]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
msgr: simple exponential backoff, with tunable initial and max delay
authorSage Weil <sage@newdream.net>
Tue, 13 Oct 2009 06:04:24 +0000 (23:04 -0700)
committerSage Weil <sage@newdream.net>
Tue, 13 Oct 2009 17:27:06 +0000 (10:27 -0700)
src/config.cc
src/config.h
src/msg/SimpleMessenger.cc
src/msg/SimpleMessenger.h

index 3befe3e9f24bc29d3885f2ce2189b99f42a687bb..685af0977d59ad756f5c1aa721505e9368f3c1fc 100644 (file)
@@ -341,8 +341,8 @@ static struct config_option config_optionsp[] = {
        OPTION(clock_lock, 0, OPT_BOOL, false),
        OPTION(clock_tare, 0, OPT_BOOL, false),
        OPTION(ms_tcp_nodelay, 0, OPT_BOOL, true),
-       OPTION(ms_retry_interval, 0, OPT_DOUBLE, 2.0),  // how often to attempt reconnect
-       OPTION(ms_fail_interval, 0, OPT_DOUBLE, 15.0),  // fail after this long
+       OPTION(ms_initial_backoff, 0, OPT_DOUBLE, .2),
+       OPTION(ms_max_backoff, 0, OPT_DOUBLE, 15.0),
        OPTION(ms_die_on_failure, 0, OPT_BOOL, false),
        OPTION(ms_nocrc, 0, OPT_BOOL, false),
        OPTION(ms_die_on_bad_msg, 0, OPT_BOOL, false),
index 3b9a0824e8df215f4eb6f2524ef629297934c635..1abf65b5716786fce69d62c04d10cc53530c9147 100644 (file)
@@ -117,8 +117,8 @@ struct md_config_t {
   */
 
   bool ms_tcp_nodelay;
-  double ms_retry_interval;
-  double ms_fail_interval;
+  double ms_initial_backoff;
+  double ms_max_backoff;
   bool ms_die_on_failure;
   bool ms_nocrc;
   bool ms_die_on_bad_msg;
index 0e5e7a21c86a20683d6c356cb46ce1cd7b0d23f7..12bdd1f1e002cf683e306c65ff32e35bc1f44264 100644 (file)
@@ -1000,7 +1000,7 @@ int SimpleMessenger::Pipe::connect()
       state = STATE_OPEN;
       connect_seq = cseq + 1;
       assert(connect_seq == reply.connect_seq);
-      first_fault = last_attempt = utime_t();
+      backoff = utime_t();
       dout(20) << "connect success " << connect_seq << ", lossy = " << policy.lossy << dendl;
 
       if (!reader_running) {
@@ -1124,33 +1124,26 @@ void SimpleMessenger::Pipe::fault(bool onconnect, bool onread)
     return;
   } 
 
+
   utime_t now = g_clock.now();
   if (state != STATE_CONNECTING) {
     if (!onconnect)
       dout(0) << "fault initiating reconnect" << dendl;
     connect_seq++;
     state = STATE_CONNECTING;
-    first_fault = now;
-  } else if (first_fault.sec() == 0) {
+    backoff = utime_t();
+  } else if (backoff == utime_t()) {
     if (!onconnect)
       dout(0) << "fault first fault" << dendl;
-    first_fault = now;
+    backoff.set_from_double(g_conf.ms_initial_backoff);
   } else {
-
-#warning clean me up
-
-    utime_t failinterval = now - first_fault;
-    utime_t retryinterval = now - last_attempt;
-    if (!onconnect) dout(10) << "fault failure was " << failinterval 
-                            << " ago, last attempt was at " << last_attempt
-                            << ", " << retryinterval << " ago" << dendl;
-    // wait
-    now += 1.0;
-    dout(10) << "fault waiting until " << now << dendl;
-    cond.WaitUntil(lock, now);
+    dout(10) << "fault waiting " << backoff << dendl;
+    cond.WaitInterval(lock, backoff);
+    backoff += backoff;
+    if (backoff > g_conf.ms_max_backoff)
+      backoff.set_from_double(g_conf.ms_max_backoff);
     dout(10) << "fault done waiting or woke up" << dendl;
   }
-  last_attempt = now;
 }
 
 void SimpleMessenger::Pipe::fail()
index 230af5d55e20bc3f11d98abb1dae881d94f5407c..c385110b645aee5a90ac9e360f6f6431e70341f9 100644 (file)
@@ -103,8 +103,7 @@ private:
   protected:
     Connection *connection_state;
 
-    utime_t first_fault;   // time of original failure
-    utime_t last_attempt;  // time of last reconnect attempt
+    utime_t backoff;         // backoff time
 
     bool reader_running;
     bool writer_running;