]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/MonClient: use keepalive2 to verify the mon session is live 1556/head
authorSage Weil <sage@inktank.com>
Fri, 28 Mar 2014 04:33:21 +0000 (21:33 -0700)
committerSage Weil <sage@inktank.com>
Fri, 28 Mar 2014 23:09:26 +0000 (16:09 -0700)
Verify that the mon is responding by checking the keepalive2 reply
timestamp.  We cannot rely solely on TCP timing out and returning an
error.

Fixes: #7888
Signed-off-by: Sage Weil <sage@inktank.com>
src/common/config_opts.h
src/mon/MonClient.cc

index 87ce3375d7092a006bc8036e843bd6dfcc8492bd..69f9dcf6c90cae4480eee304a768669aa8394d21 100644 (file)
@@ -241,6 +241,7 @@ OPTION(auth_service_ticket_ttl, OPT_DOUBLE, 60*60)
 OPTION(auth_debug, OPT_BOOL, false)          // if true, assert when weird things happen
 OPTION(mon_client_hunt_interval, OPT_DOUBLE, 3.0)   // try new mon every N seconds until we connect
 OPTION(mon_client_ping_interval, OPT_DOUBLE, 10.0)  // ping every N seconds
+OPTION(mon_client_ping_timeout, OPT_DOUBLE, 30.0)   // fail if we don't hear back
 OPTION(mon_client_hunt_interval_backoff, OPT_DOUBLE, 2.0) // each time we reconnect to a monitor, double our timeout
 OPTION(mon_client_hunt_interval_max_multiple, OPT_DOUBLE, 10.0) // up to a max of 10*default (30 seconds)
 OPTION(mon_client_max_log_entries_per_message, OPT_INT, 1000)
index f494242e6df2d67d1cf60426190dba3b4d737407..fcb622bd495cdd738baff9185f594361fbde1b7c 100644 (file)
@@ -628,6 +628,11 @@ void MonClient::_reopen_session(int rank, string name)
   state = MC_STATE_NEGOTIATING;
   hunting = true;
 
+  // send an initial keepalive to ensure our timestamp is valid by the
+  // time we are in an OPENED state (by sequencing this before
+  // authentication).
+  messenger->send_keepalive(cur_con.get());
+
   MAuth *m = new MAuth;
   m->protocol = 0;
   m->monmap_epoch = monmap.get_epoch();
@@ -696,9 +701,20 @@ void MonClient::tick()
       _renew_subs();
 
     messenger->send_keepalive(cur_con.get());
-   
+
     if (state == MC_STATE_HAVE_SESSION) {
       send_log();
+
+      if (cct->_conf->mon_client_ping_timeout > 0 &&
+         cur_con->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+       utime_t lk = cur_con->get_last_keepalive_ack();
+       utime_t interval = ceph_clock_now(cct) - lk;
+       if (interval > cct->_conf->mon_client_ping_timeout) {
+         ldout(cct, 1) << "no keepalive since " << lk << " (" << interval
+                       << " seconds), reconnecting" << dendl;
+         _reopen_session();
+       }
+      }
     }
   }