]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/MonClient: use keepalive2 to verify the mon session is live
authorSage Weil <sage@inktank.com>
Fri, 28 Mar 2014 04:33:21 +0000 (21:33 -0700)
committerSage Weil <sage@inktank.com>
Fri, 28 Mar 2014 23:49:59 +0000 (16:49 -0700)
Verify that the mon is responding by checking the keepalive2 reply
timestamp.  We cannot rely solely on TCP timing out and returning an
error.

Fixes: #7888
Signed-off-by: Sage Weil <sage@inktank.com>
(cherry picked from commit 056151a6334c054505c54e59af40f203a0721f28)

src/common/config_opts.h
src/mon/MonClient.cc

index b6d92af0914f129d320149b318ae982dba1e3cfb..b41db5a178388c787620cdfca12ae0d4f3827e6e 100644 (file)
@@ -226,6 +226,7 @@ OPTION(auth_service_ticket_ttl, OPT_DOUBLE, 60*60)
 OPTION(auth_debug, OPT_BOOL, false)          // if true, assert when weird things happen
 OPTION(mon_client_hunt_interval, OPT_DOUBLE, 3.0)   // try new mon every N seconds until we connect
 OPTION(mon_client_ping_interval, OPT_DOUBLE, 10.0)  // ping every N seconds
+OPTION(mon_client_ping_timeout, OPT_DOUBLE, 30.0)   // fail if we don't hear back
 OPTION(mon_client_hunt_interval_backoff, OPT_DOUBLE, 2.0) // each time we reconnect to a monitor, double our timeout
 OPTION(mon_client_hunt_interval_max_multiple, OPT_DOUBLE, 10.0) // up to a max of 10*default (30 seconds)
 OPTION(mon_client_max_log_entries_per_message, OPT_INT, 1000)
index c5903685c7f5a94f15dba25ab825006d79a5838f..d726f88dc6fd3ee9905db5255f155533a9856422 100644 (file)
@@ -556,6 +556,11 @@ void MonClient::_reopen_session(int rank, string name)
   state = MC_STATE_NEGOTIATING;
   hunting = true;
 
+  // send an initial keepalive to ensure our timestamp is valid by the
+  // time we are in an OPENED state (by sequencing this before
+  // authentication).
+  messenger->send_keepalive(cur_con.get());
+
   MAuth *m = new MAuth;
   m->protocol = 0;
   m->monmap_epoch = monmap.get_epoch();
@@ -624,9 +629,20 @@ void MonClient::tick()
       _renew_subs();
 
     messenger->send_keepalive(cur_con.get());
-   
+
     if (state == MC_STATE_HAVE_SESSION) {
       send_log();
+
+      if (cct->_conf->mon_client_ping_timeout > 0 &&
+         cur_con->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+       utime_t lk = cur_con->get_last_keepalive_ack();
+       utime_t interval = ceph_clock_now(cct) - lk;
+       if (interval > cct->_conf->mon_client_ping_timeout) {
+         ldout(cct, 1) << "no keepalive since " << lk << " (" << interval
+                       << " seconds), reconnecting" << dendl;
+         _reopen_session();
+       }
+      }
     }
   }