From: Sage Weil Date: Fri, 28 Mar 2014 04:33:21 +0000 (-0700) Subject: mon/MonClient: use keepalive2 to verify the mon session is live X-Git-Tag: v0.67.8~24^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=80e0a0a8fee2f6f903f612734b2cc72eae703eae;p=ceph.git mon/MonClient: use keepalive2 to verify the mon session is live Verify that the mon is responding by checking the keepalive2 reply timestamp. We cannot rely solely on TCP timing out and returning an error. Fixes: #7888 Signed-off-by: Sage Weil (cherry picked from commit 056151a6334c054505c54e59af40f203a0721f28) --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index b6d92af0914..b41db5a1783 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -226,6 +226,7 @@ OPTION(auth_service_ticket_ttl, OPT_DOUBLE, 60*60) OPTION(auth_debug, OPT_BOOL, false) // if true, assert when weird things happen OPTION(mon_client_hunt_interval, OPT_DOUBLE, 3.0) // try new mon every N seconds until we connect OPTION(mon_client_ping_interval, OPT_DOUBLE, 10.0) // ping every N seconds +OPTION(mon_client_ping_timeout, OPT_DOUBLE, 30.0) // fail if we don't hear back OPTION(mon_client_hunt_interval_backoff, OPT_DOUBLE, 2.0) // each time we reconnect to a monitor, double our timeout OPTION(mon_client_hunt_interval_max_multiple, OPT_DOUBLE, 10.0) // up to a max of 10*default (30 seconds) OPTION(mon_client_max_log_entries_per_message, OPT_INT, 1000) diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc index c5903685c7f..d726f88dc6f 100644 --- a/src/mon/MonClient.cc +++ b/src/mon/MonClient.cc @@ -556,6 +556,11 @@ void MonClient::_reopen_session(int rank, string name) state = MC_STATE_NEGOTIATING; hunting = true; + // send an initial keepalive to ensure our timestamp is valid by the + // time we are in an OPENED state (by sequencing this before + // authentication). + messenger->send_keepalive(cur_con.get()); + MAuth *m = new MAuth; m->protocol = 0; m->monmap_epoch = monmap.get_epoch(); @@ -624,9 +629,20 @@ void MonClient::tick() _renew_subs(); messenger->send_keepalive(cur_con.get()); - + if (state == MC_STATE_HAVE_SESSION) { send_log(); + + if (cct->_conf->mon_client_ping_timeout > 0 && + cur_con->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) { + utime_t lk = cur_con->get_last_keepalive_ack(); + utime_t interval = ceph_clock_now(cct) - lk; + if (interval > cct->_conf->mon_client_ping_timeout) { + ldout(cct, 1) << "no keepalive since " << lk << " (" << interval + << " seconds), reconnecting" << dendl; + _reopen_session(); + } + } } }