From 056151a6334c054505c54e59af40f203a0721f28 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Mar 2014 21:33:21 -0700 Subject: [PATCH] mon/MonClient: use keepalive2 to verify the mon session is live Verify that the mon is responding by checking the keepalive2 reply timestamp. We cannot rely solely on TCP timing out and returning an error. Fixes: #7888 Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/mon/MonClient.cc | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 87ce3375d7092..69f9dcf6c90ca 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -241,6 +241,7 @@ OPTION(auth_service_ticket_ttl, OPT_DOUBLE, 60*60) OPTION(auth_debug, OPT_BOOL, false) // if true, assert when weird things happen OPTION(mon_client_hunt_interval, OPT_DOUBLE, 3.0) // try new mon every N seconds until we connect OPTION(mon_client_ping_interval, OPT_DOUBLE, 10.0) // ping every N seconds +OPTION(mon_client_ping_timeout, OPT_DOUBLE, 30.0) // fail if we don't hear back OPTION(mon_client_hunt_interval_backoff, OPT_DOUBLE, 2.0) // each time we reconnect to a monitor, double our timeout OPTION(mon_client_hunt_interval_max_multiple, OPT_DOUBLE, 10.0) // up to a max of 10*default (30 seconds) OPTION(mon_client_max_log_entries_per_message, OPT_INT, 1000) diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc index f494242e6df2d..fcb622bd495cd 100644 --- a/src/mon/MonClient.cc +++ b/src/mon/MonClient.cc @@ -628,6 +628,11 @@ void MonClient::_reopen_session(int rank, string name) state = MC_STATE_NEGOTIATING; hunting = true; + // send an initial keepalive to ensure our timestamp is valid by the + // time we are in an OPENED state (by sequencing this before + // authentication). + messenger->send_keepalive(cur_con.get()); + MAuth *m = new MAuth; m->protocol = 0; m->monmap_epoch = monmap.get_epoch(); @@ -696,9 +701,20 @@ void MonClient::tick() _renew_subs(); messenger->send_keepalive(cur_con.get()); - + if (state == MC_STATE_HAVE_SESSION) { send_log(); + + if (cct->_conf->mon_client_ping_timeout > 0 && + cur_con->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) { + utime_t lk = cur_con->get_last_keepalive_ack(); + utime_t interval = ceph_clock_now(cct) - lk; + if (interval > cct->_conf->mon_client_ping_timeout) { + ldout(cct, 1) << "no keepalive since " << lk << " (" << interval + << " seconds), reconnecting" << dendl; + _reopen_session(); + } + } } } -- 2.39.5