From: John Spray Date: Thu, 12 Oct 2017 10:57:50 +0000 (+0100) Subject: mon: handle monitor lag when killing mgrs X-Git-Tag: v12.2.2~31^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F18726%2Fhead;p=ceph.git mon: handle monitor lag when killing mgrs Fixes: http://tracker.ceph.com/issues/20629 Signed-off-by: John Spray (cherry picked from commit 3c3776b30a2da3f5d2ece15c57713c0ce191c778) --- diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index 86895671583..a307dd4df5c 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -538,7 +538,28 @@ void MgrMonitor::tick() return; const auto now = ceph::coarse_mono_clock::now(); - const auto cutoff = now - std::chrono::seconds(g_conf->get_val("mon_mgr_beacon_grace")); + + const auto mgr_beacon_grace = std::chrono::seconds( + g_conf->get_val("mon_mgr_beacon_grace")); + + // Note that this is the mgr daemon's tick period, not ours (the + // beacon is sent with this period). + const auto mgr_tick_period = std::chrono::seconds( + g_conf->get_val("mgr_tick_period")); + + if (last_tick != ceph::coarse_mono_clock::time_point::min() + && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) { + // This case handles either local slowness (calls being delayed + // for whatever reason) or cluster election slowness (a long gap + // between calls while an election happened) + dout(4) << __func__ << ": resetting beacon timeouts due to mon delay " + "(slow election?) of " << now - last_tick << " seconds" << dendl; + for (auto &i : last_beacon) { + i.second = now; + } + } + + last_tick = now; // Populate any missing beacons (i.e. no beacon since MgrMonitor // instantiation) with the current time, so that they will @@ -556,6 +577,7 @@ void MgrMonitor::tick() // Cull standbys first so that any remaining standbys // will be eligible to take over from the active if we cull him. std::list dead_standbys; + const auto cutoff = now - mgr_beacon_grace; for (const auto &i : pending_map.standbys) { auto last_beacon_time = last_beacon.at(i.first); if (last_beacon_time < cutoff) { @@ -614,6 +636,7 @@ void MgrMonitor::on_restart() { // Clear out the leader-specific state. last_beacon.clear(); + last_tick = ceph::coarse_mono_clock::now(); } diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h index f7fb506cd1b..82315d3550d 100644 --- a/src/mon/MgrMonitor.h +++ b/src/mon/MgrMonitor.h @@ -118,6 +118,11 @@ public: void count_metadata(const string& field, std::map *out); friend class C_Updated; + + // When did the mon last call into our tick() method? Used for detecting + // when the mon was not updating us for some period (e.g. during slow + // election) to reset last_beacon timeouts + ceph::coarse_mono_clock::time_point last_tick; }; #endif