From f81e6dc8abe3535a54cbf683992c95c4eaf4e186 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 12 Oct 2017 11:57:50 +0100 Subject: [PATCH] mon: handle monitor lag when killing mgrs Fixes: http://tracker.ceph.com/issues/20629 Signed-off-by: John Spray (cherry picked from commit 3c3776b30a2da3f5d2ece15c57713c0ce191c778) --- src/mon/MgrMonitor.cc | 25 ++++++++++++++++++++++++- src/mon/MgrMonitor.h | 5 +++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index 86895671583..a307dd4df5c 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -538,7 +538,28 @@ void MgrMonitor::tick() return; const auto now = ceph::coarse_mono_clock::now(); - const auto cutoff = now - std::chrono::seconds(g_conf->get_val("mon_mgr_beacon_grace")); + + const auto mgr_beacon_grace = std::chrono::seconds( + g_conf->get_val("mon_mgr_beacon_grace")); + + // Note that this is the mgr daemon's tick period, not ours (the + // beacon is sent with this period). + const auto mgr_tick_period = std::chrono::seconds( + g_conf->get_val("mgr_tick_period")); + + if (last_tick != ceph::coarse_mono_clock::time_point::min() + && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) { + // This case handles either local slowness (calls being delayed + // for whatever reason) or cluster election slowness (a long gap + // between calls while an election happened) + dout(4) << __func__ << ": resetting beacon timeouts due to mon delay " + "(slow election?) of " << now - last_tick << " seconds" << dendl; + for (auto &i : last_beacon) { + i.second = now; + } + } + + last_tick = now; // Populate any missing beacons (i.e. no beacon since MgrMonitor // instantiation) with the current time, so that they will @@ -556,6 +577,7 @@ void MgrMonitor::tick() // Cull standbys first so that any remaining standbys // will be eligible to take over from the active if we cull him. std::list dead_standbys; + const auto cutoff = now - mgr_beacon_grace; for (const auto &i : pending_map.standbys) { auto last_beacon_time = last_beacon.at(i.first); if (last_beacon_time < cutoff) { @@ -614,6 +636,7 @@ void MgrMonitor::on_restart() { // Clear out the leader-specific state. last_beacon.clear(); + last_tick = ceph::coarse_mono_clock::now(); } diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h index f7fb506cd1b..82315d3550d 100644 --- a/src/mon/MgrMonitor.h +++ b/src/mon/MgrMonitor.h @@ -118,6 +118,11 @@ public: void count_metadata(const string& field, std::map *out); friend class C_Updated; + + // When did the mon last call into our tick() method? Used for detecting + // when the mon was not updating us for some period (e.g. during slow + // election) to reset last_beacon timeouts + ceph::coarse_mono_clock::time_point last_tick; }; #endif -- 2.47.3