From: John Spray Date: Thu, 12 Oct 2017 10:57:50 +0000 (+0100) Subject: mon: handle monitor lag when killing mgrs X-Git-Tag: v13.0.1~336^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3c3776b30a2da3f5d2ece15c57713c0ce191c778;p=ceph.git mon: handle monitor lag when killing mgrs Fixes: http://tracker.ceph.com/issues/20629 Signed-off-by: John Spray --- diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index befab26d9dcb..9a340d950eff 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -490,7 +490,28 @@ void MgrMonitor::tick() return; const auto now = ceph::coarse_mono_clock::now(); - const auto cutoff = now - std::chrono::seconds(g_conf->get_val("mon_mgr_beacon_grace")); + + const auto mgr_beacon_grace = std::chrono::seconds( + g_conf->get_val("mon_mgr_beacon_grace")); + + // Note that this is the mgr daemon's tick period, not ours (the + // beacon is sent with this period). + const auto mgr_tick_period = std::chrono::seconds( + g_conf->get_val("mgr_tick_period")); + + if (last_tick != ceph::coarse_mono_clock::time_point::min() + && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) { + // This case handles either local slowness (calls being delayed + // for whatever reason) or cluster election slowness (a long gap + // between calls while an election happened) + dout(4) << __func__ << ": resetting beacon timeouts due to mon delay " + "(slow election?) of " << now - last_tick << " seconds" << dendl; + for (auto &i : last_beacon) { + i.second = now; + } + } + + last_tick = now; // Populate any missing beacons (i.e. no beacon since MgrMonitor // instantiation) with the current time, so that they will @@ -508,6 +529,7 @@ void MgrMonitor::tick() // Cull standbys first so that any remaining standbys // will be eligible to take over from the active if we cull him. std::list dead_standbys; + const auto cutoff = now - mgr_beacon_grace; for (const auto &i : pending_map.standbys) { auto last_beacon_time = last_beacon.at(i.first); if (last_beacon_time < cutoff) { @@ -566,6 +588,7 @@ void MgrMonitor::on_restart() { // Clear out the leader-specific state. last_beacon.clear(); + last_tick = ceph::coarse_mono_clock::now(); } diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h index 2f24c4ad1ce2..5507615af435 100644 --- a/src/mon/MgrMonitor.h +++ b/src/mon/MgrMonitor.h @@ -115,6 +115,11 @@ public: void count_metadata(const string& field, std::map *out); friend class C_Updated; + + // When did the mon last call into our tick() method? Used for detecting + // when the mon was not updating us for some period (e.g. during slow + // election) to reset last_beacon timeouts + ceph::coarse_mono_clock::time_point last_tick; }; #endif