From: Patrick Donnelly Date: Fri, 27 Aug 2021 01:16:30 +0000 (-0400) Subject: mon: do not quickly mark mds laggy when MON_DOWN X-Git-Tag: v17.1.0~872^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=89c901a4944158c21eb26d53676a709fa2964c9d;p=ceph.git mon: do not quickly mark mds laggy when MON_DOWN The MDS may be sending beacons to a partitioned or newly restarted monitor. This will wrongly cause the current leader to believe the MDS is unavailable. Fixes: https://tracker.ceph.com/issues/43216 Signed-off-by: Patrick Donnelly --- diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 526fc44607e8..8754f4fb6b80 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -764,6 +764,13 @@ options: services: - mon with_legacy: true +- name: mds_beacon_mon_down_grace + type: secs + level: advanced + desc: tolerance in seconds for missed MDS beacons to monitors + fmt_desc: The interval without beacons before Ceph declares an MDS laggy + when a monitor is down. + default: 1_min # skip safety assertions on FSMap (in case of bugs where we want to continue anyway) - name: mon_mds_skip_sanity type: bool diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 928fce753d0a..0ad30683e239 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -2136,6 +2136,11 @@ bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap) // check beacon timestamps std::vector to_remove; + const bool mon_down = mon.is_mon_down(); + const auto mds_beacon_mon_down_grace = + g_conf().get_val("mds_beacon_mon_down_grace"); + const auto quorum_age = std::chrono::seconds(mon.quorum_age()); + const bool new_quorum = quorum_age < mds_beacon_mon_down_grace; for (auto it = last_beacon.begin(); it != last_beacon.end(); ) { auto& [gid, beacon_info] = *it; auto since_last = std::chrono::duration(now-beacon_info.stamp); @@ -2152,6 +2157,14 @@ bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap) << " (gid: " << gid << " addr: " << info.addrs << " state: " << ceph_mds_state_name(info.state) << ")" << " since " << since_last.count() << dendl; + if ((mon_down || new_quorum) && since_last < mds_beacon_mon_down_grace) { + /* The MDS may be sending beacons to a monitor not yet in quorum or + * temporarily partitioned. Hold off on removal for a little longer... + */ + dout(10) << "deferring removal for mds_beacon_mon_down_grace during MON_DOWN" << dendl; + ++it; + continue; + } // If the OSDMap is writeable, we can blocklist things, so we can // try failing any laggy MDS daemons. Consider each one for failure. if (!info.laggy()) { diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index 330ecbc70d95..4c9566b396a3 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -221,6 +221,13 @@ public: return age.count(); } + bool is_mon_down() const { + int max = monmap->size(); + int actual = get_quorum().size(); + auto now = ceph::real_clock::now(); + return actual < max && now > monmap->created.to_real_time(); + } + // -- elector -- private: std::unique_ptr paxos;