From 88fca472196bd68cee0a5c8f650f80ea1ee18ba5 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 11 Jul 2017 19:36:32 -0700 Subject: [PATCH] MDSMonitor: show laggy MDSs at higher debug level Also, print laggy daemons even if the OSDMap is not yet writeable. This is mostly for operators wanting to see a more visible message that an MDS has been replaced. Related-to: http://tracker.ceph.com/issues/19706 Signed-off-by: Patrick Donnelly --- src/mon/MDSMonitor.cc | 55 ++++++++++++++++++++----------------------- src/mon/MDSMonitor.h | 2 +- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 608e1aeedc3..ac22848dc48 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -1906,21 +1906,14 @@ bool MDSMonitor::maybe_expand_cluster(std::shared_ptr fs) * is available, fail this daemon (remove from map) and pass its * role to another daemon. */ -void MDSMonitor::maybe_replace_gid(mds_gid_t gid, - const beacon_info_t &beacon, +void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose) { assert(mds_propose != nullptr); assert(osd_propose != nullptr); - const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid); const auto fscid = pending_fsmap.mds_roles.at(gid); - dout(10) << "no beacon from " << gid << " " << info.addr << " mds." - << info.rank << "." << info.inc - << " " << ceph_mds_state_name(info.state) - << " since " << beacon.stamp << dendl; - // We will only take decisive action (replacing/removing a daemon) // if we have some indicating that some other daemon(s) are successfully // getting beacons through recently. @@ -2144,32 +2137,36 @@ void MDSMonitor::tick() } } - // If the OSDMap is writeable, we can blacklist things, so we can - // try failing any laggy MDS daemons. Consider each one for failure. - if (mon->osdmon()->is_writeable()) { - bool propose_osdmap = false; - - map::iterator p = last_beacon.begin(); - while (p != last_beacon.end()) { - mds_gid_t gid = p->first; - auto beacon_info = p->second; - ++p; - - if (!pending_fsmap.gid_exists(gid)) { - // clean it out - last_beacon.erase(gid); - continue; - } + bool propose_osdmap = false; + bool osdmap_writeable = mon->osdmon()->is_writeable(); + auto p = last_beacon.begin(); + while (p != last_beacon.end()) { + mds_gid_t gid = p->first; + auto beacon_info = p->second; + ++p; - if (beacon_info.stamp < cutoff) { - maybe_replace_gid(gid, beacon_info, &do_propose, &propose_osdmap); - } + if (!pending_fsmap.gid_exists(gid)) { + // clean it out + last_beacon.erase(gid); + continue; } - if (propose_osdmap) { - request_proposal(mon->osdmon()); + if (beacon_info.stamp < cutoff) { + auto &info = pending_fsmap.get_info_gid(gid); + dout(1) << "no beacon from mds." << info.rank << "." << info.inc + << " (gid: " << gid << " addr: " << info.addr + << " state: " << ceph_mds_state_name(info.state) << ")" + << " since " << beacon_info.stamp << dendl; + // If the OSDMap is writeable, we can blacklist things, so we can + // try failing any laggy MDS daemons. Consider each one for failure. + if (osdmap_writeable) { + maybe_replace_gid(gid, info, &do_propose, &propose_osdmap); + } } } + if (propose_osdmap) { + request_proposal(mon->osdmon()); + } for (auto i : pending_fsmap.filesystems) { auto fs = i.second; diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index 05946b51c3c..65cb7551ac1 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -129,7 +129,7 @@ class MDSMonitor : public PaxosService { bool maybe_promote_standby(std::shared_ptr fs); bool maybe_expand_cluster(std::shared_ptr fs); - void maybe_replace_gid(mds_gid_t gid, const beacon_info_t &beacon, + void maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose); void tick() override; // check state, take actions -- 2.47.3