From 20509bb6c82e872127ab838d45402be0d0b91b5f Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 9 Sep 2021 19:39:33 -0400 Subject: [PATCH] MDSMonitor: handle damaged from standby-replay This change does a few things: - if a state transition is invalid or a beacon is garbage, the MDSMonitor now evicts the MDS instead of ignoring the problem. - standby state validation is moved to prepare_beacon where eviction can happen. - standby-replay may indicate the rank is damaged (failure to replay the journal). - if the rank is damaged, both the rank holder and standby-replay daemon (if any) will be removed. Fixes: https://tracker.ceph.com/issues/52565 Signed-off-by: Patrick Donnelly --- PendingReleaseNotes | 3 + src/mds/FSMap.h | 6 ++ src/mds/MDSMap.h | 3 + src/mon/MDSMonitor.cc | 125 ++++++++++++++++++++++-------------------- 4 files changed, 77 insertions(+), 60 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 893465269a853..ff1b2a1aa10db 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -43,6 +43,9 @@ * OSD: Ceph now uses mclock_scheduler as its default osd_op_queue to provide QoS. +* CephFS: Failure to replay the journal by a standby-replay daemon will now + cause the rank to be marked damaged. + * RGW: S3 bucket notification events now contain an `eTag` key instead of `etag`, and eventName values no longer carry the `s3:` prefix, fixing deviations from the message format observed on AWS. diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h index 4f732128cc934..9795816b829e8 100644 --- a/src/mds/FSMap.h +++ b/src/mds/FSMap.h @@ -523,8 +523,14 @@ public: bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;} Filesystem::const_ref get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast(filesystems.at(fscid));} Filesystem::ref get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);} + Filesystem::ref get_filesystem(mds_gid_t gid) { + return filesystems.at(mds_roles.at(gid)); + } Filesystem::const_ref get_filesystem(void) const {return std::const_pointer_cast(filesystems.begin()->second);} Filesystem::const_ref get_filesystem(std::string_view name) const; + Filesystem::const_ref get_filesystem(mds_gid_t gid) const { + return filesystems.at(mds_roles.at(gid)); + } std::vector get_filesystems(void) const; diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 521c40158ba7f..2b4021ba1f492 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -446,6 +446,9 @@ public: return get_state_gid(it->second); } + auto get_gid(mds_rank_t r) const { + return up.at(r); + } const auto& get_info(mds_rank_t m) const { return mds_info.at(up.at(m)); } diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 928fce753d0af..f568f524cee58 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -473,23 +473,6 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op) // is there a state change here? if (info.state != state) { - // legal state change? - if ((info.state == MDSMap::STATE_STANDBY || - info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) { - dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state) - << " -> " << ceph_mds_state_name(state) << ")" << dendl; - goto reply; - } - - if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY) - && info.rank != MDS_RANK_NONE) - { - dout(4) << "mds_beacon MDS can't go back into standby after taking rank: " - "held rank " << info.rank << " while requesting state " - << ceph_mds_state_name(state) << dendl; - goto reply; - } - _note_beacon(m); return false; } @@ -691,15 +674,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op) * know which FS it was part of. Nor does this matter. Sending an empty * MDSMap is sufficient for getting the MDS to respawn. */ - wait_for_finished_proposal(op, new LambdaContext([op, this](int r){ - if (r >= 0) { - auto m = make_message(mon.monmap->fsid, MDSMap::create_null_mdsmap()); - mon.send_reply(op, m.detach()); - } else { - dispatch(op); // try again - } - })); - return true; + goto null; } const auto& info = pending.get_info_gid(gid); @@ -716,14 +691,27 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op) return true; } - if (info.state == MDSMap::STATE_STOPPING && + // legal state change? + if ((info.state == MDSMap::STATE_STANDBY && state > 0) || + (info.state == MDSMap::STATE_STANDBY_REPLAY && state > 0 && state != MDSMap::STATE_DAMAGED)) { + /* N.B.: standby-replay can indicate the rank is damaged due to failure to replay */ + dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state) + << " -> " << ceph_mds_state_name(state) << ")" << dendl; + goto evict; + } else if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY) + && info.rank != MDS_RANK_NONE) + { + dout(4) << "mds_beacon MDS can't go back into standby after taking rank: " + "held rank " << info.rank << " while requesting state " + << ceph_mds_state_name(state) << dendl; + goto evict; + } else if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED) { // we can't transition to any other states from STOPPING dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change" << dendl; - _note_beacon(m); - return true; + goto evict; } if (info.laggy()) { @@ -770,8 +758,6 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op) pending_daemon_health_rm.insert(erased_gid); } } - - } else if (state == MDSMap::STATE_DAMAGED) { if (!mon.osdmon()->is_writeable()) { dout(1) << __func__ << ": DAMAGED from rank " << info.rank @@ -780,47 +766,40 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op) return false; } + auto rank = info.rank; + // Record this MDS rank as damaged, so that other daemons // won't try to run it. - dout(0) << __func__ << ": marking rank " - << info.rank << " damaged" << dendl; + dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl; + + auto fs = pending.get_filesystem(gid); + auto rankgid = fs->mds_map.get_gid(rank); + auto rankinfo = pending.get_info_gid(rankgid); + auto followergid = fs->mds_map.get_standby_replay(rank); + + ceph_assert(gid == rankgid || gid == followergid); utime_t until = ceph_clock_now(); until += g_conf().get_val("mon_mds_blocklist_interval"); - const auto blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until); - request_proposal(mon.osdmon()); - pending.damaged(gid, blocklist_epoch); - last_beacon.erase(gid); - - // Respond to MDS, so that it knows it can continue to shut down - auto beacon = make_message( - mon.monmap->fsid, m->get_global_id(), - m->get_name(), pending.get_epoch(), state, seq, - CEPH_FEATURES_SUPPORTED_DEFAULT); - mon.send_reply(op, beacon.detach()); - } else if (state == MDSMap::STATE_DNE) { - if (!mon.osdmon()->is_writeable()) { - dout(1) << __func__ << ": DNE from rank " << info.rank - << " waiting for osdmon writeable to blocklist it" << dendl; - mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); - return false; + const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until); + if (followergid != MDS_GID_NONE) { + fail_mds_gid(pending, followergid); + last_beacon.erase(followergid); } - - fail_mds_gid(pending, gid); - ceph_assert(mon.osdmon()->is_writeable()); request_proposal(mon.osdmon()); + pending.damaged(rankgid, blocklist_epoch); + last_beacon.erase(rankgid); - // Respond to MDS, so that it knows it can continue to shut down - auto beacon = make_message(mon.monmap->fsid, - m->get_global_id(), m->get_name(), pending.get_epoch(), state, seq, - CEPH_FEATURES_SUPPORTED_DEFAULT); - mon.send_reply(op, beacon.detach()); + /* MDS expects beacon reply back */ + } else if (state == MDSMap::STATE_DNE) { + dout(1) << __func__ << ": DNE from " << info << dendl; + goto evict; } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) { // Standby daemons should never modify their own // state. Reject any attempts to do so. derr << "standby " << gid << " attempted to change state to " << ceph_mds_state_name(state) << ", rejecting" << dendl; - return true; + goto evict; } else if (info.state != MDSMap::STATE_STANDBY && state != info.state && !MDSMap::state_transition_valid(info.state, state)) { // Validate state transitions for daemons that hold a rank @@ -828,7 +807,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op) << "reported invalid state transition " << ceph_mds_state_name(info.state) << " -> " << ceph_mds_state_name(state) << dendl; - return true; + goto evict; } else { if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) { const auto &fscid = pending.mds_roles.at(gid); @@ -860,6 +839,32 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op) } })); + return true; + +evict: + if (!mon.osdmon()->is_writeable()) { + dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl; + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + + fail_mds_gid(pending, gid); + request_proposal(mon.osdmon()); + dout(5) << __func__ << ": pending map now:" << dendl; + print_map(pending); + + goto null; + +null: + wait_for_finished_proposal(op, new LambdaContext([op, this](int r){ + if (r >= 0) { + auto m = make_message(mon.monmap->fsid, MDSMap::create_null_mdsmap()); + mon.send_reply(op, m.detach()); + } else { + dispatch(op); // try again + } + })); + return true; } -- 2.39.5