From 20509bb6c82e872127ab838d45402be0d0b91b5f Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Thu, 9 Sep 2021 19:39:33 -0400
Subject: [PATCH] MDSMonitor: handle damaged from standby-replay

This change does a few things:

- if a state transition is invalid or a beacon is garbage, the
  MDSMonitor now evicts the MDS instead of ignoring the problem.

- standby state validation is moved to prepare_beacon where eviction can
  happen.

- standby-replay may indicate the rank is damaged (failure to replay the
  journal).

- if the rank is damaged, both the rank holder and standby-replay daemon
  (if any) will be removed.

Fixes: https://tracker.ceph.com/issues/52565
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 PendingReleaseNotes   |   3 +
 src/mds/FSMap.h       |   6 ++
 src/mds/MDSMap.h      |   3 +
 src/mon/MDSMonitor.cc | 125 ++++++++++++++++++++++--------------------
 4 files changed, 77 insertions(+), 60 deletions(-)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 893465269a853..ff1b2a1aa10db 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -43,6 +43,9 @@
 
 * OSD: Ceph now uses mclock_scheduler as its default osd_op_queue to provide QoS.
 
+* CephFS: Failure to replay the journal by a standby-replay daemon will now
+  cause the rank to be marked damaged.
+
 * RGW: S3 bucket notification events now contain an `eTag` key instead of `etag`,
   and eventName values no longer carry the `s3:` prefix, fixing deviations from
   the message format observed on AWS.
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h
index 4f732128cc934..9795816b829e8 100644
--- a/src/mds/FSMap.h
+++ b/src/mds/FSMap.h
@@ -523,8 +523,14 @@ public:
   bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;}
   Filesystem::const_ref get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast<const Filesystem>(filesystems.at(fscid));}
   Filesystem::ref get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);}
+  Filesystem::ref get_filesystem(mds_gid_t gid) {
+    return filesystems.at(mds_roles.at(gid));
+  }
   Filesystem::const_ref get_filesystem(void) const {return std::const_pointer_cast<const Filesystem>(filesystems.begin()->second);}
   Filesystem::const_ref get_filesystem(std::string_view name) const;
+  Filesystem::const_ref get_filesystem(mds_gid_t gid) const {
+    return filesystems.at(mds_roles.at(gid));
+  }
 
   std::vector<Filesystem::const_ref> get_filesystems(void) const;
 
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 521c40158ba7f..2b4021ba1f492 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -446,6 +446,9 @@ public:
     return get_state_gid(it->second);
   }
 
+  auto get_gid(mds_rank_t r) const {
+    return up.at(r);
+  }
   const auto& get_info(mds_rank_t m) const {
     return mds_info.at(up.at(m));
   }
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 928fce753d0af..f568f524cee58 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -473,23 +473,6 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
 
   // is there a state change here?
   if (info.state != state) {
-    // legal state change?
-    if ((info.state == MDSMap::STATE_STANDBY ||
-	 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
-      dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
-	       << " -> " << ceph_mds_state_name(state) << ")" << dendl;
-      goto reply;
-    }
-
-    if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
-        && info.rank != MDS_RANK_NONE)
-    {
-      dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
-                 "held rank " << info.rank << " while requesting state "
-              << ceph_mds_state_name(state) << dendl;
-      goto reply;
-    }
-    
     _note_beacon(m);
     return false;
   }
@@ -691,15 +674,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
        * know which FS it was part of. Nor does this matter. Sending an empty
        * MDSMap is sufficient for getting the MDS to respawn.
        */
-      wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
-        if (r >= 0) {
-          auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
-          mon.send_reply(op, m.detach());
-        } else {
-          dispatch(op);        // try again
-        }
-      }));
-      return true;
+      goto null;
     }
 
     const auto& info = pending.get_info_gid(gid);
@@ -716,14 +691,27 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
       return true;
     }
 
-    if (info.state == MDSMap::STATE_STOPPING &&
+    // legal state change?
+    if ((info.state == MDSMap::STATE_STANDBY && state > 0) ||
+        (info.state == MDSMap::STATE_STANDBY_REPLAY && state > 0 && state != MDSMap::STATE_DAMAGED)) {
+      /* N.B.: standby-replay can indicate the rank is damaged due to failure to replay */
+      dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
+	       << " -> " << ceph_mds_state_name(state) << ")" << dendl;
+      goto evict;
+    } else if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
+        && info.rank != MDS_RANK_NONE)
+    {
+      dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
+                 "held rank " << info.rank << " while requesting state "
+              << ceph_mds_state_name(state) << dendl;
+      goto evict;
+    } else if (info.state == MDSMap::STATE_STOPPING &&
         state != MDSMap::STATE_STOPPING &&
         state != MDSMap::STATE_STOPPED) {
       // we can't transition to any other states from STOPPING
       dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
 	       << dendl;
-      _note_beacon(m);
-      return true;
+      goto evict;
     }
 
     if (info.laggy()) {
@@ -770,8 +758,6 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
           pending_daemon_health_rm.insert(erased_gid);
         }
       }
-
-
     } else if (state == MDSMap::STATE_DAMAGED) {
       if (!mon.osdmon()->is_writeable()) {
         dout(1) << __func__ << ": DAMAGED from rank " << info.rank
@@ -780,47 +766,40 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
         return false;
       }
 
+      auto rank = info.rank;
+
       // Record this MDS rank as damaged, so that other daemons
       // won't try to run it.
-      dout(0) << __func__ << ": marking rank "
-              << info.rank << " damaged" << dendl;
+      dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl;
+
+      auto fs = pending.get_filesystem(gid);
+      auto rankgid = fs->mds_map.get_gid(rank);
+      auto rankinfo = pending.get_info_gid(rankgid);
+      auto followergid = fs->mds_map.get_standby_replay(rank);
+
+      ceph_assert(gid == rankgid || gid == followergid);
 
       utime_t until = ceph_clock_now();
       until += g_conf().get_val<double>("mon_mds_blocklist_interval");
-      const auto blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
-      request_proposal(mon.osdmon());
-      pending.damaged(gid, blocklist_epoch);
-      last_beacon.erase(gid);
-
-      // Respond to MDS, so that it knows it can continue to shut down
-      auto beacon = make_message<MMDSBeacon>(
-			mon.monmap->fsid, m->get_global_id(),
-			m->get_name(), pending.get_epoch(), state, seq,
-			CEPH_FEATURES_SUPPORTED_DEFAULT);
-      mon.send_reply(op, beacon.detach());
-    } else if (state == MDSMap::STATE_DNE) {
-      if (!mon.osdmon()->is_writeable()) {
-        dout(1) << __func__ << ": DNE from rank " << info.rank
-                << " waiting for osdmon writeable to blocklist it" << dendl;
-        mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
-        return false;
+      const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until);
+      if (followergid != MDS_GID_NONE) {
+        fail_mds_gid(pending, followergid);
+        last_beacon.erase(followergid);
       }
-
-      fail_mds_gid(pending, gid);
-      ceph_assert(mon.osdmon()->is_writeable());
       request_proposal(mon.osdmon());
+      pending.damaged(rankgid, blocklist_epoch);
+      last_beacon.erase(rankgid);
 
-      // Respond to MDS, so that it knows it can continue to shut down
-      auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
-          m->get_global_id(), m->get_name(), pending.get_epoch(), state, seq,
-          CEPH_FEATURES_SUPPORTED_DEFAULT);
-      mon.send_reply(op, beacon.detach());
+      /* MDS expects beacon reply back */
+    } else if (state == MDSMap::STATE_DNE) {
+      dout(1) << __func__ << ": DNE from " << info << dendl;
+      goto evict;
     } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
       // Standby daemons should never modify their own
       // state.  Reject any attempts to do so.
       derr << "standby " << gid << " attempted to change state to "
            << ceph_mds_state_name(state) << ", rejecting" << dendl;
-      return true;
+      goto evict;
     } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
                !MDSMap::state_transition_valid(info.state, state)) {
       // Validate state transitions for daemons that hold a rank
@@ -828,7 +807,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
            << "reported invalid state transition "
            << ceph_mds_state_name(info.state) << " -> "
            << ceph_mds_state_name(state) << dendl;
-      return true;
+      goto evict;
     } else {
       if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
         const auto &fscid = pending.mds_roles.at(gid);
@@ -860,6 +839,32 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
     }
   }));
 
+  return true;
+
+evict:
+  if (!mon.osdmon()->is_writeable()) {
+    dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl;
+    mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+    return false;
+  }
+
+  fail_mds_gid(pending, gid);
+  request_proposal(mon.osdmon());
+  dout(5) << __func__ << ": pending map now:" << dendl;
+  print_map(pending);
+
+  goto null;
+
+null:
+  wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
+    if (r >= 0) {
+      auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
+      mon.send_reply(op, m.detach());
+    } else {
+      dispatch(op);        // try again
+    }
+  }));
+
   return true;
 }
 
-- 
2.39.5