MDSMonitor: handle damaged from standby-replay

author Patrick Donnelly <pdonnell@redhat.com>

Thu, 9 Sep 2021 23:39:33 +0000 (19:39 -0400)

committer Patrick Donnelly <pdonnell@redhat.com>

Thu, 9 Sep 2021 23:47:01 +0000 (19:47 -0400)
author Patrick Donnelly <pdonnell@redhat.com>
Thu, 9 Sep 2021 23:39:33 +0000 (19:39 -0400)
committer Patrick Donnelly <pdonnell@redhat.com>
Thu, 9 Sep 2021 23:47:01 +0000 (19:47 -0400)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index 893465269a853b7eb62149c2f1880d956f82745e..ff1b2a1aa10db6c6fffa27c0132fad480b54fdb7 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -43,6 +43,9 @@
  
  * OSD: Ceph now uses mclock_scheduler as its default osd_op_queue to provide QoS.
  
+* CephFS: Failure to replay the journal by a standby-replay daemon will now
+  cause the rank to be marked damaged.
+
  * RGW: S3 bucket notification events now contain an `eTag` key instead of `etag`,
    and eventName values no longer carry the `s3:` prefix, fixing deviations from
    the message format observed on AWS.
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h

index 4f732128cc934a130030a5e20f8d35938a6d02fd..9795816b829e8d924f4264efb01c35b7ae6c6d4d 100644 (file)
--- a/src/mds/FSMap.h
+++ b/src/mds/FSMap.h
@@ -523,8 +523,14 @@ public:
    bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;}
    Filesystem::const_ref get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast<const Filesystem>(filesystems.at(fscid));}
    Filesystem::ref get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);}
+  Filesystem::ref get_filesystem(mds_gid_t gid) {
+    return filesystems.at(mds_roles.at(gid));
+  }
    Filesystem::const_ref get_filesystem(void) const {return std::const_pointer_cast<const Filesystem>(filesystems.begin()->second);}
    Filesystem::const_ref get_filesystem(std::string_view name) const;
+  Filesystem::const_ref get_filesystem(mds_gid_t gid) const {
+    return filesystems.at(mds_roles.at(gid));
+  }
  
    std::vector<Filesystem::const_ref> get_filesystems(void) const;
  
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h

index 521c40158ba7f73f0d8afef311a5168ed7988085..2b4021ba1f49260201aeb49980af0c7e88b8d1fa 100644 (file)
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -446,6 +446,9 @@ public:
      return get_state_gid(it->second);
    }
  
+  auto get_gid(mds_rank_t r) const {
+    return up.at(r);
+  }
    const auto& get_info(mds_rank_t m) const {
      return mds_info.at(up.at(m));
    }
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc

index 928fce753d0afe1b10ed6ff6884f88d2611dd905..f568f524cee58a9fa28addd2fc44a64636993409 100644 (file)
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -473,23 +473,6 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
  
    // is there a state change here?
    if (info.state != state) {
-    // legal state change?
-    if ((info.state == MDSMap::STATE_STANDBY ||
-        info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
-      dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
-              << " -> " << ceph_mds_state_name(state) << ")" << dendl;
-      goto reply;
-    }
-
-    if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
-        && info.rank != MDS_RANK_NONE)
-    {
-      dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
-                 "held rank " << info.rank << " while requesting state "
-              << ceph_mds_state_name(state) << dendl;
-      goto reply;
-    }
-    
      _note_beacon(m);
      return false;
    }
@@ -691,15 +674,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
         * know which FS it was part of. Nor does this matter. Sending an empty
         * MDSMap is sufficient for getting the MDS to respawn.
         */
-      wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
-        if (r >= 0) {
-          auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
-          mon.send_reply(op, m.detach());
-        } else {
-          dispatch(op);        // try again
-        }
-      }));
-      return true;
+      goto null;
      }
  
      const auto& info = pending.get_info_gid(gid);
@@ -716,14 +691,27 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
        return true;
      }
  
-    if (info.state == MDSMap::STATE_STOPPING &&
+    // legal state change?
+    if ((info.state == MDSMap::STATE_STANDBY && state > 0) ||
+        (info.state == MDSMap::STATE_STANDBY_REPLAY && state > 0 && state != MDSMap::STATE_DAMAGED)) {
+      /* N.B.: standby-replay can indicate the rank is damaged due to failure to replay */
+      dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
+              << " -> " << ceph_mds_state_name(state) << ")" << dendl;
+      goto evict;
+    } else if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
+        && info.rank != MDS_RANK_NONE)
+    {
+      dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
+                 "held rank " << info.rank << " while requesting state "
+              << ceph_mds_state_name(state) << dendl;
+      goto evict;
+    } else if (info.state == MDSMap::STATE_STOPPING &&
          state != MDSMap::STATE_STOPPING &&
          state != MDSMap::STATE_STOPPED) {
        // we can't transition to any other states from STOPPING
        dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
                << dendl;
-      _note_beacon(m);
-      return true;
+      goto evict;
      }
  
      if (info.laggy()) {
@@ -770,8 +758,6 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
            pending_daemon_health_rm.insert(erased_gid);
          }
        }
-
-
      } else if (state == MDSMap::STATE_DAMAGED) {
        if (!mon.osdmon()->is_writeable()) {
          dout(1) << __func__ << ": DAMAGED from rank " << info.rank
@@ -780,47 +766,40 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
          return false;
        }
  
+      auto rank = info.rank;
+
        // Record this MDS rank as damaged, so that other daemons
        // won't try to run it.
-      dout(0) << __func__ << ": marking rank "
-              << info.rank << " damaged" << dendl;
+      dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl;
+
+      auto fs = pending.get_filesystem(gid);
+      auto rankgid = fs->mds_map.get_gid(rank);
+      auto rankinfo = pending.get_info_gid(rankgid);
+      auto followergid = fs->mds_map.get_standby_replay(rank);
+
+      ceph_assert(gid == rankgid || gid == followergid);
  
        utime_t until = ceph_clock_now();
        until += g_conf().get_val<double>("mon_mds_blocklist_interval");
-      const auto blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
-      request_proposal(mon.osdmon());
-      pending.damaged(gid, blocklist_epoch);
-      last_beacon.erase(gid);
-
-      // Respond to MDS, so that it knows it can continue to shut down
-      auto beacon = make_message<MMDSBeacon>(
-                       mon.monmap->fsid, m->get_global_id(),
-                       m->get_name(), pending.get_epoch(), state, seq,
-                       CEPH_FEATURES_SUPPORTED_DEFAULT);
-      mon.send_reply(op, beacon.detach());
-    } else if (state == MDSMap::STATE_DNE) {
-      if (!mon.osdmon()->is_writeable()) {
-        dout(1) << __func__ << ": DNE from rank " << info.rank
-                << " waiting for osdmon writeable to blocklist it" << dendl;
-        mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
-        return false;
+      const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until);
+      if (followergid != MDS_GID_NONE) {
+        fail_mds_gid(pending, followergid);
+        last_beacon.erase(followergid);
        }
-
-      fail_mds_gid(pending, gid);
-      ceph_assert(mon.osdmon()->is_writeable());
        request_proposal(mon.osdmon());
+      pending.damaged(rankgid, blocklist_epoch);
+      last_beacon.erase(rankgid);
  
-      // Respond to MDS, so that it knows it can continue to shut down
-      auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
-          m->get_global_id(), m->get_name(), pending.get_epoch(), state, seq,
-          CEPH_FEATURES_SUPPORTED_DEFAULT);
-      mon.send_reply(op, beacon.detach());
+      /* MDS expects beacon reply back */
+    } else if (state == MDSMap::STATE_DNE) {
+      dout(1) << __func__ << ": DNE from " << info << dendl;
+      goto evict;
      } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
        // Standby daemons should never modify their own
        // state.  Reject any attempts to do so.
        derr << "standby " << gid << " attempted to change state to "
             << ceph_mds_state_name(state) << ", rejecting" << dendl;
-      return true;
+      goto evict;
      } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
                 !MDSMap::state_transition_valid(info.state, state)) {
        // Validate state transitions for daemons that hold a rank
@@ -828,7 +807,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
             << "reported invalid state transition "
             << ceph_mds_state_name(info.state) << " -> "
             << ceph_mds_state_name(state) << dendl;
-      return true;
+      goto evict;
      } else {
        if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
          const auto &fscid = pending.mds_roles.at(gid);
@@ -860,6 +839,32 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
      }
    }));
  
+  return true;
+
+evict:
+  if (!mon.osdmon()->is_writeable()) {
+    dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl;
+    mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+    return false;
+  }
+
+  fail_mds_gid(pending, gid);
+  request_proposal(mon.osdmon());
+  dout(5) << __func__ << ": pending map now:" << dendl;
+  print_map(pending);
+
+  goto null;
+
+null:
+  wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
+    if (r >= 0) {
+      auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
+      mon.send_reply(op, m.detach());
+    } else {
+      dispatch(op);        // try again
+    }
+  }));
+
    return true;
  }
author	Patrick Donnelly <pdonnell@redhat.com>
	Thu, 9 Sep 2021 23:39:33 +0000 (19:39 -0400)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Thu, 9 Sep 2021 23:47:01 +0000 (19:47 -0400)
PendingReleaseNotes		patch \| blob \| history
src/mds/FSMap.h		patch \| blob \| history
src/mds/MDSMap.h		patch \| blob \| history
src/mon/MDSMonitor.cc		patch \| blob \| history