]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
cephfs,mon: require confirmation to fail unhealthy MDS
authorRishabh Dave <ridave@redhat.com>
Fri, 8 Mar 2024 15:39:18 +0000 (21:09 +0530)
committerRishabh Dave <ridave@redhat.com>
Mon, 3 Jun 2024 13:24:25 +0000 (18:54 +0530)
When running the command "ceph mds fail" for an MDS that is unhealthy
due to, MDS_CACHE_OVERSIZED or MDS_TRIM, user must pass confirmation
flag. Else, the command will fail and print an appropriate error
message.

Restarting an MDS with such health warnings is not recommended since it
will have a slow reocvery during restart which will create new problems.

Fixes: https://tracker.ceph.com/issues/61866
Signed-off-by: Rishabh Dave <ridave@redhat.com>
(cherry picked from commit eeda00eea5043d3ba806695a207b732cb53b35c4)

src/mon/FSCommands.h
src/mon/MDSMonitor.cc
src/mon/MDSMonitor.h
src/mon/MonCommands.h

index a8714129693cf86bac7df0ff8ab5d5c4482f72ed..79609c33544678cc6cddb2e27c548cecee114728 100644 (file)
@@ -91,4 +91,12 @@ public:
     std::ostream &ss) = 0;
 };
 
+
+static constexpr auto errmsg_for_unhealthy_mds = \
+  "MDS has one of two health warnings which could extend recovery: "
+  "MDS_TRIM or MDS_CACHE_OVERSIZED. MDS failover is not recommended "
+  "since it might cause unexpected file system unavailability. If "
+  "you wish to proceed, pass --yes-i-really-mean-it";
+
+
 #endif
index 88894d73fc887dd5728d2fb978465ec87d3ff79e..c619eb79c3f33daebf3bc17975c70a9ecbc90aa3 100644 (file)
@@ -1460,6 +1460,23 @@ out:
   }
 }
 
+bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
+{
+  for (auto& [gid, health] : pending_daemon_health) {
+    for (auto& metric : health.metrics) {
+      // metric.type here is the type of health warning. We are only
+      // looking for types of health warnings passed to this func member
+      // through variable "warnings".
+      auto it = std::find(warnings.begin(), warnings.end(), metric.type);
+      if (it != warnings.end()) {
+       return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 int MDSMonitor::filesystem_command(
     FSMap &fsmap,
     MonOpRequestRef op,
@@ -1497,6 +1514,8 @@ int MDSMonitor::filesystem_command(
   } else if (prefix == "mds fail") {
     string who;
     cmd_getval(cmdmap, "role_or_gid", who);
+    bool confirm = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
 
     MDSMap::mds_info_t failed_info;
     mds_gid_t gid = gid_from_arg(fsmap, who, ss);
@@ -1516,6 +1535,12 @@ int MDSMonitor::filesystem_command(
       return -EPERM;
     }
 
+    if (!confirm &&
+        has_health_warnings({MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
+      ss << errmsg_for_unhealthy_mds;
+      return -EPERM;
+    }
+
     r = fail_mds(fsmap, ss, who, &failed_info);
     if (r < 0 && r == -EAGAIN) {
       mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
index c41246992d5c83c7402258ff91868ea2e077d44c..0157a47177c0e62cf1acad5518fd6848416349c9 100644 (file)
@@ -20,6 +20,7 @@
 
 #include <map>
 #include <set>
+#include <vector>
 
 #include "include/types.h"
 #include "PaxosFSMap.h"
@@ -51,6 +52,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
   bool preprocess_query(MonOpRequestRef op) override;  // true if processed.
   bool prepare_update(MonOpRequestRef op) override;
   bool should_propose(double& delay) override;
+  bool has_health_warnings(std::vector<mds_metric_t> warnings);
 
   bool should_print_status() const {
     auto& fs = get_fsmap();
index cb412c1494c0a584e63af19fa3fefeb423ce5072..a6bb07132a6e70fdc06b7a00c028e0efc34733ff 100644 (file)
@@ -319,7 +319,8 @@ COMMAND_WITH_FLAG("mds set_state "
        "name=gid,type=CephInt,range=0 "
        "name=state,type=CephInt,range=0|20",
        "set mds state of <gid> to <numeric-state>", "mds", "rw", FLAG(HIDDEN))
-COMMAND("mds fail name=role_or_gid,type=CephString",
+COMMAND("mds fail name=role_or_gid,type=CephString "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
        "Mark MDS failed: trigger a failover if a standby is available",
         "mds", "rw")
 COMMAND("mds repaired name=role,type=CephString",