]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
cephfs,mon: require confirmation to fail unhealthy MDS
authorRishabh Dave <ridave@redhat.com>
Fri, 8 Mar 2024 15:39:18 +0000 (21:09 +0530)
committerRishabh Dave <ridave@redhat.com>
Mon, 3 Jun 2024 13:46:11 +0000 (19:16 +0530)
When running the command "ceph mds fail" for an MDS that is unhealthy
due to, MDS_CACHE_OVERSIZED or MDS_TRIM, user must pass confirmation
flag. Else, the command will fail and print an appropriate error
message.

Restarting an MDS with such health warnings is not recommended since it
will have a slow reocvery during restart which will create new problems.

Fixes: https://tracker.ceph.com/issues/61866
Signed-off-by: Rishabh Dave <ridave@redhat.com>
(cherry picked from commit eeda00eea5043d3ba806695a207b732cb53b35c4)

src/mon/FSCommands.h
src/mon/MDSMonitor.cc
src/mon/MDSMonitor.h
src/mon/MonCommands.h

index cd9009724e089d9e28843d6892690097ca245c2b..3cac437a6c9a095b1bc728102edb10cf0602dfca 100644 (file)
@@ -87,4 +87,12 @@ public:
     std::ostream &ss) = 0;
 };
 
+
+static constexpr auto errmsg_for_unhealthy_mds = \
+  "MDS has one of two health warnings which could extend recovery: "
+  "MDS_TRIM or MDS_CACHE_OVERSIZED. MDS failover is not recommended "
+  "since it might cause unexpected file system unavailability. If "
+  "you wish to proceed, pass --yes-i-really-mean-it";
+
+
 #endif
index 4cf6eafc773b37103e5925ae01500b59428d25f8..357f944df21aefd6920286a4167ab1e82ff04313 100644 (file)
@@ -1492,6 +1492,23 @@ out:
   }
 }
 
+bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
+{
+  for (auto& [gid, health] : pending_daemon_health) {
+    for (auto& metric : health.metrics) {
+      // metric.type here is the type of health warning. We are only
+      // looking for types of health warnings passed to this func member
+      // through variable "warnings".
+      auto it = std::find(warnings.begin(), warnings.end(), metric.type);
+      if (it != warnings.end()) {
+       return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 int MDSMonitor::filesystem_command(
     FSMap &fsmap,
     MonOpRequestRef op,
@@ -1529,6 +1546,8 @@ int MDSMonitor::filesystem_command(
   } else if (prefix == "mds fail") {
     string who;
     cmd_getval(cmdmap, "role_or_gid", who);
+    bool confirm = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
 
     MDSMap::mds_info_t failed_info;
     mds_gid_t gid = gid_from_arg(fsmap, who, ss);
@@ -1548,6 +1567,12 @@ int MDSMonitor::filesystem_command(
       return -EPERM;
     }
 
+    if (!confirm &&
+        has_health_warnings({MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
+      ss << errmsg_for_unhealthy_mds;
+      return -EPERM;
+    }
+
     r = fail_mds(fsmap, ss, who, &failed_info);
     if (r < 0 && r == -EAGAIN) {
       mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
index 25f27535c77a6eec5c1aeed48f13d7b1af17bc86..b0f88cd31302d779dace82506cc3c2edc1b46730 100644 (file)
@@ -20,6 +20,7 @@
 
 #include <map>
 #include <set>
+#include <vector>
 
 #include "include/types.h"
 #include "PaxosFSMap.h"
@@ -51,6 +52,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
   bool preprocess_query(MonOpRequestRef op) override;  // true if processed.
   bool prepare_update(MonOpRequestRef op) override;
   bool should_propose(double& delay) override;
+  bool has_health_warnings(std::vector<mds_metric_t> warnings);
 
   bool should_print_status() const {
     auto& fs = get_fsmap();
index 14bb3602c9b574033987d9775349ed140c723013..26fec133a6cde1c2894b1f6a5215c22db3c34f57 100644 (file)
@@ -319,7 +319,8 @@ COMMAND_WITH_FLAG("mds set_state "
        "name=gid,type=CephInt,range=0 "
        "name=state,type=CephInt,range=0|20",
        "set mds state of <gid> to <numeric-state>", "mds", "rw", FLAG(HIDDEN))
-COMMAND("mds fail name=role_or_gid,type=CephString",
+COMMAND("mds fail name=role_or_gid,type=CephString "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
        "Mark MDS failed: trigger a failover if a standby is available",
         "mds", "rw")
 COMMAND("mds repaired name=role,type=CephString",