From: Rishabh Dave Date: Fri, 8 Mar 2024 15:39:18 +0000 (+0530) Subject: cephfs,mon: require confirmation to fail unhealthy MDS X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6cc97db86e996aa3d30a851adb7e25f2b07c6b7e;p=ceph.git cephfs,mon: require confirmation to fail unhealthy MDS When running the command "ceph mds fail" for an MDS that is unhealthy due to, MDS_CACHE_OVERSIZED or MDS_TRIM, user must pass confirmation flag. Else, the command will fail and print an appropriate error message. Restarting an MDS with such health warnings is not recommended since it will have a slow reocvery during restart which will create new problems. Fixes: https://tracker.ceph.com/issues/61866 Signed-off-by: Rishabh Dave (cherry picked from commit eeda00eea5043d3ba806695a207b732cb53b35c4) --- diff --git a/src/mon/FSCommands.h b/src/mon/FSCommands.h index 44dff4e4cdd..6334cef6b9f 100644 --- a/src/mon/FSCommands.h +++ b/src/mon/FSCommands.h @@ -86,4 +86,12 @@ public: std::ostream &ss) = 0; }; + +static constexpr auto errmsg_for_unhealthy_mds = \ + "MDS has one of two health warnings which could extend recovery: " + "MDS_TRIM or MDS_CACHE_OVERSIZED. MDS failover is not recommended " + "since it might cause unexpected file system unavailability. If " + "you wish to proceed, pass --yes-i-really-mean-it"; + + #endif diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 750828b3dba..6d84351c0b6 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -1455,6 +1455,23 @@ out: } } +bool MDSMonitor::has_health_warnings(vector warnings) +{ + for (auto& [gid, health] : pending_daemon_health) { + for (auto& metric : health.metrics) { + // metric.type here is the type of health warning. We are only + // looking for types of health warnings passed to this func member + // through variable "warnings". + auto it = std::find(warnings.begin(), warnings.end(), metric.type); + if (it != warnings.end()) { + return true; + } + } + } + + return false; +} + int MDSMonitor::filesystem_command( FSMap &fsmap, MonOpRequestRef op, @@ -1492,6 +1509,8 @@ int MDSMonitor::filesystem_command( } else if (prefix == "mds fail") { string who; cmd_getval(cmdmap, "role_or_gid", who); + bool confirm = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", confirm); MDSMap::mds_info_t failed_info; mds_gid_t gid = gid_from_arg(fsmap, who, ss); @@ -1511,6 +1530,12 @@ int MDSMonitor::filesystem_command( return -EPERM; } + if (!confirm && + has_health_warnings({MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) { + ss << errmsg_for_unhealthy_mds; + return -EPERM; + } + r = fail_mds(fsmap, ss, who, &failed_info); if (r < 0 && r == -EAGAIN) { mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index c41246992d5..0157a47177c 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -20,6 +20,7 @@ #include #include +#include #include "include/types.h" #include "PaxosFSMap.h" @@ -51,6 +52,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand bool preprocess_query(MonOpRequestRef op) override; // true if processed. bool prepare_update(MonOpRequestRef op) override; bool should_propose(double& delay) override; + bool has_health_warnings(std::vector warnings); bool should_print_status() const { auto& fs = get_fsmap(); diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 24cd0ca7c25..8d09cd6a72f 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -319,7 +319,8 @@ COMMAND_WITH_FLAG("mds set_state " "name=gid,type=CephInt,range=0 " "name=state,type=CephInt,range=0|20", "set mds state of to ", "mds", "rw", FLAG(HIDDEN)) -COMMAND("mds fail name=role_or_gid,type=CephString", +COMMAND("mds fail name=role_or_gid,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", "Mark MDS failed: trigger a failover if a standby is available", "mds", "rw") COMMAND("mds repaired name=role,type=CephString",