From: Rishabh Dave Date: Fri, 8 Mar 2024 15:39:18 +0000 (+0530) Subject: cephfs,mon: require confirmation to fail unhealthy MDS X-Git-Tag: testing/wip-lusov-testing-20240611.123850-squid~6^2~10 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=4f037dec7fdf6ac23b7a06bafba5ed3e80e134be;p=ceph-ci.git cephfs,mon: require confirmation to fail unhealthy MDS When running the command "ceph mds fail" for an MDS that is unhealthy due to, MDS_CACHE_OVERSIZED or MDS_TRIM, user must pass confirmation flag. Else, the command will fail and print an appropriate error message. Restarting an MDS with such health warnings is not recommended since it will have a slow reocvery during restart which will create new problems. Fixes: https://tracker.ceph.com/issues/61866 Signed-off-by: Rishabh Dave (cherry picked from commit eeda00eea5043d3ba806695a207b732cb53b35c4) --- diff --git a/src/mon/FSCommands.h b/src/mon/FSCommands.h index cd9009724e0..3cac437a6c9 100644 --- a/src/mon/FSCommands.h +++ b/src/mon/FSCommands.h @@ -87,4 +87,12 @@ public: std::ostream &ss) = 0; }; + +static constexpr auto errmsg_for_unhealthy_mds = \ + "MDS has one of two health warnings which could extend recovery: " + "MDS_TRIM or MDS_CACHE_OVERSIZED. MDS failover is not recommended " + "since it might cause unexpected file system unavailability. If " + "you wish to proceed, pass --yes-i-really-mean-it"; + + #endif diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 4cf6eafc773..357f944df21 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -1492,6 +1492,23 @@ out: } } +bool MDSMonitor::has_health_warnings(vector warnings) +{ + for (auto& [gid, health] : pending_daemon_health) { + for (auto& metric : health.metrics) { + // metric.type here is the type of health warning. We are only + // looking for types of health warnings passed to this func member + // through variable "warnings". + auto it = std::find(warnings.begin(), warnings.end(), metric.type); + if (it != warnings.end()) { + return true; + } + } + } + + return false; +} + int MDSMonitor::filesystem_command( FSMap &fsmap, MonOpRequestRef op, @@ -1529,6 +1546,8 @@ int MDSMonitor::filesystem_command( } else if (prefix == "mds fail") { string who; cmd_getval(cmdmap, "role_or_gid", who); + bool confirm = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", confirm); MDSMap::mds_info_t failed_info; mds_gid_t gid = gid_from_arg(fsmap, who, ss); @@ -1548,6 +1567,12 @@ int MDSMonitor::filesystem_command( return -EPERM; } + if (!confirm && + has_health_warnings({MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) { + ss << errmsg_for_unhealthy_mds; + return -EPERM; + } + r = fail_mds(fsmap, ss, who, &failed_info); if (r < 0 && r == -EAGAIN) { mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index 25f27535c77..b0f88cd3130 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -20,6 +20,7 @@ #include #include +#include #include "include/types.h" #include "PaxosFSMap.h" @@ -51,6 +52,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand bool preprocess_query(MonOpRequestRef op) override; // true if processed. bool prepare_update(MonOpRequestRef op) override; bool should_propose(double& delay) override; + bool has_health_warnings(std::vector warnings); bool should_print_status() const { auto& fs = get_fsmap(); diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 14bb3602c9b..26fec133a6c 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -319,7 +319,8 @@ COMMAND_WITH_FLAG("mds set_state " "name=gid,type=CephInt,range=0 " "name=state,type=CephInt,range=0|20", "set mds state of to ", "mds", "rw", FLAG(HIDDEN)) -COMMAND("mds fail name=role_or_gid,type=CephString", +COMMAND("mds fail name=role_or_gid,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", "Mark MDS failed: trigger a failover if a standby is available", "mds", "rw") COMMAND("mds repaired name=role,type=CephString",