(even as standby). Operators may ignore the error messages and continue
upgrading/restarting or follow this upgrade sequence:
- Reduce the number of ranks to 1 (`ceph fs set <fs_name> max_mds 1`),
- deactivate all other ranks (`ceph mds deactivate <fs_name>:<n>`), shutdown
- standbys leaving the one active MDS, upgrade the single active MDS, then
- upgrade/start standbys. Finally, restore the previous max_mds.
+ Reduce the number of ranks to 1 (`ceph fs set <fs_name> max_mds 1`), wait
+ for all other MDS to deactivate, leaving the one active MDS, upgrade the
+ single active MDS, then upgrade/start standbys. Finally, restore the
+ previous max_mds.
See also: https://tracker.ceph.com/issues/23172
#include "MgrStatMonitor.h"
-
static const string EXPERIMENTAL_WARNING("Warning! This feature is experimental."
"It may cause problems up to and including data loss."
"Consult the documentation at ceph.com, and if unsure, do not proceed."
return -EINVAL;
}
- if (!fs->mds_map.allows_multimds() && n > fs->mds_map.get_max_mds() &&
+ mds_rank_t oldmax = fs->mds_map.get_max_mds();
+
+ if (!fs->mds_map.allows_multimds() && n > oldmax &&
n > 1) {
ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
return -EINVAL;
ss << "may not have more than " << MAX_MDS << " MDS ranks";
return -EINVAL;
}
+
fsmap.modify_filesystem(
fs->fscid,
[n](std::shared_ptr<Filesystem> fs)
}
});
- ss << "marked " << (is_down ? "down" : "up");
+ if (is_down) {
+ ss << " marked down. ";
+ } else {
+ ss << " marked up, max_mds = " << fs->mds_map.get_max_mds();
+ }
+
} else if (var == "standby_count_wanted") {
if (interr.length()) {
ss << var << " requires an integer value";
r = 0;
mds_gid_t gid = fs->mds_map.up.at(role.rank);
ss << "telling mds." << role << " "
- << pending.get_info_gid(gid).addr << " to deactivate";
-
pending.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
info->state = MDSMap::STATE_STOPPING;
});
/**
* If a cluster is undersized (with respect to max_mds), then
- * attempt to find daemons to grow it.
+ * attempt to find daemons to grow it. If the cluster is oversized
+ * (with respect to max_mds) then shrink it by stopping its highest rank.
*/
-bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> &fs)
+bool MDSMonitor::maybe_resize_cluster(std::shared_ptr<Filesystem> &fs)
{
- bool do_propose = false;
auto &pending = get_pending_fsmap_writeable();
+ int in = fs->mds_map.get_num_in_mds();
+ int max = fs->mds_map.get_max_mds();
- if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
- return do_propose;
+ dout(20) << __func__ << " in " << in << " max " << max << dendl;
+
+ if (fs->mds_map.is_degraded()) {
+ dout(5) << "not resizing degraded MDS cluster "
+ << fs->mds_map.fs_name << dendl;
+ return false;
}
- while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
- !fs->mds_map.is_degraded()) {
+ if (fs->mds_map.get_num_mds(CEPH_MDS_STATE_STOPPING)) {
+ dout(5) << "An MDS for " << fs->mds_map.fs_name
+ << " is stopping; waiting to resize" << dendl;
+ }
+
+ if (in < max) {
mds_rank_t mds = mds_rank_t(0);
string name;
while (fs->mds_map.is_in(mds)) {
mds_gid_t newgid = pending.find_replacement_for({fs->fscid, mds},
name, g_conf->mon_force_standby_active);
if (newgid == MDS_GID_NONE) {
- break;
+ return false;
}
const auto &new_info = pending.get_info_gid(newgid);
<< mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
<< " ranks)";
pending.promote(newgid, fs, mds);
- do_propose = true;
+ return true;
}
- return do_propose;
+ if (in > max) {
+ mds_rank_t target = in - 1;
+ mds_gid_t target_gid = fs->mds_map.get_info(target).global_id;
+ if (fs->mds_map.get_state(target) == CEPH_MDS_STATE_ACTIVE) {
+ dout(1) << "deactivating " << target << dendl;
+ mon->clog->info() << "deactivating "
+ << fs->mds_map.get_info(target).human_name();
+ fsmap.modify_daemon(target_gid,
+ [] (MDSMap::mds_info_t *info) {
+ info->state = MDSMap::STATE_STOPPING;
+ });
+ return true;
+ } else {
+ dout(20) << "skipping deactivate on " << target << dendl;
+ return false;
+ }
+ }
+
+ return false;
}
do_propose |= pending.check_health();
- // expand mds cluster (add new nodes to @in)?
+ // resize mds cluster (adjust @in)?
for (auto &p : pending.filesystems) {
- do_propose |= maybe_expand_cluster(p.second);
+ do_propose |= maybe_resize_cluster(p.second);
}
const auto now = ceph_clock_now();