From 19ff550c54cc59656d0e74f467453c832cc273bc Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 23 Apr 2018 12:46:20 -0700 Subject: [PATCH] MDSMonitor: do not resize cluster when MDS is starting Fixes: http://tracker.ceph.com/issues/23799 Signed-off-by: Patrick Donnelly --- src/mds/FSMap.h | 1 + src/mds/MDSMap.h | 11 +++++++++++ src/mon/MDSMonitor.cc | 45 +++++++++++++++++++++++-------------------- src/mon/MDSMonitor.h | 2 +- 4 files changed, 37 insertions(+), 22 deletions(-) diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h index 83a85e6f9ed6e..bf28546160115 100644 --- a/src/mds/FSMap.h +++ b/src/mds/FSMap.h @@ -420,6 +420,7 @@ public: size_t filesystem_count() const {return filesystems.size();} bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;} std::shared_ptr get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast(filesystems.at(fscid));} + std::shared_ptr get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);} std::shared_ptr get_filesystem(void) const {return std::const_pointer_cast(filesystems.begin()->second);} std::shared_ptr get_filesystem(std::string_view name) const { diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index c92cf2bff2613..01935ac3b10a9 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -497,6 +497,17 @@ public: */ availability_t is_cluster_available() const; + /** + * Return whether this MDSMap is suitable for resizing based on the state + * of the ranks. + */ + bool is_resizeable() const { + return !is_degraded() && + get_num_mds(CEPH_MDS_STATE_CREATING) == 0 && + get_num_mds(CEPH_MDS_STATE_STARTING) == 0 && + get_num_mds(CEPH_MDS_STATE_STOPPING) == 0; + } + // mds states bool is_down(mds_rank_t m) const { return up.count(m) == 0; } bool is_up(mds_rank_t m) const { return up.count(m); } diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 7c213e022d17e..9f14838a5f04f 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -1736,33 +1736,36 @@ int MDSMonitor::print_nodes(Formatter *f) * attempt to find daemons to grow it. If the cluster is oversized * (with respect to max_mds) then shrink it by stopping its highest rank. */ -bool MDSMonitor::maybe_resize_cluster(std::shared_ptr &fs) +bool MDSMonitor::maybe_resize_cluster(fs_cluster_id_t fscid) { + const auto &fsmap = get_fsmap(); + auto &fsmap_mds_map = fsmap.get_filesystem(fscid)->mds_map; auto &pending = get_pending_fsmap_writeable(); - int in = fs->mds_map.get_num_in_mds(); - int max = fs->mds_map.get_max_mds(); + auto pending_fs = pending.get_filesystem(fscid); + auto &pending_mds_map = pending_fs->mds_map; - dout(20) << __func__ << " in " << in << " max " << max << dendl; + int in = pending_mds_map.get_num_in_mds(); + int max = pending_mds_map.get_max_mds(); - if (fs->mds_map.is_degraded()) { - dout(5) << "not resizing degraded MDS cluster " - << fs->mds_map.fs_name << dendl; - return false; - } + dout(20) << __func__ << " in " << in << " max " << max << dendl; - if (fs->mds_map.get_num_mds(CEPH_MDS_STATE_STOPPING)) { - dout(5) << "An MDS for " << fs->mds_map.fs_name - << " is stopping; waiting to resize" << dendl; + /* Check that both the current epoch mds_map is resizeable as well as the + * current batch of changes in pending. This is important if an MDS is + * becoming active in the next epoch. + */ + if (!fsmap_mds_map.is_resizeable() || + !pending_mds_map.is_resizeable()) { + dout(5) << __func__ << " mds_map is not currently resizeable" << dendl; return false; } - if (in < max && !fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) { + if (in < max && !pending_mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) { mds_rank_t mds = mds_rank_t(0); string name; - while (fs->mds_map.is_in(mds)) { + while (pending_mds_map.is_in(mds)) { mds++; } - mds_gid_t newgid = pending.find_replacement_for({fs->fscid, mds}, + mds_gid_t newgid = pending.find_replacement_for({fscid, mds}, name, g_conf->mon_force_standby_active); if (newgid == MDS_GID_NONE) { return false; @@ -1773,15 +1776,15 @@ bool MDSMonitor::maybe_resize_cluster(std::shared_ptr &fs) << " as mds." << mds << dendl; mon->clog->info() << new_info.human_name() << " assigned to " - "filesystem " << fs->mds_map.fs_name << " as rank " - << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1 + "filesystem " << pending_mds_map.fs_name << " as rank " + << mds << " (now has " << pending_mds_map.get_num_in_mds() + 1 << " ranks)"; - pending.promote(newgid, fs, mds); + pending.promote(newgid, pending_fs, mds); return true; } else if (in > max) { mds_rank_t target = in - 1; - const auto &info = fs->mds_map.get_info(target); - if (fs->mds_map.is_active(target)) { + const auto &info = pending_mds_map.get_info(target); + if (pending_mds_map.is_active(target)) { dout(1) << "deactivating " << target << dendl; mon->clog->info() << "deactivating " << info.human_name(); pending.modify_daemon(info.global_id, @@ -2013,7 +2016,7 @@ void MDSMonitor::tick() // resize mds cluster (adjust @in)? for (auto &p : pending.filesystems) { - do_propose |= maybe_resize_cluster(p.second); + do_propose |= maybe_resize_cluster(p.second->fscid); } const auto now = ceph_clock_now(); diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index e7a80b331cd4f..fb2e6256f0e67 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -113,7 +113,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap { std::list > handlers; bool maybe_promote_standby(std::shared_ptr &fs); - bool maybe_resize_cluster(std::shared_ptr &fs); + bool maybe_resize_cluster(fs_cluster_id_t fscid); void maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose); void tick() override; // check state, take actions -- 2.39.5