From 19ff550c54cc59656d0e74f467453c832cc273bc Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Mon, 23 Apr 2018 12:46:20 -0700
Subject: [PATCH] MDSMonitor: do not resize cluster when MDS is starting

Fixes: http://tracker.ceph.com/issues/23799

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 src/mds/FSMap.h       |  1 +
 src/mds/MDSMap.h      | 11 +++++++++++
 src/mon/MDSMonitor.cc | 45 +++++++++++++++++++++++--------------------
 src/mon/MDSMonitor.h  |  2 +-
 4 files changed, 37 insertions(+), 22 deletions(-)
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h
index 83a85e6f9ed..bf285461601 100644
--- a/src/mds/FSMap.h
+++ b/src/mds/FSMap.h
@@ -420,6 +420,7 @@ public:
   size_t filesystem_count() const {return filesystems.size();}
   bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;}
   std::shared_ptr<const Filesystem> get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast<const Filesystem>(filesystems.at(fscid));}
+  std::shared_ptr<Filesystem> get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);}
   std::shared_ptr<const Filesystem> get_filesystem(void) const {return std::const_pointer_cast<const Filesystem>(filesystems.begin()->second);}
   std::shared_ptr<const Filesystem> get_filesystem(std::string_view name) const
   {
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index c92cf2bff26..01935ac3b10 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -497,6 +497,17 @@ public:
    */
   availability_t is_cluster_available() const;
 
+  /**
+   * Return whether this MDSMap is suitable for resizing based on the state
+   * of the ranks.
+   */
+  bool is_resizeable() const {
+    return !is_degraded() &&
+        get_num_mds(CEPH_MDS_STATE_CREATING) == 0 &&
+        get_num_mds(CEPH_MDS_STATE_STARTING) == 0 &&
+        get_num_mds(CEPH_MDS_STATE_STOPPING) == 0;
+  }
+
   // mds states
   bool is_down(mds_rank_t m) const { return up.count(m) == 0; }
   bool is_up(mds_rank_t m) const { return up.count(m); }
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 7c213e022d1..9f14838a5f0 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -1736,33 +1736,36 @@ int MDSMonitor::print_nodes(Formatter *f)
  * attempt to find daemons to grow it. If the cluster is oversized
  * (with respect to max_mds) then shrink it by stopping its highest rank.
  */
-bool MDSMonitor::maybe_resize_cluster(std::shared_ptr<Filesystem> &fs)
+bool MDSMonitor::maybe_resize_cluster(fs_cluster_id_t fscid)
 {
+  const auto &fsmap = get_fsmap();
+  auto &fsmap_mds_map = fsmap.get_filesystem(fscid)->mds_map;
   auto &pending = get_pending_fsmap_writeable();
-  int in = fs->mds_map.get_num_in_mds();
-  int max = fs->mds_map.get_max_mds();
+  auto pending_fs = pending.get_filesystem(fscid);
+  auto &pending_mds_map = pending_fs->mds_map;
 
-  dout(20) << __func__ << " in " << in << " max " << max << dendl;
+  int in = pending_mds_map.get_num_in_mds();
+  int max = pending_mds_map.get_max_mds();
 
-  if (fs->mds_map.is_degraded()) {
-    dout(5) << "not resizing degraded MDS cluster "
-	         << fs->mds_map.fs_name << dendl;
-    return false;
-  }
+  dout(20) << __func__ << " in " << in << " max " << max << dendl;
 
-  if (fs->mds_map.get_num_mds(CEPH_MDS_STATE_STOPPING)) {
-    dout(5) << "An MDS for " << fs->mds_map.fs_name
-	         << " is stopping; waiting to resize" << dendl;
+  /* Check that both the current epoch mds_map is resizeable as well as the
+   * current batch of changes in pending. This is important if an MDS is
+   * becoming active in the next epoch.
+   */
+  if (!fsmap_mds_map.is_resizeable() ||
+      !pending_mds_map.is_resizeable()) {
+    dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
     return false;
   }
 
-  if (in < max && !fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+  if (in < max && !pending_mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
     mds_rank_t mds = mds_rank_t(0);
     string name;
-    while (fs->mds_map.is_in(mds)) {
+    while (pending_mds_map.is_in(mds)) {
       mds++;
     }
-    mds_gid_t newgid = pending.find_replacement_for({fs->fscid, mds},
+    mds_gid_t newgid = pending.find_replacement_for({fscid, mds},
                          name, g_conf->mon_force_standby_active);
     if (newgid == MDS_GID_NONE) {
       return false;
@@ -1773,15 +1776,15 @@ bool MDSMonitor::maybe_resize_cluster(std::shared_ptr<Filesystem> &fs)
             << " as mds." << mds << dendl;
 
     mon->clog->info() << new_info.human_name() << " assigned to "
-                         "filesystem " << fs->mds_map.fs_name << " as rank "
-                      << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
+                         "filesystem " << pending_mds_map.fs_name << " as rank "
+                      << mds << " (now has " << pending_mds_map.get_num_in_mds() + 1
                       << " ranks)";
-    pending.promote(newgid, fs, mds);
+    pending.promote(newgid, pending_fs, mds);
     return true;
   } else if (in > max) {
     mds_rank_t target = in - 1;
-    const auto &info = fs->mds_map.get_info(target);
-    if (fs->mds_map.is_active(target)) {
+    const auto &info = pending_mds_map.get_info(target);
+    if (pending_mds_map.is_active(target)) {
       dout(1) << "deactivating " << target << dendl;
       mon->clog->info() << "deactivating " << info.human_name();
       pending.modify_daemon(info.global_id,
@@ -2013,7 +2016,7 @@ void MDSMonitor::tick()
 
   // resize mds cluster (adjust @in)?
   for (auto &p : pending.filesystems) {
-    do_propose |= maybe_resize_cluster(p.second);
+    do_propose |= maybe_resize_cluster(p.second->fscid);
   }
 
   const auto now = ceph_clock_now();
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
index e7a80b331cd..fb2e6256f0e 100644
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -113,7 +113,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap {
   std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
 
   bool maybe_promote_standby(std::shared_ptr<Filesystem> &fs);
-  bool maybe_resize_cluster(std::shared_ptr<Filesystem> &fs);
+  bool maybe_resize_cluster(fs_cluster_id_t fscid);
   void maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
       bool *mds_propose, bool *osd_propose);
   void tick() override;     // check state, take actions
-- 
2.47.3