mon/FSCommands: Set extra MDS to standby

author Douglas Fuller <dfuller@redhat.com>

Tue, 25 Jul 2017 18:06:51 +0000 (14:06 -0400)

committer Patrick Donnelly <pdonnell@redhat.com>

Tue, 17 Apr 2018 18:01:03 +0000 (11:01 -0700)
author Douglas Fuller <dfuller@redhat.com>
Tue, 25 Jul 2017 18:06:51 +0000 (14:06 -0400)
committer Patrick Donnelly <pdonnell@redhat.com>
Tue, 17 Apr 2018 18:01:03 +0000 (11:01 -0700)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index c9eaa94ed4564c20001003a352ac80055de806f3..c8e3734626013cbd235966c26637f065ffbb5ea1 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -13,10 +13,10 @@
      (even as standby). Operators may ignore the error messages and continue
      upgrading/restarting or follow this upgrade sequence:
  
-    Reduce the number of ranks to 1 (`ceph fs set <fs_name> max_mds 1`),
-    deactivate all other ranks (`ceph mds deactivate <fs_name>:<n>`), shutdown
-    standbys leaving the one active MDS, upgrade the single active MDS, then
-    upgrade/start standbys. Finally, restore the previous max_mds.
+        Reduce the number of ranks to 1 (`ceph fs set <fs_name> max_mds 1`), wait
+        for all other MDS to deactivate, leaving the one active MDS, upgrade the
+        single active MDS, then upgrade/start standbys. Finally, restore the
+        previous max_mds.
  
      See also: https://tracker.ceph.com/issues/23172
  
diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc

index 0d90f663c8f451ffb0fbeb135e95001101fbc8ff..4ac5655352e23342252e2a71a2399839e955cf48 100644 (file)
--- a/src/mon/FSCommands.cc
+++ b/src/mon/FSCommands.cc
@@ -20,7 +20,6 @@
  #include "MgrStatMonitor.h"
  
  
-
  static const string EXPERIMENTAL_WARNING("Warning! This feature is experimental."
  "It may cause problems up to and including data loss."
  "Consult the documentation at ceph.com, and if unsure, do not proceed."
@@ -264,7 +263,9 @@ public:
          return -EINVAL;
        }
  
-      if (!fs->mds_map.allows_multimds() && n > fs->mds_map.get_max_mds() &&
+      mds_rank_t oldmax = fs->mds_map.get_max_mds();
+
+      if (!fs->mds_map.allows_multimds() && n > oldmax &&
           n > 1) {
         ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
         return -EINVAL;
@@ -273,6 +274,7 @@ public:
          ss << "may not have more than " << MAX_MDS << " MDS ranks";
          return -EINVAL;
        }
+
        fsmap.modify_filesystem(
            fs->fscid,
            [n](std::shared_ptr<Filesystem> fs)
@@ -437,7 +439,12 @@ public:
          }
        });
  
-      ss << "marked " << (is_down ? "down" : "up");
+      if (is_down) {
+       ss << " marked down. ";
+      } else {
+       ss << " marked up, max_mds = " << fs->mds_map.get_max_mds();
+      }
+
      } else if (var == "standby_count_wanted") {
        if (interr.length()) {
         ss << var << " requires an integer value";
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc

index 24159e70833a21e695373ef9e74f90e01c47f01c..bb15aab4bcdac567577a8e4bc036a15d63ef8862 100644 (file)
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -1323,8 +1323,6 @@ int MDSMonitor::filesystem_command(
        r = 0;
        mds_gid_t gid = fs->mds_map.up.at(role.rank);
        ss << "telling mds." << role << " "
-         << pending.get_info_gid(gid).addr << " to deactivate";
-
        pending.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
          info->state = MDSMap::STATE_STOPPING;
        });
@@ -1762,19 +1760,29 @@ int MDSMonitor::print_nodes(Formatter *f)
  
  /**
   * If a cluster is undersized (with respect to max_mds), then
- * attempt to find daemons to grow it.
+ * attempt to find daemons to grow it. If the cluster is oversized
+ * (with respect to max_mds) then shrink it by stopping its highest rank.
   */
-bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> &fs)
+bool MDSMonitor::maybe_resize_cluster(std::shared_ptr<Filesystem> &fs)
  {
-  bool do_propose = false;
    auto &pending = get_pending_fsmap_writeable();
+  int in = fs->mds_map.get_num_in_mds();
+  int max = fs->mds_map.get_max_mds();
  
-  if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
-    return do_propose;
+  dout(20) << __func__ << " in " << in << " max " << max << dendl;
+
+  if (fs->mds_map.is_degraded()) {
+    dout(5) << "not resizing degraded MDS cluster "
+                << fs->mds_map.fs_name << dendl;
+    return false;
    }
  
-  while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
-        !fs->mds_map.is_degraded()) {
+  if (fs->mds_map.get_num_mds(CEPH_MDS_STATE_STOPPING)) {
+    dout(5) << "An MDS for " << fs->mds_map.fs_name
+                << " is stopping; waiting to resize" << dendl;
+  }
+
+  if (in < max) {
      mds_rank_t mds = mds_rank_t(0);
      string name;
      while (fs->mds_map.is_in(mds)) {
@@ -1783,7 +1791,7 @@ bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> &fs)
      mds_gid_t newgid = pending.find_replacement_for({fs->fscid, mds},
                           name, g_conf->mon_force_standby_active);
      if (newgid == MDS_GID_NONE) {
-      break;
+      return false;
      }
  
      const auto &new_info = pending.get_info_gid(newgid);
@@ -1795,10 +1803,28 @@ bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> &fs)
                        << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
                        << " ranks)";
      pending.promote(newgid, fs, mds);
-    do_propose = true;
+    return true;
    }
  
-  return do_propose;
+  if (in > max) {
+    mds_rank_t target = in - 1;
+    mds_gid_t target_gid = fs->mds_map.get_info(target).global_id;
+    if (fs->mds_map.get_state(target) == CEPH_MDS_STATE_ACTIVE) {
+      dout(1) << "deactivating " << target << dendl;
+      mon->clog->info() << "deactivating "
+                       << fs->mds_map.get_info(target).human_name();
+      fsmap.modify_daemon(target_gid,
+                                 [] (MDSMap::mds_info_t *info) {
+                                   info->state = MDSMap::STATE_STOPPING;
+                                 });
+      return true;
+    } else {
+      dout(20) << "skipping deactivate on " << target << dendl;
+      return false;
+    }
+  }
+
+  return false;
  }
  
  
@@ -2012,9 +2038,9 @@ void MDSMonitor::tick()
  
    do_propose |= pending.check_health();
  
-  // expand mds cluster (add new nodes to @in)?
+  // resize mds cluster (adjust @in)?
    for (auto &p : pending.filesystems) {
-    do_propose |= maybe_expand_cluster(p.second);
+    do_propose |= maybe_resize_cluster(p.second);
    }
  
    const auto now = ceph_clock_now();
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h

index d02af4b2d94fef3e8c8a9b932d35e4928a5d383d..e7a80b331cd4ffa66271185d212d57a307c27feb 100644 (file)
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -113,7 +113,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap {
    std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
  
    bool maybe_promote_standby(std::shared_ptr<Filesystem> &fs);
-  bool maybe_expand_cluster(std::shared_ptr<Filesystem> &fs);
+  bool maybe_resize_cluster(std::shared_ptr<Filesystem> &fs);
    void maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
        bool *mds_propose, bool *osd_propose);
    void tick() override;     // check state, take actions
author	Douglas Fuller <dfuller@redhat.com>
	Tue, 25 Jul 2017 18:06:51 +0000 (14:06 -0400)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Tue, 17 Apr 2018 18:01:03 +0000 (11:01 -0700)
PendingReleaseNotes		patch \| blob \| history
src/mon/FSCommands.cc		patch \| blob \| history
src/mon/MDSMonitor.cc		patch \| blob \| history
src/mon/MDSMonitor.h		patch \| blob \| history