mon/MDSMonitor: enforce mds_join_fs cluster affinity

author Patrick Donnelly <pdonnell@redhat.com>

Tue, 11 Feb 2020 03:20:08 +0000 (19:20 -0800)

committer Patrick Donnelly <pdonnell@redhat.com>

Thu, 13 Feb 2020 15:51:10 +0000 (07:51 -0800)
author Patrick Donnelly <pdonnell@redhat.com>
Tue, 11 Feb 2020 03:20:08 +0000 (19:20 -0800)
committer Patrick Donnelly <pdonnell@redhat.com>
Thu, 13 Feb 2020 15:51:10 +0000 (07:51 -0800)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index a6237e45d3606aa1551b74176cdb285fa0a86f3d..52175c25c2053bc4f39a4d4d635530ff64294671 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -293,7 +293,10 @@
    the racks.  In those cases, the configuration value can be increased.
  
  * MDS daemons can now be assigned to manage a particular file system via the
-  new ``mds_join_fs`` option.
+  new ``mds_join_fs`` option. The monitors will try to use only MDS for a file
+  system with mds_join_fs equal to the file system name (strong affinity).
+  Monitors may also deliberately failover an active MDS to a standby when the
+  cluster is otherwise healthy if the standby has stronger affinity.
  
  * RGW Multisite: A new fine grained bucket-granularity policy configuration
    system has been introduced and it supersedes the previous coarse zone sync
diff --git a/src/common/options.cc b/src/common/options.cc

index 708e86aeebe111172c24956547b7b549b93aa384..05e901fc84b622b0f3caab829356ec7eab4e2aa9 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -7566,9 +7566,10 @@ std::vector<Option> get_mds_options() {
      .set_flag(Option::FLAG_NO_MON_UPDATE)
      .set_description("path to MDS data and keyring"),
  
-    Option("mds_join_fs", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    Option("mds_join_fs", Option::TYPE_STR, Option::LEVEL_BASIC)
      .set_default("")
-    .set_description("force mds daemon to join a specific fs")
+    .set_description("file system MDS prefers to join")
+    .set_long_description("This setting indicates which file system name the MDS should prefer to join (affinity). The monitors will try to have the MDS cluster safely reach a state where all MDS have strong affinity, even via failovers to a standby.")
      .set_flag(Option::FLAG_RUNTIME),
  
      Option("mds_max_xattr_pairs_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc

index 61e6fd37a03295600b202649586ae006caa7f08a..b95e5c840c0045e98ed826c3492bcf59a69a0bfb 100644 (file)
--- a/src/mds/FSMap.cc
+++ b/src/mds/FSMap.cc
@@ -56,10 +56,6 @@ void FSMap::dump(Formatter *f) const
      f->open_object_section("info");
      info.dump(f);
      f->dump_int("epoch", standby_epochs.at(gid));
-    auto p = standby_daemon_fscid.find(gid);
-    if (p != standby_daemon_fscid.end()) {
-      f->dump_int("fscid", p->second);
-    }
      f->close_section();
    }
    f->close_section();
@@ -83,7 +79,6 @@ FSMap &FSMap::operator=(const FSMap &rhs)
    mds_roles = rhs.mds_roles;
    standby_daemons = rhs.standby_daemons;
    standby_epochs = rhs.standby_epochs;
-  standby_daemon_fscid = rhs.standby_daemon_fscid;
  
    filesystems.clear();
    for (const auto &i : rhs.filesystems) {
@@ -137,12 +132,7 @@ void FSMap::print(ostream& out) const
    }
  
    for (const auto& p : standby_daemons) {
-    out << p.second;
-    auto q = standby_daemon_fscid.find(p.first);
-    if (q != standby_daemon_fscid.end()) {
-      out << " (" << q->second << ")";
-    }
-    out << std::endl;
+    out << p.second << std::endl;
    }
  }
  
@@ -433,8 +423,8 @@ void FSMap::get_health_checks(health_check_map_t *checks) const
      std::set<mds_rank_t> stuck_failed;
  
      for (const auto &rank : fs->mds_map.failed) {
-      auto&& replacement = find_replacement_for({fs->fscid, rank}, {});
-      if (replacement == MDS_GID_NONE) {
+      auto rep_info = find_replacement_for({fs->fscid, rank});
+      if (!rep_info) {
          stuck_failed.insert(rank);
        }
      }
@@ -482,7 +472,7 @@ void FSMap::update_compat(const CompatSet &c)
  
  void FSMap::encode(bufferlist& bl, uint64_t features) const
  {
-  ENCODE_START(8, 6, bl);
+  ENCODE_START(7, 6, bl);
    encode(epoch, bl);
    encode(next_filesystem_id, bl);
    encode(legacy_client_fscid, bl);
@@ -498,7 +488,6 @@ void FSMap::encode(bufferlist& bl, uint64_t features) const
    encode(standby_daemons, bl, features);
    encode(standby_epochs, bl);
    encode(ever_enabled_multiple, bl);
-  encode(standby_daemon_fscid, bl);
    ENCODE_FINISH(bl);
  }
  
@@ -683,9 +672,6 @@ void FSMap::decode(bufferlist::const_iterator& p)
      if (struct_v >= 7) {
        decode(ever_enabled_multiple, p);
      }
-    if (struct_v >= 8) {
-      decode(standby_daemon_fscid, p);
-    }
    }
  
    DECODE_FINISH(p);
@@ -760,7 +746,7 @@ bool FSMap::is_any_degraded() const
  
  std::map<mds_gid_t, MDSMap::mds_info_t> FSMap::get_mds_info() const
  {
-  std::map<mds_gid_t, MDSMap::mds_info_t> result;
+  std::map<mds_gid_t, mds_info_t> result;
    for (const auto &i : standby_daemons) {
      result[i.first] = i.second;
    }
@@ -775,8 +761,9 @@ std::map<mds_gid_t, MDSMap::mds_info_t> FSMap::get_mds_info() const
    return result;
  }
  
-mds_gid_t FSMap::get_available_standby(fs_cluster_id_t fscid) const
+const MDSMap::mds_info_t* FSMap::get_available_standby(fs_cluster_id_t fscid) const
  {
+  const mds_info_t* who = nullptr;
    for (const auto& [gid, info] : standby_daemons) {
      ceph_assert(info.rank == MDS_RANK_NONE);
      ceph_assert(info.state == MDSMap::STATE_STANDBY);
@@ -785,15 +772,16 @@ mds_gid_t FSMap::get_available_standby(fs_cluster_id_t fscid) const
        continue;
      }
  
-    auto p = standby_daemon_fscid.find(gid);
-    if (p != standby_daemon_fscid.end() &&
-       p->second != fscid) {
-      continue;
+    if (info.join_fscid == fscid) {
+      who = &info;
+      break;
+    } else if (info.join_fscid == FS_CLUSTER_ID_NONE) {
+      who = &info; /* vanilla standby */
+    } else if (who == nullptr) {
+      who = &info; /* standby for another fs, last resort */
      }
-
-    return gid;
    }
-  return MDS_GID_NONE;
+  return who;
  }
  
  mds_gid_t FSMap::find_mds_gid_by_name(std::string_view s) const
@@ -809,7 +797,7 @@ mds_gid_t FSMap::find_mds_gid_by_name(std::string_view s) const
  
  const MDSMap::mds_info_t* FSMap::find_by_name(std::string_view name) const
  {
-  std::map<mds_gid_t, MDSMap::mds_info_t> result;
+  std::map<mds_gid_t, mds_info_t> result;
    for (const auto &i : standby_daemons) {
      if (i.second.name == name) {
        return &(i.second);
@@ -828,7 +816,7 @@ const MDSMap::mds_info_t* FSMap::find_by_name(std::string_view name) const
    return nullptr;
  }
  
-mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) const
+const MDSMap::mds_info_t* FSMap::find_replacement_for(mds_role_t role) const
  {
    auto&& fs = get_filesystem(role.fscid);
  
@@ -837,9 +825,9 @@ mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) co
      if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
        if (info.is_frozen()) {
          /* the standby-replay is frozen, do nothing! */
-        return MDS_GID_NONE;
+        return nullptr;
        } else {
-        return gid;
+        return &info;
        }
      }
    }
@@ -923,7 +911,7 @@ void FSMap::promote(
      ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
      ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
    }
-  MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid];
+  auto& info = mds_map.mds_info[standby_gid];
  
    if (mds_map.stopped.erase(assigned_rank)) {
      // The cluster is being expanded with a stopped rank
@@ -948,7 +936,6 @@ void FSMap::promote(
    if (!is_standby_replay) {
      standby_daemons.erase(standby_gid);
      standby_epochs.erase(standby_gid);
-    standby_daemon_fscid.erase(standby_gid);
    }
  
    // Indicate that Filesystem has been modified
@@ -975,7 +962,6 @@ void FSMap::assign_standby_replay(
    // Remove from the list of standbys
    standby_daemons.erase(standby_gid);
    standby_epochs.erase(standby_gid);
-  standby_daemon_fscid.erase(standby_gid);
  
    // Indicate that Filesystem has been modified
    fs->mds_map.epoch = epoch;
@@ -986,7 +972,6 @@ void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
    if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
      standby_daemons.erase(who);
      standby_epochs.erase(who);
-    standby_daemon_fscid.erase(who);
    } else {
      auto &fs = filesystems.at(mds_roles.at(who));
      const auto &info = fs->mds_map.mds_info.at(who);
@@ -1051,18 +1036,6 @@ void FSMap::insert(const MDSMap::mds_info_t &new_info)
    standby_epochs[new_info.global_id] = epoch;
  }
  
-void FSMap::adjust_standby_fscid(mds_gid_t standby_gid,
-                                fs_cluster_id_t fscid)
-{
-  standby_daemon_fscid[standby_gid] = fscid;
-}
-
-std::size_t FSMap::clear_standby_fscid(mds_gid_t standby_gid)
-{
-  auto count = standby_daemon_fscid.erase(standby_gid);
-  return count;
-}
-
  std::vector<mds_gid_t> FSMap::stop(mds_gid_t who)
  {
    ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
@@ -1156,3 +1129,24 @@ bool FSMap::pool_in_use(int64_t poolid) const
    }
    return false;
  }
+
+void FSMap::erase_filesystem(fs_cluster_id_t fscid)
+{
+  filesystems.erase(fscid);
+  for (auto& [gid, info] : standby_daemons) {
+    if (info.join_fscid == fscid) {
+      modify_daemon(gid, [](auto& info) {
+        info.join_fscid = FS_CLUSTER_ID_NONE;
+      });
+    }
+  }
+  for (auto& p : filesystems) {
+    for (auto& [gid, info] : p.second->mds_map.get_mds_info()) {
+      if (info.join_fscid == fscid) {
+        modify_daemon(gid, [](auto& info) {
+          info.join_fscid = FS_CLUSTER_ID_NONE;
+        });
+      }
+    }
+  }
+}
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h

index 1c89bcd36eda9159ac9a493048a0c44b4c15f70e..4e4c6e8b007f06a33a99fed97bb758b38ef941f2 100644 (file)
--- a/src/mds/FSMap.h
+++ b/src/mds/FSMap.h
@@ -86,6 +86,7 @@ class FSMap {
  public:
    friend class MDSMonitor;
    friend class PaxosFSMap;
+  using mds_info_t = MDSMap::mds_info_t;
  
    FSMap() : compat(MDSMap::get_compat_set_default()) {}
  
@@ -99,8 +100,7 @@ public:
        ever_enabled_multiple(rhs.ever_enabled_multiple),
        mds_roles(rhs.mds_roles),
        standby_daemons(rhs.standby_daemons),
-      standby_epochs(rhs.standby_epochs),
-      standby_daemon_fscid(rhs.standby_daemon_fscid)
+      standby_epochs(rhs.standby_epochs)
    {
      filesystems.clear();
      for (const auto &i : rhs.filesystems) {
@@ -146,9 +146,9 @@ public:
    /**
     * Get state of all daemons (for all filesystems, including all standbys)
     */
-  std::map<mds_gid_t, MDSMap::mds_info_t> get_mds_info() const;
+  std::map<mds_gid_t, mds_info_t> get_mds_info() const;
  
-  mds_gid_t get_available_standby(fs_cluster_id_t fscid) const;
+  const mds_info_t* get_available_standby(fs_cluster_id_t fscid) const;
  
    /**
     * Resolve daemon name to GID
@@ -158,7 +158,7 @@ public:
    /**
     * Resolve daemon name to status
     */
-  const MDSMap::mds_info_t* find_by_name(std::string_view name) const;
+  const mds_info_t* find_by_name(std::string_view name) const;
  
    /**
     * Does a daemon exist with this GID?
@@ -176,17 +176,15 @@ public:
      return gid_exists(gid) && mds_roles.at(gid) != FS_CLUSTER_ID_NONE;
    }
  
-  /**
-   * Insert a new MDS daemon, as a standby
-   */
-  void insert(const MDSMap::mds_info_t &new_info);
+  fs_cluster_id_t gid_fscid(mds_gid_t gid) const
+  {
+    return mds_roles.at(gid);
+  }
  
    /**
-   * Adjust an MDS daemon's fscid
+   * Insert a new MDS daemon, as a standby
     */
-  void adjust_standby_fscid(mds_gid_t standby_gid,
-                           fs_cluster_id_t fscid);
-  std::size_t clear_standby_fscid(mds_gid_t standby_gid);
+  void insert(const mds_info_t& new_info);
  
    /**
     * Assign an MDS cluster standby replay rank to a standby daemon
@@ -245,15 +243,7 @@ public:
     * Remove the filesystem (it must exist).  Caller should already
     * have failed out any MDSs that were assigned to the filesystem.
     */
-  void erase_filesystem(fs_cluster_id_t fscid)
-  {
-    filesystems.erase(fscid);
-    for (auto& p : standby_daemon_fscid) {
-      if (p.second == fscid) {
-       p.second = FS_CLUSTER_ID_NONE;
-      }
-    }
-  }
+  void erase_filesystem(fs_cluster_id_t fscid);
  
    /**
     * Reset all the state information (not configuration information)
@@ -299,7 +289,7 @@ public:
     * Given that gid exists in a filesystem or as a standby, return
     * a reference to its info.
     */
-  const MDSMap::mds_info_t& get_info_gid(mds_gid_t gid) const
+  const mds_info_t& get_info_gid(mds_gid_t gid) const
    {
      auto fscid = mds_roles.at(gid);
      if (fscid == FS_CLUSTER_ID_NONE) {
@@ -373,7 +363,7 @@ public:
     */
    bool pool_in_use(int64_t poolid) const;
  
-  mds_gid_t find_replacement_for(mds_role_t mds, std::string_view name) const;
+  const mds_info_t* find_replacement_for(mds_role_t role) const;
  
    void get_health(list<pair<health_status_t,std::string> >& summary,
                   list<pair<health_status_t,std::string> > *detail) const;
@@ -417,11 +407,8 @@ protected:
    std::map<mds_gid_t, fs_cluster_id_t> mds_roles;
  
    // For MDS daemons not yet assigned to a Filesystem
-  std::map<mds_gid_t, MDSMap::mds_info_t> standby_daemons;
+  std::map<mds_gid_t, mds_info_t> standby_daemons;
    std::map<mds_gid_t, epoch_t> standby_epochs;
-
-  // Missing entry implies no preference for a fs; NONE means assign to no fs
-  std::map<mds_gid_t, fs_cluster_id_t> standby_daemon_fscid;
  };
  WRITE_CLASS_ENCODER_FEATURES(FSMap)
  
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc

index 809d33554758fbcc7c93dfc577e026c5fdd1a3dd..1115d2b64e571516a43676941bbc823ff640b5cc 100644 (file)
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -79,6 +79,7 @@ void MDSMap::mds_info_t::dump(Formatter *f) const
    f->dump_int("state_seq", state_seq);
    f->dump_stream("addr") << addrs.get_legacy_str();
    f->dump_object("addrs", addrs);
+  f->dump_int("join_fscid", join_fscid);
    if (laggy_since != utime_t())
      f->dump_stream("laggy_since") << laggy_since;
    
@@ -106,6 +107,9 @@ void MDSMap::mds_info_t::dump(std::ostream& o) const
    if (is_frozen()) {
      o << " frozen";
    }
+  if (join_fscid != FS_CLUSTER_ID_NONE) {
+    o << " join_fscid=" << join_fscid;
+  }
    o << " addr " << addrs << "]";
  }
  
@@ -526,7 +530,7 @@ void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) con
    encode(std::string(), bl); /* standby_for_name */
    encode(export_targets, bl);
    encode(mds_features, bl);
-  encode(FS_CLUSTER_ID_NONE, bl); /* standby_for_fscid */
+  encode(join_fscid, bl); /* formerly: standby_for_fscid */
    encode(false, bl);
    if (v >= 9) {
      encode(flags, bl);
@@ -576,8 +580,7 @@ void MDSMap::mds_info_t::decode(bufferlist::const_iterator& bl)
    if (struct_v >= 5)
      decode(mds_features, bl);
    if (struct_v >= 6) {
-    fs_cluster_id_t standby_for_fscid;
-    decode(standby_for_fscid, bl);
+    decode(join_fscid, bl);
    }
    if (struct_v >= 7) {
      bool standby_replay;
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h

index 91f299807c479167c29634b67c25ebb4eba29399..e9a5cd68300d7a6ad45f9e5640ded0a69cbad875 100644 (file)
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -147,6 +147,7 @@ public:
      entity_addrvec_t addrs;
      utime_t laggy_since;
      std::set<mds_rank_t> export_targets;
+    fs_cluster_id_t join_fscid = FS_CLUSTER_ID_NONE;
      uint64_t mds_features = 0;
      uint64_t flags = 0;
      enum mds_flags : uint64_t {
diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc

index 2c557ee95cfff7c9a805a79e9fc438bece152a4e..06a43c7a815a587f816a3f8eecda54d68f3db304 100644 (file)
--- a/src/mon/FSCommands.cc
+++ b/src/mon/FSCommands.cc
@@ -272,14 +272,12 @@ class FsNewHandler : public FileSystemCommandHandler
      ss << "new fs with metadata pool " << metadata << " and data pool " << data;
  
      // assign a standby to rank 0 to avoid health warnings
-    std::string _name;
-    mds_gid_t gid = fsmap.find_replacement_for({fs->fscid, 0}, _name);
+    auto info = fsmap.find_replacement_for({fs->fscid, 0});
  
-    if (gid != MDS_GID_NONE) {
-      const auto &info = fsmap.get_info_gid(gid);
-      mon->clog->info() << info.human_name() << " assigned to filesystem "
+    if (info) {
+      mon->clog->info() << info->human_name() << " assigned to filesystem "
            << fs_name << " as rank 0";
-      fsmap.promote(gid, *fs, 0);
+      fsmap.promote(info->global_id, *fs, 0);
      }
  
      return 0;
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc

index 116eb8e21064326f97f507c9a5ba91c7c48394bf..00ec6284bf45aa1ed7628ffbc49e49bc53dd9ad2 100644 (file)
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -420,28 +420,24 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
      goto ignore;
    }
  
-  // did the standby fs change
-  if (info.state == MDSMap::STATE_STANDBY &&
-      state == MDSMap::STATE_STANDBY) {
-    if (m->get_fs().size()) {
-      fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
-      auto f = fsmap.get_filesystem(m->get_fs());
-      if (f) {
-       fscid = f->fscid;
-      }
-      auto p = fsmap.standby_daemon_fscid.find(gid);
-      if (p == fsmap.standby_daemon_fscid.end() ||
-         p->second != fscid) {
-       dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
-                << " (" << m->get_fs() << ")" << dendl;
-       return false;
-      }
-    } else {
-      auto p = fsmap.standby_daemon_fscid.find(gid);
-      if (p != fsmap.standby_daemon_fscid.end()) {
-       dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
-       return false;
-      }
+  // did the join_fscid change
+  if (m->get_fs().size()) {
+    fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+    auto f = fsmap.get_filesystem(m->get_fs());
+    if (f) {
+      fscid = f->fscid;
+    }
+    if (info.join_fscid != fscid) {
+      dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
+               << " (" << m->get_fs() << ")" << dendl;
+      _note_beacon(m);
+      return false;
+    }
+  } else {
+    if (info.join_fscid != FS_CLUSTER_ID_NONE) {
+      dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
+      _note_beacon(m);
+      return false;
      }
    }
  
@@ -643,7 +639,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
         if (f) {
           fscid = f->fscid;
         }
-       pending.adjust_standby_fscid(gid, fscid);
+        new_info.join_fscid = fscid;
        }
      }
  
@@ -708,11 +704,23 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
          }
        );
      }
-  
+
      dout(5)  << "prepare_beacon mds." << info.rank
              << " " << ceph_mds_state_name(info.state)
              << " -> " << ceph_mds_state_name(state)
              << dendl;
+
+    fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+    if (m->get_fs().size()) {
+      auto f = pending.get_filesystem(m->get_fs());
+      if (f) {
+        fscid = f->fscid;
+      }
+    }
+    pending.modify_daemon(gid, [fscid](auto& info) {
+      info.join_fscid = fscid;
+    });
+
      if (state == MDSMap::STATE_STOPPED) {
        const auto fscid = pending.mds_roles.at(gid);
        const auto &fs = pending.get_filesystem(fscid);
@@ -725,7 +733,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
        auto erased = pending.stop(gid);
        erased.push_back(gid);
  
-      for (const auto &erased_gid : erased) {
+      for (const auto& erased_gid : erased) {
          last_beacon.erase(erased_gid);
          if (pending_daemon_health.count(erased_gid)) {
            pending_daemon_health.erase(erased_gid);
@@ -806,20 +814,6 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
          info.state = state;
          info.state_seq = seq;
        });
-
-      // Process standby mds_join_fs change
-      if (state == MDSMap::STATE_STANDBY) {
-       if (m->get_fs().size()) {
-         fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
-         auto f = pending.get_filesystem(m->get_fs());
-         if (f) {
-           fscid = f->fscid;
-         }
-         pending.adjust_standby_fscid(gid, fscid);
-       } else {
-         pending.clear_standby_fscid(gid);
-       }
-      }
      }
    }
  
@@ -1183,7 +1177,7 @@ out:
  
  bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
  {
-  const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
+  const auto& info = fsmap.get_info_gid(gid);
    dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
  
    ceph_assert(mon->osdmon()->is_writeable());
@@ -1870,24 +1864,21 @@ bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
  
    if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
      mds_rank_t mds = mds_rank_t(0);
-    string name;
      while (mds_map.is_in(mds)) {
        mds++;
      }
-    auto&& newgid = fsmap.find_replacement_for({fscid, mds}, name);
-    if (newgid == MDS_GID_NONE) {
+    auto info = fsmap.find_replacement_for({fscid, mds});
+    if (!info) {
        return false;
      }
  
-    const auto &new_info = fsmap.get_info_gid(newgid);
-    dout(1) << "assigned standby " << new_info.addrs
+    dout(1) << "assigned standby " << info->addrs
              << " as mds." << mds << dendl;
-
-    mon->clog->info() << new_info.human_name() << " assigned to "
+    mon->clog->info() << info->human_name() << " assigned to "
                           "filesystem " << mds_map.fs_name << " as rank "
                        << mds << " (now has " << mds_map.get_num_in_mds() + 1
                        << " ranks)";
-    fsmap.promote(newgid, *fs, mds);
+    fsmap.promote(info->global_id, *fs, mds);
      return true;
    } else if (in > max) {
      mds_rank_t target = in - 1;
@@ -1911,86 +1902,205 @@ bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
  
  
  /**
- * If a daemon is laggy, and a suitable replacement
- * is available, fail this daemon (remove from map) and pass its
- * role to another daemon.
+ * Fail a daemon and replace it with a suitable standby.
   */
-void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
-    const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose)
+bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
  {
-  ceph_assert(mds_propose != nullptr);
    ceph_assert(osd_propose != nullptr);
  
    const auto fscid = fsmap.mds_roles.at(gid);
+  const auto& info = fsmap.get_info_gid(gid);
+  const auto rank = info.rank;
+  const auto state = info.state;
+
+  if (info.is_frozen()) {
+    return false;
+  } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
+             state == MDSMap::STATE_STANDBY) {
+    dout(1)  << " failing and removing standby " << gid << " " << info.addrs
+            << " mds." << rank
+            << "." << info.inc << " " << ceph_mds_state_name(state)
+            << dendl;
+    *osd_propose |= fail_mds_gid(fsmap, gid);
+    return true;
+  } else if (rank >= 0 && rep_info) {
+    auto fs = fsmap.filesystems.at(fscid);
+    if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+      return false;
+    }
+    // are we in?
+    // and is there a non-laggy standby that can take over for us?
+    dout(1)  << " replacing " << gid << " " << info.addrs
+            << " mds." << rank << "." << info.inc
+            << " " << ceph_mds_state_name(state)
+            << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
+            << dendl;
+
+    mon->clog->warn() << "Replacing " << info.human_name()
+                      << " as rank " << rank
+                      << " with standby " << rep_info->human_name();
+
+    // Remove the old one
+    *osd_propose |= fail_mds_gid(fsmap, gid);
+
+    // Promote the replacement
+    fsmap.promote(rep_info->global_id, *fs, rank);
+
+    return true;
+  }
+  return false;
+}
+
+bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
+{
+  bool do_propose = false;
+  const auto now = mono_clock::now();
+  const bool osdmap_writeable = mon->osdmon()->is_writeable();
+  const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
+  const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
+
+  if (mono_clock::is_zero(last_tick)) {
+    last_tick = now;
+  }
+
+  {
+    auto since_last = std::chrono::duration<double>(now-last_tick);
+
+    if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
+      // This case handles either local slowness (calls being delayed
+      // for whatever reason) or cluster election slowness (a long gap
+      // between calls while an election happened)
+      dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
+              "(slow election?) of " << since_last.count() << " seconds" << dendl;
+      for (auto& p : last_beacon) {
+        p.second.stamp = now;
+      }
+    }
+  }
+
+  // make sure last_beacon is fully populated
+  for (auto& p : fsmap.mds_roles) {
+    auto& gid = p.first;
+    last_beacon.emplace(std::piecewise_construct,
+        std::forward_as_tuple(gid),
+        std::forward_as_tuple(now, 0));
+  }
  
    // We will only take decisive action (replacing/removing a daemon)
-  // if we have some indicating that some other daemon(s) are successfully
+  // if we have some indication that some other daemon(s) are successfully
    // getting beacons through recently.
    mono_time latest_beacon = mono_clock::zero();
-  for (const auto &p : last_beacon) {
+  for (const auto& p : last_beacon) {
      latest_beacon = std::max(p.second.stamp, latest_beacon);
    }
-  mono_time now = mono_clock::now();
-  chrono::duration<double> since = now-latest_beacon;
-  const bool frozen = info.is_frozen();
+  auto since = chrono::duration<double>(now-latest_beacon);
    const bool may_replace = since.count() <
        std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
  
-  // are we in?
-  // and is there a non-laggy standby that can take over for us?
-  mds_gid_t sgid;
-  if (info.rank >= 0 &&
-      info.state != MDSMap::STATE_STANDBY &&
-      info.state != MDSMap::STATE_STANDBY_REPLAY &&
-      may_replace &&
-      !frozen &&
-      !fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
-      (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name)) != MDS_GID_NONE)
-  {
-    
-    MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
-    dout(1)  << " replacing " << gid << " " << info.addrs
-            << " mds." << info.rank << "." << info.inc
-            << " " << ceph_mds_state_name(info.state)
-            << " with " << sgid << "/" << si.name << " " << si.addrs
-            << dendl;
+  // check beacon timestamps
+  std::vector<mds_gid_t> to_remove;
+  for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
+    auto& [gid, beacon_info] = *it;
+    auto since_last = chrono::duration<double>(now-beacon_info.stamp);
+
+    if (!fsmap.gid_exists(gid)) {
+      // gid no longer exists, remove from tracked beacons
+      it = last_beacon.erase(it);
+      continue;
+    }
  
-    mon->clog->warn() << info.human_name() 
-                      << " is not responding, replacing it "
-                      << "as rank " << info.rank
-                      << " with standby " << si.human_name();
+    if (since_last.count() >= g_conf()->mds_beacon_grace) {
+      auto& info = fsmap.get_info_gid(gid);
+      dout(1) << "no beacon from mds." << info.rank << "." << info.inc
+              << " (gid: " << gid << " addr: " << info.addrs
+              << " state: " << ceph_mds_state_name(info.state) << ")"
+              << " since " << since_last.count() << dendl;
+      // If the OSDMap is writeable, we can blacklist things, so we can
+      // try failing any laggy MDS daemons.  Consider each one for failure.
+      if (!info.laggy()) {
+        dout(1)  << " marking " << gid << " " << info.addrs
+                << " mds." << info.rank << "." << info.inc
+                << " " << ceph_mds_state_name(info.state)
+                << " laggy" << dendl;
+        fsmap.modify_daemon(info.global_id, [](auto& info) {
+            info.laggy_since = ceph_clock_now();
+        });
+        do_propose = true;
+      }
+      if (osdmap_writeable && may_replace) {
+        to_remove.push_back(gid); // drop_mds may invalidate iterator
+      }
+    }
  
-    // Remember what NS the old one was in
-    const fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
+    ++it;
+  }
  
-    // Remove the old one
-    *osd_propose |= fail_mds_gid(fsmap, gid);
+  for (const auto& gid : to_remove) {
+    auto& info = fsmap.get_info_gid(gid);
+    const mds_info_t* rep_info = nullptr;
+    if (info.rank >= 0) {
+      auto fscid = fsmap.gid_fscid(gid);
+      rep_info = fsmap.find_replacement_for({fscid, info.rank});
+    }
+    bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
+    if (dropped) {
+      mon->clog->info() << "MDS " << info.human_name()
+                        << " is removed because it is dead or otherwise unavailable.";
+      do_propose = true;
+    }
+  }
  
-    // Promote the replacement
-    auto&& fs = fsmap.filesystems.at(fscid);
-    fsmap.promote(sgid, *fs, info.rank);
-
-    *mds_propose = true;
-  } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
-             info.state == MDSMap::STATE_STANDBY) && may_replace && !frozen) {
-    dout(1)  << " failing and removing " << gid << " " << info.addrs
-            << " mds." << info.rank
-            << "." << info.inc << " " << ceph_mds_state_name(info.state)
-            << dendl;
-    mon->clog->info() << "Standby " << info.human_name() << " is not "
-                         "responding, dropping it";
-    fail_mds_gid(fsmap, gid);
-    *mds_propose = true;
-  } else if (!info.laggy()) {
-      dout(1)  << " marking " << gid << " " << info.addrs
-              << " mds." << info.rank << "." << info.inc
-              << " " << ceph_mds_state_name(info.state)
-              << " laggy" << dendl;
-      fsmap.modify_daemon(info.global_id, [](auto& info) {
-          info.laggy_since = ceph_clock_now();
-      });
-      *mds_propose = true;
+  if (osdmap_writeable) {
+    for (auto& [fscid, fs] : fsmap.filesystems) {
+      if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
+          fs->mds_map.is_resizeable()) {
+        // Check if a rank or standby-replay should be replaced with a stronger
+        // affinity standby. This looks at ranks and standby-replay:
+        for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
+          const auto join_fscid = info.join_fscid;
+          if (join_fscid == fscid)
+            continue;
+          const auto rank = info.rank;
+          const auto state = info.state;
+          const mds_info_t* rep_info = nullptr;
+          if (state == MDSMap::STATE_STANDBY_REPLAY) {
+            rep_info = fsmap.get_available_standby(fscid);
+          } else if (state == MDSMap::STATE_ACTIVE) {
+            rep_info = fsmap.find_replacement_for({fscid, rank});
+          } else {
+            /* N.B. !is_degraded() */
+            ceph_abort_msg("invalid state in MDSMap");
+          }
+          if (!rep_info) {
+            break;
+          }
+          bool better_affinity = false;
+          if (join_fscid == FS_CLUSTER_ID_NONE) {
+            better_affinity = (rep_info->join_fscid == fscid);
+          } else {
+            better_affinity = (rep_info->join_fscid == fscid) ||
+                              (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
+          }
+          if (better_affinity) {
+            if (state == MDSMap::STATE_STANDBY_REPLAY) {
+              mon->clog->info() << "Dropping low affinity standby-replay "
+                                << info.human_name()
+                                << " in favor of higher affinity standby.";
+              *propose_osdmap |= fail_mds_gid(fsmap, gid);
+              /* Now let maybe_promote_standby do the promotion. */
+            } else {
+              mon->clog->info() << "Dropping low affinity active "
+                                << info.human_name()
+                                << " in favor of higher affinity standby.";
+              do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
+            }
+            break; /* don't replace more than one per tick per fs */
+          }
+        }
+      }
+    }
    }
+  return do_propose;
  }
  
  bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
@@ -2005,35 +2115,34 @@ bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
    set<mds_rank_t> failed;
    fs.mds_map.get_failed_mds_set(failed);
    for (const auto& rank : failed) {
-    auto&& sgid = fsmap.find_replacement_for({fs.fscid, rank}, {});
-    if (sgid) {
-      auto&& info = fsmap.get_info_gid(sgid);
-      dout(1) << " taking over failed mds." << rank << " with " << sgid
-              << "/" << info.name << " " << info.addrs << dendl;
-      mon->clog->info() << "Standby " << info.human_name()
+    auto info = fsmap.find_replacement_for({fs.fscid, rank});
+    if (info) {
+      dout(1) << " taking over failed mds." << rank << " with " << info->global_id
+              << "/" << info->name << " " << info->addrs << dendl;
+      mon->clog->info() << "Standby " << info->human_name()
                          << " assigned to filesystem " << fs.mds_map.fs_name
                          << " as rank " << rank;
  
-      fsmap.promote(sgid, fs, rank);
+      fsmap.promote(info->global_id, fs, rank);
        do_propose = true;
      }
    }
  
-  if (fs.mds_map.allows_standby_replay() && !fs.mds_map.is_degraded()) {
+  if (!fs.mds_map.is_degraded() && fs.mds_map.allows_standby_replay()) {
      // There were no failures to replace, so try using any available standbys
      // as standby-replay daemons. Don't do this when the cluster is degraded
      // as a standby-replay daemon may try to read a journal being migrated.
      for (;;) {
-      auto standby_gid = fsmap.get_available_standby(fs.fscid);
-      if (standby_gid == MDS_GID_NONE) break;
-      dout(20) << "standby available mds." << standby_gid << dendl;
+      auto info = fsmap.get_available_standby(fs.fscid);
+      if (!info) break;
+      dout(20) << "standby available mds." << info->global_id << dendl;
        bool changed = false;
        for (const auto& rank : fs.mds_map.in) {
-        dout(20) << "exmaining " << rank << dendl;
+        dout(20) << "examining " << rank << dendl;
          if (fs.mds_map.is_followable(rank)) {
-          dout(1) << "  setting mds." << standby_gid
+          dout(1) << "  setting mds." << info->global_id
                    << " to follow mds rank " << rank << dendl;
-          fsmap.assign_standby_replay(standby_gid, fs.fscid, rank);
+          fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
            do_propose = true;
            changed = true;
            break;
@@ -2048,92 +2157,37 @@ bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
  
  void MDSMonitor::tick()
  {
-  // make sure mds's are still alive
-  // ...if i am an active leader
-
    if (!is_active() || !is_leader()) return;
  
    auto &pending = get_pending_fsmap_writeable();
  
    bool do_propose = false;
+  bool propose_osdmap = false;
  
    do_propose |= pending.check_health();
  
-  // resize mds cluster (adjust @in)?
-  for (auto &p : pending.filesystems) {
-    do_propose |= maybe_resize_cluster(pending, p.second->fscid);
-  }
-
-  mono_time now = mono_clock::now();
-  if (mono_clock::is_zero(last_tick)) {
-    last_tick = now;
-  }
-  chrono::duration<double> since_last = now-last_tick;
+  /* Check health and affinity of ranks */
+  do_propose |= check_health(pending, &propose_osdmap);
  
-  if (since_last.count() >
-      (g_conf()->mds_beacon_grace - g_conf()->mds_beacon_interval)) {
-    // This case handles either local slowness (calls being delayed
-    // for whatever reason) or cluster election slowness (a long gap
-    // between calls while an election happened)
-    dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
-            "(slow election?) of " << now - last_tick << " seconds" << dendl;
-    for (auto &p : last_beacon) {
-      p.second.stamp = now;
-    }
+  /* Resize the cluster according to max_mds. */
+  for (auto& p : pending.filesystems) {
+    do_propose |= maybe_resize_cluster(pending, p.second->fscid);
    }
  
-  last_tick = now;
-
-  // make sure last_beacon is fully populated
-  for (auto &p : pending.mds_roles) {
-    auto &gid = p.first;
-    last_beacon.emplace(std::piecewise_construct,
-        std::forward_as_tuple(gid),
-        std::forward_as_tuple(mono_clock::now(), 0));
+  /* Replace any failed ranks. */
+  for (auto& p : pending.filesystems) {
+    do_propose |= maybe_promote_standby(pending, *p.second);
    }
  
-
-  // check beacon timestamps
-  bool propose_osdmap = false;
-  bool osdmap_writeable = mon->osdmon()->is_writeable();
-  for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
-    mds_gid_t gid = it->first;
-    auto beacon_info = it->second;
-    chrono::duration<double> since_last = now-beacon_info.stamp;
-
-    if (!pending.gid_exists(gid)) {
-      // clean it out
-      it = last_beacon.erase(it);
-      continue;
-    }
-
-
-    if (since_last.count() >= g_conf()->mds_beacon_grace) {
-      auto &info = pending.get_info_gid(gid);
-      dout(1) << "no beacon from mds." << info.rank << "." << info.inc
-              << " (gid: " << gid << " addr: " << info.addrs
-              << " state: " << ceph_mds_state_name(info.state) << ")"
-              << " since " << since_last.count() << "s" << dendl;
-      // If the OSDMap is writeable, we can blacklist things, so we can
-      // try failing any laggy MDS daemons.  Consider each one for failure.
-      if (osdmap_writeable) {
-        maybe_replace_gid(pending, gid, info, &do_propose, &propose_osdmap);
-      }
-    }
-
-    ++it;
-  }
    if (propose_osdmap) {
      request_proposal(mon->osdmon());
    }
  
-  for (auto &p : pending.filesystems) {
-    do_propose |= maybe_promote_standby(pending, *p.second);
-  }
-
    if (do_propose) {
      propose_pending();
    }
+
+  last_tick = mono_clock::now();
  }
  
  MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h

index 1475e312d069928483c66d69a85260c8ba0eb9e4..56762a8afb7ba0b47cfb359f5b8474ce01d70031 100644 (file)
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -73,6 +73,8 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
    bool is_leader() const override { return mon->is_leader(); }
  
   protected:
+  using mds_info_t = MDSMap::mds_info_t;
+
    // my helpers
    template<int dblV = 7>
    void print_map(const FSMap &m);
@@ -88,7 +90,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
  
    int fail_mds(FSMap &fsmap, std::ostream &ss,
        const std::string &arg,
-      MDSMap::mds_info_t *failed_info);
+      mds_info_t *failed_info);
  
    bool preprocess_command(MonOpRequestRef op);
    bool prepare_command(MonOpRequestRef op);
@@ -113,8 +115,8 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
  
    bool maybe_promote_standby(FSMap& fsmap, Filesystem& fs);
    bool maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid);
-  void maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
-      const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose);
+  bool drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool* osd_propose);
+  bool check_health(FSMap &fsmap, bool* osd_propose);
    void tick() override;     // check state, take actions
  
    int dump_metadata(const FSMap &fsmap, const std::string &who, Formatter *f,
author	Patrick Donnelly <pdonnell@redhat.com>
	Tue, 11 Feb 2020 03:20:08 +0000 (19:20 -0800)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Thu, 13 Feb 2020 15:51:10 +0000 (07:51 -0800)
PendingReleaseNotes		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/mds/FSMap.cc		patch \| blob \| history
src/mds/FSMap.h		patch \| blob \| history
src/mds/MDSMap.cc		patch \| blob \| history
src/mds/MDSMap.h		patch \| blob \| history
src/mon/FSCommands.cc		patch \| blob \| history
src/mon/MDSMonitor.cc		patch \| blob \| history
src/mon/MDSMonitor.h		patch \| blob \| history