From 7278543d744974421f8c8299b83b7054bbd5eacc Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 18 Nov 2016 13:27:07 -0500 Subject: [PATCH] mds: warn if insufficient standbys exist Fixes: http://tracker.ceph.com/issues/17604 Signed-off-by: Patrick Donnelly --- doc/cephfs/health-messages.rst | 9 +++++++++ doc/cephfs/standby.rst | 14 ++++++++++++++ src/mds/FSMap.cc | 23 +++++++++++++++++++++-- src/mds/FSMap.h | 2 ++ src/mds/MDSMap.cc | 33 +++++++++++++++++++++++++++++++-- src/mds/MDSMap.h | 16 ++++++++++++++++ src/mon/FSCommands.cc | 15 +++++++++++++++ src/mon/MDSMonitor.cc | 2 ++ src/mon/MonCommands.h | 3 ++- 9 files changed, 112 insertions(+), 5 deletions(-) diff --git a/doc/cephfs/health-messages.rst b/doc/cephfs/health-messages.rst index 2a345bb7ef37..54b4f7144e90 100644 --- a/doc/cephfs/health-messages.rst +++ b/doc/cephfs/health-messages.rst @@ -32,6 +32,15 @@ they are supposed to send beacon messages every ``mds_beacon_interval`` (default 4s). The daemons may have crashed. The Ceph monitor will automatically replace laggy daemons with standbys if any are available. +Message: insufficient standby daemons available +Description: One or more file systems are configured to have a certain number +of standby daemons available (including daemons in standby-replay) but the +cluster does not have enough standby daemons. The standby deamons not in replay +count towards any file system (i.e. they may overlap). This warning can +configured by setting ``ceph fs set standby_count_wanted ``. Use +zero for ``count`` to disable. + + Daemon-reported health checks ============================= diff --git a/doc/cephfs/standby.rst b/doc/cephfs/standby.rst index ec272e33a90a..8a8bef159c0b 100644 --- a/doc/cephfs/standby.rst +++ b/doc/cephfs/standby.rst @@ -64,6 +64,20 @@ If an MDS daemon stops communicating with the monitor, the monitor will wait ``mds_beacon_grace`` seconds (default 15 seconds) before marking the daemon as *laggy*. +Each file system may specify a number of standby daemons to be considered +healthy. This number includes daemons in standby-replay waiting for a rank to +fail (remember that a standby-replay daemon will not be assigned to take over a +failure for another rank or a failure in a another CephFS file system). The +pool of standby daemons not in replay count towards any file system count. +Each file system may set the number of standby daemons wanted using: + +:: + + ceph fs set standby_count_wanted + +Setting ``count`` to 0 will disable the health check. + + Configuring standby daemons --------------------------- diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc index 33d70a0e91e9..35c8093a380a 100644 --- a/src/mds/FSMap.cc +++ b/src/mds/FSMap.cc @@ -291,6 +291,7 @@ void FSMap::reset_filesystem(fs_cluster_id_t fscid) new_fs->mds_map.modified = ceph_clock_now(); new_fs->mds_map.session_timeout = g_conf->mds_session_timeout; new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose; + new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted; new_fs->mds_map.enabled = true; // Persist the new FSMap @@ -300,13 +301,31 @@ void FSMap::reset_filesystem(fs_cluster_id_t fscid) void FSMap::get_health(list >& summary, list > *detail) const { - for (auto i : filesystems) { - auto fs = i.second; + mds_rank_t standby_count_wanted = 0; + for (const auto &i : filesystems) { + const auto &fs = i.second; // TODO: move get_health up into here so that we can qualify // all the messages with what filesystem they're talking about fs->mds_map.get_health(summary, detail); + + standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size())); + } + + if (standby_count_wanted) { + std::ostringstream oss; + oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more"; + summary.push_back(make_pair(HEALTH_WARN, oss.str())); + } +} + +bool FSMap::check_health(void) +{ + bool changed = false; + for (auto &i : filesystems) { + changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size()); } + return changed; } void FSMap::encode(bufferlist& bl, uint64_t features) const diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h index 177b999d51c2..2c15063202f5 100644 --- a/src/mds/FSMap.h +++ b/src/mds/FSMap.h @@ -468,6 +468,8 @@ public: void get_health(list >& summary, list > *detail) const; + bool check_health(void); + /** * Assert that the FSMap, Filesystem, MDSMap, mds_info_t relations are * all self-consistent. diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 1e86b5da769c..da223e8664d1 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -185,6 +185,7 @@ void MDSMap::dump(Formatter *f) const f->dump_bool("enabled", enabled); f->dump_string("fs_name", fs_name); f->dump_string("balancer", balancer); + f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted)); } void MDSMap::generate_test_instances(list& ls) @@ -228,6 +229,7 @@ void MDSMap::print(ostream& out) const out << "metadata_pool\t" << metadata_pool << "\n"; out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n"; out << "balancer\t" << balancer << "\n"; + out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n"; multimap< pair, mds_gid_t > foo; for (const auto &p : mds_info) { @@ -556,7 +558,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const ::encode(cas_pool, bl); // kclient ignores everything from here - __u16 ev = 11; + __u16 ev = 12; ::encode(ev, bl); ::encode(compat, bl); ::encode(metadata_pool, bl); @@ -576,6 +578,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const ::encode(fs_name, bl); ::encode(damaged, bl); ::encode(balancer, bl); + ::encode(standby_count_wanted, bl); ENCODE_FINISH(bl); } @@ -682,8 +685,13 @@ void MDSMap::decode(bufferlist::iterator& p) } if (ev >= 11) { - ::decode(balancer, p); + ::decode(balancer, p); } + + if (ev >= 12) { + ::decode(standby_count_wanted, p); + } + DECODE_FINISH(p); } @@ -758,3 +766,24 @@ bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next) return state_valid; } + +bool MDSMap::check_health(mds_rank_t standby_daemon_count) +{ + std::set standbys; + get_standby_replay_mds_set(standbys); + std::set actives; + get_active_mds_set(actives); + mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count; + + /* If there are standby daemons available/replaying and + * standby_count_wanted is unset (default), then we set it to 1. This will + * happen during health checks by the mons. Also, during initial creation + * of the FS we will have no actives so we don't want to change the default + * yet. + */ + if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) { + set_standby_count_wanted(1); + return true; + } + return false; +} diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 19f4af4c8dbb..e1874459bfb0 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -193,6 +193,7 @@ protected: */ mds_rank_t max_mds; /* The maximum number of active MDSes. Also, the maximum rank. */ + mds_rank_t standby_count_wanted; string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */ std::set in; // currently defined cluster @@ -228,6 +229,7 @@ public: cas_pool(-1), metadata_pool(-1), max_mds(0), + standby_count_wanted(-1), ever_allowed_features(0), explicitly_allowed_features(0), inline_data_enabled(false), @@ -290,6 +292,17 @@ public: mds_rank_t get_max_mds() const { return max_mds; } void set_max_mds(mds_rank_t m) { max_mds = m; } + mds_rank_t get_standby_count_wanted(mds_rank_t standby_daemon_count) const { + assert(standby_daemon_count >= 0); + std::set s; + get_standby_replay_mds_set(s); + mds_rank_t standbys_avail = (mds_rank_t)s.size()+standby_daemon_count; + mds_rank_t wanted = std::max(0, standby_count_wanted); + return wanted > standbys_avail ? wanted - standbys_avail : 0; + } + void set_standby_count_wanted(mds_rank_t n) { standby_count_wanted = n; } + bool check_health(mds_rank_t standby_daemon_count); + const std::string get_balancer() const { return balancer; } void set_balancer(std::string val) { balancer.assign(val); } @@ -370,6 +383,9 @@ public: void get_active_mds_set(std::set& s) const { get_mds_set(s, MDSMap::STATE_ACTIVE); } + void get_standby_replay_mds_set(std::set& s) const { + get_mds_set(s, MDSMap::STATE_STANDBY_REPLAY); + } void get_failed_mds_set(std::set& s) const { s = failed; } diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 27d2d0c6dcf4..6093b5dd4ffc 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -412,6 +412,21 @@ public: }); ss << "marked " << (is_down ? "down" : "up"); + } else if (var == "standby_count_wanted") { + if (interr.length()) { + ss << var << " requires an integer value"; + return -EINVAL; + } + if (n < 0) { + ss << var << " must be non-negative"; + return -ERANGE; + } + fsmap.modify_filesystem( + fs->fscid, + [n](std::shared_ptr fs) + { + fs->mds_map.set_standby_count_wanted(n); + }); } else { ss << "unknown variable " << var; return -EINVAL; diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index cfc4510dc08c..3bc9c92da3e4 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -2065,6 +2065,8 @@ void MDSMonitor::tick() if (!mon->is_leader()) return; + do_propose |= pending_fsmap.check_health(); + // expand mds cluster (add new nodes to @in)? for (auto i : pending_fsmap.filesystems) { do_propose |= maybe_expand_cluster(i.second); diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index fb0276c96da7..93293e5d577e 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -397,7 +397,8 @@ COMMAND("fs get name=fs_name,type=CephString", \ COMMAND("fs set " \ "name=fs_name,type=CephString " \ "name=var,type=CephChoices,strings=max_mds|max_file_size" - "|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer " \ + "|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer" \ + "|standby_count_wanted " \ "name=val,type=CephString " \ "name=confirm,type=CephString,req=false", \ "set mds parameter to ", "mds", "rw", "cli,rest") -- 2.47.3