#include <sstream>
using std::stringstream;
+#include "mon/health_check.h"
+
void Filesystem::dump(Formatter *f) const
{
return changed;
}
+void FSMap::get_health_checks(health_check_map_t *checks) const
+{
+ mds_rank_t standby_count_wanted = 0;
+ for (const auto &i : filesystems) {
+ const auto &fs = i.second;
+ health_check_map_t fschecks;
+ fs->mds_map.get_health_checks(&fschecks);
+ checks->merge(fschecks);
+ standby_count_wanted = std::max(
+ standby_count_wanted,
+ fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
+ }
+
+ // MDS_INSUFFICIENT_STANDBY
+ if (standby_count_wanted) {
+ std::ostringstream oss, dss;
+ oss << "insufficient standby daemons available";
+ auto& d = checks->add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str());
+ dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
+ << " more";
+ d.detail.push_back(dss.str());
+ }
+}
+
void FSMap::encode(bufferlist& bl, uint64_t features) const
{
if (features & CEPH_FEATURE_SERVER_JEWEL) {
#include "mds/mdstypes.h"
class CephContext;
+class health_check_map_t;
#define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20")
#define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges")
void get_health(list<pair<health_status_t,std::string> >& summary,
list<pair<health_status_t,std::string> > *detail) const;
+ void get_health_checks(health_check_map_t *checks) const;
+
bool check_health(void);
/**
#include <sstream>
using std::stringstream;
+#include "mon/health_check.h"
+
// features
CompatSet get_mdsmap_compat_set_all() {
}
}
+void MDSMap::get_health_checks(health_check_map_t *checks) const
+{
+ // FS_WITH_FAILED_MDS
+ // MDS_FAILED
+ if (!failed.empty()) {
+ health_check_t& fscheck = checks->add(
+ "FS_WITH_FAILED_MDS", HEALTH_WARN,
+ "%num% filesystem%plurals% %isorare% have a failed mds daemon");
+ ostringstream ss;
+ ss << "fs " << fs_name << " has " << failed.size() << " failed mds"
+ << (failed.size() > 1 ? "s" : "");
+ fscheck.detail.push_back(ss.str());
+
+ health_check_t& check = checks->add("MDS_FAILED", HEALTH_ERR,
+ "%num% mds daemon%plurals% down");
+ for (auto p : failed) {
+ std::ostringstream oss;
+ oss << "fs " << fs_name << " mds." << p << " has failed";
+ check.detail.push_back(oss.str());
+ }
+ }
+
+ // MDS_DAMAGED
+ if (!damaged.empty()) {
+ health_check_t& check = checks->add("MDS_DAMAGED", HEALTH_ERR,
+ "%num% mds daemon%plurals% damaged");
+ for (auto p : damaged) {
+ std::ostringstream oss;
+ oss << "fs " << fs_name << " mds." << p << " is damaged";
+ check.detail.push_back(oss.str());
+ }
+ }
+
+ // FS_DEGRADED
+ // MDS_DEGRADED
+ if (is_degraded()) {
+ health_check_t& fscheck = checks->add(
+ "FS_DEGRADED", HEALTH_WARN,
+ "%num% filesystem%plurals% %isorare% degraded");
+ ostringstream ss;
+ ss << "fs " << fs_name << " is degraded";
+ fscheck.detail.push_back(ss.str());
+
+ list<string> detail;
+ for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
+ if (!is_up(i))
+ continue;
+ mds_gid_t gid = up.find(i)->second;
+ map<mds_gid_t,mds_info_t>::const_iterator info = mds_info.find(gid);
+ stringstream ss;
+ ss << "fs " << fs_name << " mds." << info->second.name << " at "
+ << info->second.addr << " rank " << i;
+ if (is_resolve(i))
+ ss << " is resolving";
+ if (is_replay(i))
+ ss << " is replaying journal";
+ if (is_rejoin(i))
+ ss << " is rejoining";
+ if (is_reconnect(i))
+ ss << " is reconnecting to clients";
+ if (ss.str().length())
+ detail.push_back(ss.str());
+ }
+ if (!detail.empty()) {
+ health_check_t& check = checks->add(
+ "MDS_DEGRADED", HEALTH_WARN,
+ "%num% mds daemon%plurals% %isorare% degraded");
+ check.detail.insert(check.detail.end(), detail.begin(), detail.end());
+ }
+ }
+}
+
void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const
{
ENCODE_START(7, 4, bl);
*/
class CephContext;
+class health_check_map_t;
extern CompatSet get_mdsmap_compat_set_all();
extern CompatSet get_mdsmap_compat_set_default();
void get_health(list<pair<health_status_t,std::string> >& summary,
list<pair<health_status_t,std::string> > *detail) const;
+ void get_health_checks(health_check_map_t *checks) const;
+
typedef enum
{
AVAILABLE = 0,
MDS_HEALTH_CACHE_OVERSIZED
};
+static inline const char *mds_metric_name(mds_metric_t m)
+{
+ switch (m) {
+ case MDS_HEALTH_TRIM: return "MDS_TRIM";
+ case MDS_HEALTH_CLIENT_RECALL: return "MDS_CLIENT_RECALL";
+ case MDS_HEALTH_CLIENT_LATE_RELEASE: return "MDS_CLIENT_LATE_RELEASE";
+ case MDS_HEALTH_CLIENT_RECALL_MANY: return "MDS_CLIENT_RECALL_MANY";
+ case MDS_HEALTH_CLIENT_LATE_RELEASE_MANY: return "MDS_CLIENT_LATE_RELEASE_MANY";
+ case MDS_HEALTH_CLIENT_OLDEST_TID: return "MDS_CLIENT_OLDEST_TID";
+ case MDS_HEALTH_CLIENT_OLDEST_TID_MANY: return "MDS_CLIENT_OLDEST_TID_MANY";
+ case MDS_HEALTH_DAMAGE: return "MDS_DAMAGE";
+ case MDS_HEALTH_READ_ONLY: return "MDS_READ_ONLY";
+ case MDS_HEALTH_SLOW_REQUEST: return "MDS_SLOW_REQUEST";
+ case MDS_HEALTH_CACHE_OVERSIZED: return "MDS_CACHE_OVERSIZED";
+ default:
+ return "???";
+ }
+}
+
+static inline const char *mds_metric_summary(mds_metric_t m)
+{
+ switch (m) {
+ case MDS_HEALTH_TRIM:
+ return "%num% MDSs behind on trimming";
+ case MDS_HEALTH_CLIENT_RECALL:
+ return "%num% clients failing to respond to cache pressure";
+ case MDS_HEALTH_CLIENT_LATE_RELEASE:
+ return "%num% clients failing to respond to capability release";
+ case MDS_HEALTH_CLIENT_RECALL_MANY:
+ return "%num% MDSs have many clients failing to respond to cache pressure";
+ case MDS_HEALTH_CLIENT_LATE_RELEASE_MANY:
+ return "%num% MDSs have many clients failing to respond to capability "
+ "release";
+ case MDS_HEALTH_CLIENT_OLDEST_TID:
+ return "%num% clients failing to advance oldest client/flush tid";
+ case MDS_HEALTH_CLIENT_OLDEST_TID_MANY:
+ return "%num% MDSs have clients failing to advance oldest client/flush tid";
+ case MDS_HEALTH_DAMAGE:
+ return "%num% MDSs report damaged metadata";
+ case MDS_HEALTH_READ_ONLY:
+ return "%num% MDSs are read only";
+ case MDS_HEALTH_SLOW_REQUEST:
+ return "%num% MDSs report slow requests";
+ case MDS_HEALTH_CACHE_OVERSIZED:
+ return "%num% MDSs report oversized cache";
+ default:
+ return "???";
+ }
+}
+
/**
* This structure is designed to allow some flexibility in how we emit health
* complaints, such that:
#include <sstream>
#include <boost/utility.hpp>
+#include <boost/regex.hpp>
#include "MDSMonitor.h"
#include "FSCommands.h"
<< ", my e " << fsmap.epoch << dendl;
assert(version > fsmap.epoch);
+ load_health();
+
// read and decode
bufferlist fsmap_bl;
fsmap_bl.clear();
}
pending_daemon_health_rm.clear();
remove_from_metadata(t);
+
+ // health
+ health_check_map_t new_checks;
+ const auto info_map = pending_fsmap.get_mds_info();
+ for (const auto &i : info_map) {
+ const auto &gid = i.first;
+ const auto &info = i.second;
+ if (pending_daemon_health_rm.count(gid)) {
+ continue;
+ }
+ MDSHealth health;
+ auto p = pending_daemon_health.find(gid);
+ if (p != pending_daemon_health.end()) {
+ health = p->second;
+ } else {
+ bufferlist bl;
+ mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
+ if (!bl.length()) {
+ derr << "Missing health data for MDS " << gid << dendl;
+ continue;
+ }
+ bufferlist::iterator bl_i = bl.begin();
+ health.decode(bl_i);
+ }
+ for (const auto &metric : health.metrics) {
+ int const rank = info.rank;
+ health_check_t *check = &new_checks.add(
+ mds_metric_name(metric.type),
+ metric.sev,
+ mds_metric_summary(metric.type));
+ ostringstream ss;
+ ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
+ for (auto p = metric.metadata.begin();
+ p != metric.metadata.end();
+ ++p) {
+ if (p != metric.metadata.begin()) {
+ ss << ", ";
+ }
+ ss << p->first << ": " << p->second;
+ }
+ check->detail.push_back(ss.str());
+ }
+ }
+ pending_fsmap.get_health_checks(&new_checks);
+ for (auto& p : new_checks.checks) {
+ p.second.summary = boost::regex_replace(
+ p.second.summary,
+ boost::regex("%num%"),
+ stringify(p.second.detail.size()));
+ p.second.summary = boost::regex_replace(
+ p.second.summary,
+ boost::regex("%plurals%"),
+ p.second.detail.size() > 1 ? "s" : "");
+ p.second.summary = boost::regex_replace(
+ p.second.summary,
+ boost::regex("%isorare%"),
+ p.second.detail.size() > 1 ? "are" : "is");
+ }
+ encode_health(new_checks, t);
}
version_t MDSMonitor::get_trim_to()