if (!stuck_failed.empty()) {
health_check_t& fscheck = checks->get_or_add(
"FS_WITH_FAILED_MDS", HEALTH_WARN,
- "%num% filesystem%plurals% %hasorhave% a failed mds daemon");
+ "%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1);
ostringstream ss;
ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
<< " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
if (standby_count_wanted) {
std::ostringstream oss, dss;
oss << "insufficient standby MDS daemons available";
- auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str());
+ auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str(), 1);
dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
<< " more";
d.detail.push_back(dss.str());
// MDS_DAMAGE
if (!damaged.empty()) {
health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR,
- "%num% mds daemon%plurals% damaged");
+ "%num% mds daemon%plurals% damaged",
+ damaged.size());
for (auto p : damaged) {
std::ostringstream oss;
oss << "fs " << fs_name << " mds." << p << " is damaged";
if (is_degraded()) {
health_check_t& fscheck = checks->get_or_add(
"FS_DEGRADED", HEALTH_WARN,
- "%num% filesystem%plurals% %isorare% degraded");
+ "%num% filesystem%plurals% %isorare% degraded", 1);
ostringstream ss;
ss << "fs " << fs_name << " is degraded";
fscheck.detail.push_back(ss.str());
if ((mds_rank_t)get_num_in_mds() < get_max_mds()) {
health_check_t& check = checks->add(
"MDS_UP_LESS_THAN_MAX", HEALTH_WARN,
- "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds");
+ "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1);
stringstream ss;
ss << "fs " << fs_name << " has " << get_num_in_mds()
<< " MDS online, but wants " << get_max_mds();
if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) {
health_check_t &check = checks->add(
"MDS_ALL_DOWN", HEALTH_ERR,
- "%num% filesystem%plurals% %isorare% offline");
+ "%num% filesystem%plurals% %isorare% offline", 1);
stringstream ss;
ss << "fs " << fs_name << " is offline because no MDS is active for it.";
check.detail.push_back(ss.str());
was_snaps_ever_allowed() && !allows_multimds_snaps()) {
health_check_t &check = checks->add(
"MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR,
- "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots");
+ "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1);
stringstream ss;
ss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
check.detail.push_back(ss.str());
health_status_t severity = HEALTH_OK;
string summary;
list<string> detail;
+ int64_t count = 0;
PyObject *infols = PyDict_Items(check_info);
for (int j = 0; j < PyList_Size(infols); ++j) {
PyObject *pair = PyList_GET_ITEM(infols, j);
} else {
summary = std::move(vs);
}
+ } else if (ks == "count") {
+ if (PyLong_Check(v)) {
+ count = PyLong_AsLong(v);
+ } else {
+ derr << __func__ << " check " << check_name
+ << " count value not long" << dendl;
+ continue;
+ }
} else if (ks == "detail") {
if (!PyList_Check(v)) {
derr << __func__ << " check " << check_name
<< " unexpected key " << k << dendl;
}
}
- auto& d = out_checks.add(check_name, severity, summary);
+ auto& d = out_checks.add(check_name, severity, summary, count);
d.detail.swap(detail);
}
return type == daemon_metric::SLOW_OPS;
}
health_check_t& _get_check(health_check_map_t& cm) const override {
- return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "");
+ return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "", 1);
}
bool _update(const DaemonKey& daemon,
const DaemonHealthMetric& metric) override {
return type == daemon_metric::PENDING_CREATING_PGS;
}
health_check_t& _get_check(health_check_map_t& cm) const override {
- return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "");
+ return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "", 1);
}
bool _update(const DaemonKey& osd,
const DaemonHealthMetric& metric) override {
ss << dependency_modules.size()
<< " mgr modules have failed dependencies";
}
- auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str());
+ auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str(),
+ dependency_modules.size());
for (auto& i : dependency_modules) {
std::ostringstream ss;
ss << "Module '" << i.first << "' has failed dependency: " << i.second;
} else if (failed_modules.size() > 1) {
ss << failed_modules.size() << " mgr modules have failed";
}
- auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str());
+ auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str(),
+ failed_modules.size());
for (auto& i : failed_modules) {
std::ostringstream ss;
ss << "Module '" << i.first << "' has failed: " << i.second;
if (bad_detail.size()) {
ostringstream ss;
ss << bad_detail.size() << " auth entities have invalid capabilities";
- health_check_t *check = &next.add("AUTH_BAD_CAPS", HEALTH_ERR, ss.str());
+ health_check_t *check = &next.add("AUTH_BAD_CAPS", HEALTH_ERR, ss.str(),
+ bad_detail.size());
for (auto& i : bad_detail) {
for (auto& j : i.second) {
check->detail.push_back(j);
if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_crit) {
stringstream ss, ss2;
ss << "mon%plurals% %names% %isorare% very low on available space";
- auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str());
+ auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str(), 1);
ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
<< "% avail";
d.detail.push_back(ss2.str());
} else if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_warn) {
stringstream ss, ss2;
ss << "mon%plurals% %names% %isorare% low on available space";
- auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str());
+ auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str(), 1);
ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
<< "% avail";
d.detail.push_back(ss2.str());
if (stats.store_stats.bytes_total >= g_conf()->mon_data_size_warn) {
stringstream ss, ss2;
ss << "mon%plurals% %names% %isorare% using a lot of disk space";
- auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str());
+ auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str(), 1);
ss2 << "mon." << mon->name << " is "
<< byte_u_t(stats.store_stats.bytes_total)
<< " >= mon_data_size_warn ("
g_conf()->mon_osd_down_out_interval == 0) {
ostringstream ss, ds;
ss << "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0";
- auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str());
+ auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str(), 1);
ds << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
d.detail.push_back(ds.str());
}
ostringstream ss;
ss << (max-actual) << "/" << max << " mons down, quorum "
<< mon->get_quorum_names();
- auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str());
+ auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str(), max - actual);
set<int> q = mon->get_quorum();
for (int i=0; i<max; i++) {
if (q.count(i) == 0) {
if (!warns.empty())
ss << ",";
}
- auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str());
+ auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str(), details.size());
d.detail.swap(details);
}
}
if (!details.empty()) {
ostringstream ss;
ss << details.size() << " monitors have not enabled msgr2";
- auto& d = next.add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str());
+ auto& d = next.add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str(),
+ details.size());
d.detail.swap(details);
}
}
health_check_t *check = &new_checks.get_or_add(
mds_metric_name(metric.type),
metric.sev,
- mds_metric_summary(metric.type));
+ mds_metric_summary(metric.type),
+ 1);
ostringstream ss;
ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
bool first = true;
if (pending_map.active_gid == 0) {
auto level = should_warn_about_mgr_down();
if (level != HEALTH_OK) {
- next.add("MGR_DOWN", level, "no active mgr");
+ next.add("MGR_DOWN", level, "no active mgr", 0);
} else {
dout(10) << __func__ << " no health warning (never active and new cluster)"
<< dendl;
// Compose summary message saying how many PGs in what states led
// to this health check failing
std::vector<std::string> pg_msgs;
+ int64_t count = 0;
for (const auto &j : i.second.states) {
std::ostringstream msg;
msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
pg_msgs.push_back(msg.str());
+ count += j.second;
}
summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
-
-
health_check_t *check = &checks->add(
health_code,
sev,
- summary);
+ summary,
+ count);
// Compose list of PGs contributing to this health check failing
for (const auto &j : i.second.pg_messages) {
if (pg_sum.stats.sum.num_scrub_errors) {
ostringstream ss;
ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
- checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
+ checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
+ pg_sum.stats.sum.num_scrub_errors);
}
// LARGE_OMAP_OBJECTS
if (!detail.empty()) {
ostringstream ss;
ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
- auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str());
+ auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
+ pg_sum.stats.sum.num_large_omap_objects);
stringstream tip;
tip << "Search the cluster log for 'Large omap object found' for more "
<< "details.";
if (!detail.empty()) {
ostringstream ss;
ss << num_pools << " cache pools at or near target size";
- auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
+ auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
+ num_pools);
d.detail.swap(detail);
}
}
ostringstream ss;
ss << "too few PGs per OSD (" << per
<< " < min " << min_pg_per_osd << ")";
- checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
+ checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
+ min_pg_per_osd - per);
}
}
ostringstream ss;
ss << "too many PGs per OSD (" << per
<< " > max " << max_pg_per_osd << ")";
- checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
+ checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
+ per - max_pg_per_osd);
}
}
ostringstream ss;
ss << "OSD count " << osdmap.get_num_osds()
<< " < osd_pool_default_size " << osd_pool_default_size;
- checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str());
+ checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
+ osd_pool_default_size - osdmap.get_num_osds());
}
// SMALLER_PGP_NUM
if (!pgp_detail.empty()) {
ostringstream ss;
ss << pgp_detail.size() << " pools have pg_num > pgp_num";
- auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
+ auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
+ pgp_detail.size());
d.detail.swap(pgp_detail);
}
if (!many_detail.empty()) {
ostringstream ss;
ss << many_detail.size() << " pools have many more objects per pg than"
<< " average";
- auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
+ auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
+ many_detail.size());
d.detail.swap(many_detail);
}
}
if (full_pools) {
ostringstream ss;
ss << full_pools << " pools full";
- auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
+ auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
d.detail.swap(full_detail);
}
if (nearfull_pools) {
ostringstream ss;
ss << nearfull_pools << " pools nearfull";
- auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
+ auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
d.detail.swap(nearfull_detail);
}
}
ss << pg_sum.stats.sum.num_objects_misplaced
<< "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
<< b << "%)";
- checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
+ checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
+ pg_sum.stats.sum.num_objects_misplaced);
}
// OBJECT_UNFOUND
ostringstream ss;
ss << pg_sum.stats.sum.num_objects_unfound
<< "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
- auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
+ auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
+ pg_sum.stats.sum.num_objects_unfound);
for (auto& p : pg_stat) {
if (p.second.stats.sum.num_objects_unfound) {
ostringstream ss;
ss << warn << " slow requests are blocked > "
<< cct->_conf->mon_osd_warn_op_age << " sec";
- auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
+ auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
d.detail.swap(warn_detail);
int left = max;
for (auto& p : warn_osd_by_max) {
ostringstream ss;
ss << error << " stuck requests are blocked > "
<< err_age << " sec";
- auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
+ auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
d.detail.swap(error_detail);
int left = max;
for (auto& p : error_osd_by_max) {
} else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
summary = " reporting legacy (not per-pool) BlueStore omap usage stats";
}
- auto& d = checks->add(asum.first, HEALTH_WARN, summary);
+ auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
for (auto& s : asum.second.second) {
d.detail.push_back(s);
}
if (detail_total) {
ostringstream ss;
ss << detail_total << " pgs not scrubbed in time";
- auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
+ auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
if (!detail.empty()) {
d.detail.swap(detail);
if (deep_detail_total) {
ostringstream ss;
ss << deep_detail_total << " pgs not deep-scrubbed in time";
- auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
+ auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
+ deep_detail_total);
if (!deep_detail.empty()) {
d.detail.swap(deep_detail);
if (!detail.empty()) {
ostringstream ss;
ss << detail.size() << " pool(s) do not have an application enabled";
- auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str());
+ auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
+ detail.size());
stringstream tip;
tip << "use 'ceph osd pool application enable <pool-name> "
<< "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
stringstream ss;
ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
- auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str());
+ auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
+ snaptrimq_exceeded);
detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
d.detail.swap(detail);
}
health_status_t severity;
std::string summary;
std::list<std::string> detail;
+ int64_t count = 0;
DENC(health_check_t, v, p) {
- DENC_START(1, 1, p);
+ DENC_START(2, 1, p);
denc(v.severity, p);
denc(v.summary, p);
denc(v.detail, p);
+ if (struct_v >= 2) {
+ denc(v.count, p);
+ }
DENC_FINISH(p);
}
const health_check_t& r) {
return l.severity == r.severity &&
l.summary == r.summary &&
- l.detail == r.detail;
+ l.detail == r.detail &&
+ l.count == r.count;
}
friend bool operator!=(const health_check_t& l,
const health_check_t& r) {
f->open_object_section("summary");
f->dump_string("message", summary);
+ f->dump_int("count", count);
f->close_section();
if (want_detail) {
ls.back()->severity = HEALTH_ERR;
ls.back()->summary = "summarization";
ls.back()->detail = {"one", "two", "three"};
+ ls.back()->count = 42;
}
};
WRITE_CLASS_DENC(health_check_t)
ls.push_back(new health_check_map_t);
ls.push_back(new health_check_map_t);
{
- auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo");
+ auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo", 2);
d.detail.push_back("a");
d.detail.push_back("b");
}
{
- auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!");
+ auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!", 3);
d.detail.push_back("c");
d.detail.push_back("d");
+ d.detail.push_back("e");
}
}
health_check_t& add(const std::string& code,
health_status_t severity,
- const std::string& summary) {
+ const std::string& summary,
+ int64_t count) {
ceph_assert(checks.count(code) == 0);
health_check_t& r = checks[code];
r.severity = severity;
r.summary = summary;
+ r.count = count;
return r;
}
health_check_t& get_or_add(const std::string& code,
health_status_t severity,
- const std::string& summary) {
+ const std::string& summary,
+ int64_t count) {
health_check_t& r = checks[code];
r.severity = severity;
r.summary = summary;
+ r.count += count;
return r;
}
q->second.detail.end(),
p.second.detail.begin(),
p.second.detail.end());
+ q->second.count += p.second.count;
}
}
}
string err = string("OSD_") +
string(crush->get_type_name(type)) + "_DOWN";
boost::to_upper(err);
- auto& d = checks->add(err, HEALTH_WARN, ss.str());
+ auto& d = checks->add(err, HEALTH_WARN, ss.str(),
+ subtree_type_down[type].size());
for (auto j = subtree_type_down[type].rbegin();
j != subtree_type_down[type].rend();
++j) {
}
ostringstream ss;
ss << down_in_osds.size() << " osds down";
- auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str());
+ auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
+ down_in_osds.size());
for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
ostringstream ss;
ss << "osd." << *it << " (";
if (!osds.empty()) {
ostringstream ss;
ss << osds.size() << " osds exist in the crush map but not in the osdmap";
- auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str());
+ auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
+ osds.size());
for (auto osd : osds) {
ostringstream ss;
ss << "osd." << osd << " exists in crush map but not in osdmap";
out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
out += nodeepscrub ? "nodeep-scrub" : "";
auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
- "Some pool(s) have the " + out + " flag(s) set");
+ "Some pool(s) have the " + out + " flag(s) set", 0);
d.detail.splice(d.detail.end(), scrub_messages);
}
}
if (!detail.empty()) {
auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
- "full ratio(s) out of order");
+ "full ratio(s) out of order", 0);
d.detail.swap(detail);
}
}
if (full.size()) {
ostringstream ss;
ss << full.size() << " full osd(s)";
- auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str());
+ auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
for (auto& i: full) {
ostringstream ss;
ss << "osd." << i << " is full";
if (backfillfull.size()) {
ostringstream ss;
ss << backfillfull.size() << " backfillfull osd(s)";
- auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
+ auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
+ backfillfull.size());
for (auto& i: backfillfull) {
ostringstream ss;
ss << "osd." << i << " is backfill full";
if (nearfull.size()) {
ostringstream ss;
ss << nearfull.size() << " nearfull osd(s)";
- auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str());
+ auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
for (auto& i: nearfull) {
ostringstream ss;
ss << "osd." << i << " is near full";
CEPH_OSDMAP_NOREBALANCE;
if (test_flag(warn_flags)) {
ostringstream ss;
- ss << get_flag_string(get_flags() & warn_flags)
- << " flag(s) set";
- checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str());
+ string s = get_flag_string(get_flags() & warn_flags);
+ ss << s << " flag(s) set";
+ checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
+ s.size() /* kludgey but sufficient */);
}
}
if (!detail.empty()) {
ostringstream ss;
ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
- auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str());
+ auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
d.detail.swap(detail);
}
}
ostringstream ss;
ss << "crush map has legacy tunables (require " << min
<< ", min is " << g_conf()->mon_crush_min_required_version << ")";
- auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
+ auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
}
}
if (crush->get_straw_calc_version() == 0) {
ostringstream ss;
ss << "crush map has straw_calc_version=0";
- auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
+ auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
d.detail.push_back(
"see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
}
if (!detail.empty()) {
ostringstream ss;
ss << detail.size() << " cache pools are missing hit_sets";
- auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str());
+ auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
+ detail.size());
d.detail.swap(detail);
}
}
if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
ostringstream ss;
ss << "'sortbitwise' flag is not set";
- checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
+ checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
}
// OSD_UPGRADE_FINISHED
if (!full_detail.empty()) {
ostringstream ss;
ss << full_detail.size() << " pool(s) full";
- auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
+ auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
d.detail.swap(full_detail);
}
if (!backfillfull_detail.empty()) {
ostringstream ss;
ss << backfillfull_detail.size() << " pool(s) backfillfull";
- auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
+ auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
+ backfillfull_detail.size());
d.detail.swap(backfillfull_detail);
}
if (!nearfull_detail.empty()) {
ostringstream ss;
ss << nearfull_detail.size() << " pool(s) nearfull";
- auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
+ auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
+ nearfull_detail.size());
d.detail.swap(nearfull_detail);
}
}
health_checks['RECENT_CRASH'] = {
'severity': 'warning',
'summary': '%d daemons have recently crashed' % (num),
+ 'count': num,
'detail': detail,
}
self.set_health_checks(health_checks)
checks[warning] = {
'severity': 'warning',
'summary': HEALTH_MESSAGES[warning] % n,
+ 'count': len(ls),
'detail': ls,
}
self.set_health_checks(checks)
'CHECK_FOO': {
'severity': 'warning', # or 'error'
'summary': 'summary string',
+ 'count': 4, # quantify badness
'detail': [ 'list', 'of', 'detail', 'strings' ],
},
'CHECK_BAR': {
health_checks['POOL_TOO_FEW_PGS'] = {
'severity': 'warning',
'summary': summary,
+ 'count': len(too_few),
'detail': too_few
}
if too_many:
health_checks['POOL_TOO_MANY_PGS'] = {
'severity': 'warning',
'summary': summary,
+ 'count': len(too_many),
'detail': too_many
}
health_checks['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = {
'severity': 'warning',
'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio),
+ 'count': len(too_much_target_ratio),
'detail': too_much_target_ratio,
}
health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
'severity': 'warning',
'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
+ 'count': len(too_much_target_bytes),
'detail': too_much_target_bytes,
}
self._health[check] = {
"severity": str(info["severity"]),
"summary": str(info["summary"]),
+ "count": 123,
"detail": [str(m) for m in info["detail"]]
}
except Exception as e: