events to a Zabbix server containing high-level information of the Ceph
cluster. This makes it easy to monitor a Ceph cluster's status and send
out notifications in case of a malfunction.
+
+* The 'mon_warn_osd_usage_min_max_delta' config option has been
+ removed and the associated health warning has been disabled because
+ it does not address clusters undergoing recovery or CRUSH rules that do
+ not target all devices in the cluster.
\ No newline at end of file
OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL, true) // warn if crush straw_calc_version==0
OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
-OPTION(mon_warn_osd_usage_min_max_delta, OPT_FLOAT, .40) // warn if difference between min and max OSD utilizations exceeds specified amount
OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
OPTION(mon_max_log_epochs, OPT_INT, 500)
}
}
- // OSD_SKEWED_USAGE
- if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
- int max_osd = -1, min_osd = -1;
- float max_osd_usage = 0.0, min_osd_usage = 1.0;
- for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
- // kb should never be 0, but avoid divide by zero in case of corruption
- if (p->second.kb <= 0)
- continue;
- float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
- if (usage > max_osd_usage) {
- max_osd_usage = usage;
- max_osd = p->first;
- }
- if (usage < min_osd_usage) {
- min_osd_usage = usage;
- min_osd = p->first;
- }
- }
- float diff = max_osd_usage - min_osd_usage;
- if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
- auto& d = checks->add("OSD_SKEWED_USAGE", HEALTH_WARN,
- "skewed osd utilization");
- ostringstream ss;
- ss << "difference between min (osd." << min_osd << " at "
- << roundf(min_osd_usage*1000.0)/100.0
- << "%) and max (osd." << max_osd << " at "
- << roundf(max_osd_usage*1000.0)/100.0
- << "%) osd usage " << roundf(diff*1000.0)/100.0 << "% > "
- << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/100.0
- << " (mon_warn_osd_usage_min_max_delta)";
- d.detail.push_back(ss.str());
- }
- }
-
// OSD_SCRUB_ERRORS
if (pg_sum.stats.sum.num_scrub_errors) {
ostringstream ss;
}
}
- if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
- float max_osd_usage = 0.0, min_osd_usage = 1.0;
- for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
- // kb should never be 0, but avoid divide by zero in case of corruption
- if (p->second.kb <= 0)
- continue;
- float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
- if (usage > max_osd_usage)
- max_osd_usage = usage;
- if (usage < min_osd_usage)
- min_osd_usage = usage;
- }
- float diff = max_osd_usage - min_osd_usage;
- if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
- ostringstream ss;
- ss << "difference between min (" << roundf(min_osd_usage*1000.0)/10.0
- << "%) and max (" << roundf(max_osd_usage*1000.0)/10.0
- << "%) osd usage " << roundf(diff*1000.0)/10.0 << "% > "
- << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/10.0
- << "% (mon_warn_osd_usage_min_max_delta)";
- summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- if (detail)
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- }
-
// recovery
list<string> sl;
overall_recovery_summary(NULL, &sl);