From 7832c53375ad24f3b54a68a3c63b056e69e3fa0d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 21 Jul 2017 11:58:08 -0400 Subject: [PATCH] mon/PGMap: remove skewed utilizatoin warning This has a few problems: 1- It does not do it's analysis over CRUSH rule roots/classes, which means that an innocent user of classes will see skewed usage (bc hdds are more full than ssds, say) 2- It does not take degraded clusters into account, which means the warning will appear when a fresh OSD is added. See http://tracker.ceph.com/issues/20730 Signed-off-by: Sage Weil --- PendingReleaseNotes | 5 +++ src/common/legacy_config_opts.h | 1 - src/mon/PGMap.cc | 60 --------------------------------- 3 files changed, 5 insertions(+), 61 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 9879069535c..0812bcc3785 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -235,3 +235,8 @@ events to a Zabbix server containing high-level information of the Ceph cluster. This makes it easy to monitor a Ceph cluster's status and send out notifications in case of a malfunction. + +* The 'mon_warn_osd_usage_min_max_delta' config option has been + removed and the associated health warning has been disabled because + it does not address clusters undergoing recovery or CRUSH rules that do + not target all devices in the cluster. \ No newline at end of file diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h index f3cb6516ea7..300d2978a77 100644 --- a/src/common/legacy_config_opts.h +++ b/src/common/legacy_config_opts.h @@ -273,7 +273,6 @@ OPTION(mon_crush_min_required_version, OPT_STR, "firefly") OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL, true) // warn if crush straw_calc_version==0 OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0' OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true) -OPTION(mon_warn_osd_usage_min_max_delta, OPT_FLOAT, .40) // warn if difference between min and max OSD utilizations exceeds specified amount OPTION(mon_min_osdmap_epochs, OPT_INT, 500) OPTION(mon_max_pgmap_epochs, OPT_INT, 500) OPTION(mon_max_log_epochs, OPT_INT, 500) diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index a01a11c4fc7..553a3c5b8ae 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2812,40 +2812,6 @@ void PGMap::get_health_checks( } } - // OSD_SKEWED_USAGE - if (cct->_conf->mon_warn_osd_usage_min_max_delta) { - int max_osd = -1, min_osd = -1; - float max_osd_usage = 0.0, min_osd_usage = 1.0; - for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) { - // kb should never be 0, but avoid divide by zero in case of corruption - if (p->second.kb <= 0) - continue; - float usage = ((float)p->second.kb_used) / ((float)p->second.kb); - if (usage > max_osd_usage) { - max_osd_usage = usage; - max_osd = p->first; - } - if (usage < min_osd_usage) { - min_osd_usage = usage; - min_osd = p->first; - } - } - float diff = max_osd_usage - min_osd_usage; - if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) { - auto& d = checks->add("OSD_SKEWED_USAGE", HEALTH_WARN, - "skewed osd utilization"); - ostringstream ss; - ss << "difference between min (osd." << min_osd << " at " - << roundf(min_osd_usage*1000.0)/100.0 - << "%) and max (osd." << max_osd << " at " - << roundf(max_osd_usage*1000.0)/100.0 - << "%) osd usage " << roundf(diff*1000.0)/100.0 << "% > " - << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/100.0 - << " (mon_warn_osd_usage_min_max_delta)"; - d.detail.push_back(ss.str()); - } - } - // OSD_SCRUB_ERRORS if (pg_sum.stats.sum.num_scrub_errors) { ostringstream ss; @@ -3443,32 +3409,6 @@ void PGMap::get_health( } } - if (cct->_conf->mon_warn_osd_usage_min_max_delta) { - float max_osd_usage = 0.0, min_osd_usage = 1.0; - for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) { - // kb should never be 0, but avoid divide by zero in case of corruption - if (p->second.kb <= 0) - continue; - float usage = ((float)p->second.kb_used) / ((float)p->second.kb); - if (usage > max_osd_usage) - max_osd_usage = usage; - if (usage < min_osd_usage) - min_osd_usage = usage; - } - float diff = max_osd_usage - min_osd_usage; - if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) { - ostringstream ss; - ss << "difference between min (" << roundf(min_osd_usage*1000.0)/10.0 - << "%) and max (" << roundf(max_osd_usage*1000.0)/10.0 - << "%) osd usage " << roundf(diff*1000.0)/10.0 << "% > " - << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/10.0 - << "% (mon_warn_osd_usage_min_max_delta)"; - summary.push_back(make_pair(HEALTH_WARN, ss.str())); - if (detail) - detail->push_back(make_pair(HEALTH_WARN, ss.str())); - } - } - // recovery list sl; overall_recovery_summary(NULL, &sl); -- 2.39.5