]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/PGMap: remove skewed utilizatoin warning 16461/head
authorSage Weil <sage@redhat.com>
Fri, 21 Jul 2017 15:58:08 +0000 (11:58 -0400)
committerSage Weil <sage@redhat.com>
Fri, 21 Jul 2017 15:58:08 +0000 (11:58 -0400)
This has a few problems:

1- It does not do it's analysis over CRUSH rule roots/classes, which
means that an innocent user of classes will see skewed usage (bc hdds are
more full than ssds, say)

2- It does not take degraded clusters into account, which means the warning
will appear when a fresh OSD is added.

See http://tracker.ceph.com/issues/20730

Signed-off-by: Sage Weil <sage@redhat.com>
PendingReleaseNotes
src/common/legacy_config_opts.h
src/mon/PGMap.cc

index 9879069535c25263a4e2c395e3da35135ca5e146..0812bcc378550568ef454af6643936502b89dad9 100644 (file)
   events to a Zabbix server containing high-level information of the Ceph
   cluster. This makes it easy to monitor a Ceph cluster's status and send
   out notifications in case of a malfunction.
+
+* The 'mon_warn_osd_usage_min_max_delta' config option has been
+  removed and the associated health warning has been disabled because
+  it does not address clusters undergoing recovery or CRUSH rules that do
+  not target all devices in the cluster.
\ No newline at end of file
index f3cb6516ea7f063562209da66a3b9a1caeebf148..300d2978a774d28bfa1b4c313372d53db793f59e 100644 (file)
@@ -273,7 +273,6 @@ OPTION(mon_crush_min_required_version, OPT_STR, "firefly")
 OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL, true) // warn if crush straw_calc_version==0
 OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
 OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
-OPTION(mon_warn_osd_usage_min_max_delta, OPT_FLOAT, .40) // warn if difference between min and max OSD utilizations exceeds specified amount
 OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
 OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
 OPTION(mon_max_log_epochs, OPT_INT, 500)
index a01a11c4fc72968a63f28d95739f62682e9ee9c3..553a3c5b8aed0dc331b98e0a07dbe4c826111f8f 100644 (file)
@@ -2812,40 +2812,6 @@ void PGMap::get_health_checks(
     }
   }
 
-  // OSD_SKEWED_USAGE
-  if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
-    int max_osd = -1, min_osd = -1;
-    float max_osd_usage = 0.0, min_osd_usage = 1.0;
-    for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
-      // kb should never be 0, but avoid divide by zero in case of corruption
-      if (p->second.kb <= 0)
-        continue;
-      float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
-      if (usage > max_osd_usage) {
-        max_osd_usage = usage;
-       max_osd = p->first;
-      }
-      if (usage < min_osd_usage) {
-        min_osd_usage = usage;
-       min_osd = p->first;
-      }
-    }
-    float diff = max_osd_usage - min_osd_usage;
-    if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
-      auto& d = checks->add("OSD_SKEWED_USAGE", HEALTH_WARN,
-                           "skewed osd utilization");
-      ostringstream ss;
-      ss << "difference between min (osd." << min_osd << " at "
-        << roundf(min_osd_usage*1000.0)/100.0
-        << "%) and max (osd." << max_osd << " at "
-        << roundf(max_osd_usage*1000.0)/100.0
-        << "%) osd usage " << roundf(diff*1000.0)/100.0 << "% > "
-        << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/100.0
-        << " (mon_warn_osd_usage_min_max_delta)";
-      d.detail.push_back(ss.str());
-    }
-  }
-
   // OSD_SCRUB_ERRORS
   if (pg_sum.stats.sum.num_scrub_errors) {
     ostringstream ss;
@@ -3443,32 +3409,6 @@ void PGMap::get_health(
     }
   }
 
-  if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
-    float max_osd_usage = 0.0, min_osd_usage = 1.0;
-    for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
-      // kb should never be 0, but avoid divide by zero in case of corruption
-      if (p->second.kb <= 0)
-        continue;
-      float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
-      if (usage > max_osd_usage)
-        max_osd_usage = usage;
-      if (usage < min_osd_usage)
-        min_osd_usage = usage;
-    }
-    float diff = max_osd_usage - min_osd_usage;
-    if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
-      ostringstream ss;
-      ss << "difference between min (" << roundf(min_osd_usage*1000.0)/10.0
-        << "%) and max (" << roundf(max_osd_usage*1000.0)/10.0
-        << "%) osd usage " << roundf(diff*1000.0)/10.0 << "% > "
-        << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/10.0
-        << "% (mon_warn_osd_usage_min_max_delta)";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-      if (detail)
-        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-    }
-  }
-
   // recovery
   list<string> sl;
   overall_recovery_summary(NULL, &sl);