]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: Add warning if diff in OSD usage > config mon_warn_osd_usage_percent (10%)
authorDavid Zafman <dzafman@redhat.com>
Wed, 15 Feb 2017 00:37:07 +0000 (16:37 -0800)
committerDavid Zafman <dzafman@redhat.com>
Mon, 27 Feb 2017 15:53:56 +0000 (07:53 -0800)
Signed-off-by: David Zafman <dzafman@redhat.com>
src/common/config_opts.h
src/mon/PGMonitor.cc

index d29046a45de6db8b2aa9cc2c1c3c072824cd4677..93e96ccb3fae74d7ac43051ec4555f01c13d2f54 100644 (file)
@@ -318,6 +318,7 @@ OPTION(mon_crush_min_required_version, OPT_STR, "firefly")
 OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL, true) // warn if crush straw_calc_version==0
 OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
 OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
+OPTION(mon_warn_osd_usage_percent, OPT_FLOAT, .40) // warn if difference in usage percent between OSDs exceeds specified percent
 OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
 OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
 OPTION(mon_max_log_epochs, OPT_INT, 500)
index ca6334722b31227ccd93abdf36b2adbfbf7c451e..44708922f22590c3f7e0d744134478a16bfe4ca6 100644 (file)
@@ -1697,6 +1697,27 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
     }
   }
 
+  if (g_conf->mon_warn_osd_usage_percent) {
+    float max_osd_perc_avail = 0.0, min_osd_perc_avail = 1.0;
+    for (auto p = pg_map.osd_stat.begin(); p != pg_map.osd_stat.end(); ++p) {
+      // kb should never be 0, but avoid divide by zero in case of corruption
+      if (p->second.kb <= 0)
+        continue;
+      float perc_avail = ((float)(p->second.kb - p->second.kb_avail)) / ((float)p->second.kb);
+      if (perc_avail > max_osd_perc_avail)
+        max_osd_perc_avail = perc_avail;
+      if (perc_avail < min_osd_perc_avail)
+        min_osd_perc_avail = perc_avail;
+    }
+    if ((max_osd_perc_avail - min_osd_perc_avail) > g_conf->mon_warn_osd_usage_percent) {
+      ostringstream ss;
+      ss << "Difference in osd space utilization " << ((max_osd_perc_avail - min_osd_perc_avail) *100) << "% greater than " << (g_conf->mon_warn_osd_usage_percent * 100) << "%";
+      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+      if (detail)
+        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+    }
+  }
+
   // recovery
   list<string> sl;
   pg_map.overall_recovery_summary(NULL, &sl);