mon: warn when pool nears target max objects/bytes

author Sage Weil <sage@inktank.com>

Wed, 5 Mar 2014 18:58:37 +0000 (10:58 -0800)

committer Sage Weil <sage@inktank.com>

Wed, 5 Mar 2014 19:59:07 +0000 (11:59 -0800)
author Sage Weil <sage@inktank.com>
Wed, 5 Mar 2014 18:58:37 +0000 (10:58 -0800)
committer Sage Weil <sage@inktank.com>
Wed, 5 Mar 2014 19:59:07 +0000 (11:59 -0800)
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh

index 51e7ab4dbbe819c54d1e7bef5a4c45f87f5b6f51..e4efe97c27cb3c4e6e7acbe384c761b6994cbc4a 100755 (executable)
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -95,6 +95,22 @@ ceph osd dump | grep cache3 | grep bloom | grep 'false_positive_probability: 0.0
  ceph osd tier remove data cache3
  ceph osd pool delete cache3 cache3 --yes-i-really-really-mean-it
  
+# check health check
+ceph osd pool create cache4 2
+ceph osd pool set cache4 target_max_objects 5
+ceph osd pool set cache4 target_max_bytes 1000
+for f in `seq 1 5` ; do
+    rados -p cache4 put foo$f /etc/passwd
+done
+while ! ceph df | grep cache4 | grep ' 5 ' ; do
+    echo waiting for pg stats to flush
+    sleep 2
+done
+ceph health | grep WARN | grep cache4
+ceph health detail | grep cache4 | grep 'target max' | grep objects
+ceph health detail | grep cache4 | grep 'target max' | grep 'B'
+ceph osd pool delete cache4 cache4 --yes-i-really-really-mean-it
+
  # Assumes there are at least 3 MDSes and two OSDs
  #
  
diff --git a/src/common/config_opts.h b/src/common/config_opts.h

index 8bbd58671e04ca6040a9955f9ebb585373c70b6c..fad7204a85ba038b776e432cd780deb79035f99e 100644 (file)
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -166,6 +166,7 @@ OPTION(mon_pg_warn_min_per_osd, OPT_INT, 20)  // min # pgs per (in) osd before w
  OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg
  OPTION(mon_pg_warn_min_objects, OPT_INT, 10000)  // do not warn below this object #
  OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000)  // do not warn on pools below this object #
+OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT, .66) // position between pool cache_target_full and max where we start warning
  OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
  OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
  OPTION(mon_globalid_prealloc, OPT_INT, 100)   // how many globalids to prealloc
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h

index d76b90d355a093d437bdda9e6acbee2b6ef4cb65..f4e8df5f0ffcab3433f655b23fa3060c88ab710d 100644 (file)
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -205,8 +205,9 @@ public:
      stamp = s;
    }
  
-  pool_stat_t get_pg_pool_sum_stat(int64_t pool) {
-    ceph::unordered_map<int,pool_stat_t>::iterator p = pg_pool_sum.find(pool);
+  pool_stat_t get_pg_pool_sum_stat(int64_t pool) const {
+    ceph::unordered_map<int,pool_stat_t>::const_iterator p =
+      pg_pool_sum.find(pool);
      if (p != pg_pool_sum.end())
        return p->second;
      return pool_stat_t();
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc

index 1c64cdc8ec5ae9ea3f1c2e31ca7483b6c051ddc5..c93e8557f869ad4fe636d99736846859fbf52a91 100644 (file)
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1889,6 +1889,50 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
    check_full_osd_health(summary, detail, pg_map.full_osds, "full", HEALTH_ERR);
    check_full_osd_health(summary, detail, pg_map.nearfull_osds, "near full", HEALTH_WARN);
  
+  // near-target max pools
+  const map<int64_t,pg_pool_t>& pools = mon->osdmon()->osdmap.get_pools();
+  for (map<int64_t,pg_pool_t>::const_iterator p = pools.begin();
+       p != pools.end(); ++p) {
+    if ((!p->second.target_max_objects && !p->second.target_max_bytes) ||
+       !pg_map.pg_pool_sum.count(p->first))
+      continue;
+    bool nearfull = false;
+    const char *name = mon->osdmon()->osdmap.get_pool_name(p->first);
+    const pool_stat_t& st = pg_map.get_pg_pool_sum_stat(p->first);
+    uint64_t ratio = p->second.cache_target_full_ratio_micro +
+      ((1000000 - p->second.cache_target_full_ratio_micro) *
+       g_conf->mon_cache_target_full_warn_ratio);
+    if (p->second.target_max_objects && (uint64_t)st.stats.sum.num_objects >
+       p->second.target_max_objects * ratio / 1000000) {
+      nearfull = true;
+      if (detail) {
+       ostringstream ss;
+       ss << "cache pool '" << name << "' with "
+          << si_t(st.stats.sum.num_objects)
+          << " objects at/near target max "
+          << si_t(p->second.target_max_objects) << " objects";
+       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
+    }
+    if (p->second.target_max_bytes && (uint64_t)st.stats.sum.num_bytes >
+       p->second.target_max_bytes * ratio / 1000000) {
+      nearfull = true;
+      if (detail) {
+       ostringstream ss;
+       ss << "cache pool '" << mon->osdmon()->osdmap.get_pool_name(p->first)
+          << "' with " << si_t(st.stats.sum.num_bytes)
+          << "B at/near target max "
+          << si_t(p->second.target_max_bytes) << "B";
+       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
+    }
+    if (nearfull) {
+      ostringstream ss;
+      ss << "'" << name << "' at/near target max";
+      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+    }
+  }
+
    // scrub
    if (pg_map.pg_sum.stats.sum.num_scrub_errors) {
      ostringstream ss;
author	Sage Weil <sage@inktank.com>
	Wed, 5 Mar 2014 18:58:37 +0000 (10:58 -0800)
committer	Sage Weil <sage@inktank.com>
	Wed, 5 Mar 2014 19:59:07 +0000 (11:59 -0800)
qa/workunits/cephtool/test.sh		patch \| blob \| history
src/common/config_opts.h		patch \| blob \| history
src/mon/PGMap.h		patch \| blob \| history
src/mon/PGMonitor.cc		patch \| blob \| history