mon: report pgs stuck inactive/unclean/stale in health check

author Josh Durgin <josh.durgin@dreamhost.com>

Tue, 28 Feb 2012 01:49:13 +0000 (17:49 -0800)

committer Josh Durgin <josh.durgin@dreamhost.com>

Tue, 28 Feb 2012 21:53:15 +0000 (13:53 -0800)
author Josh Durgin <josh.durgin@dreamhost.com>
Tue, 28 Feb 2012 01:49:13 +0000 (17:49 -0800)
committer Josh Durgin <josh.durgin@dreamhost.com>
Tue, 28 Feb 2012 21:53:15 +0000 (13:53 -0800)
diff --git a/src/common/config_opts.h b/src/common/config_opts.h

index 942eae5cb3160ed6a2af4e47b303e04b3ff241f4..e2a84d152c536ad224692e7feadeadd65d9482a2 100644 (file)
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -100,6 +100,7 @@ OPTION(mon_clock_drift_allowed, OPT_FLOAT, .050) // allowed clock drift between
  OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for clock drift warnings
  OPTION(mon_accept_timeout, OPT_FLOAT, 10.0)    // on leader, if paxos update isn't accepted
  OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
+OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
  OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
  OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
  OPTION(mon_globalid_prealloc, OPT_INT, 100)   // how many globalids to prealloc
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc

index ce64eb09989341c6f91e4d0123fb73f25d5da6df..69cf6d1e4efef80e3f806da256c44b6d46350665 100644 (file)
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1151,7 +1151,35 @@ enum health_status_t PGMonitor::get_health(std::ostream &ss) const
        note["repair"] += p->second;
      if (p->first & PG_STATE_SPLITTING)
        note["splitting"] += p->second;
+    if (p->first & PG_STATE_RECOVERING)
+      note["recovering"] += p->second;
+    if (p->first & PG_STATE_INCOMPLETE)
+      note["incomplete"] += p->second;
+    if (p->first & PG_STATE_BACKFILL)
+      note["backfill"] += p->second;
    }
+
+  hash_map<pg_t, pg_stat_t> stuck_pgs;
+  utime_t now(ceph_clock_now(g_ceph_context));
+  utime_t cutoff = now - utime_t(g_conf->mon_pg_stuck_threshold, 0);
+
+  pg_map.get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
+  if (!stuck_pgs.empty()) {
+    note["stuck inactive"] = stuck_pgs.size();
+  }
+  stuck_pgs.clear();
+
+  pg_map.get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
+  if (!stuck_pgs.empty()) {
+    note["stuck unclean"] = stuck_pgs.size();
+  }
+  stuck_pgs.clear();
+
+  pg_map.get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
+  if (!stuck_pgs.empty()) {
+    note["stuck stale"] = stuck_pgs.size();
+  }
+
    if (!note.empty()) {
      ret = HEALTH_WARN;
      for (map<string,int>::iterator p = note.begin(); p != note.end(); p++) {
@@ -1192,7 +1220,7 @@ int PGMonitor::dump_stuck_pg_stats(ostream& ss,
  {
    string format = "plain";
    string val;
-  int threshold = 300;
+  int threshold = g_conf->mon_pg_stuck_threshold;
    int seconds;
    ostringstream err;
author	Josh Durgin <josh.durgin@dreamhost.com>
	Tue, 28 Feb 2012 01:49:13 +0000 (17:49 -0800)
committer	Josh Durgin <josh.durgin@dreamhost.com>
	Tue, 28 Feb 2012 21:53:15 +0000 (13:53 -0800)
src/common/config_opts.h		patch \| blob \| history
src/mon/PGMonitor.cc		patch \| blob \| history