From: Sage Weil <sage@redhat.com>
Date: Fri, 19 May 2017 15:48:15 +0000 (-0400)
Subject: mon/PGMap: cap health detail messages at 50 (configurable)
X-Git-Tag: ses5-milestone6~8^2~19^2~50
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=229e56c7bcf50a53ce3c0318aac583c574e39aa9;p=ceph.git

mon/PGMap: cap health detail messages at 50 (configurable)

There are two cases where we spew health detail warnings for potentially
every pg.  Cap those detail messages at 50 and, if we exceed that, include
a message saying how many more there are.  This avoids huge lists of
detail messages going from the mgr to mon and also makes life better for
users of the health detail api.

Signed-off-by: Sage Weil <sage@redhat.com>
---

diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 85552efba63..d538460fb59 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -308,6 +308,7 @@ OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for cl
 OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
 OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
 OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
+OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail
 OPTION(mon_pg_min_inactive, OPT_U64, 1) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
 OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30)  // min # pgs per (in) osd before we warn the admin
 OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300)  // max # pgs per (in) osd before we warn the admin
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 31e249a22f2..0dcb7083c21 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -2320,8 +2320,10 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
 static void note_stuck_detail(
   int what,
   mempool::pgmap::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
+  int max_detail,
   list<pair<health_status_t,string> > *detail)
 {
+  int n = 0;
   for (auto p = stuck_pgs.begin();
        p != stuck_pgs.end();
        ++p) {
@@ -2352,6 +2354,13 @@ static void note_stuck_detail(
     default:
       ceph_abort();
     }
+    if (--max_detail == 0) {
+      ostringstream ss;
+      ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname;
+      detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      break;
+    }
+    ++n;
     ss << "pg " << p->first << " is stuck " << whatname;
     if (since == utime_t()) {
       ss << " since forever";
@@ -2380,11 +2389,12 @@ static int _warn_slow_request_histogram(
     float ub = (float)(1 << i) / 1000.0;
     if (ub < cct->_conf->mon_osd_max_op_age)
       break;
-    ostringstream ss;
     if (h.h[i]) {
-      ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
-      if (detail)
+      if (detail) {
+	ostringstream ss;
+	ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
 	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
       sum += h.h[i];
     }
   }
@@ -2528,28 +2538,32 @@ void PGMap::get_health(
         get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
         note["stuck inactive"] = stuck_pgs.size();
         num_inactive_pgs += stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
+			  cct->_conf->mon_health_max_detail, detail);
         stuck_pgs.clear();
       }
 
       if (note.find("stuck unclean") != note.end()) {
         get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
         note["stuck unclean"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
+			  cct->_conf->mon_health_max_detail,  detail);
         stuck_pgs.clear();
       }
 
       if (note.find("stuck undersized") != note.end()) {
         get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
         note["stuck undersized"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
+			  cct->_conf->mon_health_max_detail,  detail);
         stuck_pgs.clear();
       }
 
       if (note.find("stuck degraded") != note.end()) {
         get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
         note["stuck degraded"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
+			  cct->_conf->mon_health_max_detail,  detail);
         stuck_pgs.clear();
       }
 
@@ -2557,7 +2571,8 @@ void PGMap::get_health(
         get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
         note["stuck stale"] = stuck_pgs.size();
         num_inactive_pgs += stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
+			  cct->_conf->mon_health_max_detail,  detail);
       }
     }
   } else {
@@ -2584,6 +2599,8 @@ void PGMap::get_health(
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
     }
     if (detail) {
+      int n = 0, more = 0;
+      int max = cct->_conf->mon_health_max_detail;
       for (auto p = pg_stat.begin();
            p != pg_stat.end();
            ++p) {
@@ -2602,6 +2619,13 @@ void PGMap::get_health(
 	                        PG_STATE_BACKFILL |
 	                        PG_STATE_BACKFILL_TOOFULL)) &&
 	    stuck_pgs.count(p->first) == 0) {
+	  if (max > 0) {
+	    --max;
+	  } else {
+	    ++more;
+	    continue;
+	  }
+	  ++n;
 	  ostringstream ss;
 	  ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
 	  ss << ", acting " << p->second.acting;
@@ -2611,12 +2635,18 @@ void PGMap::get_health(
 	    const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool());
 	    if (pi && pi->min_size > 1) {
 	      ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool())
-	         << " min_size from " << (int)pi->min_size << " may help; search ceph.com/docs for 'incomplete')";
+	         << " min_size from " << (int)pi->min_size
+		 << " may help; search ceph.com/docs for 'incomplete')";
 	    }
 	  }
 	  detail->push_back(make_pair(HEALTH_WARN, ss.str()));
 	}
       }
+      if (more) {
+	ostringstream ss;
+	ss << more << " more pgs are also unhealthy";
+	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
     }
   }