]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/PGMap: cap health detail messages at 50 (configurable)
authorSage Weil <sage@redhat.com>
Fri, 19 May 2017 15:48:15 +0000 (11:48 -0400)
committerSage Weil <sage@redhat.com>
Fri, 2 Jun 2017 17:02:49 +0000 (13:02 -0400)
There are two cases where we spew health detail warnings for potentially
every pg.  Cap those detail messages at 50 and, if we exceed that, include
a message saying how many more there are.  This avoids huge lists of
detail messages going from the mgr to mon and also makes life better for
users of the health detail api.

Signed-off-by: Sage Weil <sage@redhat.com>
src/common/config_opts.h
src/mon/PGMap.cc

index 85552efba63835f40be98d4cafcb432469552da8..d538460fb59a1eef9fbd941af60ff2770c1edf01 100644 (file)
@@ -308,6 +308,7 @@ OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for cl
 OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
 OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
 OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
+OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail
 OPTION(mon_pg_min_inactive, OPT_U64, 1) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
 OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30)  // min # pgs per (in) osd before we warn the admin
 OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300)  // max # pgs per (in) osd before we warn the admin
index 31e249a22f27dda9ec8817bf242edd4a9edd1830..0dcb7083c21bcad4dd763f98af7864f0a364447b 100644 (file)
@@ -2320,8 +2320,10 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
 static void note_stuck_detail(
   int what,
   mempool::pgmap::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
+  int max_detail,
   list<pair<health_status_t,string> > *detail)
 {
+  int n = 0;
   for (auto p = stuck_pgs.begin();
        p != stuck_pgs.end();
        ++p) {
@@ -2352,6 +2354,13 @@ static void note_stuck_detail(
     default:
       ceph_abort();
     }
+    if (--max_detail == 0) {
+      ostringstream ss;
+      ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname;
+      detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      break;
+    }
+    ++n;
     ss << "pg " << p->first << " is stuck " << whatname;
     if (since == utime_t()) {
       ss << " since forever";
@@ -2380,11 +2389,12 @@ static int _warn_slow_request_histogram(
     float ub = (float)(1 << i) / 1000.0;
     if (ub < cct->_conf->mon_osd_max_op_age)
       break;
-    ostringstream ss;
     if (h.h[i]) {
-      ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
-      if (detail)
+      if (detail) {
+       ostringstream ss;
+       ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
       sum += h.h[i];
     }
   }
@@ -2528,28 +2538,32 @@ void PGMap::get_health(
         get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
         note["stuck inactive"] = stuck_pgs.size();
         num_inactive_pgs += stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
+                         cct->_conf->mon_health_max_detail, detail);
         stuck_pgs.clear();
       }
 
       if (note.find("stuck unclean") != note.end()) {
         get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
         note["stuck unclean"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
+                         cct->_conf->mon_health_max_detail,  detail);
         stuck_pgs.clear();
       }
 
       if (note.find("stuck undersized") != note.end()) {
         get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
         note["stuck undersized"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
+                         cct->_conf->mon_health_max_detail,  detail);
         stuck_pgs.clear();
       }
 
       if (note.find("stuck degraded") != note.end()) {
         get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
         note["stuck degraded"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
+                         cct->_conf->mon_health_max_detail,  detail);
         stuck_pgs.clear();
       }
 
@@ -2557,7 +2571,8 @@ void PGMap::get_health(
         get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
         note["stuck stale"] = stuck_pgs.size();
         num_inactive_pgs += stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
+                         cct->_conf->mon_health_max_detail,  detail);
       }
     }
   } else {
@@ -2584,6 +2599,8 @@ void PGMap::get_health(
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
     }
     if (detail) {
+      int n = 0, more = 0;
+      int max = cct->_conf->mon_health_max_detail;
       for (auto p = pg_stat.begin();
            p != pg_stat.end();
            ++p) {
@@ -2602,6 +2619,13 @@ void PGMap::get_health(
                                PG_STATE_BACKFILL |
                                PG_STATE_BACKFILL_TOOFULL)) &&
            stuck_pgs.count(p->first) == 0) {
+         if (max > 0) {
+           --max;
+         } else {
+           ++more;
+           continue;
+         }
+         ++n;
          ostringstream ss;
          ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
          ss << ", acting " << p->second.acting;
@@ -2611,12 +2635,18 @@ void PGMap::get_health(
            const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool());
            if (pi && pi->min_size > 1) {
              ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool())
-                << " min_size from " << (int)pi->min_size << " may help; search ceph.com/docs for 'incomplete')";
+                << " min_size from " << (int)pi->min_size
+                << " may help; search ceph.com/docs for 'incomplete')";
            }
          }
          detail->push_back(make_pair(HEALTH_WARN, ss.str()));
        }
       }
+      if (more) {
+       ostringstream ss;
+       ss << more << " more pgs are also unhealthy";
+       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
     }
   }