mon/PGMap: cap health detail messages at 50 (configurable)

author Sage Weil <sage@redhat.com>

Fri, 19 May 2017 15:48:15 +0000 (11:48 -0400)

committer Sage Weil <sage@redhat.com>

Fri, 2 Jun 2017 17:02:49 +0000 (13:02 -0400)
author Sage Weil <sage@redhat.com>
Fri, 19 May 2017 15:48:15 +0000 (11:48 -0400)
committer Sage Weil <sage@redhat.com>
Fri, 2 Jun 2017 17:02:49 +0000 (13:02 -0400)
diff --git a/src/common/config_opts.h b/src/common/config_opts.h

index 85552efba63835f40be98d4cafcb432469552da8..d538460fb59a1eef9fbd941af60ff2770c1edf01 100644 (file)
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -308,6 +308,7 @@ OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for cl
  OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
  OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
  OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
+OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail
  OPTION(mon_pg_min_inactive, OPT_U64, 1) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
  OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30)  // min # pgs per (in) osd before we warn the admin
  OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300)  // max # pgs per (in) osd before we warn the admin
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc

index 31e249a22f27dda9ec8817bf242edd4a9edd1830..0dcb7083c21bcad4dd763f98af7864f0a364447b 100644 (file)
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -2320,8 +2320,10 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
  static void note_stuck_detail(
    int what,
    mempool::pgmap::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
+  int max_detail,
    list<pair<health_status_t,string> > *detail)
  {
+  int n = 0;
    for (auto p = stuck_pgs.begin();
         p != stuck_pgs.end();
         ++p) {
@@ -2352,6 +2354,13 @@ static void note_stuck_detail(
      default:
        ceph_abort();
      }
+    if (--max_detail == 0) {
+      ostringstream ss;
+      ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname;
+      detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      break;
+    }
+    ++n;
      ss << "pg " << p->first << " is stuck " << whatname;
      if (since == utime_t()) {
        ss << " since forever";
@@ -2380,11 +2389,12 @@ static int _warn_slow_request_histogram(
      float ub = (float)(1 << i) / 1000.0;
      if (ub < cct->_conf->mon_osd_max_op_age)
        break;
-    ostringstream ss;
      if (h.h[i]) {
-      ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
-      if (detail)
+      if (detail) {
+       ostringstream ss;
+       ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
         detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
        sum += h.h[i];
      }
    }
@@ -2528,28 +2538,32 @@ void PGMap::get_health(
          get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
          note["stuck inactive"] = stuck_pgs.size();
          num_inactive_pgs += stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
+                         cct->_conf->mon_health_max_detail, detail);
          stuck_pgs.clear();
        }
  
        if (note.find("stuck unclean") != note.end()) {
          get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
          note["stuck unclean"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
+                         cct->_conf->mon_health_max_detail,  detail);
          stuck_pgs.clear();
        }
  
        if (note.find("stuck undersized") != note.end()) {
          get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
          note["stuck undersized"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
+                         cct->_conf->mon_health_max_detail,  detail);
          stuck_pgs.clear();
        }
  
        if (note.find("stuck degraded") != note.end()) {
          get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
          note["stuck degraded"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
+                         cct->_conf->mon_health_max_detail,  detail);
          stuck_pgs.clear();
        }
  
@@ -2557,7 +2571,8 @@ void PGMap::get_health(
          get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
          note["stuck stale"] = stuck_pgs.size();
          num_inactive_pgs += stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail);
+        note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
+                         cct->_conf->mon_health_max_detail,  detail);
        }
      }
    } else {
@@ -2584,6 +2599,8 @@ void PGMap::get_health(
        summary.push_back(make_pair(HEALTH_WARN, ss.str()));
      }
      if (detail) {
+      int n = 0, more = 0;
+      int max = cct->_conf->mon_health_max_detail;
        for (auto p = pg_stat.begin();
             p != pg_stat.end();
             ++p) {
@@ -2602,6 +2619,13 @@ void PGMap::get_health(
                                 PG_STATE_BACKFILL |
                                 PG_STATE_BACKFILL_TOOFULL)) &&
             stuck_pgs.count(p->first) == 0) {
+         if (max > 0) {
+           --max;
+         } else {
+           ++more;
+           continue;
+         }
+         ++n;
           ostringstream ss;
           ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
           ss << ", acting " << p->second.acting;
@@ -2611,12 +2635,18 @@ void PGMap::get_health(
             const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool());
             if (pi && pi->min_size > 1) {
               ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool())
-                << " min_size from " << (int)pi->min_size << " may help; search ceph.com/docs for 'incomplete')";
+                << " min_size from " << (int)pi->min_size
+                << " may help; search ceph.com/docs for 'incomplete')";
             }
           }
           detail->push_back(make_pair(HEALTH_WARN, ss.str()));
         }
        }
+      if (more) {
+       ostringstream ss;
+       ss << more << " more pgs are also unhealthy";
+       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
      }
    }
author	Sage Weil <sage@redhat.com>
	Fri, 19 May 2017 15:48:15 +0000 (11:48 -0400)
committer	Sage Weil <sage@redhat.com>
	Fri, 2 Jun 2017 17:02:49 +0000 (13:02 -0400)
src/common/config_opts.h		patch \| blob \| history
src/mon/PGMap.cc		patch \| blob \| history