From: Sage Weil Date: Fri, 19 May 2017 15:48:15 +0000 (-0400) Subject: mon/PGMap: cap health detail messages at 50 (configurable) X-Git-Tag: ses5-milestone6~8^2~19^2~50 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=229e56c7bcf50a53ce3c0318aac583c574e39aa9;p=ceph.git mon/PGMap: cap health detail messages at 50 (configurable) There are two cases where we spew health detail warnings for potentially every pg. Cap those detail messages at 50 and, if we exceed that, include a message saying how many more there are. This avoids huge lists of detail messages going from the mgr to mon and also makes life better for users of the health detail api. Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 85552efba638..d538460fb59a 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -308,6 +308,7 @@ OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for cl OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds) OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds) OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info) +OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail OPTION(mon_pg_min_inactive, OPT_U64, 1) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR. OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30) // min # pgs per (in) osd before we warn the admin OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300) // max # pgs per (in) osd before we warn the admin diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 31e249a22f27..0dcb7083c21b 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2320,8 +2320,10 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set& pgs) const static void note_stuck_detail( int what, mempool::pgmap::unordered_map& stuck_pgs, + int max_detail, list > *detail) { + int n = 0; for (auto p = stuck_pgs.begin(); p != stuck_pgs.end(); ++p) { @@ -2352,6 +2354,13 @@ static void note_stuck_detail( default: ceph_abort(); } + if (--max_detail == 0) { + ostringstream ss; + ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname; + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + break; + } + ++n; ss << "pg " << p->first << " is stuck " << whatname; if (since == utime_t()) { ss << " since forever"; @@ -2380,11 +2389,12 @@ static int _warn_slow_request_histogram( float ub = (float)(1 << i) / 1000.0; if (ub < cct->_conf->mon_osd_max_op_age) break; - ostringstream ss; if (h.h[i]) { - ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix; - if (detail) + if (detail) { + ostringstream ss; + ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix; detail->push_back(make_pair(HEALTH_WARN, ss.str())); + } sum += h.h[i]; } } @@ -2528,28 +2538,32 @@ void PGMap::get_health( get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs); note["stuck inactive"] = stuck_pgs.size(); num_inactive_pgs += stuck_pgs.size(); - note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail); + note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, + cct->_conf->mon_health_max_detail, detail); stuck_pgs.clear(); } if (note.find("stuck unclean") != note.end()) { get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs); note["stuck unclean"] = stuck_pgs.size(); - note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, detail); + note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, + cct->_conf->mon_health_max_detail, detail); stuck_pgs.clear(); } if (note.find("stuck undersized") != note.end()) { get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs); note["stuck undersized"] = stuck_pgs.size(); - note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, detail); + note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, + cct->_conf->mon_health_max_detail, detail); stuck_pgs.clear(); } if (note.find("stuck degraded") != note.end()) { get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs); note["stuck degraded"] = stuck_pgs.size(); - note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, detail); + note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, + cct->_conf->mon_health_max_detail, detail); stuck_pgs.clear(); } @@ -2557,7 +2571,8 @@ void PGMap::get_health( get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs); note["stuck stale"] = stuck_pgs.size(); num_inactive_pgs += stuck_pgs.size(); - note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail); + note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, + cct->_conf->mon_health_max_detail, detail); } } } else { @@ -2584,6 +2599,8 @@ void PGMap::get_health( summary.push_back(make_pair(HEALTH_WARN, ss.str())); } if (detail) { + int n = 0, more = 0; + int max = cct->_conf->mon_health_max_detail; for (auto p = pg_stat.begin(); p != pg_stat.end(); ++p) { @@ -2602,6 +2619,13 @@ void PGMap::get_health( PG_STATE_BACKFILL | PG_STATE_BACKFILL_TOOFULL)) && stuck_pgs.count(p->first) == 0) { + if (max > 0) { + --max; + } else { + ++more; + continue; + } + ++n; ostringstream ss; ss << "pg " << p->first << " is " << pg_state_string(p->second.state); ss << ", acting " << p->second.acting; @@ -2611,12 +2635,18 @@ void PGMap::get_health( const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool()); if (pi && pi->min_size > 1) { ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool()) - << " min_size from " << (int)pi->min_size << " may help; search ceph.com/docs for 'incomplete')"; + << " min_size from " << (int)pi->min_size + << " may help; search ceph.com/docs for 'incomplete')"; } } detail->push_back(make_pair(HEALTH_WARN, ss.str())); } } + if (more) { + ostringstream ss; + ss << more << " more pgs are also unhealthy"; + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + } } }