From 9c95bb0454dbad781b6655f4c7116620005101f7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Piotr=20Da=C5=82ek?= Date: Tue, 2 Feb 2016 09:47:51 +0100 Subject: [PATCH] mon: reduce CPU and memory manager pressure of pg health check MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When we don't need details about which PGs are stuck, we can just iterate once over pg_stat map and increase appropriate counters, no need to repeatedly spam memory manager with stuck_pgs map population (we'll only use its size). Even if we need details, first check if we actually have any PGs in any of requested states, so in most cases we won't iterate needlessly five times over all PG stat map. This at least halves the time needed by pg health gathering (~3ms, down from ~6-7ms) on cluster with 31832 PGs and Intel Xeon E5-2640 CPU. Signed-off-by: Piotr Dałek --- src/mon/PGMap.cc | 50 +++++++++++++++++++++++++- src/mon/PGMap.h | 3 +- src/mon/PGMonitor.cc | 86 +++++++++++++++++++++++++------------------- 3 files changed, 101 insertions(+), 38 deletions(-) diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index ab8f8a9e8b73..74888530956d 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -914,7 +914,7 @@ void PGMap::dump_osd_sum_stats(ostream& ss) const << std::endl; } -void PGMap::get_stuck_stats(int types, utime_t cutoff, +void PGMap::get_stuck_stats(int types, const utime_t cutoff, ceph::unordered_map& stuck_pgs) const { assert(types != 0); @@ -955,6 +955,54 @@ void PGMap::get_stuck_stats(int types, utime_t cutoff, } } +bool PGMap::get_stuck_counts(const utime_t cutoff, map& note) const +{ + int inactive = 0; + int unclean = 0; + int degraded = 0; + int undersized = 0; + int stale = 0; + + for (ceph::unordered_map::const_iterator i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + + if (! (i->second.state & PG_STATE_ACTIVE)) { + if (i->second.last_active < cutoff) + ++inactive; + } else if (! (i->second.state & PG_STATE_CLEAN)) { + if (i->second.last_clean < cutoff) + ++unclean; + } else if (i->second.state & PG_STATE_DEGRADED) { + if (i->second.last_undegraded < cutoff) + ++degraded; + } else if (i->second.state & PG_STATE_UNDERSIZED) { + if (i->second.last_fullsized < cutoff) + ++undersized; + } else if (i->second.state & PG_STATE_STALE) { + if (i->second.last_unstale < cutoff) + ++stale; + } + } + + if (inactive) + note["stuck inactive"] = inactive; + + if (unclean) + note["stuck unclean"] = unclean; + + if (undersized) + note["stuck undersized"] = undersized; + + if (degraded) + note["stuck degraded"] = degraded; + + if (stale) + note["stuck stale"] = stale; + + return inactive || unclean || undersized || degraded || stale; +} + void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const { ceph::unordered_map stuck_pg_stats; diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index d2b9e8af1db1..8c2b3cadea2d 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -282,8 +282,9 @@ public: void dump_pg_stats_plain(ostream& ss, const ceph::unordered_map& pg_stats, bool brief) const; - void get_stuck_stats(int types, utime_t cutoff, + void get_stuck_stats(int types, const utime_t cutoff, ceph::unordered_map& stuck_pgs) const; + bool get_stuck_counts(const utime_t cutoff, map& note) const; void dump_stuck(Formatter *f, int types, utime_t cutoff) const; void dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const; diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index f9c61b7e9c27..6b78d044a643 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -2145,46 +2145,60 @@ void PGMonitor::get_health(list >& summary, utime_t now(ceph_clock_now(g_ceph_context)); utime_t cutoff = now - utime_t(g_conf->mon_pg_stuck_threshold, 0); uint64_t num_inactive_pgs = 0; + + if (detail) { + + // we need to collect details of stuck pgs, first do a quick check + // whether this will yield any results + if (pg_map.get_stuck_counts(cutoff, note)) { + + // there are stuck pgs. gather details for specified statuses + // only if we know that there are pgs stuck in that status + + if (note.find("stuck inactive") != note.end()) { + pg_map.get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs); + note["stuck inactive"] = stuck_pgs.size(); + num_inactive_pgs += stuck_pgs.size(); + note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail); + stuck_pgs.clear(); + } - pg_map.get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs); - if (!stuck_pgs.empty()) { - note["stuck inactive"] = stuck_pgs.size(); - num_inactive_pgs += stuck_pgs.size(); - if (detail) - note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail); - } - stuck_pgs.clear(); - - pg_map.get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs); - if (!stuck_pgs.empty()) { - note["stuck unclean"] = stuck_pgs.size(); - if (detail) - note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, detail); - } - stuck_pgs.clear(); + if (note.find("stuck unclean") != note.end()) { + pg_map.get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs); + note["stuck unclean"] = stuck_pgs.size(); + note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, detail); + stuck_pgs.clear(); + } - pg_map.get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs); - if (!stuck_pgs.empty()) { - note["stuck undersized"] = stuck_pgs.size(); - if (detail) - note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, detail); - } - stuck_pgs.clear(); + if (note.find("stuck undersized") != note.end()) { + pg_map.get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs); + note["stuck undersized"] = stuck_pgs.size(); + note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, detail); + stuck_pgs.clear(); + } - pg_map.get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs); - if (!stuck_pgs.empty()) { - note["stuck degraded"] = stuck_pgs.size(); - if (detail) - note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, detail); - } - stuck_pgs.clear(); + if (note.find("stuck degraded") != note.end()) { + pg_map.get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs); + note["stuck degraded"] = stuck_pgs.size(); + note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, detail); + stuck_pgs.clear(); + } - pg_map.get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs); - if (!stuck_pgs.empty()) { - note["stuck stale"] = stuck_pgs.size(); - num_inactive_pgs += stuck_pgs.size(); - if (detail) - note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail); + if (note.find("stuck stale") != note.end()) { + pg_map.get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs); + note["stuck stale"] = stuck_pgs.size(); + num_inactive_pgs += stuck_pgs.size(); + note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail); + } + } + } else { + pg_map.get_stuck_counts(cutoff, note); + map::const_iterator p = note.find("stuck inactive"); + if (p != note.end()) + num_inactive_pgs += p->second; + p = note.find("stuck stale"); + if (p != note.end()) + num_inactive_pgs += p->second; } if (g_conf->mon_pg_min_inactive > 0 && num_inactive_pgs >= g_conf->mon_pg_min_inactive) { -- 2.47.3