]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mon: reduce CPU and memory manager pressure of pg health check
authorPiotr Dałek <piotr.dalek@ts.fujitsu.com>
Tue, 2 Feb 2016 08:47:51 +0000 (09:47 +0100)
committerPiotr Dałek <piotr.dalek@ts.fujitsu.com>
Tue, 2 Feb 2016 14:09:27 +0000 (15:09 +0100)
When we don't need details about which PGs are stuck, we can just iterate
once over pg_stat map and increase appropriate counters, no need to
repeatedly spam memory manager with stuck_pgs map population (we'll only
use its size). Even if we need details, first check if we actually have any
PGs in any of requested states, so in most cases we won't iterate needlessly
five times over all PG stat map.

This at least halves the time needed by pg health gathering (~3ms,
down from ~6-7ms) on cluster with 31832 PGs and Intel Xeon E5-2640
CPU.

Signed-off-by: Piotr Dałek <piotr.dalek@ts.fujitsu.com>
src/mon/PGMap.cc
src/mon/PGMap.h
src/mon/PGMonitor.cc

index ab8f8a9e8b73beb057d5d135fb8ca2a34aaf1df7..74888530956d855afc72d0e36457285fed43e1c4 100644 (file)
@@ -914,7 +914,7 @@ void PGMap::dump_osd_sum_stats(ostream& ss) const
      << std::endl;
 }
 
-void PGMap::get_stuck_stats(int types, utime_t cutoff,
+void PGMap::get_stuck_stats(int types, const utime_t cutoff,
                             ceph::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
 {
   assert(types != 0);
@@ -955,6 +955,54 @@ void PGMap::get_stuck_stats(int types, utime_t cutoff,
   }
 }
 
+bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
+{
+  int inactive = 0;
+  int unclean = 0;
+  int degraded = 0;
+  int undersized = 0;
+  int stale = 0;
+
+  for (ceph::unordered_map<pg_t, pg_stat_t>::const_iterator i = pg_stat.begin();
+       i != pg_stat.end();
+       ++i) {
+
+    if (! (i->second.state & PG_STATE_ACTIVE)) {
+      if (i->second.last_active < cutoff)
+        ++inactive;
+    } else if (! (i->second.state & PG_STATE_CLEAN)) {
+      if (i->second.last_clean < cutoff)
+        ++unclean;
+    } else if (i->second.state & PG_STATE_DEGRADED) {
+      if (i->second.last_undegraded < cutoff)
+        ++degraded;
+    } else if (i->second.state & PG_STATE_UNDERSIZED) {
+      if (i->second.last_fullsized < cutoff)
+        ++undersized;
+    } else if (i->second.state & PG_STATE_STALE) {
+      if (i->second.last_unstale < cutoff)
+        ++stale;
+    }
+  }
+  
+  if (inactive)
+    note["stuck inactive"] = inactive;
+  
+  if (unclean)
+    note["stuck unclean"] = unclean;
+  
+  if (undersized)
+    note["stuck undersized"] = undersized;
+  
+  if (degraded)
+    note["stuck degraded"] = degraded;
+  
+  if (stale)
+    note["stuck stale"] = stale; 
+  
+  return inactive || unclean || undersized || degraded || stale;
+}
+
 void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const
 {
   ceph::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
index d2b9e8af1db13cabca7eba18732f38a5823acdc1..8c2b3cadea2dff106eb0fb2cb6cb1dcf2245db81 100644 (file)
@@ -282,8 +282,9 @@ public:
   void dump_pg_stats_plain(ostream& ss,
                           const ceph::unordered_map<pg_t, pg_stat_t>& pg_stats,
                           bool brief) const;
-  void get_stuck_stats(int types, utime_t cutoff,
+  void get_stuck_stats(int types, const utime_t cutoff,
                       ceph::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const;
+  bool get_stuck_counts(const utime_t cutoff, map<string, int>& note) const;
   void dump_stuck(Formatter *f, int types, utime_t cutoff) const;
   void dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const;
 
index f9c61b7e9c27c3ffa5048f85772e0f6d87f1f5d9..6b78d044a6433f68b966931d7e2f94f847f8b797 100644 (file)
@@ -2145,46 +2145,60 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
   utime_t now(ceph_clock_now(g_ceph_context));
   utime_t cutoff = now - utime_t(g_conf->mon_pg_stuck_threshold, 0);
   uint64_t num_inactive_pgs = 0;
+  
+  if (detail) {
+    
+    // we need to collect details of stuck pgs, first do a quick check
+    // whether this will yield any results
+    if (pg_map.get_stuck_counts(cutoff, note)) {
+      
+      // there are stuck pgs. gather details for specified statuses
+      // only if we know that there are pgs stuck in that status
+      
+      if (note.find("stuck inactive") != note.end()) {
+        pg_map.get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
+        note["stuck inactive"] = stuck_pgs.size();
+        num_inactive_pgs += stuck_pgs.size();
+        note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail);
+        stuck_pgs.clear();
+      }
 
-  pg_map.get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
-  if (!stuck_pgs.empty()) {
-    note["stuck inactive"] = stuck_pgs.size();
-    num_inactive_pgs += stuck_pgs.size();
-    if (detail)
-      note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail);
-  }
-  stuck_pgs.clear();
-
-  pg_map.get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
-  if (!stuck_pgs.empty()) {
-    note["stuck unclean"] = stuck_pgs.size();
-    if (detail)
-      note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, detail);
-  }
-  stuck_pgs.clear();
+      if (note.find("stuck unclean") != note.end()) {
+        pg_map.get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
+        note["stuck unclean"] = stuck_pgs.size();
+        note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, detail);
+        stuck_pgs.clear();
+      }
 
-  pg_map.get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
-  if (!stuck_pgs.empty()) {
-    note["stuck undersized"] = stuck_pgs.size();
-    if (detail)
-      note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, detail);
-  }
-  stuck_pgs.clear();
+      if (note.find("stuck undersized") != note.end()) {
+        pg_map.get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
+        note["stuck undersized"] = stuck_pgs.size();
+        note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, detail);
+        stuck_pgs.clear();
+      }
 
-  pg_map.get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
-  if (!stuck_pgs.empty()) {
-    note["stuck degraded"] = stuck_pgs.size();
-    if (detail)
-      note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, detail);
-  }
-  stuck_pgs.clear();
+      if (note.find("stuck degraded") != note.end()) {
+        pg_map.get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
+        note["stuck degraded"] = stuck_pgs.size();
+        note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, detail);
+        stuck_pgs.clear();
+      }
 
-  pg_map.get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
-  if (!stuck_pgs.empty()) {
-    note["stuck stale"] = stuck_pgs.size();
-    num_inactive_pgs += stuck_pgs.size();
-    if (detail)
-      note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail);
+      if (note.find("stuck stale") != note.end()) {
+        pg_map.get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
+        note["stuck stale"] = stuck_pgs.size();
+        num_inactive_pgs += stuck_pgs.size();
+        note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail);
+      }
+    }
+  } else {
+    pg_map.get_stuck_counts(cutoff, note);
+    map<string,int>::const_iterator p = note.find("stuck inactive");
+    if (p != note.end()) 
+      num_inactive_pgs += p->second;
+    p = note.find("stuck stale");
+    if (p != note.end()) 
+      num_inactive_pgs += p->second;
   }
 
   if (g_conf->mon_pg_min_inactive > 0 && num_inactive_pgs >= g_conf->mon_pg_min_inactive) {