]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mon: stuck peering since warning is misleading
authorshreyanshjain7174 <ssanchet@redhat.com>
Thu, 8 Dec 2022 13:40:55 +0000 (07:40 -0600)
committershreyanshjain7174 <ssanchet@redhat.com>
Thu, 8 Dec 2022 13:40:55 +0000 (07:40 -0600)
When osds restart or manually marked down it is common to see a HEALTH_WARN claiming that PGs have been stuck peering since awhile, even though they were active. The warning is to be issued if they really are stuck peering longer than 60s.

Fixes: https://tracker.ceph.com/issues/51688
Signed-off-by: shreyanshjain7174 <ssanchet@redhat.com>
src/mon/PGMap.cc
src/mon/PGMap.h

index 190b93bb82405828db9edd07c436995056cccdfb..33613dd4a6a4cdd0dc5030d9675047480d8734c0 100644 (file)
@@ -1939,6 +1939,10 @@ void PGMap::get_stuck_stats(
        val = i->second.last_unstale;
     }
 
+    if ((types & STUCK_PEERING) && (i->second.state & PG_STATE_PEERING)) {
+      if (i->second.last_peered < val)
+       val = i->second.last_peered;
+    }
     // val is now the earliest any of the requested stuck states began
     if (val < cutoff) {
       stuck_pgs[i->first] = i->second;
@@ -1989,6 +1993,8 @@ int PGMap::dump_stuck_pg_stats(
       stuck_types |= PGMap::STUCK_DEGRADED;
     else if (*i == "stale")
       stuck_types |= PGMap::STUCK_STALE;
+    else if (*i == "peering")
+      stuck_types |= PGMap::STUCK_PEERING;
     else {
       ds << "Unknown type: " << *i << std::endl;
       return -EINVAL;
@@ -3850,6 +3856,33 @@ static void _try_mark_pg_stale(
     newstat->state |= PG_STATE_STALE;
     newstat->last_unstale = ceph_clock_now();
   }
+
+    if ((cur.state & PG_STATE_PEERING) == 0 &&
+      cur.acting_primary != -1 &&
+      osdmap.is_down(cur.acting_primary)) {
+    pg_stat_t *newstat;
+    auto q = pending_inc->pg_stat_updates.find(pgid);
+    if (q != pending_inc->pg_stat_updates.end()) {
+      if ((q->second.acting_primary == cur.acting_primary) ||
+         ((q->second.state & PG_STATE_PEERING) == 0 &&
+          q->second.acting_primary != -1 &&
+          osdmap.is_down(q->second.acting_primary))) {
+       newstat = &q->second;
+      } else {
+       // pending update is no longer down or already stale
+       return;
+      }
+    } else {
+      newstat = &pending_inc->pg_stat_updates[pgid];
+      *newstat = cur;
+    }
+    dout(10) << __func__ << " marking pg " << pgid
+            << " stale (acting_primary " << newstat->acting_primary
+            << ")" << dendl;
+    newstat->state |= PG_STATE_PEERING;
+    newstat->last_peered = ceph_clock_now();
+  }
+
 }
 
 void PGMapUpdater::check_down_pgs(
index a5e75ed58979b9462a296d319cd8be08bf57617a..dbbfce9d52aa693deb098cb68561c0d2dd7061fa 100644 (file)
@@ -359,7 +359,8 @@ public:
   static const int STUCK_UNDERSIZED = (1<<2);
   static const int STUCK_DEGRADED = (1<<3);
   static const int STUCK_STALE = (1<<4);
-  
+  static const int STUCK_PEERING = (1<<5);
+
   PGMap()
     : version(0),
       last_osdmap_epoch(0), last_pg_scan(0)