From: shreyanshjain7174 Date: Thu, 8 Dec 2022 13:40:55 +0000 (-0600) Subject: mon: stuck peering since warning is misleading X-Git-Tag: v17.2.8~152^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d7202a0d7f389db0284f42a7889712b2d744d3df;p=ceph.git mon: stuck peering since warning is misleading When osds restart or manually marked down it is common to see a HEALTH_WARN claiming that PGs have been stuck peering since awhile, even though they were active. The warning is to be issued if they really are stuck peering longer than 60s. Fixes: https://tracker.ceph.com/issues/51688 Signed-off-by: shreyanshjain7174 (cherry picked from commit 8202e722796824e6597f44ffb94e80dfb396bd96) --- diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 5f4e25ac1bfe..0712e0881f2a 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -1941,6 +1941,10 @@ void PGMap::get_stuck_stats( val = i->second.last_unstale; } + if ((types & STUCK_PEERING) && (i->second.state & PG_STATE_PEERING)) { + if (i->second.last_peered < val) + val = i->second.last_peered; + } // val is now the earliest any of the requested stuck states began if (val < cutoff) { stuck_pgs[i->first] = i->second; @@ -1991,6 +1995,8 @@ int PGMap::dump_stuck_pg_stats( stuck_types |= PGMap::STUCK_DEGRADED; else if (*i == "stale") stuck_types |= PGMap::STUCK_STALE; + else if (*i == "peering") + stuck_types |= PGMap::STUCK_PEERING; else { ds << "Unknown type: " << *i << std::endl; return -EINVAL; @@ -3842,6 +3848,33 @@ static void _try_mark_pg_stale( newstat->state |= PG_STATE_STALE; newstat->last_unstale = ceph_clock_now(); } + + if ((cur.state & PG_STATE_PEERING) == 0 && + cur.acting_primary != -1 && + osdmap.is_down(cur.acting_primary)) { + pg_stat_t *newstat; + auto q = pending_inc->pg_stat_updates.find(pgid); + if (q != pending_inc->pg_stat_updates.end()) { + if ((q->second.acting_primary == cur.acting_primary) || + ((q->second.state & PG_STATE_PEERING) == 0 && + q->second.acting_primary != -1 && + osdmap.is_down(q->second.acting_primary))) { + newstat = &q->second; + } else { + // pending update is no longer down or already stale + return; + } + } else { + newstat = &pending_inc->pg_stat_updates[pgid]; + *newstat = cur; + } + dout(10) << __func__ << " marking pg " << pgid + << " stale (acting_primary " << newstat->acting_primary + << ")" << dendl; + newstat->state |= PG_STATE_PEERING; + newstat->last_peered = ceph_clock_now(); + } + } void PGMapUpdater::check_down_pgs( diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index a5e75ed58979..dbbfce9d52aa 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -359,7 +359,8 @@ public: static const int STUCK_UNDERSIZED = (1<<2); static const int STUCK_DEGRADED = (1<<3); static const int STUCK_STALE = (1<<4); - + static const int STUCK_PEERING = (1<<5); + PGMap() : version(0), last_osdmap_epoch(0), last_pg_scan(0)