From 8c19227c817420f6a736a40bb101349e36e126e2 Mon Sep 17 00:00:00 2001 From: Naveen Naidu Date: Thu, 29 May 2025 14:28:32 +0530 Subject: [PATCH] osd/PeeringState: handle race condition of DeferBackfill event for Backfilling state Currently when PG in `Backfilling` state receives a `DeferBackfill` event, there are cases when that event could race with `MOSDPGBackfill::OP_BACKFILL_FINISH` becasue the PG has already finished backfilling. In such case, the following happens: 1. PG state set to `PG_STATE_BACKFILL_WAIT` 2. Suspend backfilling 3. Discard the event Notice that we do not reschedule backfill in the above steps, this can lead to a situation where the PG gets stuck in a `backfill_wait` state forever. This bug got introduced due to the following commit: `865839f`: osd/PeeringState: check racing with OP_BACKFILL_FINISH when defering backfill PR Link: https://github.com/ceph/ceph/pull/60185 This commit, fixes that by making sure that in race conditions such as above - we only discard the event. Fixes: https://tracker.ceph.com/issues/71010 Signed-off-by: Naveen Naidu (cherry picked from commit b2bd15b4485f367c3f599a3d233d6e506b3285d1) --- src/osd/PeeringState.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index de2d275ed87a..7bddf3460594 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -5411,13 +5411,12 @@ boost::statechart::result PeeringState::Backfilling::react(const DeferBackfill &c) { DECLARE_LOCALS; - - psdout(10) << "defer backfill, retry delay " << c.delay << dendl; - ps->state_set(PG_STATE_BACKFILL_WAIT); - ps->state_clear(PG_STATE_BACKFILLING); - suspend_backfill(); - if (ps->needs_backfill()) { + psdout(10) << "defer backfill, retry delay " << c.delay << dendl; + ps->state_set(PG_STATE_BACKFILL_WAIT); + ps->state_clear(PG_STATE_BACKFILLING); + suspend_backfill(); + pl->schedule_event_after( std::make_shared( ps->get_osdmap_epoch(), @@ -5427,6 +5426,9 @@ PeeringState::Backfilling::react(const DeferBackfill &c) return transit(); } else { // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore + psdout(10) << "discarding stale DeferBackfill event , pg does not need " + "backfill anymore" + << dendl; return discard_event(); } } -- 2.47.3