]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd/PeeringState: handle race condition of DeferBackfill event for Backfilling state
authorNaveen Naidu <naveennaidu479@gmail.com>
Thu, 29 May 2025 08:58:32 +0000 (14:28 +0530)
committerNaveen Naidu <naveennaidu479@gmail.com>
Thu, 19 Jun 2025 11:19:18 +0000 (16:49 +0530)
Currently when PG in `Backfilling` state receives a `DeferBackfill`
event, there are cases when that event could race with
`MOSDPGBackfill::OP_BACKFILL_FINISH` becasue the PG has already
finished backfilling. In such case, the following
happens:
  1. PG state set to `PG_STATE_BACKFILL_WAIT`
  2. Suspend backfilling
  3. Discard the event

Notice that we do not reschedule backfill in the above steps, this can
lead to a situation where the PG gets stuck in a `backfill_wait` state
forever. This bug got introduced due to the following commit:

`865839f`: osd/PeeringState: check racing with OP_BACKFILL_FINISH when defering
backfill
Link: https://github.com/ceph/ceph/pull/60185
This commit, fixes that by making sure that in race conditions such as
above - we only discard the event.

Fixes: https://tracker.ceph.com/issues/71010
Signed-off-by: Naveen Naidu <naveen.naidu@ibm.com>
(cherry picked from commit b2bd15b4485f367c3f599a3d233d6e506b3285d1)

src/osd/PeeringState.cc

index de2d275ed87a0d79d89b4ed1a03650226e7ba0a7..7bddf3460594a32f896ed772c4d1b8ca705a700d 100644 (file)
@@ -5411,13 +5411,12 @@ boost::statechart::result
 PeeringState::Backfilling::react(const DeferBackfill &c)
 {
   DECLARE_LOCALS;
-
-  psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
-  ps->state_set(PG_STATE_BACKFILL_WAIT);
-  ps->state_clear(PG_STATE_BACKFILLING);
-  suspend_backfill();
-
   if (ps->needs_backfill()) {
+    psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
+    ps->state_set(PG_STATE_BACKFILL_WAIT);
+    ps->state_clear(PG_STATE_BACKFILLING);
+    suspend_backfill();
+
     pl->schedule_event_after(
       std::make_shared<PGPeeringEvent>(
        ps->get_osdmap_epoch(),
@@ -5427,6 +5426,9 @@ PeeringState::Backfilling::react(const DeferBackfill &c)
     return transit<NotBackfilling>();
   } else {
     // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
+    psdout(10) << "discarding stale DeferBackfill event , pg does not need "
+                 "backfill anymore"
+              << dendl;
     return discard_event();
   }
 }