]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/osd: fix race between AllReplicasRecovered and DeferRecovery 68383/head
authorAishwarya Mathuria <amathuri@redhat.com>
Tue, 14 Apr 2026 07:59:36 +0000 (13:29 +0530)
committerAishwarya Mathuria <amathuri@redhat.com>
Wed, 15 Apr 2026 10:25:19 +0000 (10:25 +0000)
Fixes a crash where AllReplicasRecovered event arrives at NotRecovering
state due to async event delivery race with DeferRecovery preemption.

The issue occurs when:
1. Recovery completes and AllReplicasRecovered is queued asynchronously
2. A higher priority operation (e.g., client I/O) triggers AsyncReserver
   to preempt recovery, posting DeferRecovery event
3. DeferRecovery is processed first, transitioning PG to NotRecovering
4. AllReplicasRecovered arrives at wrong state → crash with "bad state
   machine event" because NotRecovering doesn't handle it

The fix follows Classic OSD's approach in PrimaryLogPG::start_recovery_ops():
clear PG_STATE_RECOVERING before posting recovery completion events. This
makes the existing safety check in PeeringState::Recovering::react() work:
when DeferRecovery arrives and sees !state_test(PG_STATE_RECOVERING), it
discards itself, preventing the state transition that would cause the crash.

Fixes:https://tracker.ceph.com/issues/73314
Signed-off-by: Aishwarya Mathuria <amathuri@redhat.com>
src/crimson/osd/pg_recovery.cc

index 0ff4abc8518f00055af3cc3867243ee25d2e1b76..cac3bfc53df2c999043e6c6ab09e462a9199fe4f 100644 (file)
@@ -83,15 +83,24 @@ PGRecovery::start_recovery_ops(
   ceph_assert(!pg->is_backfilling());
 
   if (!pg->get_peering_state().needs_recovery()) {
+    /* Clear PG_STATE_RECOVERING before posting recovery completion events
+     * to prevent race condition with DeferRecovery. 
+     * See https://tracker.ceph.com/issues/73314.
+     *
+     * This matches the Classic OSD approach in PrimaryLogPG::start_recovery_ops().
+     * When DeferRecovery arrives, it checks !state_test(PG_STATE_RECOVERING)
+     * and discards itself if the flag is already clear, preventing the crash
+     * that occurs when AllReplicasRecovered arrives at NotRecovering state. */
+    pg->get_peering_state().state_clear(PG_STATE_RECOVERING);
+    pg->get_peering_state().state_clear(PG_STATE_FORCED_RECOVERY);
+
     if (pg->get_peering_state().needs_backfill()) {
+      DEBUGDPP("recovery done, queuing backfill", *pg->get_dpp());
       request_backfill();
     } else {
+      DEBUGDPP("recovery done, no backfill", *pg->get_dpp());
       all_replicas_recovered();
     }
-    /* TODO: this is racy -- it's possible for a DeferRecovery
-     * event to be processed between this call and when the
-     * async RequestBackfill or AllReplicasRecovered events
-     * are processed --  see https://tracker.ceph.com/issues/71267 */
     pg->reset_pglog_based_recovery_op();
     co_return seastar::stop_iteration::yes;
   }