From 278ace1bc4abf7a91a7e0932091353b19da11222 Mon Sep 17 00:00:00 2001
From: Aishwarya Mathuria <amathuri@redhat.com>
Date: Tue, 14 Apr 2026 13:29:36 +0530
Subject: [PATCH] crimson/osd: fix race between AllReplicasRecovered and
 DeferRecovery
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Fixes a crash where AllReplicasRecovered event arrives at NotRecovering
state due to async event delivery race with DeferRecovery preemption.

The issue occurs when:
1. Recovery completes and AllReplicasRecovered is queued asynchronously
2. A higher priority operation (e.g., client I/O) triggers AsyncReserver
   to preempt recovery, posting DeferRecovery event
3. DeferRecovery is processed first, transitioning PG to NotRecovering
4. AllReplicasRecovered arrives at wrong state â crash with "bad state
   machine event" because NotRecovering doesn't handle it

The fix follows Classic OSD's approach in PrimaryLogPG::start_recovery_ops():
clear PG_STATE_RECOVERING before posting recovery completion events. This
makes the existing safety check in PeeringState::Recovering::react() work:
when DeferRecovery arrives and sees !state_test(PG_STATE_RECOVERING), it
discards itself, preventing the state transition that would cause the crash.

Fixes:https://tracker.ceph.com/issues/73314
Signed-off-by: Aishwarya Mathuria <amathuri@redhat.com>
---
 src/crimson/osd/pg_recovery.cc | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index 0ff4abc8518f..cac3bfc53df2 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -83,15 +83,24 @@ PGRecovery::start_recovery_ops(
   ceph_assert(!pg->is_backfilling());
 
   if (!pg->get_peering_state().needs_recovery()) {
+    /* Clear PG_STATE_RECOVERING before posting recovery completion events
+     * to prevent race condition with DeferRecovery. 
+     * See https://tracker.ceph.com/issues/73314.
+     *
+     * This matches the Classic OSD approach in PrimaryLogPG::start_recovery_ops().
+     * When DeferRecovery arrives, it checks !state_test(PG_STATE_RECOVERING)
+     * and discards itself if the flag is already clear, preventing the crash
+     * that occurs when AllReplicasRecovered arrives at NotRecovering state. */
+    pg->get_peering_state().state_clear(PG_STATE_RECOVERING);
+    pg->get_peering_state().state_clear(PG_STATE_FORCED_RECOVERY);
+
     if (pg->get_peering_state().needs_backfill()) {
+      DEBUGDPP("recovery done, queuing backfill", *pg->get_dpp());
       request_backfill();
     } else {
+      DEBUGDPP("recovery done, no backfill", *pg->get_dpp());
       all_replicas_recovered();
     }
-    /* TODO: this is racy -- it's possible for a DeferRecovery
-     * event to be processed between this call and when the
-     * async RequestBackfill or AllReplicasRecovered events
-     * are processed --  see https://tracker.ceph.com/issues/71267 */
     pg->reset_pglog_based_recovery_op();
     co_return seastar::stop_iteration::yes;
   }
-- 
2.47.3