crimson/osd: fix race between AllReplicasRecovered and DeferRecovery

author Aishwarya Mathuria <amathuri@redhat.com>

Tue, 14 Apr 2026 07:59:36 +0000 (13:29 +0530)

committer Aishwarya Mathuria <amathuri@redhat.com>

Wed, 15 Apr 2026 10:25:19 +0000 (10:25 +0000)
author Aishwarya Mathuria <amathuri@redhat.com>
Tue, 14 Apr 2026 07:59:36 +0000 (13:29 +0530)
committer Aishwarya Mathuria <amathuri@redhat.com>
Wed, 15 Apr 2026 10:25:19 +0000 (10:25 +0000)
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc

index 0ff4abc8518f00055af3cc3867243ee25d2e1b76..cac3bfc53df2c999043e6c6ab09e462a9199fe4f 100644 (file)
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -83,15 +83,24 @@ PGRecovery::start_recovery_ops(
    ceph_assert(!pg->is_backfilling());
  
    if (!pg->get_peering_state().needs_recovery()) {
+    /* Clear PG_STATE_RECOVERING before posting recovery completion events
+     * to prevent race condition with DeferRecovery. 
+     * See https://tracker.ceph.com/issues/73314.
+     *
+     * This matches the Classic OSD approach in PrimaryLogPG::start_recovery_ops().
+     * When DeferRecovery arrives, it checks !state_test(PG_STATE_RECOVERING)
+     * and discards itself if the flag is already clear, preventing the crash
+     * that occurs when AllReplicasRecovered arrives at NotRecovering state. */
+    pg->get_peering_state().state_clear(PG_STATE_RECOVERING);
+    pg->get_peering_state().state_clear(PG_STATE_FORCED_RECOVERY);
+
      if (pg->get_peering_state().needs_backfill()) {
+      DEBUGDPP("recovery done, queuing backfill", *pg->get_dpp());
        request_backfill();
      } else {
+      DEBUGDPP("recovery done, no backfill", *pg->get_dpp());
        all_replicas_recovered();
      }
-    /* TODO: this is racy -- it's possible for a DeferRecovery
-     * event to be processed between this call and when the
-     * async RequestBackfill or AllReplicasRecovered events
-     * are processed --  see https://tracker.ceph.com/issues/71267 */
      pg->reset_pglog_based_recovery_op();
      co_return seastar::stop_iteration::yes;
    }
author	Aishwarya Mathuria <amathuri@redhat.com>
	Tue, 14 Apr 2026 07:59:36 +0000 (13:29 +0530)
committer	Aishwarya Mathuria <amathuri@redhat.com>
	Wed, 15 Apr 2026 10:25:19 +0000 (10:25 +0000)