ceph_assert(!pg->is_backfilling());
if (!pg->get_peering_state().needs_recovery()) {
+ /* Clear PG_STATE_RECOVERING before posting recovery completion events
+ * to prevent race condition with DeferRecovery.
+ * See https://tracker.ceph.com/issues/73314.
+ *
+ * This matches the Classic OSD approach in PrimaryLogPG::start_recovery_ops().
+ * When DeferRecovery arrives, it checks !state_test(PG_STATE_RECOVERING)
+ * and discards itself if the flag is already clear, preventing the crash
+ * that occurs when AllReplicasRecovered arrives at NotRecovering state. */
+ pg->get_peering_state().state_clear(PG_STATE_RECOVERING);
+ pg->get_peering_state().state_clear(PG_STATE_FORCED_RECOVERY);
+
if (pg->get_peering_state().needs_backfill()) {
+ DEBUGDPP("recovery done, queuing backfill", *pg->get_dpp());
request_backfill();
} else {
+ DEBUGDPP("recovery done, no backfill", *pg->get_dpp());
all_replicas_recovered();
}
- /* TODO: this is racy -- it's possible for a DeferRecovery
- * event to be processed between this call and when the
- * async RequestBackfill or AllReplicasRecovered events
- * are processed -- see https://tracker.ceph.com/issues/71267 */
pg->reset_pglog_based_recovery_op();
co_return seastar::stop_iteration::yes;
}