From 4b7031586683009cb33e56e5a97b0deb34028aa3 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 10 Apr 2024 14:51:47 +0800 Subject: [PATCH] crimson/osd/pg_recovery: backoff if the recovery/backfill is deferred Fixes: https://tracker.ceph.com/issues/65399 Signed-off-by: Xuehan Xu --- src/crimson/osd/pg_recovery.cc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index 13ac069c63d..b2f813447b3 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -42,7 +42,12 @@ PGRecovery::start_recovery_ops( { assert(pg->is_primary()); assert(pg->is_peered()); - assert(pg->is_recovering()); + + if (!pg->is_recovering() && !pg->is_backfilling()) { + logger().debug("recovery raced and were queued twice, ignoring!"); + return seastar::make_ready_future(false); + } + // in ceph-osd the do_recovery() path handles both the pg log-based // recovery and the backfill, albeit they are separated at the layer // of PeeringState. In crimson-osd backfill has been cut from it, so @@ -64,6 +69,12 @@ PGRecovery::start_recovery_ops( [] (auto&& ifut) { return std::move(ifut); }).then_interruptible([this] { + //TODO: maybe we should implement a recovery race interruptor in the future + if (!pg->is_recovering() && !pg->is_backfilling()) { + logger().debug("recovery raced and were queued twice, ignoring!"); + return seastar::make_ready_future(false); + } + bool done = !pg->get_peering_state().needs_recovery(); if (done) { logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}", -- 2.39.5