From 29c43371b595f30d8840185eba21c09ee235f0f8 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Mon, 16 Dec 2024 11:05:11 +0800 Subject: [PATCH] osd/PeeringState: rename "cancel_backfill" to "suspend_backfill" PerringState events the are leading to `cancel_backfill()` are: * DeferBackfill - Called if local recovery reservation is revoked before it completes (See AsyncResever::request_reservation on_preempt) * UnfoundBackfill * RemoteReservationRevokedTooFull * RemoteReservationRevoked In each event, we merely suspend the the backfill. The primary will *continue* to keep trying to start this backfill as long as the up set for the current interval includes the osds that needs to be backfilled. Eventually, the backfill will either succeed and complete or will be made irrelevant due to an interval change (and essentially truly "cancelled") Signed-off-by: Xuehan Xu --- src/crimson/osd/backfill_state.cc | 6 +++--- src/crimson/osd/backfill_state.h | 18 +++++++++--------- src/crimson/osd/pg.h | 4 ++-- src/crimson/osd/pg_recovery.cc | 4 ++-- src/crimson/osd/pg_recovery.h | 2 +- src/osd/PG.cc | 6 +++++- src/osd/PG.h | 2 +- src/osd/PeeringState.cc | 12 ++++++------ src/osd/PeeringState.h | 4 ++-- src/osd/PrimaryLogPG.cc | 4 ++-- src/test/crimson/test_backfill.cc | 14 +++++++------- 11 files changed, 40 insertions(+), 36 deletions(-) diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 1392ee330ac2..0b26b03a1c8f 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -417,7 +417,7 @@ BackfillState::PrimaryScanning::react(PrimaryScanned evt) } boost::statechart::result -BackfillState::PrimaryScanning::react(CancelBackfill evt) +BackfillState::PrimaryScanning::react(SuspendBackfill evt) { LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill); DEBUGDPP("suspended within PrimaryScanning", pg()); @@ -513,7 +513,7 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) } boost::statechart::result -BackfillState::ReplicasScanning::react(CancelBackfill evt) +BackfillState::ReplicasScanning::react(SuspendBackfill evt) { LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill); DEBUGDPP("suspended within ReplicasScanning", pg()); @@ -566,7 +566,7 @@ BackfillState::Waiting::react(ObjectPushed evt) } boost::statechart::result -BackfillState::Waiting::react(CancelBackfill evt) +BackfillState::Waiting::react(SuspendBackfill evt) { LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill); DEBUGDPP("suspended within Waiting", pg()); diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index 463be4a7a2eb..ebaf099b76fc 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -59,7 +59,7 @@ struct BackfillState { struct RequestDone : sc::event { }; - struct CancelBackfill : sc::event { + struct SuspendBackfill : sc::event { }; private: @@ -210,14 +210,14 @@ public: sc::custom_reaction, sc::custom_reaction, sc::transition, - sc::custom_reaction, + sc::custom_reaction, sc::custom_reaction, sc::transition>; explicit PrimaryScanning(my_context); sc::result react(ObjectPushed); // collect scanning result and transit to Enqueuing. sc::result react(PrimaryScanned); - sc::result react(CancelBackfill); + sc::result react(SuspendBackfill); sc::result react(Triggered); }; @@ -226,7 +226,7 @@ public: using reactions = boost::mpl::list< sc::custom_reaction, sc::custom_reaction, - sc::custom_reaction, + sc::custom_reaction, sc::custom_reaction, sc::transition, sc::transition>; @@ -235,7 +235,7 @@ public: // to Enqueuing will happen. sc::result react(ObjectPushed); sc::result react(ReplicaScanned); - sc::result react(CancelBackfill); + sc::result react(SuspendBackfill); sc::result react(Triggered); // indicate whether a particular peer should be scanned to retrieve @@ -255,22 +255,22 @@ public: using reactions = boost::mpl::list< sc::custom_reaction, sc::transition, - sc::custom_reaction, + sc::custom_reaction, sc::custom_reaction, sc::transition>; explicit Waiting(my_context); sc::result react(ObjectPushed); - sc::result react(CancelBackfill); + sc::result react(SuspendBackfill); sc::result react(Triggered); }; struct Done : sc::state, StateHelper { using reactions = boost::mpl::list< - sc::custom_reaction, + sc::custom_reaction, sc::transition>; explicit Done(my_context); - sc::result react(CancelBackfill) { + sc::result react(SuspendBackfill) { return discard_event(); } }; diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 15aeec0e4f35..67758c6b44b0 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -432,8 +432,8 @@ public: void on_backfill_reserved() final { recovery_handler->on_backfill_reserved(); } - void on_backfill_canceled() final { - recovery_handler->backfill_cancelled(); + void on_backfill_suspended() final { + recovery_handler->backfill_suspended(); } void on_recovery_cancelled() final { diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index ec3af0d2b000..5db347fc55e3 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -630,13 +630,13 @@ void PGRecovery::backfilled() PeeringState::Backfilled{}); } -void PGRecovery::backfill_cancelled() +void PGRecovery::backfill_suspended() { // We are not creating a new BackfillRecovery request here, as we // need to cancel the backfill synchronously (before this method returns). using BackfillState = crimson::osd::BackfillState; backfill_state->process_event( - BackfillState::CancelBackfill{}.intrusive_from_this()); + BackfillState::SuspendBackfill{}.intrusive_from_this()); } void PGRecovery::dispatch_backfill_event( diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h index 657e6d3e888c..9d4a4874402f 100644 --- a/src/crimson/osd/pg_recovery.h +++ b/src/crimson/osd/pg_recovery.h @@ -105,7 +105,7 @@ private: template void start_backfill_recovery( const EventT& evt); - void backfill_cancelled(); + void backfill_suspended(); void request_replica_scan( const pg_shard_t& target, const hobject_t& begin, diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 307651fd6272..eecf19e58989 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1562,8 +1562,12 @@ void PG::on_backfill_reserved() queue_recovery(); } -void PG::on_backfill_canceled() +void PG::on_backfill_suspended() { + // Scan replies asked before suspending this backfill should be ignored. + // See PrimaryLogPG::do_scan - case MOSDPGScan::OP_SCAN_DIGEST. + // `waiting_on_backfill` will be re-refilled after the suspended backfill + // is resumed/restarted. if (!waiting_on_backfill.empty()) { waiting_on_backfill.clear(); finish_recovery_op(hobject_t::get_max()); diff --git a/src/osd/PG.h b/src/osd/PG.h index 86e2e2fa3128..bb8caa36b954 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -604,7 +604,7 @@ public: void queue_snap_retrim(snapid_t snap); void on_backfill_reserved() override; - void on_backfill_canceled() override; + void on_backfill_suspended() override; void on_recovery_cancelled() override {} void on_recovery_reserved() override; diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 334d202d207a..c7f1aaebc216 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -5106,11 +5106,11 @@ void PeeringState::Backfilling::backfill_release_reservations() } } -void PeeringState::Backfilling::cancel_backfill() +void PeeringState::Backfilling::suspend_backfill() { DECLARE_LOCALS; backfill_release_reservations(); - pl->on_backfill_canceled(); + pl->on_backfill_suspended(); } boost::statechart::result @@ -5128,7 +5128,7 @@ PeeringState::Backfilling::react(const DeferBackfill &c) psdout(10) << "defer backfill, retry delay " << c.delay << dendl; ps->state_set(PG_STATE_BACKFILL_WAIT); ps->state_clear(PG_STATE_BACKFILLING); - cancel_backfill(); + suspend_backfill(); pl->schedule_event_after( std::make_shared( @@ -5146,7 +5146,7 @@ PeeringState::Backfilling::react(const UnfoundBackfill &c) psdout(10) << "backfill has unfound, can't continue" << dendl; ps->state_set(PG_STATE_BACKFILL_UNFOUND); ps->state_clear(PG_STATE_BACKFILLING); - cancel_backfill(); + suspend_backfill(); return transit(); } @@ -5157,7 +5157,7 @@ PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &) ps->state_set(PG_STATE_BACKFILL_TOOFULL); ps->state_clear(PG_STATE_BACKFILLING); - cancel_backfill(); + suspend_backfill(); pl->schedule_event_after( std::make_shared( @@ -5174,7 +5174,7 @@ PeeringState::Backfilling::react(const RemoteReservationRevoked &) { DECLARE_LOCALS; ps->state_set(PG_STATE_BACKFILL_WAIT); - cancel_backfill(); + suspend_backfill(); if (ps->needs_backfill()) { return transit(); } else { diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index 4b5285b18786..54e8c89c9212 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -417,7 +417,7 @@ public: // ============ recovery reservation notifications ========== virtual void on_backfill_reserved() = 0; - virtual void on_backfill_canceled() = 0; + virtual void on_backfill_suspended() = 0; virtual void on_recovery_reserved() = 0; virtual void on_recovery_cancelled() = 0; @@ -963,7 +963,7 @@ public: boost::statechart::result react(const RemoteReservationRevoked& evt); boost::statechart::result react(const DeferBackfill& evt); boost::statechart::result react(const UnfoundBackfill& evt); - void cancel_backfill(); + void suspend_backfill(); void exit(); }; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 44f8e85b5ef6..74b2f31693e9 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -4485,7 +4485,7 @@ void PrimaryLogPG::do_scan( { auto dpp = get_dpp(); if (osd->check_backfill_full(dpp)) { - dout(1) << __func__ << ": Canceling backfill: Full." << dendl; + dout(1) << __func__ << ": Suspending backfill: Full." << dendl; queue_peering_event( PGPeeringEventRef( std::make_shared( @@ -4542,7 +4542,7 @@ void PrimaryLogPG::do_scan( } else { // we canceled backfill for a while due to a too full, and this // is an extra response from a non-too-full peer - dout(20) << __func__ << " canceled backfill (too full?)" << dendl; + dout(20) << __func__ << " suspended backfill (too full?)" << dendl; } } break; diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc index e0fc5821d089..b1c9c0575d54 100644 --- a/src/test/crimson/test_backfill.cc +++ b/src/test/crimson/test_backfill.cc @@ -193,7 +193,7 @@ public: struct PGFacade; void cancel() { - schedule_event_immediate(crimson::osd::BackfillState::CancelBackfill{}); + schedule_event_immediate(crimson::osd::BackfillState::SuspendBackfill{}); } void resume() { @@ -476,7 +476,7 @@ TEST(backfill, cancel_resume_middle_of_primaryscan) EXPECT_CALL(cluster_fixture, backfilled); cluster_fixture.cancel(); - cluster_fixture.next_round2(); + cluster_fixture.next_round2(); cluster_fixture.next_round2(); cluster_fixture.resume(); cluster_fixture.next_round2(); @@ -508,7 +508,7 @@ TEST(backfill, cancel_resume_middle_of_replicascan1) EXPECT_CALL(cluster_fixture, backfilled); cluster_fixture.next_round2(); cluster_fixture.cancel(); - cluster_fixture.next_round2(); + cluster_fixture.next_round2(); cluster_fixture.resume(); cluster_fixture.next_round2(); cluster_fixture.next_round2(); @@ -540,7 +540,7 @@ TEST(backfill, cancel_resume_middle_of_replicascan2) cluster_fixture.next_round2(); cluster_fixture.next_round2(); cluster_fixture.cancel(); - cluster_fixture.next_round2(); + cluster_fixture.next_round2(); cluster_fixture.resume(); cluster_fixture.next_round2(); cluster_fixture.next_round2(); @@ -572,7 +572,7 @@ TEST(backfill, cancel_resume_middle_of_push1) cluster_fixture.next_round2(); cluster_fixture.next_round2(); cluster_fixture.cancel(); - cluster_fixture.next_round2(); + cluster_fixture.next_round2(); cluster_fixture.resume(); cluster_fixture.next_round2(); cluster_fixture.next_round2(); @@ -604,7 +604,7 @@ TEST(backfill, cancel_resume_middle_of_push2) cluster_fixture.next_round2(); cluster_fixture.next_round2(); cluster_fixture.cancel(); - cluster_fixture.next_round2(); + cluster_fixture.next_round2(); cluster_fixture.resume(); cluster_fixture.next_round2(); cluster_fixture.next_round2(); @@ -635,7 +635,7 @@ TEST(backfill, cancel_resume_middle_of_push3) cluster_fixture.next_round2(); cluster_fixture.next_round2(); cluster_fixture.cancel(); - cluster_fixture.next_round2(); + cluster_fixture.next_round2(); cluster_fixture.next_round2(); cluster_fixture.next_round2(); cluster_fixture.resume(); -- 2.47.3