]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/PeeringState: rename "cancel_backfill" to "suspend_backfill" 61232/head
authorXuehan Xu <xuxuehan@qianxin.com>
Mon, 16 Dec 2024 03:05:11 +0000 (11:05 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Tue, 7 Jan 2025 03:06:29 +0000 (11:06 +0800)
PerringState events the are leading to `cancel_backfill()` are:
* DeferBackfill - Called if local recovery reservation is revoked
  before it completes (See AsyncResever::request_reservation on_preempt)
* UnfoundBackfill
* RemoteReservationRevokedTooFull
* RemoteReservationRevoked

In each event, we merely suspend the the backfill.
The primary will *continue* to keep trying to start this backfill
as long as the up set for the current interval includes the osds
that needs to be backfilled.

Eventually, the backfill will either succeed and complete or will
be made irrelevant due to an interval change (and essentially truly
"cancelled")

Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/osd/backfill_state.cc
src/crimson/osd/backfill_state.h
src/crimson/osd/pg.h
src/crimson/osd/pg_recovery.cc
src/crimson/osd/pg_recovery.h
src/osd/PG.cc
src/osd/PG.h
src/osd/PeeringState.cc
src/osd/PeeringState.h
src/osd/PrimaryLogPG.cc
src/test/crimson/test_backfill.cc

index 1392ee330ac2077f37772a62140edb6998c40b89..0b26b03a1c8f47fe7227acea3a419879e9032bd7 100644 (file)
@@ -417,7 +417,7 @@ BackfillState::PrimaryScanning::react(PrimaryScanned evt)
 }
 
 boost::statechart::result
-BackfillState::PrimaryScanning::react(CancelBackfill evt)
+BackfillState::PrimaryScanning::react(SuspendBackfill evt)
 {
   LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill);
   DEBUGDPP("suspended within PrimaryScanning", pg());
@@ -513,7 +513,7 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt)
 }
 
 boost::statechart::result
-BackfillState::ReplicasScanning::react(CancelBackfill evt)
+BackfillState::ReplicasScanning::react(SuspendBackfill evt)
 {
   LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill);
   DEBUGDPP("suspended within ReplicasScanning", pg());
@@ -566,7 +566,7 @@ BackfillState::Waiting::react(ObjectPushed evt)
 }
 
 boost::statechart::result
-BackfillState::Waiting::react(CancelBackfill evt)
+BackfillState::Waiting::react(SuspendBackfill evt)
 {
   LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill);
   DEBUGDPP("suspended within Waiting", pg());
index 463be4a7a2eb5ef1f7e8364427dd137352bceea0..ebaf099b76fcc31caf821f1b6bd45ab404938535 100644 (file)
@@ -59,7 +59,7 @@ struct BackfillState {
   struct RequestDone : sc::event<RequestDone> {
   };
 
-  struct CancelBackfill : sc::event<CancelBackfill> {
+  struct SuspendBackfill : sc::event<SuspendBackfill> {
   };
 
 private:
@@ -210,14 +210,14 @@ public:
       sc::custom_reaction<ObjectPushed>,
       sc::custom_reaction<PrimaryScanned>,
       sc::transition<RequestDone, Done>,
-      sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<SuspendBackfill>,
       sc::custom_reaction<Triggered>,
       sc::transition<sc::event_base, Crashed>>;
     explicit PrimaryScanning(my_context);
     sc::result react(ObjectPushed);
     // collect scanning result and transit to Enqueuing.
     sc::result react(PrimaryScanned);
-    sc::result react(CancelBackfill);
+    sc::result react(SuspendBackfill);
     sc::result react(Triggered);
   };
 
@@ -226,7 +226,7 @@ public:
     using reactions = boost::mpl::list<
       sc::custom_reaction<ObjectPushed>,
       sc::custom_reaction<ReplicaScanned>,
-      sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<SuspendBackfill>,
       sc::custom_reaction<Triggered>,
       sc::transition<RequestDone, Done>,
       sc::transition<sc::event_base, Crashed>>;
@@ -235,7 +235,7 @@ public:
     // to Enqueuing will happen.
     sc::result react(ObjectPushed);
     sc::result react(ReplicaScanned);
-    sc::result react(CancelBackfill);
+    sc::result react(SuspendBackfill);
     sc::result react(Triggered);
 
     // indicate whether a particular peer should be scanned to retrieve
@@ -255,22 +255,22 @@ public:
     using reactions = boost::mpl::list<
       sc::custom_reaction<ObjectPushed>,
       sc::transition<RequestDone, Done>,
-      sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<SuspendBackfill>,
       sc::custom_reaction<Triggered>,
       sc::transition<sc::event_base, Crashed>>;
     explicit Waiting(my_context);
     sc::result react(ObjectPushed);
-    sc::result react(CancelBackfill);
+    sc::result react(SuspendBackfill);
     sc::result react(Triggered);
   };
 
   struct Done : sc::state<Done, BackfillMachine>,
                 StateHelper<Done> {
     using reactions = boost::mpl::list<
-      sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<SuspendBackfill>,
       sc::transition<sc::event_base, Crashed>>;
     explicit Done(my_context);
-    sc::result react(CancelBackfill) {
+    sc::result react(SuspendBackfill) {
       return discard_event();
     }
   };
index 15aeec0e4f35c0fd93c3b73d3ec91a603cb7f6b3..67758c6b44b0663ff21242bdd0ad443c57adee9d 100644 (file)
@@ -432,8 +432,8 @@ public:
   void on_backfill_reserved() final {
     recovery_handler->on_backfill_reserved();
   }
-  void on_backfill_canceled() final {
-    recovery_handler->backfill_cancelled();
+  void on_backfill_suspended() final {
+    recovery_handler->backfill_suspended();
   }
 
   void on_recovery_cancelled() final {
index ec3af0d2b00061bde2b5801435d0109faecb83a3..5db347fc55e313cad29c9c587507846631c6b261 100644 (file)
@@ -630,13 +630,13 @@ void PGRecovery::backfilled()
     PeeringState::Backfilled{});
 }
 
-void PGRecovery::backfill_cancelled()
+void PGRecovery::backfill_suspended()
 {
   // We are not creating a new BackfillRecovery request here, as we
   // need to cancel the backfill synchronously (before this method returns).
   using BackfillState = crimson::osd::BackfillState;
   backfill_state->process_event(
-    BackfillState::CancelBackfill{}.intrusive_from_this());
+    BackfillState::SuspendBackfill{}.intrusive_from_this());
 }
 
 void PGRecovery::dispatch_backfill_event(
index 657e6d3e888c7385a15fae9f6799b8f158f046a4..9d4a4874402f4fc22c782c46d1a336226ca64098 100644 (file)
@@ -105,7 +105,7 @@ private:
   template <class EventT>
   void start_backfill_recovery(
     const EventT& evt);
-  void backfill_cancelled();
+  void backfill_suspended();
   void request_replica_scan(
     const pg_shard_t& target,
     const hobject_t& begin,
index 307651fd6272911b4057bb6b09eff53e57a520e4..eecf19e58989116b1ae51ae07c3e5eacc8f179e6 100644 (file)
@@ -1562,8 +1562,12 @@ void PG::on_backfill_reserved()
   queue_recovery();
 }
 
-void PG::on_backfill_canceled()
+void PG::on_backfill_suspended()
 {
+  // Scan replies asked before suspending this backfill should be ignored.
+  // See PrimaryLogPG::do_scan -  case MOSDPGScan::OP_SCAN_DIGEST.
+  // `waiting_on_backfill` will be re-refilled after the suspended backfill
+  // is resumed/restarted.
   if (!waiting_on_backfill.empty()) {
     waiting_on_backfill.clear();
     finish_recovery_op(hobject_t::get_max());
index 86e2e2fa3128fba49b378203d62926258445b8b1..bb8caa36b9544abb6390533e1ab9466ef2f4a461 100644 (file)
@@ -604,7 +604,7 @@ public:
   void queue_snap_retrim(snapid_t snap);
 
   void on_backfill_reserved() override;
-  void on_backfill_canceled() override;
+  void on_backfill_suspended() override;
   void on_recovery_cancelled() override {}
   void on_recovery_reserved() override;
 
index 334d202d207a9148f9be4607d38c0da71a51f706..c7f1aaebc216fef5103418821bea2a6d14f7e701 100644 (file)
@@ -5106,11 +5106,11 @@ void PeeringState::Backfilling::backfill_release_reservations()
   }
 }
 
-void PeeringState::Backfilling::cancel_backfill()
+void PeeringState::Backfilling::suspend_backfill()
 {
   DECLARE_LOCALS;
   backfill_release_reservations();
-  pl->on_backfill_canceled();
+  pl->on_backfill_suspended();
 }
 
 boost::statechart::result
@@ -5128,7 +5128,7 @@ PeeringState::Backfilling::react(const DeferBackfill &c)
   psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
   ps->state_set(PG_STATE_BACKFILL_WAIT);
   ps->state_clear(PG_STATE_BACKFILLING);
-  cancel_backfill();
+  suspend_backfill();
 
   pl->schedule_event_after(
     std::make_shared<PGPeeringEvent>(
@@ -5146,7 +5146,7 @@ PeeringState::Backfilling::react(const UnfoundBackfill &c)
   psdout(10) << "backfill has unfound, can't continue" << dendl;
   ps->state_set(PG_STATE_BACKFILL_UNFOUND);
   ps->state_clear(PG_STATE_BACKFILLING);
-  cancel_backfill();
+  suspend_backfill();
   return transit<NotBackfilling>();
 }
 
@@ -5157,7 +5157,7 @@ PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
 
   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
   ps->state_clear(PG_STATE_BACKFILLING);
-  cancel_backfill();
+  suspend_backfill();
 
   pl->schedule_event_after(
     std::make_shared<PGPeeringEvent>(
@@ -5174,7 +5174,7 @@ PeeringState::Backfilling::react(const RemoteReservationRevoked &)
 {
   DECLARE_LOCALS;
   ps->state_set(PG_STATE_BACKFILL_WAIT);
-  cancel_backfill();
+  suspend_backfill();
   if (ps->needs_backfill()) {
     return transit<WaitLocalBackfillReserved>();
   } else {
index 4b5285b18786f03cc456ac6b4a729699313a3b4a..54e8c89c92125f239b2e5ceedf1e55a8a4b0c897 100644 (file)
@@ -417,7 +417,7 @@ public:
 
     // ============ recovery reservation notifications ==========
     virtual void on_backfill_reserved() = 0;
-    virtual void on_backfill_canceled() = 0;
+    virtual void on_backfill_suspended() = 0;
     virtual void on_recovery_reserved() = 0;
     virtual void on_recovery_cancelled() = 0;
 
@@ -963,7 +963,7 @@ public:
     boost::statechart::result react(const RemoteReservationRevoked& evt);
     boost::statechart::result react(const DeferBackfill& evt);
     boost::statechart::result react(const UnfoundBackfill& evt);
-    void cancel_backfill();
+    void suspend_backfill();
     void exit();
   };
 
index 44f8e85b5ef6c34a4333a78f9d629c0ae81ecaed..74b2f31693e92d881e190b6a185e5d3b03cdb3b6 100644 (file)
@@ -4485,7 +4485,7 @@ void PrimaryLogPG::do_scan(
     {
       auto dpp = get_dpp();
       if (osd->check_backfill_full(dpp)) {
-       dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
+       dout(1) << __func__ << ": Suspending backfill: Full." << dendl;
        queue_peering_event(
          PGPeeringEventRef(
            std::make_shared<PGPeeringEvent>(
@@ -4542,7 +4542,7 @@ void PrimaryLogPG::do_scan(
       } else {
        // we canceled backfill for a while due to a too full, and this
        // is an extra response from a non-too-full peer
-       dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
+       dout(20) << __func__ << " suspended backfill (too full?)" << dendl;
       }
     }
     break;
index e0fc5821d08968d86db5b41597bf3792880834e2..b1c9c0575d541c21bbbc182fd0952298673e09b1 100644 (file)
@@ -193,7 +193,7 @@ public:
   struct PGFacade;
 
   void cancel() {
-    schedule_event_immediate(crimson::osd::BackfillState::CancelBackfill{});
+    schedule_event_immediate(crimson::osd::BackfillState::SuspendBackfill{});
   }
 
   void resume() {
@@ -476,7 +476,7 @@ TEST(backfill, cancel_resume_middle_of_primaryscan)
 
   EXPECT_CALL(cluster_fixture, backfilled);
   cluster_fixture.cancel();
-  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::SuspendBackfill>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
   cluster_fixture.resume();
   cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
@@ -508,7 +508,7 @@ TEST(backfill, cancel_resume_middle_of_replicascan1)
   EXPECT_CALL(cluster_fixture, backfilled);
   cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
   cluster_fixture.cancel();
-  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::SuspendBackfill>();
   cluster_fixture.resume();
   cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
@@ -540,7 +540,7 @@ TEST(backfill, cancel_resume_middle_of_replicascan2)
   cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
   cluster_fixture.cancel();
-  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::SuspendBackfill>();
   cluster_fixture.resume();
   cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
@@ -572,7 +572,7 @@ TEST(backfill, cancel_resume_middle_of_push1)
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
   cluster_fixture.cancel();
-  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::SuspendBackfill>();
   cluster_fixture.resume();
   cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
@@ -604,7 +604,7 @@ TEST(backfill, cancel_resume_middle_of_push2)
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
   cluster_fixture.cancel();
-  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::SuspendBackfill>();
   cluster_fixture.resume();
   cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
@@ -635,7 +635,7 @@ TEST(backfill, cancel_resume_middle_of_push3)
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
   cluster_fixture.cancel();
-  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::SuspendBackfill>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
   cluster_fixture.resume();