]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/osd/backfill_state: treat Cancelled as a pause of the ongoing backfilling
authorXuehan Xu <xuxuehan@qianxin.com>
Tue, 8 Oct 2024 04:26:41 +0000 (12:26 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Mon, 16 Dec 2024 06:06:32 +0000 (14:06 +0800)
Fixes: https://tracker.ceph.com/issues/67888
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/osd/backfill_state.cc
src/crimson/osd/backfill_state.h

index 62607c7fbebab0f1ae0cd3879bec178842691b64..1392ee330ac2077f37772a62140edb6998c40b89 100644 (file)
@@ -407,7 +407,34 @@ BackfillState::PrimaryScanning::react(PrimaryScanned evt)
   LOG_PREFIX(BackfillState::PrimaryScanning::react::PrimaryScanned);
   DEBUGDPP("", pg());
   backfill_state().backfill_info = std::move(evt.result);
-  return transit<Enqueuing>();
+  if (!backfill_state().is_suspended()) {
+    return transit<Enqueuing>();
+  } else {
+    DEBUGDPP("backfill suspended, not going Enqueuing", pg());
+    backfill_state().go_enqueuing_on_resume();
+  }
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(CancelBackfill evt)
+{
+  LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill);
+  DEBUGDPP("suspended within PrimaryScanning", pg());
+  backfill_state().on_suspended();
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(Triggered evt)
+{
+  LOG_PREFIX(BackfillState::PrimaryScanning::react::Triggered);
+  ceph_assert(backfill_state().is_suspended());
+  if (backfill_state().on_resumed()) {
+    DEBUGDPP("Backfill resumed, going Enqueuing", pg());
+    return transit<Enqueuing>();
+  }
+  return discard_event();
 }
 
 boost::statechart::result
@@ -470,12 +497,17 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt)
     if (waiting_on_backfill.empty()) {
       ceph_assert(backfill_state().peer_backfill_info.size() == \
                   peering_state().get_backfill_targets().size());
-      return transit<Enqueuing>();
+      if (!backfill_state().is_suspended()) {
+       return transit<Enqueuing>();
+      } else {
+       DEBUGDPP("backfill suspended, not going Enqueuing", pg());
+       backfill_state().go_enqueuing_on_resume();
+      }
     }
   } else {
-    // we canceled backfill for a while due to a too full, and this
+    // we suspended backfill for a while due to a too full, and this
     // is an extra response from a non-too-full peer
-    DEBUGDPP("canceled backfill (too full?)", pg());
+    DEBUGDPP("suspended backfill (too full?)", pg());
   }
   return discard_event();
 }
@@ -483,8 +515,22 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt)
 boost::statechart::result
 BackfillState::ReplicasScanning::react(CancelBackfill evt)
 {
-  LOG_PREFIX(BackfillState::ReplicasScanning::react::CancelBackfill);
-  DEBUGDPP("cancelled within ReplicasScanning", pg());
+  LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill);
+  DEBUGDPP("suspended within ReplicasScanning", pg());
+  backfill_state().on_suspended();
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::ReplicasScanning::react(Triggered evt)
+{
+  LOG_PREFIX(BackfillState::ReplicasScanning::react::Triggered);
+  ceph_assert(backfill_state().is_suspended());
+  if (backfill_state().on_resumed()) {
+    DEBUGDPP("Backfill resumed, going Enqueuing", pg());
+    return transit<Enqueuing>();
+  }
+  return discard_event();
 }
 
 boost::statechart::result
@@ -510,7 +556,34 @@ BackfillState::Waiting::react(ObjectPushed evt)
   LOG_PREFIX(BackfillState::Waiting::react::ObjectPushed);
   DEBUGDPP("Waiting::react() on ObjectPushed; evt.object={}", pg(), evt.object);
   backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false);
-  return transit<Enqueuing>();
+  if (!backfill_state().is_suspended()) {
+    return transit<Enqueuing>();
+  } else {
+    DEBUGDPP("backfill suspended, not going Enqueuing", pg());
+    backfill_state().go_enqueuing_on_resume();
+  }
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::Waiting::react(CancelBackfill evt)
+{
+  LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill);
+  DEBUGDPP("suspended within Waiting", pg());
+  backfill_state().on_suspended();
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::Waiting::react(Triggered evt)
+{
+  LOG_PREFIX(BackfillState::Waiting::react::Triggered);
+  ceph_assert(backfill_state().is_suspended());
+  if (backfill_state().on_resumed()) {
+    DEBUGDPP("Backfill resumed, going Enqueuing", pg());
+    return transit<Enqueuing>();
+  }
+  return discard_event();
 }
 
 // -- Done
index 34400d930b2b9bb3dbc08d18463835763df710bd..463be4a7a2eb5ef1f7e8364427dd137352bceea0 100644 (file)
@@ -210,11 +210,15 @@ public:
       sc::custom_reaction<ObjectPushed>,
       sc::custom_reaction<PrimaryScanned>,
       sc::transition<RequestDone, Done>,
+      sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<Triggered>,
       sc::transition<sc::event_base, Crashed>>;
     explicit PrimaryScanning(my_context);
     sc::result react(ObjectPushed);
     // collect scanning result and transit to Enqueuing.
     sc::result react(PrimaryScanned);
+    sc::result react(CancelBackfill);
+    sc::result react(Triggered);
   };
 
   struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>,
@@ -223,6 +227,7 @@ public:
       sc::custom_reaction<ObjectPushed>,
       sc::custom_reaction<ReplicaScanned>,
       sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<Triggered>,
       sc::transition<RequestDone, Done>,
       sc::transition<sc::event_base, Crashed>>;
     explicit ReplicasScanning(my_context);
@@ -231,6 +236,7 @@ public:
     sc::result react(ObjectPushed);
     sc::result react(ReplicaScanned);
     sc::result react(CancelBackfill);
+    sc::result react(Triggered);
 
     // indicate whether a particular peer should be scanned to retrieve
     // BackfillInterval for new range of hobject_t namespace.
@@ -249,9 +255,13 @@ public:
     using reactions = boost::mpl::list<
       sc::custom_reaction<ObjectPushed>,
       sc::transition<RequestDone, Done>,
+      sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<Triggered>,
       sc::transition<sc::event_base, Crashed>>;
     explicit Waiting(my_context);
     sc::result react(ObjectPushed);
+    sc::result react(CancelBackfill);
+    sc::result react(Triggered);
   };
 
   struct Done : sc::state<Done, BackfillMachine>,
@@ -296,6 +306,26 @@ public:
     }
   }
 private:
+  struct backfill_suspend_state_t {
+    bool suspended = false;
+    bool should_go_enqueuing = false;
+  } backfill_suspend_state;
+  bool is_suspended() const {
+    return backfill_suspend_state.suspended;
+  }
+  void on_suspended() {
+    ceph_assert(!is_suspended());
+    backfill_suspend_state = {true, false};
+  }
+  bool on_resumed() {
+    auto go_enqueuing = backfill_suspend_state.should_go_enqueuing;
+    backfill_suspend_state = {false, false};
+    return go_enqueuing;
+  }
+  void go_enqueuing_on_resume() {
+    ceph_assert(is_suspended());
+    backfill_suspend_state.should_go_enqueuing = true;
+  }
   hobject_t last_backfill_started;
   BackfillInterval backfill_info;
   std::map<pg_shard_t, BackfillInterval> peer_backfill_info;