]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/osd: cancel ongoing pglog-based recoveries on recovery defering 59066/head
authorXuehan Xu <xuxuehan@qianxin.com>
Wed, 7 Aug 2024 02:52:49 +0000 (10:52 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Wed, 28 Aug 2024 08:56:05 +0000 (16:56 +0800)
Previously, we rely on checking `PG::is_recovery()` in
`PGRecevery::start_recovery_ops()` to determine whether it's still valid
to proceed the recovery. This turns out to be inefficient, for example:

1. PG `P` is under recovery, and `PGRecovery::start_recovery_ops()` is
   called;
2. PG `P`'s recovery is deferred, and `PGRecovery::start_recovery_ops()`
   is blocked on waiting for external replies;
3. Before the arrivals of external replies, PG `P` resumes recovery and
   a new round of `PGRecovery::start_recovery_ops()` starts;
4. The external replies arrives and the old
   `PGRecovery::start_recovery_ops()` continues as `PG:is_recovering()`
   returns true.

In the above case, we get two duplicated recovery ops, which is
incorrect.

Fixes: https://tracker.ceph.com/issues/67380
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/osd/osd_operations/background_recovery.cc
src/crimson/osd/osd_operations/background_recovery.h
src/crimson/osd/pg.cc
src/crimson/osd/pg.h
src/crimson/osd/pg_recovery.cc
src/crimson/osd/pg_recovery.h
src/crimson/osd/pg_recovery_listener.h
src/osd/PG.h
src/osd/PeeringState.cc
src/osd/PeeringState.h

index 509d4c4a484cc27ab76f12b924c2eb09c8ff4ffe..c030c9d89708d5faeb02ad3f21b21b5d5a1fc827 100644 (file)
@@ -158,6 +158,8 @@ PglogBasedRecovery::PglogBasedRecovery(
 PglogBasedRecovery::interruptible_future<bool>
 PglogBasedRecovery::do_recovery()
 {
+  LOG_PREFIX(PglogBasedRecovery::do_recovery);
+  DEBUGDPPI("{}: {}", *pg, __func__, *this);
   if (pg->has_reset_since(epoch_started)) {
     return seastar::make_ready_future<bool>(false);
   }
@@ -167,6 +169,7 @@ PglogBasedRecovery::do_recovery()
                               interruptor>([this] (auto&& trigger) {
       return pg->get_recovery_handler()->start_recovery_ops(
        trigger,
+       *this,
        crimson::common::local_conf()->osd_recovery_max_single_start);
     });
   });
index 17f2cd57a305af359b27ac893c7d2a5d48481c0c..5ae0e1a9edb8503ab228c433be83235cb8b99479 100644 (file)
@@ -91,8 +91,20 @@ public:
     RecoveryBackend::RecoveryBlockingEvent
   > tracking_events;
 
+  void cancel() {
+    cancelled = true;
+  }
+
+  bool is_cancelled() const {
+    return cancelled;
+  }
+
+  epoch_t get_epoch_started() const {
+    return epoch_started;
+  }
 private:
   interruptible_future<bool> do_recovery() override;
+  bool cancelled = false;
 };
 
 class BackfillRecovery final : public BackgroundRecoveryT<BackfillRecovery> {
index cdaa1e8378fc9b6a5d138975f5561f90991bd00a..fb069617a1fcbec276faba3956ad50aa00712596 100644 (file)
@@ -1659,6 +1659,7 @@ void PG::on_change(ceph::os::Transaction &t) {
   peering_state.state_clear(PG_STATE_SNAPTRIM);
   peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR);
   snap_mapper.reset_backend();
+  reset_pglog_based_recovery_op();
 }
 
 void PG::context_registry_on_change() {
@@ -1798,4 +1799,19 @@ void PG::PGLogEntryHandler::remove(const hobject_t &soid) {
   DEBUGDPP("remove {} on pglog rollback", *pg, soid);
   pg->remove_maybe_snapmapped_object(*t, soid);
 }
+
+void PG::set_pglog_based_recovery_op(PglogBasedRecovery *op) {
+  ceph_assert(!pglog_based_recovery_op);
+  pglog_based_recovery_op = op;
+}
+
+void PG::reset_pglog_based_recovery_op() {
+  pglog_based_recovery_op = nullptr;
+}
+
+void PG::cancel_pglog_based_recovery_op() {
+  ceph_assert(pglog_based_recovery_op);
+  pglog_based_recovery_op->cancel();
+  reset_pglog_based_recovery_op();
+}
 }
index 17b29aace2de0433aff04da53a063df395e05aae..37537a40c361e04f654e72c794cfe2b2d4b38dfd 100644 (file)
@@ -64,6 +64,7 @@ namespace crimson::osd {
 class OpsExecuter;
 class BackfillRecovery;
 class SnapTrimEvent;
+class PglogBasedRecovery;
 
 class PG : public boost::intrusive_ref_counter<
   PG,
@@ -433,6 +434,10 @@ public:
     recovery_handler->backfill_cancelled();
   }
 
+  void on_recovery_cancelled() final {
+    cancel_pglog_based_recovery_op();
+  }
+
   void on_recovery_reserved() final {
     recovery_handler->start_pglogbased_recovery();
   }
@@ -838,6 +843,10 @@ public:
     return can_discard_replica_op(m, m.get_map_epoch());
   }
 
+  void set_pglog_based_recovery_op(PglogBasedRecovery *op) final;
+  void reset_pglog_based_recovery_op() final;
+  void cancel_pglog_based_recovery_op();
+
 private:
   // instead of seastar::gate, we use a boolean flag to indicate
   // whether the system is shutting down, as we don't need to track
@@ -845,6 +854,7 @@ private:
   bool stopping = false;
 
   PGActivationBlocker wait_for_active_blocker;
+  PglogBasedRecovery* pglog_based_recovery_op = nullptr;
 
   friend std::ostream& operator<<(std::ostream&, const PG& pg);
   friend class ClientRequest;
index f4a7d8a63db9f4787578f07688085c7d1bb2b5b4..669df5eddbfa37ac78390fdb9631de23f4638d58 100644 (file)
@@ -24,29 +24,33 @@ namespace {
 
 using std::map;
 using std::set;
+using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
 
 void PGRecovery::start_pglogbased_recovery()
 {
-  using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
-  (void) pg->get_shard_services().start_operation<PglogBasedRecovery>(
+  auto [op, fut] = pg->get_shard_services().start_operation<PglogBasedRecovery>(
     static_cast<crimson::osd::PG*>(pg),
     pg->get_shard_services(),
     pg->get_osdmap_epoch(),
     float(0.001));
+  pg->set_pglog_based_recovery_op(op.get());
 }
 
 PGRecovery::interruptible_future<bool>
 PGRecovery::start_recovery_ops(
   RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+  PglogBasedRecovery &recover_op,
   size_t max_to_start)
 {
   assert(pg->is_primary());
   assert(pg->is_peered());
 
-  if (!pg->is_recovering() && !pg->is_backfilling()) {
-    logger().debug("recovery raced and were queued twice, ignoring!");
+  if (pg->has_reset_since(recover_op.get_epoch_started()) ||
+      recover_op.is_cancelled()) {
+    logger().debug("recovery {} cancelled.", recover_op);
     return seastar::make_ready_future<bool>(false);
   }
+  ceph_assert(pg->is_recovering());
 
   // in ceph-osd the do_recovery() path handles both the pg log-based
   // recovery and the backfill, albeit they are separated at the layer
@@ -68,12 +72,15 @@ PGRecovery::start_recovery_ops(
   return interruptor::parallel_for_each(started,
                                        [] (auto&& ifut) {
     return std::move(ifut);
-  }).then_interruptible([this] {
+  }).then_interruptible([this, &recover_op] {
     //TODO: maybe we should implement a recovery race interruptor in the future
-    if (!pg->is_recovering() && !pg->is_backfilling()) {
-      logger().debug("recovery raced and were queued twice, ignoring!");
+    if (pg->has_reset_since(recover_op.get_epoch_started()) ||
+       recover_op.is_cancelled()) {
+      logger().debug("recovery {} cancelled.", recover_op);
       return seastar::make_ready_future<bool>(false);
     }
+    ceph_assert(pg->is_recovering());
+    ceph_assert(!pg->is_backfilling());
 
     bool done = !pg->get_peering_state().needs_recovery();
     if (done) {
@@ -101,6 +108,7 @@ PGRecovery::start_recovery_ops(
           pg->get_osdmap_epoch(),
           PeeringState::RequestBackfill{});
       }
+      pg->reset_pglog_based_recovery_op();
     }
     return seastar::make_ready_future<bool>(!done);
   });
index f5b8632a3826372720f477a71394b9779b77e1e1..b1fb01dc12924419af13e0128e35895b1808ad8f 100644 (file)
@@ -17,6 +17,7 @@
 
 namespace crimson::osd {
 class UrgentRecovery;
+class PglogBasedRecovery;
 }
 
 class MOSDPGBackfillRemove;
@@ -32,6 +33,7 @@ public:
 
   interruptible_future<bool> start_recovery_ops(
     RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+    crimson::osd::PglogBasedRecovery &recover_op,
     size_t max_to_start);
   void on_backfill_reserved();
   void dispatch_backfill_event(
index a53221c4a69b94593cfa643183553887d6ce94e4..6c88b170b4fb981e59478f1ff588ee5116feecc4 100644 (file)
@@ -11,6 +11,7 @@
 
 namespace crimson::osd {
   class ShardServices;
+  class PglogBasedRecovery;
 };
 
 class RecoveryBackend;
@@ -38,4 +39,7 @@ public:
   virtual void publish_stats_to_osd() = 0;
   virtual OSDriver &get_osdriver() = 0;
   virtual SnapMapper &get_snap_mapper() = 0;
+  virtual void set_pglog_based_recovery_op(
+    crimson::osd::PglogBasedRecovery *op) = 0;
+  virtual void reset_pglog_based_recovery_op() = 0;
 };
index 6bcb77ee5a9d18ea13cb69a2813a6c56f81f8e77..ef4dd3fc4f0950af01032403e38b7ee4ec06a5bb 100644 (file)
@@ -612,6 +612,7 @@ public:
 
   void on_backfill_reserved() override;
   void on_backfill_canceled() override;
+  void on_recovery_cancelled() override {}
   void on_recovery_reserved() override;
 
   bool is_forced_recovery_or_backfill() const {
index 944b77c2812aeae57750315fdbc06bed478e1eb3..22222b7f7af581e30fb3df883ba473d5660ead69 100644 (file)
@@ -5827,6 +5827,7 @@ PeeringState::Recovering::react(const DeferRecovery &evt)
   ps->state_set(PG_STATE_RECOVERY_WAIT);
   pl->cancel_local_background_io_reservation();
   release_reservations(true);
+  pl->on_recovery_cancelled();
   pl->schedule_event_after(
     std::make_shared<PGPeeringEvent>(
       ps->get_osdmap_epoch(),
@@ -5844,6 +5845,7 @@ PeeringState::Recovering::react(const UnfoundRecovery &evt)
   ps->state_set(PG_STATE_RECOVERY_UNFOUND);
   pl->cancel_local_background_io_reservation();
   release_reservations(true);
+  pl->on_recovery_cancelled();
   return transit<NotRecovering>();
 }
 
index f036bb44b1159504606c3ce4d87fdac0a3d38401..11ac084a054b3327cc9af24e40f0714818128050 100644 (file)
@@ -419,6 +419,7 @@ public:
     virtual void on_backfill_reserved() = 0;
     virtual void on_backfill_canceled() = 0;
     virtual void on_recovery_reserved() = 0;
+    virtual void on_recovery_cancelled() = 0;
 
     // ================recovery space accounting ================
     virtual bool try_reserve_recovery_space(