From: Xuehan Xu Date: Wed, 7 Aug 2024 02:52:49 +0000 (+0800) Subject: crimson/osd: cancel ongoing pglog-based recoveries on recovery defering X-Git-Tag: v20.0.0~1085^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c43542f7b9dcbf52cde4f041256a356d83ac86e9;p=ceph.git crimson/osd: cancel ongoing pglog-based recoveries on recovery defering Previously, we rely on checking `PG::is_recovery()` in `PGRecevery::start_recovery_ops()` to determine whether it's still valid to proceed the recovery. This turns out to be inefficient, for example: 1. PG `P` is under recovery, and `PGRecovery::start_recovery_ops()` is called; 2. PG `P`'s recovery is deferred, and `PGRecovery::start_recovery_ops()` is blocked on waiting for external replies; 3. Before the arrivals of external replies, PG `P` resumes recovery and a new round of `PGRecovery::start_recovery_ops()` starts; 4. The external replies arrives and the old `PGRecovery::start_recovery_ops()` continues as `PG:is_recovering()` returns true. In the above case, we get two duplicated recovery ops, which is incorrect. Fixes: https://tracker.ceph.com/issues/67380 Signed-off-by: Xuehan Xu --- diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc index 509d4c4a484c..c030c9d89708 100644 --- a/src/crimson/osd/osd_operations/background_recovery.cc +++ b/src/crimson/osd/osd_operations/background_recovery.cc @@ -158,6 +158,8 @@ PglogBasedRecovery::PglogBasedRecovery( PglogBasedRecovery::interruptible_future PglogBasedRecovery::do_recovery() { + LOG_PREFIX(PglogBasedRecovery::do_recovery); + DEBUGDPPI("{}: {}", *pg, __func__, *this); if (pg->has_reset_since(epoch_started)) { return seastar::make_ready_future(false); } @@ -167,6 +169,7 @@ PglogBasedRecovery::do_recovery() interruptor>([this] (auto&& trigger) { return pg->get_recovery_handler()->start_recovery_ops( trigger, + *this, crimson::common::local_conf()->osd_recovery_max_single_start); }); }); diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h index 17f2cd57a305..5ae0e1a9edb8 100644 --- a/src/crimson/osd/osd_operations/background_recovery.h +++ b/src/crimson/osd/osd_operations/background_recovery.h @@ -91,8 +91,20 @@ public: RecoveryBackend::RecoveryBlockingEvent > tracking_events; + void cancel() { + cancelled = true; + } + + bool is_cancelled() const { + return cancelled; + } + + epoch_t get_epoch_started() const { + return epoch_started; + } private: interruptible_future do_recovery() override; + bool cancelled = false; }; class BackfillRecovery final : public BackgroundRecoveryT { diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index cdaa1e8378fc..fb069617a1fc 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -1659,6 +1659,7 @@ void PG::on_change(ceph::os::Transaction &t) { peering_state.state_clear(PG_STATE_SNAPTRIM); peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR); snap_mapper.reset_backend(); + reset_pglog_based_recovery_op(); } void PG::context_registry_on_change() { @@ -1798,4 +1799,19 @@ void PG::PGLogEntryHandler::remove(const hobject_t &soid) { DEBUGDPP("remove {} on pglog rollback", *pg, soid); pg->remove_maybe_snapmapped_object(*t, soid); } + +void PG::set_pglog_based_recovery_op(PglogBasedRecovery *op) { + ceph_assert(!pglog_based_recovery_op); + pglog_based_recovery_op = op; +} + +void PG::reset_pglog_based_recovery_op() { + pglog_based_recovery_op = nullptr; +} + +void PG::cancel_pglog_based_recovery_op() { + ceph_assert(pglog_based_recovery_op); + pglog_based_recovery_op->cancel(); + reset_pglog_based_recovery_op(); +} } diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 17b29aace2de..37537a40c361 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -64,6 +64,7 @@ namespace crimson::osd { class OpsExecuter; class BackfillRecovery; class SnapTrimEvent; +class PglogBasedRecovery; class PG : public boost::intrusive_ref_counter< PG, @@ -433,6 +434,10 @@ public: recovery_handler->backfill_cancelled(); } + void on_recovery_cancelled() final { + cancel_pglog_based_recovery_op(); + } + void on_recovery_reserved() final { recovery_handler->start_pglogbased_recovery(); } @@ -838,6 +843,10 @@ public: return can_discard_replica_op(m, m.get_map_epoch()); } + void set_pglog_based_recovery_op(PglogBasedRecovery *op) final; + void reset_pglog_based_recovery_op() final; + void cancel_pglog_based_recovery_op(); + private: // instead of seastar::gate, we use a boolean flag to indicate // whether the system is shutting down, as we don't need to track @@ -845,6 +854,7 @@ private: bool stopping = false; PGActivationBlocker wait_for_active_blocker; + PglogBasedRecovery* pglog_based_recovery_op = nullptr; friend std::ostream& operator<<(std::ostream&, const PG& pg); friend class ClientRequest; diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index f4a7d8a63db9..669df5eddbfa 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -24,29 +24,33 @@ namespace { using std::map; using std::set; +using PglogBasedRecovery = crimson::osd::PglogBasedRecovery; void PGRecovery::start_pglogbased_recovery() { - using PglogBasedRecovery = crimson::osd::PglogBasedRecovery; - (void) pg->get_shard_services().start_operation( + auto [op, fut] = pg->get_shard_services().start_operation( static_cast(pg), pg->get_shard_services(), pg->get_osdmap_epoch(), float(0.001)); + pg->set_pglog_based_recovery_op(op.get()); } PGRecovery::interruptible_future PGRecovery::start_recovery_ops( RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + PglogBasedRecovery &recover_op, size_t max_to_start) { assert(pg->is_primary()); assert(pg->is_peered()); - if (!pg->is_recovering() && !pg->is_backfilling()) { - logger().debug("recovery raced and were queued twice, ignoring!"); + if (pg->has_reset_since(recover_op.get_epoch_started()) || + recover_op.is_cancelled()) { + logger().debug("recovery {} cancelled.", recover_op); return seastar::make_ready_future(false); } + ceph_assert(pg->is_recovering()); // in ceph-osd the do_recovery() path handles both the pg log-based // recovery and the backfill, albeit they are separated at the layer @@ -68,12 +72,15 @@ PGRecovery::start_recovery_ops( return interruptor::parallel_for_each(started, [] (auto&& ifut) { return std::move(ifut); - }).then_interruptible([this] { + }).then_interruptible([this, &recover_op] { //TODO: maybe we should implement a recovery race interruptor in the future - if (!pg->is_recovering() && !pg->is_backfilling()) { - logger().debug("recovery raced and were queued twice, ignoring!"); + if (pg->has_reset_since(recover_op.get_epoch_started()) || + recover_op.is_cancelled()) { + logger().debug("recovery {} cancelled.", recover_op); return seastar::make_ready_future(false); } + ceph_assert(pg->is_recovering()); + ceph_assert(!pg->is_backfilling()); bool done = !pg->get_peering_state().needs_recovery(); if (done) { @@ -101,6 +108,7 @@ PGRecovery::start_recovery_ops( pg->get_osdmap_epoch(), PeeringState::RequestBackfill{}); } + pg->reset_pglog_based_recovery_op(); } return seastar::make_ready_future(!done); }); diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h index f5b8632a3826..b1fb01dc1292 100644 --- a/src/crimson/osd/pg_recovery.h +++ b/src/crimson/osd/pg_recovery.h @@ -17,6 +17,7 @@ namespace crimson::osd { class UrgentRecovery; +class PglogBasedRecovery; } class MOSDPGBackfillRemove; @@ -32,6 +33,7 @@ public: interruptible_future start_recovery_ops( RecoveryBackend::RecoveryBlockingEvent::TriggerI&, + crimson::osd::PglogBasedRecovery &recover_op, size_t max_to_start); void on_backfill_reserved(); void dispatch_backfill_event( diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h index a53221c4a69b..6c88b170b4fb 100644 --- a/src/crimson/osd/pg_recovery_listener.h +++ b/src/crimson/osd/pg_recovery_listener.h @@ -11,6 +11,7 @@ namespace crimson::osd { class ShardServices; + class PglogBasedRecovery; }; class RecoveryBackend; @@ -38,4 +39,7 @@ public: virtual void publish_stats_to_osd() = 0; virtual OSDriver &get_osdriver() = 0; virtual SnapMapper &get_snap_mapper() = 0; + virtual void set_pglog_based_recovery_op( + crimson::osd::PglogBasedRecovery *op) = 0; + virtual void reset_pglog_based_recovery_op() = 0; }; diff --git a/src/osd/PG.h b/src/osd/PG.h index 6bcb77ee5a9d..ef4dd3fc4f09 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -612,6 +612,7 @@ public: void on_backfill_reserved() override; void on_backfill_canceled() override; + void on_recovery_cancelled() override {} void on_recovery_reserved() override; bool is_forced_recovery_or_backfill() const { diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 944b77c2812a..22222b7f7af5 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -5827,6 +5827,7 @@ PeeringState::Recovering::react(const DeferRecovery &evt) ps->state_set(PG_STATE_RECOVERY_WAIT); pl->cancel_local_background_io_reservation(); release_reservations(true); + pl->on_recovery_cancelled(); pl->schedule_event_after( std::make_shared( ps->get_osdmap_epoch(), @@ -5844,6 +5845,7 @@ PeeringState::Recovering::react(const UnfoundRecovery &evt) ps->state_set(PG_STATE_RECOVERY_UNFOUND); pl->cancel_local_background_io_reservation(); release_reservations(true); + pl->on_recovery_cancelled(); return transit(); } diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index f036bb44b115..11ac084a054b 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -419,6 +419,7 @@ public: virtual void on_backfill_reserved() = 0; virtual void on_backfill_canceled() = 0; virtual void on_recovery_reserved() = 0; + virtual void on_recovery_cancelled() = 0; // ================recovery space accounting ================ virtual bool try_reserve_recovery_space(