From a9c5839a0ce14f0dbe17a5f0adcbae265e01d3ca Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 28 Dec 2020 18:49:34 +0200 Subject: [PATCH] osd: scrubber: improved handling of new-interval events Interval change events now trigger immediate scrub session termination and resources release. (based on code-review comments by @athanatos.) Fixes: https://tracker.ceph.com/issues/48712 Signed-off-by: Ronen Friedman --- src/osd/PG.cc | 26 ++---- src/osd/PrimaryLogPG.cc | 2 - src/osd/pg_scrubber.cc | 170 +++++++++----------------------------- src/osd/pg_scrubber.h | 27 ++++-- src/osd/scrub_machine.cc | 26 ------ src/osd/scrub_machine.h | 19 ++--- src/osd/scrubber_common.h | 16 ++-- 7 files changed, 82 insertions(+), 204 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 972a5482841a7..2b97a6cc4eec2 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -360,7 +360,7 @@ void PG::clear_primary_state() finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread release_pg_backoffs(); - m_scrubber->unreserve_replicas(); + m_scrubber->discard_replica_reservations(); scrub_after_recovery = false; agent_clear(); @@ -2066,31 +2066,19 @@ void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle) void PG::scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle) { dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - - scrub_queued = false; - ceph_assert(is_primary()); - ceph_assert(!m_scrubber->is_scrub_active()); - // a new scrub - - m_scrubber->reset_epoch(epoch_queued); - - // note: send_start_scrub() will verify 'epoch queued' against our current interval - m_scrubber->send_start_scrub(epoch_queued); + scrub_queued = false; + m_scrubber->initiate_regular_scrub(epoch_queued); } // note: no need to secure OSD resources for a recovery scrub -void PG::recovery_scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle) +void PG::recovery_scrub(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) { dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - - scrub_queued = false; - ceph_assert(is_primary()); - ceph_assert(!m_scrubber->is_scrub_active()); - // a new scrub - m_scrubber->reset_epoch(epoch_queued); - m_scrubber->send_start_after_repair(epoch_queued); + scrub_queued = false; + m_scrubber->initiate_scrub_after_repair(epoch_queued); } void PG::replica_scrub(epoch_t epoch_queued, diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 663b8fb011d5b..c0fbdd342a5c4 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -12552,8 +12552,6 @@ void PrimaryLogPG::on_change(ObjectStore::Transaction &t) requeue_ops(waiting_for_active); requeue_ops(waiting_for_readable); - m_scrubber->clear_scrub_reservations(); - vector tids; cancel_copy_ops(is_primary(), &tids); cancel_flush_ops(is_primary(), &tids); diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc index 5c9d3b8e4492b..fe57a9f2594c2 100644 --- a/src/osd/pg_scrubber.cc +++ b/src/osd/pg_scrubber.cc @@ -72,75 +72,14 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf) return out; } -// returns false if the message should be discarded. Handles the notification of interval -// change, if not done already. called only for active scrub? not sure. - -// let's first make this a Primary-only function +/* + * if the incoming message is from a previous interval, it must mean + * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard + * the stale message. + */ bool PgScrubber::check_interval(epoch_t epoch_to_verify) { - const auto current_interval = m_pg->get_same_interval_since(); - - if (epoch_to_verify < current_interval) { - // the event will not be delivered. If we have already noticed and handled - // the change of seasons, it will be silently discarded. Otherwise - we - // reset the scrubber and its FSM. - dout(10) << __func__ << " stale message. epoch: " << epoch_to_verify << " vs. " - << current_interval << " (handled: " << m_last_dead_interval << ")" << dendl; - - if (epoch_to_verify > m_last_dead_interval) { - - // we have not seen this interval change yet. - // The remote reservations are no longer relevant. - - m_last_dead_interval = current_interval; - - // clear the remote reservations. No need to send messages. - if (m_reservations) { - m_reservations->discard_all(); - } - - // stop the scrub and send a reset message to the FSM - scrub_clear_state(); - } - return false; - } - - return true; -} - -bool PgScrubber::check_interval_replica(epoch_t epoch_to_verify) -{ - const auto current_interval = m_pg->get_same_interval_since(); - - if (epoch_to_verify < current_interval) { - // the event will not be delivered. If we have already noticed and handled - // the change of seasons, it will be silently discarded. Otherwise - we - // reset the scrubber and its FSM. - dout(10) << __func__ << " stale message. epoch: " << epoch_to_verify << " vs. " - << current_interval << " (handled: " << m_last_dead_interval << ")" << dendl; - - if (epoch_to_verify > m_last_dead_interval) { - - // we have not seen this interval change yet. - // The remote reservations are no longer relevant. - - m_last_dead_interval = current_interval; - - // clear the remote reservations. No need to send messages - m_remote_osd_resource.reset(); - - // stop the scrub and send a reset message to the FSM - // replica_handling_done(); - send_interval_changed(); - } - return false; - } - - // verify that we are reserved by the primary - // not true anymore (see rapair scrubs) ceph_assert(m_remote_osd_resource && - // m_remote_osd_resource->is_reserved()); - - return true; + return epoch_to_verify >= m_pg->get_same_interval_since(); } bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify) @@ -156,12 +95,10 @@ bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify) return false; } - // check for reasons to abort this scrub - // has a new interval started? if (!check_interval(epoch_to_verify)) { - // if this is a new interval, check_interval() just discarded - // remote resources and then killed the scrub + // if this is a new interval, on_change() has already terminated that + // old scrub. return false; } @@ -171,9 +108,6 @@ bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify) return verify_against_abort(epoch_to_verify); } - -// false if the message was discarded because of an abort flag. -// Reset everything if the abort was not handled before. bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify) { if (!should_abort()) { @@ -214,7 +148,7 @@ bool PgScrubber::should_abort() const return false; } -// sending (processing) state-machine events -------------------------------- +// initiating state-machine events -------------------------------- /* * a note re the checks performed before sending scrub-initiating messages: @@ -226,7 +160,7 @@ bool PgScrubber::should_abort() const * The check_interval() catches all major changes to the PG. As for the other conditions * we may check (and see is_message_relevant() above): * - * - we are not 'active' yet, so must check against is_active(), andL + * - we are not 'active' yet, so must not check against is_active(), and: * * - the 'abort' flags were just verified (when the triggering message was queued). As * those are only modified in human speeds - they need not be queried again. @@ -235,25 +169,30 @@ bool PgScrubber::should_abort() const * ('StartReplica' & 'StartReplicaNoWait'). */ - -void PgScrubber::send_start_scrub(epoch_t epoch_queued) +void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + // we may have lost our Primary status while the message languished in the queue if (check_interval(epoch_queued)) { + dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl; + reset_epoch(epoch_queued); m_fsm->my_states(); m_fsm->process_event(StartScrub{}); + dout(10) << "scrubber event --<< StartScrub" << dendl; } - dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_start_after_repair(epoch_t epoch_queued) +void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + // we may have lost our Primary status while the message languished in the queue if (check_interval(epoch_queued)) { + dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl; + reset_epoch(epoch_queued); m_fsm->my_states(); m_fsm->process_event(AfterRepairScrub{}); + dout(10) << "scrubber event --<< AfterRepairScrub" << dendl; } - dout(10) << "scrubber event --<< " << __func__ << dendl; } void PgScrubber::send_scrub_unblock(epoch_t epoch_queued) @@ -273,6 +212,7 @@ void PgScrubber::send_scrub_resched(epoch_t epoch_queued) m_fsm->my_states(); m_fsm->process_event(InternalSchedScrub{}); } + dout(10) << "scrubber event --<< " << __func__ << dendl; } void PgScrubber::send_start_replica(epoch_t epoch_queued) @@ -283,9 +223,9 @@ void PgScrubber::send_start_replica(epoch_t epoch_queued) dout(1) << "got a replica scrub request while Primary!" << dendl; return; } - if (check_interval_replica(epoch_queued)) { + if (check_interval(epoch_queued)) { m_fsm->my_states(); - // buy us some time by not waiting for updates if there are none + // save us some time by not waiting for updates if there are none // to wait for. Affects the transition from NotActive into either // ReplicaWaitUpdates or ActiveReplica. if (pending_active_pushes()) @@ -299,7 +239,7 @@ void PgScrubber::send_start_replica(epoch_t epoch_queued) void PgScrubber::send_sched_replica(epoch_t epoch_queued) { dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (check_interval_replica(epoch_queued)) { + if (check_interval(epoch_queued)) { m_fsm->my_states(); m_fsm->process_event(SchedReplica{}); // retest for map availability } @@ -339,15 +279,6 @@ void PgScrubber::digest_update_notification(epoch_t epoch_queued) dout(10) << "scrubber event --<< " << __func__ << dendl; } -// no checks should be performed here -void PgScrubber::send_interval_changed() -{ - dout(10) << "scrubber event -->> " << __func__ << dendl; - m_fsm->my_states(); - m_fsm->process_event(IntervalChanged{}); - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued) { dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; @@ -361,7 +292,7 @@ void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued) void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued) { dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (check_interval_replica(epoch_queued)) { + if (check_interval(epoch_queued)) { m_fsm->my_states(); m_fsm->process_event(ReplicaPushesUpd{}); } @@ -852,7 +783,6 @@ void PgScrubber::on_init() preemption_data.reset(); m_pg->publish_stats_to_osd(); m_interval_start = m_pg->get_history().same_interval_since; - // m_epoch_started = m_pg->get_osdmap_epoch(); dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl; @@ -866,7 +796,6 @@ void PgScrubber::on_init() } m_start = m_pg->info.pgid.pgid.get_hobj_start(); - m_last_dead_interval = get_osdmap_epoch(); m_active = true; } @@ -874,8 +803,6 @@ void PgScrubber::on_replica_init() { ceph_assert(!m_active); m_active = true; - m_last_dead_interval = get_osdmap_epoch(); // so that check_interval_replica() won't - // kill a scrub for stale messages } void PgScrubber::_scan_snaps(ScrubMap& smap) @@ -1459,6 +1386,14 @@ void PgScrubber::handle_scrub_reserve_release(OpRequestRef op) m_remote_osd_resource.reset(); } +void PgScrubber::discard_replica_reservations() +{ + dout(10) << __func__ << dendl; + if (m_reservations.has_value()) { + m_reservations->discard_all(); + } +} + void PgScrubber::clear_scrub_reservations() { dout(10) << __func__ << dendl; @@ -1819,12 +1754,14 @@ void PgScrubber::cleanup_on_finish() state_clear(PG_STATE_DEEP_SCRUB); m_pg->publish_stats_to_osd(); - m_reservations.reset(); - m_local_osd_resource.reset(); + clear_scrub_reservations(); + m_pg->publish_stats_to_osd(); requeue_waiting(); reset_internal_state(); + m_flags = scrub_flags_t{}; + // type-specific state clear _scrub_clear_state(); } @@ -1857,6 +1794,7 @@ void PgScrubber::clear_pgscrub_state() requeue_waiting(); reset_internal_state(); + m_flags = scrub_flags_t{}; // type-specific state clear _scrub_clear_state(); @@ -1869,32 +1807,8 @@ void PgScrubber::replica_handling_done() state_clear(PG_STATE_SCRUBBING); state_clear(PG_STATE_DEEP_SCRUB); - preemption_data.reset(); - m_maps_status.reset(); - m_received_maps.clear(); - - m_start = hobject_t{}; - m_end = hobject_t{}; - m_max_end = hobject_t{}; - m_subset_last_update = eversion_t{}; - m_shallow_errors = 0; - m_deep_errors = 0; - m_fixed_count = 0; - m_omap_stats = (const struct omap_stat_t){0}; - - run_callbacks(); - m_inconsistent.clear(); - m_missing.clear(); - m_authoritative.clear(); - num_digest_updates_pending = 0; - replica_scrubmap = ScrubMap{}; - replica_scrubmap_pos.reset(); - - m_cleaned_meta_map = ScrubMap{}; - m_needs_sleep = true; - m_sleep_started_at = utime_t{}; + reset_internal_state(); - m_active = false; m_pg->publish_stats_to_osd(); } @@ -1933,8 +1847,6 @@ void PgScrubber::reset_internal_state() m_needs_sleep = true; m_sleep_started_at = utime_t{}; - m_flags = scrub_flags_t{}; - m_active = false; } diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h index f6536dba6ef0b..e24ad422dfc41 100644 --- a/src/osd/pg_scrubber.h +++ b/src/osd/pg_scrubber.h @@ -191,9 +191,9 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { /// are we waiting for resource reservation grants form our replicas? [[nodiscard]] bool is_reserving() const final; - void send_start_scrub(epoch_t epoch_queued) final; + void initiate_regular_scrub(epoch_t epoch_queued) final; - void send_start_after_repair(epoch_t epoch_queued) final; + void initiate_scrub_after_repair(epoch_t epoch_queued) final; void send_scrub_resched(epoch_t epoch_queued) final; @@ -213,8 +213,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { void send_replica_pushes_upd(epoch_t epoch_queued) final; - void reset_epoch(epoch_t epoch_queued) final; - /** * we allow some number of preemptions of the scrub, which mean we do * not block. Then we start to block. Once we start blocking, we do @@ -234,6 +232,7 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final; void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final; void handle_scrub_reserve_release(OpRequestRef op) final; + void discard_replica_reservations() final; void clear_scrub_reservations() final; // PG::clear... fwds to here void unreserve_replicas() final; @@ -422,9 +421,14 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { ScrubMap clean_meta_map(); - void run_callbacks(); + /** + * mark down some parameters of the initiated scrub: + * - the epoch when started; + * - the depth of the scrub requested (from the PG_STATE variable) + */ + void reset_epoch(epoch_t epoch_queued); - void send_interval_changed(); + void run_callbacks(); // ----- methods used to verify the relevance of incoming events: @@ -457,12 +461,17 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { * check the 'no scrub' configuration options. */ [[nodiscard]] bool should_abort() const; + + /** + * Check the 'no scrub' configuration flags. + * + * Reset everything if the abort was not handled before. + * @returns false if the message was discarded due to abort flag. + */ [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify); - bool check_interval(epoch_t epoch_to_verify); - bool check_interval_replica(epoch_t epoch_to_verify); + [[nodiscard]] bool check_interval(epoch_t epoch_to_verify); - epoch_t m_last_dead_interval{}; epoch_t m_last_aborted{}; // last time we've noticed a request to abort diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc index 271e1362fc778..9ae3e873f9575 100644 --- a/src/osd/scrub_machine.cc +++ b/src/osd/scrub_machine.cc @@ -75,12 +75,6 @@ NotActive::NotActive(my_context ctx) : my_base(ctx) dout(10) << "-- state -->> NotActive" << dendl; } -sc::result NotActive::react(const IntervalChanged&) -{ - dout(15) << "NotActive::react(const IntervalChanged&)" << dendl; - return discard_event(); -} - // ----------------------- ReservingReplicas --------------------------------- ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx) @@ -426,16 +420,6 @@ ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx) scrbr->on_replica_init(); } -sc::result ReplicaWaitUpdates::react(const IntervalChanged&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReplicaWaitUpdates::react(const IntervalChanged&)" << dendl; - - // note: the master's reservation of us was just discarded by our caller - scrbr->replica_handling_done(); - return transit(); -} - /* * Triggered externally, by the entity that had an update re pushes */ @@ -517,16 +501,6 @@ sc::result ActiveReplica::react(const SchedReplica&) return transit(); } -sc::result ActiveReplica::react(const IntervalChanged&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ActiveReplica::react(const IntervalChanged&) " << dendl; - - scrbr->send_replica_map(PreemptionNoted::no_preemption); - scrbr->replica_handling_done(); - return transit(); -} - /** * the event poster is handling the scrubber reset */ diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h index 7bbb7db37fe18..95035389e9ab6 100644 --- a/src/osd/scrub_machine.h +++ b/src/osd/scrub_machine.h @@ -54,8 +54,6 @@ void on_event_discard(std::string_view nm); MEV(RemotesReserved) ///< all replicas have granted our reserve request MEV(ReservationFailure) ///< a reservation request has failed -MEV(IntervalChanged) ///< ... from what it was when this chunk started - MEV(StartScrub) ///< initiate a new scrubbing session (relevant if we are a Primary) MEV(AfterRepairScrub) ///< initiate a new scrubbing session. Only triggered at Recovery ///< completion. @@ -130,15 +128,12 @@ class ScrubMachine : public sc::state_machine { struct NotActive : sc::state { explicit NotActive(my_context ctx); - using reactions = mpl::list, - sc::transition, + using reactions = mpl::list, // a scrubbing that was initiated at recovery completion, // and requires no resource reservations: sc::transition, sc::transition, sc::transition>; - - sc::result react(const IntervalChanged&); }; struct ReservingReplicas : sc::state { @@ -299,24 +294,20 @@ struct WaitDigestUpdate : sc::state { */ struct ReplicaWaitUpdates : sc::state { explicit ReplicaWaitUpdates(my_context ctx); - using reactions = mpl::list, - sc::custom_reaction, - sc::custom_reaction>; + using reactions = + mpl::list, sc::custom_reaction>; sc::result react(const ReplicaPushesUpd&); - sc::result react(const IntervalChanged&); sc::result react(const FullReset&); }; struct ActiveReplica : sc::state { explicit ActiveReplica(my_context ctx); - using reactions = mpl::list, - sc::custom_reaction, - sc::custom_reaction>; + using reactions = + mpl::list, sc::custom_reaction>; sc::result react(const SchedReplica&); - sc::result react(const IntervalChanged&); sc::result react(const FullReset&); }; diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index 510c428546948..15a6cdf4dede4 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -116,9 +116,9 @@ struct ScrubPgIF { // --------------- triggering state-machine events: - virtual void send_start_scrub(epoch_t epoch_queued) = 0; + virtual void initiate_regular_scrub(epoch_t epoch_queued) = 0; - virtual void send_start_after_repair(epoch_t epoch_queued) = 0; + virtual void initiate_scrub_after_repair(epoch_t epoch_queued) = 0; virtual void send_scrub_resched(epoch_t epoch_queued) = 0; @@ -140,8 +140,6 @@ struct ScrubPgIF { // -------------------------------------------------- - virtual void reset_epoch(epoch_t epoch_queued) = 0; - [[nodiscard]] virtual bool are_callbacks_pending() const = 0; // currently only used for an assert @@ -230,9 +228,17 @@ struct ScrubPgIF { */ virtual void unreserve_replicas() = 0; + /** + * "forget" all replica reservations. No messages are sent to the + * previously-reserved. + * + * Used upon interval change. The replicas' state is guaranteed to + * be reset separately by the interval-change event. + */ + virtual void discard_replica_reservations() = 0; + /** * clear both local and OSD-managed resource reservation flags - * (note: no replica un/reservation messages are involved!) */ virtual void clear_scrub_reservations() = 0; -- 2.39.5