From 6d6cc7145636c4482b81af02993b9d83043d3ec0 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Thu, 28 Dec 2023 13:41:19 -0600 Subject: [PATCH] osd/scrub: avoid "over clearing" queued_or_active flag If two StartScrub messages are received in quick succession, the earlier one might clear the queued_or_active flag as it fails for being from an old interval. When that happens - a 3'rd scrub request will actually be allowed to go through, while the scrubber is still handling the second one. Signed-off-by: Ronen Friedman --- src/osd/scrubber/pg_scrubber.cc | 21 +++++++++++---------- src/osd/scrubber/pg_scrubber.h | 2 +- src/osd/scrubber/scrub_machine.cc | 7 +++++++ src/osd/scrubber/scrub_machine.h | 1 + src/osd/scrubber/scrub_machine_lstnr.h | 3 +++ 5 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index ab9b86e5b35c..f7e027a84f07 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -208,17 +208,18 @@ bool PgScrubber::should_abort() const void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued) { - dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + dout(10) << fmt::format( + "{}: epoch:{} is PrimaryIdle:{}", __func__, epoch_queued, + m_fsm->is_primary_idle()) + << dendl; + // we may have lost our Primary status while the message languished in the // queue if (check_interval(epoch_queued)) { dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl; - reset_epoch(epoch_queued); m_fsm->process_event(StartScrub{}); dout(10) << "scrubber event --<< StartScrub" << dendl; - } else { - clear_queued_or_active(); // also restarts snap trimming } } @@ -229,17 +230,17 @@ void PgScrubber::advance_token() void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued) { - dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + dout(10) << fmt::format( + "{}: epoch:{} is PrimaryIdle:{}", __func__, epoch_queued, + m_fsm->is_primary_idle()) + << dendl; // we may have lost our Primary status while the message languished in the // queue if (check_interval(epoch_queued)) { dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl; - reset_epoch(epoch_queued); m_fsm->process_event(AfterRepairScrub{}); dout(10) << "scrubber event --<< AfterRepairScrub" << dendl; - } else { - clear_queued_or_active(); // also restarts snap trimming } } @@ -403,13 +404,13 @@ bool PgScrubber::is_reserving() const return m_fsm->is_reserving(); } -void PgScrubber::reset_epoch(epoch_t epoch_queued) +void PgScrubber::reset_epoch() { dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl; m_fsm->assert_not_in_session(); - m_epoch_start = epoch_queued; + m_epoch_start = m_pg->get_same_interval_since(); ceph_assert(m_is_deep == state_test(PG_STATE_DEEP_SCRUB)); update_op_mode_text(); } diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 8360b4c038f7..adf50f97d92a 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -556,7 +556,7 @@ class PgScrubber : public ScrubPgIF, * - the epoch when started; * - the depth of the scrub requested (from the PG_STATE variable) */ - void reset_epoch(epoch_t epoch_queued); + void reset_epoch() final; void run_callbacks(); diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index 26054bf3f764..9aa5842c4908 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -57,6 +57,11 @@ bool ScrubMachine::is_reserving() const return state_cast(); } +bool ScrubMachine::is_primary_idle() const +{ + return state_cast(); +} + bool ScrubMachine::is_accepting_updates() const { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -150,6 +155,7 @@ sc::result PrimaryIdle::react(const StartScrub&) { dout(10) << "PrimaryIdle::react(const StartScrub&)" << dendl; DECLARE_LOCALS; + scrbr->reset_epoch(); return transit(); } @@ -157,6 +163,7 @@ sc::result PrimaryIdle::react(const AfterRepairScrub&) { dout(10) << "PrimaryIdle::react(const AfterRepairScrub&)" << dendl; DECLARE_LOCALS; + scrbr->reset_epoch(); return transit(); } diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index fbd8f8b3a359..8fa96da14052 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -244,6 +244,7 @@ class ScrubMachine : public sc::state_machine { void assert_not_in_session() const; [[nodiscard]] bool is_reserving() const; [[nodiscard]] bool is_accepting_updates() const; + [[nodiscard]] bool is_primary_idle() const; // elapsed time for the currently active scrub.session ceph::timespan get_time_scrubbing() const; diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h index 086802ee813e..f2ddd9c87168 100644 --- a/src/osd/scrubber/scrub_machine_lstnr.h +++ b/src/osd/scrubber/scrub_machine_lstnr.h @@ -212,6 +212,9 @@ struct ScrubMachineListener { virtual void set_queued_or_active() = 0; virtual void clear_queued_or_active() = 0; + /// note the epoch when the scrub session started + virtual void reset_epoch() = 0; + /** * Our scrubbing is blocked, waiting for an excessive length of time for * our target chunk to be unlocked. We will set the corresponding flags, -- 2.47.3