From: Ronen Friedman Date: Thu, 28 Dec 2023 19:41:19 +0000 (-0600) Subject: osd/scrub: avoid "over clearing" queued_or_active flag X-Git-Tag: v19.3.0~274^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F54996%2Fhead;p=ceph.git osd/scrub: avoid "over clearing" queued_or_active flag If two StartScrub messages are received in quick succession, the earlier one might clear the queued_or_active flag as it fails for being from an old interval. When that happens - a 3'rd scrub request will actually be allowed to go through, while the scrubber is still handling the second one. Signed-off-by: Ronen Friedman --- diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index ab9b86e5b35..f7e027a84f0 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -208,17 +208,18 @@ bool PgScrubber::should_abort() const void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued) { - dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + dout(10) << fmt::format( + "{}: epoch:{} is PrimaryIdle:{}", __func__, epoch_queued, + m_fsm->is_primary_idle()) + << dendl; + // we may have lost our Primary status while the message languished in the // queue if (check_interval(epoch_queued)) { dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl; - reset_epoch(epoch_queued); m_fsm->process_event(StartScrub{}); dout(10) << "scrubber event --<< StartScrub" << dendl; - } else { - clear_queued_or_active(); // also restarts snap trimming } } @@ -229,17 +230,17 @@ void PgScrubber::advance_token() void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued) { - dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + dout(10) << fmt::format( + "{}: epoch:{} is PrimaryIdle:{}", __func__, epoch_queued, + m_fsm->is_primary_idle()) + << dendl; // we may have lost our Primary status while the message languished in the // queue if (check_interval(epoch_queued)) { dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl; - reset_epoch(epoch_queued); m_fsm->process_event(AfterRepairScrub{}); dout(10) << "scrubber event --<< AfterRepairScrub" << dendl; - } else { - clear_queued_or_active(); // also restarts snap trimming } } @@ -403,13 +404,13 @@ bool PgScrubber::is_reserving() const return m_fsm->is_reserving(); } -void PgScrubber::reset_epoch(epoch_t epoch_queued) +void PgScrubber::reset_epoch() { dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl; m_fsm->assert_not_in_session(); - m_epoch_start = epoch_queued; + m_epoch_start = m_pg->get_same_interval_since(); ceph_assert(m_is_deep == state_test(PG_STATE_DEEP_SCRUB)); update_op_mode_text(); } diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 8360b4c038f..adf50f97d92 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -556,7 +556,7 @@ class PgScrubber : public ScrubPgIF, * - the epoch when started; * - the depth of the scrub requested (from the PG_STATE variable) */ - void reset_epoch(epoch_t epoch_queued); + void reset_epoch() final; void run_callbacks(); diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index 26054bf3f76..9aa5842c490 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -57,6 +57,11 @@ bool ScrubMachine::is_reserving() const return state_cast(); } +bool ScrubMachine::is_primary_idle() const +{ + return state_cast(); +} + bool ScrubMachine::is_accepting_updates() const { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -150,6 +155,7 @@ sc::result PrimaryIdle::react(const StartScrub&) { dout(10) << "PrimaryIdle::react(const StartScrub&)" << dendl; DECLARE_LOCALS; + scrbr->reset_epoch(); return transit(); } @@ -157,6 +163,7 @@ sc::result PrimaryIdle::react(const AfterRepairScrub&) { dout(10) << "PrimaryIdle::react(const AfterRepairScrub&)" << dendl; DECLARE_LOCALS; + scrbr->reset_epoch(); return transit(); } diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index fbd8f8b3a35..8fa96da1405 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -244,6 +244,7 @@ class ScrubMachine : public sc::state_machine { void assert_not_in_session() const; [[nodiscard]] bool is_reserving() const; [[nodiscard]] bool is_accepting_updates() const; + [[nodiscard]] bool is_primary_idle() const; // elapsed time for the currently active scrub.session ceph::timespan get_time_scrubbing() const; diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h index 086802ee813..f2ddd9c8716 100644 --- a/src/osd/scrubber/scrub_machine_lstnr.h +++ b/src/osd/scrubber/scrub_machine_lstnr.h @@ -212,6 +212,9 @@ struct ScrubMachineListener { virtual void set_queued_or_active() = 0; virtual void clear_queued_or_active() = 0; + /// note the epoch when the scrub session started + virtual void reset_epoch() = 0; + /** * Our scrubbing is blocked, waiting for an excessive length of time for * our target chunk to be unlocked. We will set the corresponding flags,