From c55c6479f966e0d1731c2ddcd401b0ec7275b4a8 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Sun, 16 Jun 2024 10:49:27 -0500 Subject: [PATCH] osd/scrub: passing the scrub-job copy through the scrubber moving the scrubbed sjob copy thru the scrubber - from scrub session initiation to its termination (or abort - where we use the handed "old version" of the sjob to update the new one). Note that in this version - not all the information that was used to determine the specifics of the initiated scrub is passed to the scrubber and back. In this half-baked stage of the refactoring, the resulting implementation handling of corner cases, still using the "planned scrub" flags, is less than optimal. The next step (dual targets, replacing the 'planned scrub' flags with specific attributes in the scheduling target) fixes this. Signed-off-by: Ronen Friedman --- src/osd/PG.cc | 3 ++- src/osd/PG.h | 3 ++- src/osd/scrubber/osd_scrub.cc | 41 ++++++++++++++++++--------------- src/osd/scrubber/osd_scrub.h | 2 +- src/osd/scrubber/pg_scrubber.cc | 6 +++++ src/osd/scrubber/pg_scrubber.h | 12 +++++++++- src/osd/scrubber_common.h | 9 +++++++- 7 files changed, 53 insertions(+), 23 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index fd2a504ddaf98..a8b9d41e1dab2 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1325,6 +1325,7 @@ unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsig Scrub::schedule_result_t PG::start_scrubbing( + std::unique_ptr candidate, Scrub::OSDRestrictions osd_restrictions) { dout(10) << fmt::format( @@ -1348,7 +1349,7 @@ Scrub::schedule_result_t PG::start_scrubbing( get_pgbackend()->auto_repair_supported()); return m_scrubber->start_scrub_session( - osd_restrictions, pg_cond, m_planned_scrub); + std::move(candidate), osd_restrictions, pg_cond, m_planned_scrub); } double PG::next_deepscrub_interval() const diff --git a/src/osd/PG.h b/src/osd/PG.h index 68aa160a94941..444f592548667 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -701,7 +701,8 @@ public: bool get_must_scrub() const; Scrub::schedule_result_t start_scrubbing( - Scrub::OSDRestrictions osd_restrictions); + std::unique_ptr candidate, + Scrub::OSDRestrictions osd_restrictions); unsigned int scrub_requeue_priority( Scrub::scrub_prio_t with_priority, diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc index c2421c6f4f2da..a11777a040a32 100644 --- a/src/osd/scrubber/osd_scrub.cc +++ b/src/osd/scrubber/osd_scrub.cc @@ -97,12 +97,6 @@ void OsdScrub::debug_log_all_jobs() const void OsdScrub::initiate_scrub(bool is_recovery_active) { - const utime_t scrub_time = ceph_clock_now(); - dout(10) << fmt::format( - "time now:{:s}, recovery is active?:{}", scrub_time, - is_recovery_active) - << dendl; - if (auto blocked_pgs = get_blocked_pgs_count(); blocked_pgs > 0) { // some PGs managed by this OSD were blocked by a locked object during // scrub. This means we might not have the resources needed to scrub now. @@ -113,12 +107,19 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) << dendl; } + const utime_t scrub_time = ceph_clock_now(); + // check the OSD-wide environment conditions (scrub resources, time, etc.). // These may restrict the type of scrubs we are allowed to start, or just // prevent us from starting any non-operator-initiated scrub at all. - auto env_restrictions = + const auto env_restrictions = restrictions_on_scrubbing(is_recovery_active, scrub_time); + dout(10) << fmt::format("scrub scheduling (@tick) starts. " + "time now:{:s}, recovery is active?:{} restrictions:{}", + scrub_time, is_recovery_active, env_restrictions) + << dendl; + if (g_conf()->subsys.should_gather() && !env_restrictions.high_priority_only) { debug_log_all_jobs(); @@ -130,7 +131,8 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) return; } - auto res = initiate_a_scrub(candidate->pgid, env_restrictions); + auto candidate_pg = candidate->pgid; + auto res = initiate_a_scrub(std::move(candidate), env_restrictions); switch (res) { case schedule_result_t::target_specific_failure: @@ -140,8 +142,8 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) break; case schedule_result_t::scrub_initiated: - dout(20) << fmt::format("scrub initiated for pg[{}]", candidate->pgid) - << dendl; + dout(20) << fmt::format("scrub initiated for pg[{}]", candidate_pg) + << dendl; break; } } @@ -193,24 +195,27 @@ Scrub::OSDRestrictions OsdScrub::restrictions_on_scrubbing( Scrub::schedule_result_t OsdScrub::initiate_a_scrub( - spg_t pgid, + std::unique_ptr candidate, Scrub::OSDRestrictions restrictions) { - dout(20) << fmt::format("trying pg[{}]", pgid) << dendl; + dout(20) << fmt::format("trying pg[{}]", candidate->pgid) << dendl; // we have a candidate to scrub. We need some PG information to // know if scrubbing is allowed - auto locked_pg = m_osd_svc.get_locked_pg(pgid); + auto locked_pg = m_osd_svc.get_locked_pg(candidate->pgid); if (!locked_pg) { - // the PG was dequeued in the short timespan between creating the - // candidates list (ready_to_scrub()) and here - dout(5) << fmt::format("pg[{}] not found", pgid) << dendl; + // the PG was dequeued in the short timespan between querying the + // scrub queue - and now. + dout(5) << fmt::format("pg[{}] not found", candidate->pgid) << dendl; return Scrub::schedule_result_t::target_specific_failure; } - // later on, here is where the scrub target would be dequeued - return locked_pg->pg()->start_scrubbing(restrictions); + // note: the 'candidate', which in this step is a copy of the scrub job, + // was already dequeued. The "original" scrub job cannot be accessed from + // here directly. Thus - we leave it to start_scrubbing() (via a call + // to PgScrubber::start_scrub_session() to mark it as dequeued. + return locked_pg->pg()->start_scrubbing(std::move(candidate), restrictions); } diff --git a/src/osd/scrubber/osd_scrub.h b/src/osd/scrubber/osd_scrub.h index e07641e0cef69..535282a2580f5 100644 --- a/src/osd/scrubber/osd_scrub.h +++ b/src/osd/scrubber/osd_scrub.h @@ -153,7 +153,7 @@ class OsdScrub { * initiated, and if not - why. */ Scrub::schedule_result_t initiate_a_scrub( - spg_t pgid, + std::unique_ptr candidate, Scrub::OSDRestrictions restrictions); /// resource reservation management diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index fd64694b6490b..e0bc082f7af6b 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -569,6 +569,7 @@ void PgScrubber::on_primary_active_clean() m_fsm->process_event(PrimaryActivate{}); } + /* * A note re the call to publish_stats_to_osd() below: * - we are called from either request_rescrubbing() or scrub_requested(). @@ -2093,10 +2094,13 @@ void PgScrubber::requeue_penalized(Scrub::delay_cause_t cause) Scrub::schedule_result_t PgScrubber::start_scrub_session( + std::unique_ptr candidate, Scrub::OSDRestrictions osd_restrictions, Scrub::ScrubPGPreconds pg_cond, const requested_scrub_t& requested_flags) { + m_scrub_job->target_queued = false; + if (is_queued_or_active()) { // not a real option when the queue entry is the whole ScrubJob, but // will be possible when using level-specific targets @@ -2104,6 +2108,8 @@ Scrub::schedule_result_t PgScrubber::start_scrub_session( return schedule_result_t::target_specific_failure; } + m_active_target = std::move(candidate); + // for all other failures - we must reinstate our entry in the Scrub Queue if (!is_primary() || !m_pg->is_active() || !m_pg->is_clean()) { dout(10) << __func__ << ": cannot scrub (not a clean and active primary)" diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index f2e5d847b1b9c..2255ad8ce0ec0 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -188,8 +188,9 @@ class PgScrubber : public ScrubPgIF, [[nodiscard]] bool is_reserving() const final; Scrub::schedule_result_t start_scrub_session( + std::unique_ptr candidate, Scrub::OSDRestrictions osd_restrictions, - Scrub::ScrubPGPreconds, + Scrub::ScrubPGPreconds pg_cond, const requested_scrub_t& requested_flags) final; void initiate_regular_scrub(epoch_t epoch_queued) final; @@ -731,6 +732,15 @@ class PgScrubber : public ScrubPgIF, */ bool m_queued_or_active{false}; + /** + * A copy of the specific scheduling target (either shallow_target or + * deep_target in the scrub_job) that was selected for this active scrub + * session. + * \ATTN: in this initial step - a copy of the whole scrub-job is passed + * around. Later on this would be just a part of a Scrub::SchedTarget + */ + std::unique_ptr m_active_target; + eversion_t m_subset_last_update{}; std::unique_ptr m_store; diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index 6f53e95eaf63c..f7ac9a867fe73 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -23,6 +23,7 @@ using ScrubTimePoint = ScrubClock::time_point; namespace Scrub { class ReplicaReservations; struct ReplicaActive; + class ScrubJob; } /// reservation-related data sent by the primary to the replicas, @@ -421,6 +422,11 @@ struct ScrubPgIF { /** * attempt to initiate a scrub session. + * @param candidate the scrub job to start. Later on - this will be the + * specific queue entry (that carries the information about the level, + * priority, etc. of the scrub that should be initiated on this PG). + * This parameter is saved by the scrubber for the whole duration of + * the scrub session (to be used if the scrub is aborted). * @param osd_restrictions limitations on the types of scrubs that can * be initiated on this OSD at this time. * @param preconds the PG state re scrubbing at the time of the request, @@ -432,8 +438,9 @@ struct ScrubPgIF { * external reasons. */ virtual Scrub::schedule_result_t start_scrub_session( + std::unique_ptr candidate, Scrub::OSDRestrictions osd_restrictions, - Scrub::ScrubPGPreconds, + Scrub::ScrubPGPreconds pg_cond, const requested_scrub_t& requested_flags) = 0; virtual void set_op_parameters(const requested_scrub_t&) = 0; -- 2.39.5