From: Ronen Friedman Date: Thu, 7 Dec 2023 14:55:30 +0000 (-0600) Subject: osd/scrub: fix scrub eligibility tests X-Git-Tag: v19.1.0~613^2~2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=fcfab283c0a2b5dcf22ef17b5469d2266f6887c6;p=ceph.git osd/scrub: fix scrub eligibility tests By: - removing duplicate checks; - moving most checks "down" to the PG; - allowing high-priority scrubs to override most limiting conditions. Signed-off-by: Ronen Friedman --- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index ddef326e2a8ac..5ff1246427b7d 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1331,30 +1331,44 @@ unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsig // ========================================================================================== // SCRUB + /* * implementation note: - * PG::sched_scrub() is called only once per a specific scrub session. + * PG::start_scrubbing() is called only once per a specific scrub session. * That call commits us to the whatever choices are made (deep/shallow, etc'). * Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into * PgScrubber's m_flags, then cleared. */ -Scrub::schedule_result_t PG::sched_scrub() +Scrub::schedule_result_t PG::start_scrubbing( + Scrub::OSDRestrictions osd_restrictions) { using Scrub::schedule_result_t; - dout(15) << __func__ << " pg(" << info.pgid - << (is_active() ? ") " : ") ") - << (is_clean() ? " " : " ") << dendl; + dout(10) << fmt::format( + "{}: {}+{} (env restrictions:{})", __func__, + (is_active() ? "" : ""), + (is_clean() ? "" : ""), osd_restrictions) + << dendl; ceph_assert(ceph_mutex_is_locked(_lock)); - ceph_assert(m_scrubber); - - if (is_scrub_queued_or_active()) { - dout(10) << __func__ << ": already scrubbing" << dendl; - return schedule_result_t::target_specific_failure; - } if (!is_primary() || !is_active() || !is_clean()) { dout(10) << __func__ << ": cannot scrub (not a clean and active primary)" - << dendl; + << dendl; + return schedule_result_t::target_specific_failure; + } + + ceph_assert(m_scrubber); + if (is_scrub_queued_or_active()) { + dout(10) << __func__ << ": scrub already in progress" << dendl; + return schedule_result_t::target_specific_failure; + } + // if only explicitly requested repairing is allowed - skip other types + // of scrubbing + if (osd_restrictions.allow_requested_repair_only && + !get_planned_scrub().must_repair) { + dout(10) << __func__ + << ": skipping this PG as repairing was not explicitly " + "requested for it" + << dendl; return schedule_result_t::target_specific_failure; } @@ -1366,9 +1380,9 @@ Scrub::schedule_result_t PG::sched_scrub() return schedule_result_t::target_specific_failure; } - // analyse the combination of the requested scrub flags, the osd/pool configuration - // and the PG status to determine whether we should scrub now, and what type of scrub - // should that be. + // analyze the combination of the requested scrub flags, the osd/pool + // configuration and the PG status to determine whether we should scrub + // now, and what type of scrub should that be. auto updated_flags = validate_scrub_mode(); if (!updated_flags) { // the stars do not align for starting a scrub for this PG at this time @@ -1391,15 +1405,17 @@ Scrub::schedule_result_t PG::sched_scrub() // An interrupted recovery repair could leave this set. state_clear(PG_STATE_REPAIR); - // Pass control to the scrubber. It is the scrubber that handles the replicas' - // resources reservations. + // Pass control to the scrubber. It is the scrubber that handles the + // replicas' resources reservations. m_scrubber->set_op_parameters(m_planned_scrub); + // using the OSD queue, as to not execute the scrub code as part of the tick. dout(10) << __func__ << ": queueing" << dendl; osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority); return schedule_result_t::scrub_initiated; } + double PG::next_deepscrub_interval() const { double deep_scrub_interval = diff --git a/src/osd/PG.h b/src/osd/PG.h index 8713b1c8ae887..d9acfafd03285 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -709,11 +709,16 @@ public: virtual void on_shutdown() = 0; bool get_must_scrub() const; - Scrub::schedule_result_t sched_scrub(); - unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const; + Scrub::schedule_result_t start_scrubbing( + Scrub::OSDRestrictions osd_restrictions); + + unsigned int scrub_requeue_priority( + Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const; /// the version that refers to flags_.priority unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const; + private: // auxiliaries used by sched_scrub(): double next_deepscrub_interval() const; diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc index 536c4479b1d30..fb21e0e1c5ee9 100644 --- a/src/osd/scrubber/osd_scrub.cc +++ b/src/osd/scrubber/osd_scrub.cc @@ -71,7 +71,7 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) { const utime_t scrub_time = ceph_clock_now(); dout(10) << fmt::format( - "time now:{}, recover is active?:{}", scrub_time, + "time now:{:s}, recovery is active?:{}", scrub_time, is_recovery_active) << dendl; @@ -116,8 +116,7 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) // we have a candidate to scrub. But we may fail when trying to initiate that // scrub. For some failures - we can continue with the next candidate. For // others - we should stop trying to scrub at this tick. - auto res = initiate_a_scrub( - candidate, env_restrictions.allow_requested_repair_only); + auto res = initiate_a_scrub(candidate, env_restrictions); if (res == schedule_result_t::target_specific_failure) { // continue with the next job. @@ -190,7 +189,7 @@ Scrub::OSDRestrictions OsdScrub::restrictions_on_scrubbing( Scrub::schedule_result_t OsdScrub::initiate_a_scrub( spg_t pgid, - bool allow_requested_repair_only) + Scrub::OSDRestrictions restrictions) { dout(20) << fmt::format("trying pg[{}]", pgid) << dendl; @@ -205,23 +204,8 @@ Scrub::schedule_result_t OsdScrub::initiate_a_scrub( return Scrub::schedule_result_t::target_specific_failure; } - // This one is already scrubbing, so go on to the next scrub job - if (locked_pg->pg()->is_scrub_queued_or_active()) { - dout(10) << fmt::format("pg[{}]: scrub already in progress", pgid) << dendl; - return Scrub::schedule_result_t::target_specific_failure; - } - // Skip other kinds of scrubbing if only explicitly requested repairing is allowed - if (allow_requested_repair_only && - !locked_pg->pg()->get_planned_scrub().must_repair) { - dout(10) << fmt::format( - "skipping pg[{}] as repairing was not explicitly " - "requested for that pg", - pgid) - << dendl; - return Scrub::schedule_result_t::target_specific_failure; - } - - return locked_pg->pg()->sched_scrub(); + // later on, here is where the scrub target would be dequeued + return locked_pg->pg()->start_scrubbing(restrictions); } void OsdScrub::on_config_change() diff --git a/src/osd/scrubber/osd_scrub.h b/src/osd/scrubber/osd_scrub.h index fcc4fd3fe9c52..774076711e2d0 100644 --- a/src/osd/scrubber/osd_scrub.h +++ b/src/osd/scrubber/osd_scrub.h @@ -193,7 +193,7 @@ class OsdScrub { */ Scrub::schedule_result_t initiate_a_scrub( spg_t pgid, - bool allow_requested_repair_only); + Scrub::OSDRestrictions restrictions); /// resource reservation management Scrub::ScrubResources m_resource_bookkeeper;