From: Ronen Friedman Date: Thu, 7 Dec 2023 09:37:57 +0000 (-0600) Subject: osd/scrub: expose h.p. scrub jobs in the queue X-Git-Tag: v19.3.0~331^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ea15ee08c3c7a4ed591ee74e8943017d079bc060;p=ceph.git osd/scrub: expose h.p. scrub jobs in the queue - a high-priority bit is added to the scrub job - a scrub scheduling attempt will be performed even if only high-priority jobs should be allowed to run Note: some of the changes in this PR are temporary, in the sense that they would be made obsolete when the scrub scheduler is refactored. Signed-off-by: Ronen Friedman --- diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc index 99367170dbac..536c4479b1d3 100644 --- a/src/osd/scrubber/osd_scrub.cc +++ b/src/osd/scrubber/osd_scrub.cc @@ -69,10 +69,11 @@ bool OsdScrub::scrub_random_backoff() const void OsdScrub::initiate_scrub(bool is_recovery_active) { - if (scrub_random_backoff()) { - // dice-roll says we should not scrub now - return; - } + const utime_t scrub_time = ceph_clock_now(); + dout(10) << fmt::format( + "time now:{}, recover is active?:{}", scrub_time, + is_recovery_active) + << dendl; if (auto blocked_pgs = get_blocked_pgs_count(); blocked_pgs > 0) { // some PGs managed by this OSD were blocked by a locked object during @@ -84,35 +85,14 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) << dendl; } - // fail fast if no resources are available - if (!m_resource_bookkeeper.can_inc_scrubs()) { - dout(20) << "too many scrubs already running on this OSD" << dendl; - return; - } - - // if there is a PG that is just now trying to reserve scrub replica resources - - // we should wait and not initiate a new scrub - if (m_queue.is_reserving_now()) { - dout(10) << "scrub resources reservation in progress" << dendl; - return; - } - - utime_t scrub_time = ceph_clock_now(); - dout(10) << fmt::format( - "time now:{}, recover is active?:{}", scrub_time, - is_recovery_active) - << dendl; - // check the OSD-wide environment conditions (scrub resources, time, etc.). // These may restrict the type of scrubs we are allowed to start, or just - // prevent us from starting any scrub at all. + // prevent us from starting any non-operator-initiated scrub at all. auto env_restrictions = restrictions_on_scrubbing(is_recovery_active, scrub_time); - if (!env_restrictions) { - return; - } - if (g_conf()->subsys.should_gather()) { + if (g_conf()->subsys.should_gather() && + !env_restrictions.high_priority_only) { dout(20) << "scrub scheduling (@tick) starts" << dendl; auto all_jobs = m_queue.list_registered_jobs(); for (const auto& sj : all_jobs) { @@ -124,7 +104,7 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) // queue interface used here: we ask for a list of // eligible targets (based on the known restrictions). // We try all elements of this list until a (possibly temporary) success. - auto candidates = m_queue.ready_to_scrub(*env_restrictions, scrub_time); + auto candidates = m_queue.ready_to_scrub(env_restrictions, scrub_time); if (candidates.empty()) { dout(20) << "no PGs are ready for scrubbing" << dendl; return; @@ -137,7 +117,7 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) // scrub. For some failures - we can continue with the next candidate. For // others - we should stop trying to scrub at this tick. auto res = initiate_a_scrub( - candidate, env_restrictions->allow_requested_repair_only); + candidate, env_restrictions.allow_requested_repair_only); if (res == schedule_result_t::target_specific_failure) { // continue with the next job. @@ -157,39 +137,51 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) } -std::optional OsdScrub::restrictions_on_scrubbing( +Scrub::OSDRestrictions OsdScrub::restrictions_on_scrubbing( bool is_recovery_active, utime_t scrub_clock_now) const { - // our local OSD may already be running too many scrubs + Scrub::OSDRestrictions env_conditions; + + // some environmental conditions prevent all but high priority scrubs + if (!m_resource_bookkeeper.can_inc_scrubs()) { - dout(10) << "OSD cannot inc scrubs" << dendl; - return std::nullopt; - } + // our local OSD is already running too many scrubs + dout(15) << "OSD cannot inc scrubs" << dendl; + env_conditions.high_priority_only = true; - // if there is a PG that is just now trying to reserve scrub replica resources - // - we should wait and not initiate a new scrub - if (m_queue.is_reserving_now()) { + } else if (scrub_random_backoff()) { + // dice-roll says we should not scrub now + dout(15) << "Lost in dice. Only high priority scrubs allowed." + << dendl; + env_conditions.high_priority_only = true; + + } else if (m_queue.is_reserving_now()) { + // if there is a PG that is just now trying to reserve scrub replica + // resources - we should wait and not initiate a new scrub dout(10) << "scrub resources reservation in progress" << dendl; - return std::nullopt; - } + env_conditions.high_priority_only = true; - Scrub::OSDRestrictions env_conditions; - env_conditions.time_permit = scrub_time_permit(scrub_clock_now); - env_conditions.load_is_low = m_load_tracker.scrub_load_below_threshold(); - env_conditions.only_deadlined = - !env_conditions.time_permit || !env_conditions.load_is_low; - - if (is_recovery_active && !conf->osd_scrub_during_recovery) { - if (!conf->osd_repair_during_recovery) { - dout(15) << "not scheduling scrubs due to active recovery" << dendl; - return std::nullopt; + } else if (is_recovery_active && !conf->osd_scrub_during_recovery) { + if (conf->osd_repair_during_recovery) { + dout(15) + << "will only schedule explicitly requested repair due to active " + "recovery" + << dendl; + env_conditions.allow_requested_repair_only = true; + + } else { + dout(15) << "recovery in progress. Only high priority scrubs allowed." + << dendl; + env_conditions.high_priority_only = true; } + } else { - dout(10) << "will only schedule explicitly requested repair due to active " - "recovery" - << dendl; - env_conditions.allow_requested_repair_only = true; + // regular, i.e. non-high-priority scrubs are allowed + env_conditions.time_permit = scrub_time_permit(scrub_clock_now); + env_conditions.load_is_low = m_load_tracker.scrub_load_below_threshold(); + env_conditions.only_deadlined = + !env_conditions.time_permit || !env_conditions.load_is_low; } return env_conditions; diff --git a/src/osd/scrubber/osd_scrub.h b/src/osd/scrubber/osd_scrub.h index 56167df2ee6e..fcc4fd3fe9c5 100644 --- a/src/osd/scrubber/osd_scrub.h +++ b/src/osd/scrubber/osd_scrub.h @@ -168,20 +168,17 @@ class OsdScrub { /** * check the OSD-wide environment conditions (scrub resources, time, etc.). - * These may restrict the type of scrubs we are allowed to start, or just - * prevent us from starting any scrub at all. + * These may restrict the type of scrubs we are allowed to start, maybe + * down to allowing only high-priority scrubs * * Specifically: - * a nullopt is returned if we are not allowed to scrub at all, for either of + * 'only high priority' flag is set for either of * the following reasons: no local resources (too many scrubs on this OSD); * a dice roll says we will not scrub in this tick; * a recovery is in progress, and we are not allowed to scrub while recovery; * a PG is trying to acquire replica resources. - * - * If we are allowed to scrub, the returned value specifies whether the only - * high priority scrubs or only overdue ones are allowed to go on. */ - std::optional restrictions_on_scrubbing( + Scrub::OSDRestrictions restrictions_on_scrubbing( bool is_recovery_active, utime_t scrub_clock_now) const; diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc index 324899f29ab8..1b3506a35e50 100644 --- a/src/osd/scrubber/osd_scrub_sched.cc +++ b/src/osd/scrubber/osd_scrub_sched.cc @@ -149,6 +149,7 @@ void ScrubQueue::update_job(Scrub::ScrubJobRef scrub_job, // adjust the suggested scrub time according to OSD-wide status auto adjusted = adjust_target_time(suggested); scrub_job->update_schedule(adjusted); + scrub_job->high_priority = suggested.is_must == must_scrub_t::mandatory; } sched_params_t ScrubQueue::determine_scrub_time( @@ -299,11 +300,13 @@ void ScrubQueue::rm_unregistered_jobs(ScrubQContainer& group) } namespace { -struct cmp_sched_time_t { - bool operator()(const Scrub::ScrubJobRef& lhs, - const Scrub::ScrubJobRef& rhs) const +struct cmp_time_n_priority_t { + bool operator()(const Scrub::ScrubJobRef& lhs, const Scrub::ScrubJobRef& rhs) + const { - return lhs->schedule.scheduled_at < rhs->schedule.scheduled_at; + return lhs->is_high_priority() > rhs->is_high_priority() || + (lhs->is_high_priority() == rhs->is_high_priority() && + lhs->schedule.scheduled_at < rhs->schedule.scheduled_at); } }; } // namespace @@ -314,11 +317,11 @@ ScrubQContainer ScrubQueue::collect_ripe_jobs( OSDRestrictions restrictions, utime_t time_now) { - auto filtr = [time_now, restrictions](const auto& jobref) -> bool { + auto filtr = [time_now, rst = restrictions](const auto& jobref) -> bool { return jobref->schedule.scheduled_at <= time_now && - (!restrictions.only_deadlined || - (!jobref->schedule.deadline.is_zero() && - jobref->schedule.deadline <= time_now)); + (!rst.high_priority_only || jobref->high_priority) && + (!rst.only_deadlined || (!jobref->schedule.deadline.is_zero() && + jobref->schedule.deadline <= time_now)); }; rm_unregistered_jobs(group); @@ -327,7 +330,7 @@ ScrubQContainer ScrubQueue::collect_ripe_jobs( ripes.reserve(group.size()); std::copy_if(group.begin(), group.end(), std::back_inserter(ripes), filtr); - std::sort(ripes.begin(), ripes.end(), cmp_sched_time_t{}); + std::sort(ripes.begin(), ripes.end(), cmp_time_n_priority_t{}); if (g_conf()->subsys.should_gather()) { for (const auto& jobref : group) { diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h index ae29c8ebab44..11e7388f6362 100644 --- a/src/osd/scrubber/scrub_job.h +++ b/src/osd/scrubber/scrub_job.h @@ -93,6 +93,8 @@ class ScrubJob final : public RefCountedObject { CephContext* cct; + bool high_priority{false}; + ScrubJob(CephContext* cct, const spg_t& pg, int node_id); utime_t get_sched_time() const { return schedule.scheduled_at; } @@ -130,6 +132,12 @@ class ScrubJob final : public RefCountedObject { */ bool is_state_registered() const { return state == qu_state_t::registered; } + /** + * is this a high priority scrub job? + * High priority - (usually) a scrub that was initiated by the operator + */ + bool is_high_priority() const { return high_priority; } + /** * a text description of the "scheduling intentions" of this PG: * are we already scheduled for a scrub/deep scrub? when? diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index d24bb79b801e..f39907f3f6b3 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -49,12 +49,16 @@ enum class scrub_prio_t : bool { low_priority = false, high_priority = true }; using act_token_t = uint32_t; /// "environment" preconditions affecting which PGs are eligible for scrubbing +/// (note: struct size should be kept small, as it is copied around) struct OSDRestrictions { + /// high local OSD concurrency. Thus - only high priority scrubs are allowed + bool high_priority_only{false}; bool allow_requested_repair_only{false}; - bool load_is_low{true}; - bool time_permit{true}; bool only_deadlined{false}; + bool load_is_low:1{true}; + bool time_permit:1{true}; }; +static_assert(sizeof(Scrub::OSDRestrictions) <= sizeof(uint32_t)); } // namespace Scrub @@ -68,7 +72,8 @@ struct formatter { { return fmt::format_to( ctx.out(), - "overdue-only:{} load:{} time:{} repair-only:{}", + "priority-only:{} overdue-only:{} load:{} time:{} repair-only:{}", + conds.high_priority_only, conds.only_deadlined, conds.load_is_low ? "ok" : "high", conds.time_permit ? "ok" : "no",