From d294ea80cc018dc3f893fcb2c4df9a1ad8bb6a04 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Sun, 31 Dec 2023 10:18:09 -0600 Subject: [PATCH] osd/scrub: introduce a 'not before' attribute for scrub jobs The NB enables the OSD to delay the next attempt to schedule a specific scrub job. This is useful for jobs that have failed for whatever reason, especially if the primary has failed to acquire the replicas. Signed-off-by: Ronen Friedman --- src/osd/scrubber/osd_scrub.cc | 5 +++-- src/osd/scrubber/osd_scrub.h | 10 +++++---- src/osd/scrubber/osd_scrub_sched.cc | 22 +++++++++++--------- src/osd/scrubber/osd_scrub_sched.h | 12 +++++++---- src/osd/scrubber/pg_scrubber.cc | 6 +++--- src/osd/scrubber/scrub_job.cc | 32 ++++++++++++++++++++--------- src/osd/scrubber/scrub_job.h | 19 ++++++++++++----- 7 files changed, 68 insertions(+), 38 deletions(-) diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc index e5abe03feaad6..a721dc1cb9c04 100644 --- a/src/osd/scrubber/osd_scrub.cc +++ b/src/osd/scrubber/osd_scrub.cc @@ -429,9 +429,10 @@ Scrub::sched_params_t OsdScrub::determine_scrub_time( void OsdScrub::update_job( Scrub::ScrubJobRef sjob, - const Scrub::sched_params_t& suggested) + const Scrub::sched_params_t& suggested, + bool reset_notbefore) { - m_queue.update_job(sjob, suggested); + m_queue.update_job(sjob, suggested, reset_notbefore); } void OsdScrub::register_with_osd( diff --git a/src/osd/scrubber/osd_scrub.h b/src/osd/scrubber/osd_scrub.h index ce7b8524d69cb..407eae059079f 100644 --- a/src/osd/scrubber/osd_scrub.h +++ b/src/osd/scrubber/osd_scrub.h @@ -90,21 +90,23 @@ class OsdScrub { * the registration will be with "beginning of time" target, making the * scrub-job eligible to immediate scrub (given that external conditions * do not prevent scrubbing) - * * - 'must' is asserted, and the suggested time is 'now': * This happens if our stats are unknown. The results are similar to the * previous scenario. - * * - not a 'must': we take the suggested time as a basis, and add to it some * configuration / random delays. - * * ('must' is Scrub::sched_params_t.is_must) * + * 'reset_notbefore' is used to reset the 'not_before' time to the updated + * 'scheduled_at' time. This is used whenever the scrub-job schedule is + * updated not as a result of a scrub attempt failure. + * * locking: not using the jobs_lock */ void update_job( Scrub::ScrubJobRef sjob, - const Scrub::sched_params_t& suggested); + const Scrub::sched_params_t& suggested, + bool reset_notbefore); /** * Add the scrub job to the list of jobs (i.e. list of PGs) to be periodically diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc index bb72ae4cb3fb9..836bf740ec087 100644 --- a/src/osd/scrubber/osd_scrub_sched.cc +++ b/src/osd/scrubber/osd_scrub_sched.cc @@ -94,7 +94,7 @@ void ScrubQueue::register_with_osd( switch (state_at_entry) { case qu_state_t::registered: // just updating the schedule? - update_job(scrub_job, suggested); + update_job(scrub_job, suggested, false /* keep n.b. delay */); break; case qu_state_t::not_registered: @@ -110,7 +110,7 @@ void ScrubQueue::register_with_osd( break; } - update_job(scrub_job, suggested); + update_job(scrub_job, suggested, true /* resets not_before */); to_scrub.push_back(scrub_job); scrub_job->in_queues = true; scrub_job->state = qu_state_t::registered; @@ -124,7 +124,7 @@ void ScrubQueue::register_with_osd( // at any minute std::lock_guard lck{jobs_lock}; - update_job(scrub_job, suggested); + update_job(scrub_job, suggested, true /* resets not_before */); if (scrub_job->state == qu_state_t::not_registered) { dout(5) << " scrub job state changed to 'not registered'" << dendl; to_scrub.push_back(scrub_job); @@ -138,18 +138,19 @@ void ScrubQueue::register_with_osd( dout(10) << fmt::format( "pg[{}] sched-state changed from <{:.14}> to <{:.14}> (@{:s})", scrub_job->pgid, state_at_entry, scrub_job->state.load(), - scrub_job->schedule.scheduled_at) + scrub_job->schedule.not_before) << dendl; } -// look mommy - no locks! + void ScrubQueue::update_job(Scrub::ScrubJobRef scrub_job, - const sched_params_t& suggested) + const sched_params_t& suggested, + bool reset_nb) { // adjust the suggested scrub time according to OSD-wide status auto adjusted = adjust_target_time(suggested); - scrub_job->update_schedule(adjusted); scrub_job->high_priority = suggested.is_must == must_scrub_t::mandatory; + scrub_job->update_schedule(adjusted, reset_nb); } sched_params_t ScrubQueue::determine_scrub_time( @@ -262,7 +263,7 @@ ScrubQContainer ScrubQueue::collect_ripe_jobs( utime_t time_now) { auto filtr = [time_now, rst = restrictions](const auto& jobref) -> bool { - return jobref->schedule.scheduled_at <= time_now && + return jobref->schedule.not_before <= time_now && (!rst.high_priority_only || jobref->high_priority) && (!rst.only_deadlined || (!jobref->schedule.deadline.is_zero() && jobref->schedule.deadline <= time_now)); @@ -280,7 +281,8 @@ ScrubQContainer ScrubQueue::collect_ripe_jobs( for (const auto& jobref : group) { if (!filtr(jobref)) { dout(20) << fmt::format( - " not ripe: {} @ {:s}", jobref->pgid, + " not ripe: {} @ {:s} ({:s})", jobref->pgid, + jobref->schedule.not_before, jobref->schedule.scheduled_at) << dendl; } @@ -295,7 +297,7 @@ Scrub::scrub_schedule_t ScrubQueue::adjust_target_time( const sched_params_t& times) const { Scrub::scrub_schedule_t sched_n_dead{ - times.proposed_time, times.proposed_time}; + times.proposed_time, times.proposed_time, times.proposed_time}; if (times.is_must == Scrub::must_scrub_t::not_mandatory) { // unless explicitly requested, postpone the scrub with a random delay diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h index 83578dff7d935..d94ca8cf75799 100644 --- a/src/osd/scrubber/osd_scrub_sched.h +++ b/src/osd/scrubber/osd_scrub_sched.h @@ -202,19 +202,23 @@ class ScrubQueue { * the registration will be with "beginning of time" target, making the * scrub-job eligible to immediate scrub (given that external conditions * do not prevent scrubbing) - * * - 'must' is asserted, and the suggested time is 'now': * This happens if our stats are unknown. The results are similar to the * previous scenario. - * * - not a 'must': we take the suggested time as a basis, and add to it some * configuration / random delays. - * * ('must' is sched_params_t.is_must) * + * 'reset_notbefore' is used to reset the 'not_before' time to the updated + * 'scheduled_at' time. This is used whenever the scrub-job schedule is + * updated not as a result of a scrub attempt failure. + * * locking: not using the jobs_lock */ - void update_job(Scrub::ScrubJobRef sjob, const sched_params_t& suggested); + void update_job( + Scrub::ScrubJobRef sjob, + const sched_params_t& suggested, + bool reset_notbefore); sched_params_t determine_scrub_time(const requested_scrub_t& request_flags, const pg_info_t& pg_info, diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 2dae53273a705..9cd082a879470 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -547,7 +547,7 @@ void PgScrubber::update_scrub_job(const requested_scrub_t& request_flags) ceph_assert(m_pg->is_locked()); auto suggested = m_osds->get_scrub_services().determine_scrub_time( request_flags, m_pg->info, m_pg->get_pgpool().info.opts); - m_osds->get_scrub_services().update_job(m_scrub_job, suggested); + m_osds->get_scrub_services().update_job(m_scrub_job, suggested, true); m_pg->publish_stats_to_osd(); } @@ -2126,7 +2126,7 @@ pg_scrubbing_status_t PgScrubber::get_schedule() const !m_planned_scrub.must_deep_scrub; // are we ripe for scrubbing? - if (now_is > m_scrub_job->schedule.scheduled_at) { + if (now_is > m_scrub_job->schedule.not_before) { // we are waiting for our turn at the OSD. return pg_scrubbing_status_t{m_scrub_job->schedule.scheduled_at, 0, @@ -2136,7 +2136,7 @@ pg_scrubbing_status_t PgScrubber::get_schedule() const periodic}; } - return pg_scrubbing_status_t{m_scrub_job->schedule.scheduled_at, + return pg_scrubbing_status_t{m_scrub_job->schedule.not_before, 0, pg_scrub_sched_status_t::scheduled, false, diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc index 35071af5fd510..1875641e754a3 100644 --- a/src/osd/scrubber/scrub_job.cc +++ b/src/osd/scrubber/scrub_job.cc @@ -43,18 +43,30 @@ ostream& operator<<(ostream& out, const ScrubJob& sjob) } } // namespace std -void ScrubJob::update_schedule(const Scrub::scrub_schedule_t& adjusted) +void ScrubJob::update_schedule( + const Scrub::scrub_schedule_t& adjusted, + bool reset_nb) { - schedule = adjusted; - penalty_timeout = utime_t(0, 0); // helps with debugging + dout(15) + << fmt::format( + "was: nb:{:s}({:s}). Called with: rest?{} nb:{:s} ({:s}) ({})", + schedule.not_before, schedule.scheduled_at, reset_nb, + adjusted.not_before, adjusted.scheduled_at, registration_state()) + << dendl; + schedule.scheduled_at = adjusted.scheduled_at; + schedule.deadline = adjusted.deadline; + + if (reset_nb || schedule.not_before < schedule.scheduled_at) { + schedule.not_before = schedule.scheduled_at; + } // 'updated' is changed here while not holding jobs_lock. That's OK, as // the (atomic) flag will only be cleared by select_pg_and_scrub() after // scan_penalized() is called and the job was moved to the to_scrub queue. updated = true; dout(10) << fmt::format( - "adjusted: {:s} ({})", schedule.scheduled_at, - registration_state()) + "adjusted: nb:{:s} ({:s}) ({})", schedule.not_before, + schedule.scheduled_at, registration_state()) << dendl; } @@ -67,15 +79,14 @@ std::string ScrubJob::scheduling_state(utime_t now_is, bool is_deep_expected) } // if the time has passed, we are surely in the queue - // (note that for now we do not tell client if 'penalized') - if (now_is > schedule.scheduled_at) { + if (now_is > schedule.not_before) { // we are never sure that the next scrub will indeed be shallow: return fmt::format("queued for {}scrub", (is_deep_expected ? "deep " : "")); } return fmt::format( - "{}scrub scheduled @ {:s}", (is_deep_expected ? "deep " : ""), - schedule.scheduled_at); + "{}scrub scheduled @ {:s} ({:s})", (is_deep_expected ? "deep " : ""), + schedule.not_before, schedule.scheduled_at); } std::ostream& ScrubJob::gen_prefix(std::ostream& out, std::string_view fn) const @@ -100,7 +111,8 @@ void ScrubJob::dump(ceph::Formatter* f) const { f->open_object_section("scrub"); f->dump_stream("pgid") << pgid; - f->dump_stream("sched_time") << schedule.scheduled_at; + f->dump_stream("sched_time") << schedule.not_before; + f->dump_stream("orig_sched_time") << schedule.scheduled_at; f->dump_stream("deadline") << schedule.deadline; f->dump_bool("forced", schedule.scheduled_at == PgScrubber::scrub_must_stamp()); diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h index b75141ea179c3..e6765c767b968 100644 --- a/src/osd/scrubber/scrub_job.h +++ b/src/osd/scrubber/scrub_job.h @@ -38,6 +38,7 @@ enum class qu_state_t { struct scrub_schedule_t { utime_t scheduled_at{}; utime_t deadline{0, 0}; + utime_t not_before{utime_t::max()}; }; struct sched_params_t { @@ -66,7 +67,7 @@ class ScrubJob final : public RefCountedObject { /** * the old 'is_registered'. Set whenever the job is registered with the OSD, - * i.e. is in either the 'to_scrub' or the 'penalized' vectors. + * i.e. is in 'to_scrub'. */ std::atomic_bool in_queues{false}; @@ -93,7 +94,7 @@ class ScrubJob final : public RefCountedObject { ScrubJob(CephContext* cct, const spg_t& pg, int node_id); - utime_t get_sched_time() const { return schedule.scheduled_at; } + utime_t get_sched_time() const { return schedule.not_before; } static std::string_view qu_state_text(qu_state_t st); @@ -107,7 +108,15 @@ class ScrubJob final : public RefCountedObject { return qu_state_text(state.load(std::memory_order_relaxed)); } - void update_schedule(const scrub_schedule_t& adjusted); + /** + * 'reset_failure_penalty' is used to reset the 'not_before' jo attribute to + * the updated 'scheduled_at' time. This is used whenever the scrub-job + * schedule is updated, and the update is not a result of a scrub attempt + * failure. + */ + void update_schedule( + const scrub_schedule_t& adjusted, + bool reset_failure_penalty); void dump(ceph::Formatter* f) const; @@ -227,9 +236,9 @@ struct formatter { { return fmt::format_to( ctx.out(), - "pg[{}] @ {:s} (dl:{:s}) - <{}> / failure: {} / queue state: " + "pg[{}] @ {:s} ({:s}) (dl:{:s}) - <{}> / failure: {} / queue state: " "{:.7}", - sjob.pgid, sjob.schedule.scheduled_at, + sjob.pgid, sjob.schedule.not_before, sjob.schedule.scheduled_at, sjob.schedule.deadline, sjob.registration_state(), sjob.resources_failure, sjob.state.load(std::memory_order_relaxed)); } -- 2.39.5