From 8a8fa48b8b0602f3bd646a64a58993cf6793ac85 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Thu, 20 Jun 2024 08:04:41 -0500 Subject: [PATCH] osd/scrub: scheduling the next scrub following scrub completion or after an aborted scrub. To note: one of the important changes in this commit: merging the functionality of adjust_target_time() & update_schedule() into a single function - adjust_schedule(). Regarding the handling of aborts: Most of the time - all that is required following a scrub abort is to requeue the scrub job - the one that triggered the aborted scrub - with just a delay added to its n.b.. But we must take into account scenarios where "something" caused the parameters prepared for the *next* scrub to show higher urgency or priority. "Something" - as in an operator command requiring immediate scrubbing, or a change in the pool/cluster configuration. In such cases - the current requested flags and the parameters of the aborted scrub must be merged. Note that the current implementation is a temporary solution, to be replaced by a per-level updating of the relevant target. Signed-off-by: Ronen Friedman --- src/osd/scrubber/pg_scrubber.cc | 85 ++++++++++++++------ src/osd/scrubber/pg_scrubber.h | 2 +- src/osd/scrubber/scrub_job.cc | 105 +++++++++++++------------ src/osd/scrubber/scrub_job.h | 56 +++++++------ src/osd/scrubber/scrub_machine.cc | 2 +- src/osd/scrubber/scrub_machine_lstnr.h | 3 +- 6 files changed, 145 insertions(+), 108 deletions(-) diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 8984d00f45d..637b4586ab3 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -534,30 +534,16 @@ sched_params_t PgScrubber::determine_initial_schedule( } -/* - * Note: referring to m_planned_scrub here is temporary, as this set of - * scheduling flags will be removed in a followup PR. - */ void PgScrubber::schedule_scrub_with_osd() { ceph_assert(is_primary()); ceph_assert(m_scrub_job); - auto pre_reg = registration_state(); - m_scrub_job->registered = true; - - const auto applicable_conf = populate_config_params(); - const auto scrub_clock_now = ceph_clock_now(); - auto suggested = determine_initial_schedule(applicable_conf, scrub_clock_now); - m_scrub_job->init_targets( - suggested, m_pg->info, applicable_conf, scrub_clock_now); - - m_osds->get_scrub_services().enqueue_target(*m_scrub_job); - - dout(10) << fmt::format( - "{}: <{:.5}> --> <{:.5}>", __func__, - m_planned_scrub, pre_reg, registration_state()) + dout(20) << fmt::format( + "{}: state at entry: {}", __func__, m_scrub_job->state_desc()) << dendl; + m_scrub_job->registered = true; + update_scrub_job(delay_ready_t::delay_ready); } @@ -613,8 +599,12 @@ void PgScrubber::update_scrub_job(Scrub::delay_ready_t delay_ready) ceph_assert(m_pg->is_locked()); const auto applicable_conf = populate_config_params(); const auto scrub_clock_now = ceph_clock_now(); - const auto suggested = determine_initial_schedule(applicable_conf, scrub_clock_now); - m_scrub_job->on_periods_change(suggested, applicable_conf, scrub_clock_now); + const auto suggested = + determine_initial_schedule(applicable_conf, scrub_clock_now); + + ceph_assert(m_scrub_job->is_registered()); + m_scrub_job->adjust_schedule( + suggested, applicable_conf, scrub_clock_now, delay_ready); m_osds->get_scrub_services().enqueue_target(*m_scrub_job); m_scrub_job->target_queued = true; m_pg->publish_stats_to_osd(); @@ -761,10 +751,7 @@ void PgScrubber::on_operator_periodic_cmd( << dendl; // move the relevant time-stamp backwards - enough to trigger a scrub - - utime_t now_is = ceph_clock_now(); - utime_t stamp = now_is; - + utime_t stamp = ceph_clock_now(); if (offset > 0) { stamp -= offset; } else { @@ -2083,6 +2070,56 @@ void PgScrubber::on_digest_updates() } +/** + * The scrub session was aborted. We are left with two sets of parameters + * as to when the next scrub of this PG should take place, and what should + * it be like. One set of parameters is the one that was used to start the + * scrub, and that was 'frozen' by set_op_parameters(). It has its own + * scheduling target, priority, not-before, etc'. + * The other set is the updated state of the current scrub-job. It may + * have had its priority, flags, or schedule modified in the meantime. + * And - it does not (at least initially, i.e. immediately after + * set_op_parameters()), have high priority. + * + * Alas, the scrub session that was initiated was aborted. We must now + * merge the two sets of parameters, using the highest priority and the + * nearest target time for the next scrub. + * + * Note: only half-functioning in this commit. As the scrub-job copy + * (the one that was in the scheduling queue, and was passed to the scrubber) + * does not have the 'urgency' parameter, we are missing some information + * that is still encoded in the 'planned scrub' flags. This will be fixed in + * the next step. + */ +void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue) +{ + // assuming we can still depend on the 'scrubbing' flag being set; + // Also on Queued&Active. + + // note again: this is not how merging should work in the final version: + // e.g. - the 'aborted_schedule' data should be passed thru the scrubber. + // In this current patchworik, for example, we are only guessing at + // the original value of 'must_deep_scrub'. + m_planned_scrub.must_deep_scrub = + m_planned_scrub.must_deep_scrub || (m_flags.required && m_is_deep); + m_planned_scrub.must_scrub = m_planned_scrub.must_deep_scrub || + m_planned_scrub.must_scrub || m_flags.required; + m_planned_scrub.must_repair = m_planned_scrub.must_repair || m_is_repair; + m_planned_scrub.need_auto = m_planned_scrub.need_auto || m_flags.auto_repair; + m_planned_scrub.deep_scrub_on_error = + m_planned_scrub.deep_scrub_on_error || m_flags.deep_scrub_on_error; + m_planned_scrub.check_repair = + m_planned_scrub.check_repair || m_flags.check_repair; + + m_scrub_job->merge_and_delay( + m_active_target->schedule, issue, m_planned_scrub, ceph_clock_now()); + ceph_assert(m_scrub_job->is_registered()); + ceph_assert(!m_scrub_job->target_queued); + m_osds->get_scrub_services().enqueue_target(*m_scrub_job); + m_scrub_job->target_queued = true; +} + + void PgScrubber::requeue_penalized(Scrub::delay_cause_t cause) { /// \todo fix the 5s' to use a cause-specific delay parameter diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 0106edae54a..e970bd7219d 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -437,7 +437,7 @@ class PgScrubber : public ScrubPgIF, void scrub_finish() final; - void penalize_next_scrub(Scrub::delay_cause_t cause) final; + void on_mid_scrub_abort(Scrub::delay_cause_t issue) final; ScrubMachineListener::MsgAndEpoch prep_replica_map_msg( Scrub::PreemptionNoted was_preempted) final; diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc index 1770cd95cb3..911ecc66ddb 100644 --- a/src/osd/scrubber/scrub_job.cc +++ b/src/osd/scrubber/scrub_job.cc @@ -11,6 +11,7 @@ using OSDRestrictions = Scrub::OSDRestrictions; using sched_conf_t = Scrub::sched_conf_t; using scrub_schedule_t = Scrub::scrub_schedule_t; using ScrubJob = Scrub::ScrubJob; +using delay_ready_t = Scrub::delay_ready_t; // ////////////////////////////////////////////////////////////////////////// // @@ -32,7 +33,7 @@ ScrubJob::ScrubJob(CephContext* cct, const spg_t& pg, int node_id) : pgid{pg} , whoami{node_id} , cct{cct} - , log_msg_prefix{fmt::format("osd.{}: scrub-job:pg[{}]:", node_id, pgid)} + , log_msg_prefix{fmt::format("osd.{} scrub-job:pg[{}]:", node_id, pgid)} {} // debug usage only @@ -44,74 +45,76 @@ ostream& operator<<(ostream& out, const ScrubJob& sjob) } // namespace std -Scrub::scrub_schedule_t ScrubJob::adjust_target_time( - const sched_conf_t& app_conf, - const sched_params_t& suggested) const +void ScrubJob::adjust_schedule( + const Scrub::sched_params_t& suggested, + const Scrub::sched_conf_t& app_conf, + utime_t scrub_clock_now, + delay_ready_t modify_ready_targets) { - Scrub::scrub_schedule_t adjusted{ - suggested.proposed_time, suggested.proposed_time, suggested.proposed_time}; + dout(10) << fmt::format( + "{} current h.p.:{:c} conf:{} also-ready?{:c} " + "sjob@entry:{}", + suggested, high_priority ? 'y' : 'n', app_conf, + (modify_ready_targets == delay_ready_t::delay_ready) ? 'y' + : 'n', + *this) + << dendl; - if (suggested.is_must == Scrub::must_scrub_t::not_mandatory) { - // unless explicitly requested, postpone the scrub with a random delay - adjusted.scheduled_at += app_conf.shallow_interval; - double r = rand() / (double)RAND_MAX; - adjusted.scheduled_at += - app_conf.shallow_interval * app_conf.interval_randomize_ratio * r; + high_priority = (suggested.is_must == must_scrub_t::mandatory); + utime_t adj_not_before = suggested.proposed_time; + utime_t adj_target = suggested.proposed_time; + schedule.deadline = adj_target; + + if (!high_priority) { + // add a random delay to the proposed scheduled time - but only for periodic + // scrubs that are not already eligible for scrubbing. + if ((modify_ready_targets == delay_ready_t::delay_ready) || + adj_not_before > scrub_clock_now) { + adj_target += app_conf.shallow_interval; + double r = rand() / (double)RAND_MAX; + adj_target += + app_conf.shallow_interval * app_conf.interval_randomize_ratio * r; + } + // the deadline can be updated directly into the scrub-job if (app_conf.max_shallow) { - adjusted.deadline += *app_conf.max_shallow; + schedule.deadline += *app_conf.max_shallow; } else { - adjusted.deadline = utime_t{}; + schedule.deadline = utime_t{}; } - if (adjusted.not_before < adjusted.scheduled_at) { - adjusted.not_before = adjusted.scheduled_at; + if (adj_not_before < adj_target) { + adj_not_before = adj_target; } - - dout(20) << fmt::format( - "not-must. Was:{:s} config:{} adjusted:{}", - suggested.proposed_time, app_conf, adjusted) << dendl; } - // else - no log is needed. All relevant data will be logged by the caller - return adjusted; + schedule.scheduled_at = adj_target; + schedule.not_before = adj_not_before; + dout(10) << fmt::format( + "adjusted: nb:{:s} target:{:s} deadline:{:s} ({})", + schedule.not_before, schedule.scheduled_at, schedule.deadline, + state_desc()) + << dendl; } -void ScrubJob::init_targets( - const sched_params_t& suggested, - const pg_info_t& info, - const Scrub::sched_conf_t& aconf, +void ScrubJob::merge_and_delay( + const scrub_schedule_t& aborted_schedule, + delay_cause_t issue, + requested_scrub_t updated_flags, utime_t scrub_clock_now) { - auto adjusted = adjust_target_time(aconf, suggested); - high_priority = suggested.is_must == must_scrub_t::mandatory; - update_schedule(adjusted, true); + // merge the schedule targets: + schedule.scheduled_at = + std::min(aborted_schedule.scheduled_at, schedule.scheduled_at); + high_priority = high_priority || updated_flags.must_scrub; + delay_on_failure(5s, issue, scrub_clock_now); + + // the new deadline is the minimum of the two + schedule.deadline = std::min(aborted_schedule.deadline, schedule.deadline); } -void ScrubJob::update_schedule( - const Scrub::scrub_schedule_t& adjusted, - bool reset_failure_penalty) -{ - dout(15) << fmt::format( - "was: nb:{:s}({:s}). Called with: rest?{} {:s} ({})", - schedule.not_before, schedule.scheduled_at, - reset_failure_penalty, adjusted.scheduled_at, - state_desc()) - << dendl; - schedule.scheduled_at = adjusted.scheduled_at; - schedule.deadline = adjusted.deadline; - - if (reset_failure_penalty || (schedule.not_before < schedule.scheduled_at)) { - schedule.not_before = schedule.scheduled_at; - } - dout(10) << fmt::format( - "adjusted: nb:{:s} ({:s}) ({})", schedule.not_before, - schedule.scheduled_at, state_desc()) - << dendl; -} - void ScrubJob::delay_on_failure( std::chrono::seconds delay, Scrub::delay_cause_t delay_cause, diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h index ce2e66023ed..ef30bcb4fe5 100644 --- a/src/osd/scrubber/scrub_job.h +++ b/src/osd/scrubber/scrub_job.h @@ -155,27 +155,23 @@ class ScrubJob { } /** - * 'reset_failure_penalty' is used to reset the 'not_before' jo attribute to - * the updated 'scheduled_at' time. This is used whenever the scrub-job - * schedule is updated, and the update is not a result of a scrub attempt - * failure. - */ - void update_schedule( - const scrub_schedule_t& adjusted, - bool reset_failure_penalty); - - /** - * If the scrub job was not explicitly requested, we postpone it by some - * random length of time. - * And if delaying the scrub - we calculate, based on pool parameters, a - * deadline we should scrub before. + * Given a proposed time for the next scrub, and the relevant + * configuration, adjust_schedule() determines the actual target time, + * the deadline, and the 'not_before' time for the scrub. + * The new values are updated into the scrub-job. * - * @return updated (i.e. - possibly delayed) scrub schedule (schedule, - * deadline, not_before) + * Specifically: + * - for high-priority scrubs: n.b. & deadline are set equal to the + * (untouched) proposed target time. + * - for regular scrubs: the proposed time is adjusted (delayed) based + * on the configuration; the deadline is set further out (if configured) + * and the n.b. is reset to the target. */ - Scrub::scrub_schedule_t adjust_target_time( - const Scrub::sched_conf_t& app_conf, - const Scrub::sched_params_t& proposed_schedule) const; + void adjust_schedule( + const Scrub::sched_params_t& suggested, + const Scrub::sched_conf_t& aconf, + utime_t scrub_clock_now, + Scrub::delay_ready_t modify_ready_targets); /** * push the 'not_before' time out by 'delay' seconds, so that this scrub target @@ -187,14 +183,18 @@ class ScrubJob { utime_t scrub_clock_now); /** - * initial setting of the scheduling parameters of a newly registered - * PG. The scrub targets (in this stage of the refactoring - the whole - * scrub job) is initialized as for a regular periodic scrub. + * Recalculating any possible updates to the scrub schedule, following an + * aborted scrub attempt. + * Usually - we can use the same schedule that triggered the aborted scrub. + * But we must take into account scenarios where "something" caused the + * parameters prepared for the *next* scrub to show higher urgency or + * priority. "Something" - as in an operator command requiring immediate + * scrubbing, or a change in the pool/cluster configuration. */ - void init_targets( - const sched_params_t& suggested, - const pg_info_t& info, - const Scrub::sched_conf_t& aconf, + void merge_and_delay( + const scrub_schedule_t& aborted_schedule, + Scrub::delay_cause_t issue, + requested_scrub_t updated_flags, utime_t scrub_clock_now); /** @@ -213,7 +213,6 @@ class ScrubJob { void dump(ceph::Formatter* f) const; - bool is_registered() const { return registered; } /** @@ -263,7 +262,6 @@ struct formatter { } }; - template <> struct formatter { constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } @@ -272,7 +270,7 @@ struct formatter { auto format(const Scrub::ScrubJob& sjob, FormatContext& ctx) const { return fmt::format_to( - ctx.out(), "pg[{}] @ nb:{:s} ({:s}) (dl:{:s}) - <{}>", + ctx.out(), "pg[{}]:nb:{:s} / trg:{:s} / dl:{:s} <{}>", sjob.pgid, sjob.schedule.not_before, sjob.schedule.scheduled_at, sjob.schedule.deadline, sjob.state_desc()); } diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index ca0ff522278..547a046d5bd 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -308,7 +308,7 @@ ActiveScrubbing::~ActiveScrubbing() // completed successfully), we use it now to set the 'failed scrub' duration. if (session.m_session_started_at != ScrubTimePoint{}) { // delay the next invocation of the scrubber on this target - scrbr->penalize_next_scrub(Scrub::delay_cause_t::aborted); + scrbr->on_mid_scrub_abort(Scrub::delay_cause_t::aborted); auto logged_duration = ScrubClock::now() - session.m_session_started_at; session.m_perf_set->tinc(scrbcnt_failed_elapsed, logged_duration); diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h index 85c518c402f..2177c7e85ab 100644 --- a/src/osd/scrubber/scrub_machine_lstnr.h +++ b/src/osd/scrubber/scrub_machine_lstnr.h @@ -163,8 +163,7 @@ struct ScrubMachineListener { virtual void scrub_finish() = 0; /// notify the scrubber about a scrub failure - /// (note: temporary implementation) - virtual void penalize_next_scrub(Scrub::delay_cause_t cause) = 0; + virtual void on_mid_scrub_abort(Scrub::delay_cause_t cause) = 0; /** * Prepare a MOSDRepScrubMap message carrying the requested scrub map -- 2.39.5