From d24a02f67a26f1bf8f13efffe2fa213c744f096f Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Thu, 4 Dec 2025 08:49:29 -0600 Subject: [PATCH] osd/scrub: support an operator-abort command The new explicit command aborts any ongoing scrub of the target PG, including operator-initiated scrubs. That additional capability is needed now that operator-initiated scrubs are no longer blocked by 'no-scrub' settings. The scenario we are trying to help the operator with is: - an operator issues a set of operator-initiated scrubs (e.g., via a script), then realizes the mistake and wants to abort them all. The abort command also downgrades the urgency level of the scrub target (as otherwise the target would immediately restart, against the operator wishes). This commit implements the changes to the state machine and to the abort logic, assuming the operator command was translated into an event. Signed-off-by: Ronen Friedman --- src/osd/scrubber/pg_scrubber.cc | 86 ++++++++++++++++++++++++------- src/osd/scrubber/pg_scrubber.h | 11 ++++ src/osd/scrubber/scrub_machine.cc | 9 ++++ src/osd/scrubber/scrub_machine.h | 11 +++- src/osd/scrubber_common.h | 2 + 5 files changed, 98 insertions(+), 21 deletions(-) diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 1d6b8898c92..1bd1ffc309f 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -2065,6 +2065,26 @@ void PgScrubber::on_digest_updates() } } +bool PgScrubber::downgrade_on_operator_abort( + Scrub::SchedTarget& targ, + utime_t scrub_clock_now) +{ + if (targ.urgency() != urgency_t::operator_requested && + targ.urgency() != urgency_t::must_repair) { + return false; // no need to downgrade + } + + targ.sched_info.urgency = urgency_t::periodic_regular; + targ.sched_info.schedule.scheduled_at = scrub_clock_now; + targ.sched_info.schedule.not_before = scrub_clock_now; + dout(10) + << fmt::format( + "{}: removing operator-requested urgency from target. Updated: {}", + __func__, targ) + << dendl; + return true; +} + /** * The scrub session was aborted. We are left with two sets of parameters @@ -2076,6 +2096,10 @@ void PgScrubber::on_digest_updates() * have had its priority, flags, or schedule modified in the meantime. * And - it does not (at least initially, i.e. immediately after * set_op_parameters()), have high priority. + * + * Updated functionality ('Tentacle', 2025): if the abort cause was an explicit + * operator request - make sure we are not left with a high-priority + * target (one that would immediately restart, against the operator wishes). */ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue) { @@ -2090,38 +2114,54 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue) dout(10) << fmt::format( "{}: executing target: {}. Session flags: {} up-to-date job: " - "{}", - __func__, *m_active_target, m_flags, *m_scrub_job) + "{}. Abort cause: {}", + __func__, *m_active_target, m_flags, *m_scrub_job, issue) << dendl; // copy the aborted target - const auto aborted_target = *m_active_target; + auto aborted_target = *m_active_target; m_active_target.reset(); const auto scrub_clock_now = ceph_clock_now(); auto& current_targ = m_scrub_job->get_target(aborted_target.level()); ceph_assert(!current_targ.queued); - // merge the aborted target with the current one - auto& curr_sched = current_targ.sched_info.schedule; - auto& abrt_sched = aborted_target.sched_info.schedule; - - current_targ.sched_info.urgency = - std::max(current_targ.urgency(), aborted_target.urgency()); - curr_sched.scheduled_at = - std::min(curr_sched.scheduled_at, abrt_sched.scheduled_at); - curr_sched.not_before = - std::min(curr_sched.not_before, abrt_sched.not_before); - - dout(10) << fmt::format( - "{}: merged target (before delay): {}", __func__, - current_targ) - << dendl; + // if the abort trigger was an explicit operator abort command, and the + // aborted target had operator-initiated urgency: + // - do not perform a 'merge' of the aborted target and the 'next' + // target in the scrub-job. Instead - just reinstate the 'next' target. + // (the aborted target has more than its urgency attribute wrong. The + // scheduled-at was also made irrelevant by the original operator + // command that initiated the aborted scrub). + bool should_merge = true; + if (issue == delay_cause_t::operator_abort) { + should_merge = !downgrade_on_operator_abort(aborted_target, scrub_clock_now); + } + + if (should_merge) { + // the regular case. merge the aborted target with the current one + auto& curr_sched = current_targ.sched_info.schedule; + auto& abrt_sched = aborted_target.sched_info.schedule; + + current_targ.sched_info.urgency = + std::max(current_targ.urgency(), aborted_target.urgency()); + curr_sched.scheduled_at = + std::min(curr_sched.scheduled_at, abrt_sched.scheduled_at); + curr_sched.not_before = + std::min(curr_sched.not_before, abrt_sched.not_before); + dout(10) << fmt::format( + "{}: merged target (before delay): {}", __func__, + current_targ) + << dendl; + } else { + dout(10) << fmt::format( + "{}: aborted oper-urgency target discarded: {}", + __func__, current_targ) + << dendl; + } // affect a delay, as there was a failure mid-scrub m_scrub_job->delay_on_failure(current_targ.level(), issue, scrub_clock_now); - - // reinstate both targets in the queue m_osds->get_scrub_services().enqueue_target(current_targ); current_targ.queued = true; @@ -2129,7 +2169,13 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue) auto& sister = m_scrub_job->get_target( aborted_target.level() == scrub_level_t::deep ? scrub_level_t::shallow : scrub_level_t::deep); + // if 'operator-aborted' - that one should be downgraded, too (the scenario + // we are trying to help the operator with: trying to recover from a set of + // scrub requests issued by mistake). if (!sister.queued) { + if (issue == delay_cause_t::operator_abort) { + downgrade_on_operator_abort(sister, scrub_clock_now); + } m_osds->get_scrub_services().enqueue_target(sister); sister.queued = true; } diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index e88e604083e..1f5ec95e841 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -516,6 +516,17 @@ class PgScrubber : public ScrubPgIF, void on_mid_scrub_abort(Scrub::delay_cause_t issue) final; + /** + * an auxiliary used by on_mid_scrub_abort() + * If the target has operator-initiated urgency (either 'must_repair' - + * operator-requested repair or 'operator_requested' - operator-requested + * scrub) - downgrade it to regular periodic. + * \retval true: the urgency was downgraded + */ + bool downgrade_on_operator_abort( + Scrub::SchedTarget& targ, + utime_t scrub_clock_now); + ScrubMachineListener::MsgAndEpoch prep_replica_map_msg( Scrub::PreemptionNoted was_preempted) final; diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index 3400641b428..9f9c50d7289 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -224,6 +224,15 @@ sc::result Session::react(const IntervalChanged&) return transit(); } +sc::result Session::react(const OperatorAbort&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "Session::react(const OperatorAbort&)" << dendl; + ceph_assert(m_reservations); + m_abort_reason = delay_cause_t::operator_abort; + return transit(); +} + std::optional Session::get_reservation_status() const { if (!m_reservations) { diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 8c2d99275be..e55d051360c 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -248,6 +248,12 @@ MEV(IntervalChanged) */ MEV(FullReset) +/** + * (Primary only) stops the running scrub. Removes any higher-than-periodic + * 'urgency' attributes. + */ +MEV(OperatorAbort) + /// finished handling this chunk. Go get the next one MEV(NextChunk) @@ -558,10 +564,13 @@ struct Session : sc::state, ~Session(); using reactions = mpl::list, - sc::custom_reaction>; + sc::custom_reaction, + sc::custom_reaction>; sc::result react(const IntervalChanged&); + sc::result react(const OperatorAbort&); + /// managing the scrub session's reservations (optional, as /// it's an RAII wrapper around the state of 'holding reservations') std::optional m_reservations{std::nullopt}; diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index 9b054f7ab31..b57eec5d81c 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -219,6 +219,7 @@ enum class delay_cause_t { aborted, ///< scrub was aborted w/ unspecified reason interval, ///< the interval had ended mid-scrub scrub_params, ///< the specific scrub type is not allowed + operator_abort ///< operator-requested abort }; } // namespace Scrub @@ -242,6 +243,7 @@ struct formatter : ::fmt::formatter { case aborted: desc = "aborted"; break; case interval: desc = "interval"; break; case scrub_params: desc = "scrub-mode"; break; + case operator_abort: desc = "operator-abort"; break; // better to not have a default case, so that the compiler will warn } return ::fmt::formatter::format(desc, ctx); -- 2.47.3