From 787f81a705d55579ca2799962ad88811404b7a99 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Sun, 7 Dec 2025 08:34:05 -0600 Subject: [PATCH] osd/scrub: added the scrub-abort command and its handling in the PgScrubber. Fixes: https://tracker.ceph.com/issues/74133 Signed-off-by: Ronen Friedman (cherry picked from commit 0884eec3cae7ac7d81e0dce6b12526262c12047e) --- src/osd/OSD.cc | 7 +++ src/osd/PrimaryLogPG.cc | 10 +++++ src/osd/scrubber/pg_scrubber.cc | 79 +++++++++++++++++++++++++++++++++ src/osd/scrubber/pg_scrubber.h | 3 ++ src/osd/scrubber_common.h | 4 ++ 5 files changed, 103 insertions(+) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 3586b509146..fbf1f6db96b 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2780,6 +2780,7 @@ void OSD::asok_command( prefix == "list_unfound" || prefix == "scrub" || prefix == "deep-scrub" || + prefix == "scrub-abort" || prefix == "schedule-scrub" || ///< dev/tests only! prefix == "schedule-deep-scrub" ///< dev/tests only! ) { @@ -4529,6 +4530,12 @@ void OSD::final_init() asok_hook, "Trigger a deep scrub"); ceph_assert(r == 0); + r = admin_socket->register_command( + "scrub-abort " + "name=pgid,type=CephPgid,req=false", + asok_hook, + "Abort an ongoing scrub. Cancel any operator-initiated scrub"); + ceph_assert(r == 0); // debug/test commands (faking the timestamps) r = admin_socket->register_command( "schedule-scrub " diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 9ef012eb930..d4c40d59855 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -1201,6 +1201,16 @@ void PrimaryLogPG::do_command( outbl.append(ss.str()); } + else if (prefix == "scrub-abort") { + if (is_primary()) { + m_scrubber->on_operator_abort_scrub(f.get()); + } else { + ss << "Not primary"; + ret = -EPERM; + outbl.append(ss.str()); + } + } + // the test/debug commands that schedule a scrub by modifying timestamps else if (prefix == "schedule-scrub" || prefix == "schedule-deep-scrub") { if (is_primary()) { diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 320292dda17..f7a9d53ae61 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -771,6 +771,85 @@ void PgScrubber::on_operator_forced_scrub( } +/** + * Operation: + * - if the PG is being scrubbed - just send the operator-abort event to + * the FSM. That would stop the ongoing scrub session, and remove the + * (possible) operator-requested priority from both PG targets (shallow + * and deep). + * - otherwise - manually manipulate the two urgencies. + */ +void PgScrubber::on_operator_abort_scrub(ceph::Formatter* f) +{ + Formatter::ObjectSection asok_resp_section{*f, "result"sv}; + if (!is_primary() || !m_scrub_job) { + dout(10) << fmt::format( + "{}: pg[{}]: not Primary or no scrub-job", __func__, + m_pg_id.pgid) + << dendl; + f->dump_bool("applicable", false); + f->dump_bool("active", false); + return; + } + + dout(5) << fmt::format( + "{}: pg[{}]: job on entry: {}", __func__, m_pg_id.pgid, + *m_scrub_job) + << dendl; + ceph_assert(m_pg->is_locked()); + if (is_scrub_active()) { + m_fsm->process_event(OperatorAbort{}); + f->dump_bool("applicable", true); + f->dump_bool("active", true); + + } else if (!m_scrub_job->is_registered()) { + const auto err_text = fmt::format( + "{}: pg[{}] is not registered for scrubbing", __func__, m_pg_id.pgid); + dout(5) << err_text << dendl; + f->dump_bool("applicable", false); + f->dump_bool("active", false); + f->dump_string("error", err_text); + + } else { + // not scrubbing now. Remove any operator-requested priority from + // both targets. + + if (m_scrub_job->is_queued()) { + // one or both of the targets are in the queue. Remove them. + m_osds->get_scrub_services().remove_from_osd_queue(m_pg_id); + m_scrub_job->clear_both_targets_queued(); + dout(20) << fmt::format( + "{}: pg[{}] dequeuing for an update", __func__, + m_pg_id.pgid) + << dendl; + } + + // if any of the targets was set to operator-initiated urgency - + // remove that designation, and reschedule both. + const auto scrub_time_now = ceph_clock_now(); + const bool adj_shallow = downgrade_on_operator_abort( + m_scrub_job->get_target(scrub_level_t::shallow), scrub_time_now); + // note: must not short-circuit! + const bool adj_deep = downgrade_on_operator_abort( + m_scrub_job->get_target(scrub_level_t::deep), scrub_time_now); + if (adj_shallow || adj_deep) { + update_targets(scrub_time_now); + dout(10) << fmt::format("{}: adjusted job: {}", __func__, *m_scrub_job) + << dendl; + } + m_osds->get_scrub_services().enqueue_scrub_job(*m_scrub_job); + m_scrub_job->set_both_targets_queued(); + f->dump_bool("applicable", true); + f->dump_bool("active", false); + } + dout(5) << fmt::format( + "{}: pg[{}] job at exit: {}", __func__, m_pg_id.pgid, + *m_scrub_job) + << dendl; + m_pg->publish_stats_to_osd(); +} + + // ---------------------------------------------------------------------------- bool PgScrubber::has_pg_marked_new_updates() const diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index f3e2d1c5d2f..ec65fe61380 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -340,6 +340,9 @@ class PgScrubber : public ScrubPgIF, scrub_level_t scrub_level, scrub_type_t scrub_type) final; + void on_operator_abort_scrub( + ceph::Formatter* f) final; + /** * let the scrubber know that a recovery operation has completed. * This might trigger an 'after repair' scrub. diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index 9b2e03ee468..a9899f14230 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -458,6 +458,10 @@ struct ScrubPgIF { ceph::Formatter* f, scrub_level_t scrub_level) = 0; + /// abort an ongoing scrub, and cancel any pending operator scrub request + virtual void on_operator_abort_scrub( + ceph::Formatter* f) = 0; + virtual void dump_scrubber(ceph::Formatter* f) const = 0; /** -- 2.47.3