From beeb0edf4ea2473ebb10a15c28766404eea2baaa Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Wed, 6 Aug 2025 00:38:07 -0500 Subject: [PATCH] osd/scrub: do not limit operator-initiated repairs 'auto-repair' scrubs are limited to a maximum of 'scrub_auto_repair_num_errors' damaged objects. However, operator-initiated repairs should not be limited by that number. Alas, a bug in a previous commit (97de817ad1c253ee1c7c9c9302981ad2435301b9) modified the code in such a way that it applied the 'scrub_auto_repair_num_errors' limit to all repairs, including operator-initiated ones. This commit fixes that. Fixes: https://tracker.ceph.com/issues/72420 Signed-off-by: Ronen Friedman (cherry picked from commit 5afc446fdafe3f2e168f9846f324d1d6a71f0f77) --- src/osd/scrubber/pg_scrubber.cc | 11 +++++++++-- src/osd/scrubber/scrub_job.cc | 5 +++++ src/osd/scrubber/scrub_job.h | 7 +++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 1e1a33fac84..1d0af0ebfa1 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1828,10 +1828,17 @@ void PgScrubber::scrub_finish() // if the repair request comes from auto-repair and there is a large // number of objects known to be damaged, we cancel the auto-repair if (m_is_repair && m_flags.auto_repair && + ScrubJob::is_repairs_count_limited(m_active_target->urgency()) && m_be->authoritative_peers_count() > - static_cast(m_pg->cct->_conf->osd_scrub_auto_repair_num_errors)) { + static_cast( + m_pg->cct->_conf->osd_scrub_auto_repair_num_errors)) { - dout(10) << __func__ << " undoing the repair" << dendl; + dout(5) << fmt::format( + "{}: undoing the repair. Damaged objects count ({}) is " + "above configured limit ({})", + __func__, m_be->authoritative_peers_count(), + m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) + << dendl; state_clear(PG_STATE_REPAIR); // not expected to be set, anyway m_is_repair = false; update_op_mode_text(); diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc index da9cbb01820..81f45ffca80 100644 --- a/src/osd/scrubber/scrub_job.cc +++ b/src/osd/scrubber/scrub_job.cc @@ -426,3 +426,8 @@ bool ScrubJob::is_autorepair_allowed(urgency_t urgency) urgency == urgency_t::operator_requested || urgency == urgency_t::repairing || urgency == urgency_t::must_repair; } + +bool ScrubJob::is_repairs_count_limited(urgency_t urgency) +{ + return urgency < urgency_t::operator_requested; +} diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h index 5673c20bb90..3aef5a874a4 100644 --- a/src/osd/scrubber/scrub_job.h +++ b/src/osd/scrubber/scrub_job.h @@ -375,6 +375,13 @@ class ScrubJob { static bool is_repair_implied(urgency_t urgency); static bool is_autorepair_allowed(urgency_t urgency); + + /** + * should we cancel the repair if the number of damaged objects + * exceeds the configured limit ('osd_scrub_auto_repair_num_errors')? + * This does not apply to any repair that was operator-initiated. + */ + static bool is_repairs_count_limited(urgency_t urgency); }; } // namespace Scrub -- 2.39.5