From: Ronen Friedman Date: Wed, 6 Aug 2025 05:38:07 +0000 (-0500) Subject: osd/scrub: do not limit operator-initiated repairs X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=5afc446fdafe3f2e168f9846f324d1d6a71f0f77;p=ceph.git osd/scrub: do not limit operator-initiated repairs 'auto-repair' scrubs are limited to a maximum of 'scrub_auto_repair_num_errors' damaged objects. However, operator-initiated repairs should not be limited by that number. Alas, a bug in a previous commit (97de817ad1c253ee1c7c9c9302981ad2435301b9) modified the code in such a way that it applied the 'scrub_auto_repair_num_errors' limit to all repairs, including operator-initiated ones. This commit fixes that. Fixes: https://tracker.ceph.com/issues/72420 Signed-off-by: Ronen Friedman --- diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 515b53fed18be..eb9c0c697e373 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1828,10 +1828,17 @@ void PgScrubber::scrub_finish() // if the repair request comes from auto-repair and there is a large // number of objects known to be damaged, we cancel the auto-repair if (m_is_repair && m_flags.auto_repair && + ScrubJob::is_repairs_count_limited(m_active_target->urgency()) && m_be->authoritative_peers_count() > - static_cast(m_pg->cct->_conf->osd_scrub_auto_repair_num_errors)) { + static_cast( + m_pg->cct->_conf->osd_scrub_auto_repair_num_errors)) { - dout(10) << __func__ << " undoing the repair" << dendl; + dout(5) << fmt::format( + "{}: undoing the repair. Damaged objects count ({}) is " + "above configured limit ({})", + __func__, m_be->authoritative_peers_count(), + m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) + << dendl; state_clear(PG_STATE_REPAIR); // not expected to be set, anyway m_is_repair = false; update_op_mode_text(); diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc index da9cbb01820f6..81f45ffca804d 100644 --- a/src/osd/scrubber/scrub_job.cc +++ b/src/osd/scrubber/scrub_job.cc @@ -426,3 +426,8 @@ bool ScrubJob::is_autorepair_allowed(urgency_t urgency) urgency == urgency_t::operator_requested || urgency == urgency_t::repairing || urgency == urgency_t::must_repair; } + +bool ScrubJob::is_repairs_count_limited(urgency_t urgency) +{ + return urgency < urgency_t::operator_requested; +} diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h index 5673c20bb903a..3aef5a874a48a 100644 --- a/src/osd/scrubber/scrub_job.h +++ b/src/osd/scrubber/scrub_job.h @@ -375,6 +375,13 @@ class ScrubJob { static bool is_repair_implied(urgency_t urgency); static bool is_autorepair_allowed(urgency_t urgency); + + /** + * should we cancel the repair if the number of damaged objects + * exceeds the configured limit ('osd_scrub_auto_repair_num_errors')? + * This does not apply to any repair that was operator-initiated. + */ + static bool is_repairs_count_limited(urgency_t urgency); }; } // namespace Scrub