]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: do not limit operator-initiated repairs 64849/head
authorRonen Friedman <rfriedma@redhat.com>
Wed, 6 Aug 2025 05:38:07 +0000 (00:38 -0500)
committerRonen Friedman <rfriedma@redhat.com>
Wed, 6 Aug 2025 08:49:20 +0000 (03:49 -0500)
'auto-repair' scrubs are limited to a maximum of
'scrub_auto_repair_num_errors' damaged objects.
However, operator-initiated repairs should not be limited
by that number. Alas, a bug in a previous commit
(97de817ad1c253ee1c7c9c9302981ad2435301b9) modified the
code in such a way that it applied the
'scrub_auto_repair_num_errors' limit to all repairs,
including operator-initiated ones. This commit fixes that.

Fixes: https://tracker.ceph.com/issues/72420
Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/osd/scrubber/pg_scrubber.cc
src/osd/scrubber/scrub_job.cc
src/osd/scrubber/scrub_job.h

index 515b53fed18be54db352544bea0f81ebcbadbf89..eb9c0c697e373490f3ccc9f74efd8e41eb68a086 100644 (file)
@@ -1828,10 +1828,17 @@ void PgScrubber::scrub_finish()
   // if the repair request comes from auto-repair and there is a large
   // number of objects known to be damaged, we cancel the auto-repair
   if (m_is_repair && m_flags.auto_repair &&
+      ScrubJob::is_repairs_count_limited(m_active_target->urgency()) &&
       m_be->authoritative_peers_count() >
-       static_cast<int>(m_pg->cct->_conf->osd_scrub_auto_repair_num_errors)) {
+         static_cast<int>(
+             m_pg->cct->_conf->osd_scrub_auto_repair_num_errors)) {
 
-    dout(10) << __func__ << " undoing the repair" << dendl;
+    dout(5) << fmt::format(
+                  "{}: undoing the repair. Damaged objects count ({}) is "
+                  "above configured limit ({})",
+                  __func__, m_be->authoritative_peers_count(),
+                  m_pg->cct->_conf->osd_scrub_auto_repair_num_errors)
+           << dendl;
     state_clear(PG_STATE_REPAIR);  // not expected to be set, anyway
     m_is_repair = false;
     update_op_mode_text();
index da9cbb01820f6ea2e97183e502fd8dce4b542fb0..81f45ffca804d4735583d39d590a1274b1d8e771 100644 (file)
@@ -426,3 +426,8 @@ bool ScrubJob::is_autorepair_allowed(urgency_t urgency)
         urgency == urgency_t::operator_requested ||
         urgency == urgency_t::repairing || urgency == urgency_t::must_repair;
 }
+
+bool ScrubJob::is_repairs_count_limited(urgency_t urgency)
+{
+  return urgency < urgency_t::operator_requested;
+}
index 5673c20bb903a50d53b13148c560916cd0bb7587..3aef5a874a48a93a25e757531cad560017026db4 100644 (file)
@@ -375,6 +375,13 @@ class ScrubJob {
   static bool is_repair_implied(urgency_t urgency);
 
   static bool is_autorepair_allowed(urgency_t urgency);
+
+  /**
+   * should we cancel the repair if the number of damaged objects
+   * exceeds the configured limit ('osd_scrub_auto_repair_num_errors')?
+   * This does not apply to any repair that was operator-initiated.
+   */
+  static bool is_repairs_count_limited(urgency_t urgency);
 };
 }  // namespace Scrub