]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: limit scrubbing under snap-trimming overload 68737/head
authorRonen Friedman <rfriedma@redhat.com>
Wed, 29 Apr 2026 04:55:02 +0000 (04:55 +0000)
committerRonen Friedman <rfriedma@redhat.com>
Thu, 21 May 2026 18:21:55 +0000 (18:21 +0000)
When the snap-trim queues are long, scrubbing is likely to
make things worse. This change adds a new scrubbing restriction
for that case, and prevents periodic scrubs from starting when
the total snap-trim queue length across all PGs exceeds a
configurable threshold.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/common/options/osd.yaml.in
src/osd/scrubber/osd_scrub.cc
src/osd/scrubber/scrub_job.cc
src/osd/scrubber/scrub_job.h
src/osd/scrubber_common.h

index eb343e33f065525390fd39592f3166e8b36fdda5..34dc096477e0b3358e4acf08129ce9d07a03a3c2 100644 (file)
@@ -329,6 +329,13 @@ options:
     under most load conditions.
   default: 10.0
   with_legacy: true
+- name: osd_scrub_queued_snaptrims_limit
+  type: uint
+  level: advanced
+  desc: Do not initiate periodic scrubs when the total snap-trim queues across all
+    PGs exceeds this value. A value of '0' disables this limit.
+  default: 500
+  with_legacy: false
 - name: osd_scrub_min_interval
   type: float
   level: advanced
index f8e2e40a31eab982caa2da0bd96b57f1f7881805..bba723a314f7ee4bd64270e2680d04789aede142 100644 (file)
@@ -181,6 +181,10 @@ bool OsdScrub::is_sched_target_eligible(
   if (r.cpu_overloaded && ScrubJob::observes_load_limit(e.urgency)) {
     return false;
   }
+  if (r.overload_of_snap_trimming &&
+      ScrubJob::observes_trims_load(e.urgency)) {
+    return false;
+  }
   if (r.recovery_in_progress && ScrubJob::observes_recovery(e.urgency)) {
     return false;
   }
@@ -217,6 +221,11 @@ Scrub::OSDRestrictions OsdScrub::restrictions_on_scrubbing(
 
   env_conditions.restricted_time = !scrub_time_permit(scrub_clock_now);
   env_conditions.cpu_overloaded = !scrub_load_below_threshold();
+  const auto snaptrims_limit =
+      cct->_conf.get_val<uint64_t>("osd_scrub_queued_snaptrims_limit");
+  env_conditions.overload_of_snap_trimming =
+      (snaptrims_limit > 0) &&
+      (m_osd_svc.get_snap_trim_queue_total() > snaptrims_limit);
 
   return env_conditions;
 }
index b260bcc3291c1f9b53a5f7cfa2b2b3325c420c6d..541da5306ad0cddd444258a24bb831b2fdf05797 100644 (file)
@@ -389,6 +389,11 @@ bool ScrubJob::observes_load_limit(urgency_t urgency)
   return urgency < urgency_t::after_repair;
 }
 
+bool ScrubJob::observes_trims_load(urgency_t urgency)
+{
+  return urgency < urgency_t::repairing;
+}
+
 bool ScrubJob::requires_reservation(urgency_t urgency)
 {
   return urgency < urgency_t::after_repair;
index f0f7f0a3268465246466c35c2ac56999242307b4..1427a7dced0db0a40809ebe06ecf36ed16513249 100644 (file)
@@ -316,6 +316,8 @@ class ScrubJob {
  *   if continued into the forbidden times, by having a longer sleep time;
  *   (note that this is only applicable to the wq scheduler).
  * - load: the scrub must not be initiated if the OSD is under heavy CPU load;
+ * - trims: the scrub must not be initiated if the OSD has too many snap-trim
+ *   jobs pending;
  * - noscrub: the scrub is aborted if the 'noscrub' flag (or the
  *  'nodeep-scrub' flag for deep scrubs) is set;
  * - randomization: the scrub's target time is extended by a random
@@ -335,10 +337,11 @@ class ScrubJob {
  *  | limitation |  must-  | after-repair |repairing| operator | must-repair |
  *  |            |  scrub  |(aft recovery)|(errors) | request  |             |
  *  +------------+---------+--------------+---------+----------+-------------+
- *  | reservation|    yes! |      no      |    no?  |     no   |      no     |
- *  | dow/time   |    yes  |     yes      |    no   |     no   |      no     |
+ *  | reservation|    yes! |      no      |    no   |     no   |      no     |
+ *  | dow/time   |    yes  |      yes     |    no X |     no   |      no     |
  *  | ext-sleep  |    no   |      no      |    no   |     no   |      no     |
  *  | load       |    yes  |      no      |    no   |     no   |      no     |
+ *  | trims      |    yes  |      yes     |    no   |     no   |      no     |
  *  | noscrub    |    yes  |      no      |    Yes  |     no   |      no     |
  *  | max-scrubs |    yes  |      yes     |    Yes  |     no   |      no     |
  *  | backoff    |    yes  |      no      |    no   |     no   |      no     |
@@ -357,6 +360,8 @@ class ScrubJob {
 
   static bool observes_load_limit(urgency_t urgency);
 
+  static bool observes_trims_load(urgency_t urgency);
+
   static bool requires_reservation(urgency_t urgency);
 
   static bool requires_randomization(urgency_t urgency);
index 844c6943588e23ae736e809a49208f07d6e415dd..26a36b808b67e15d84485b26331a6e0c54fb1e41 100644 (file)
@@ -102,6 +102,9 @@ struct OSDRestrictions {
   /// the CPU load is high. No regular scrubs are allowed.
   bool cpu_overloaded:1{false};
 
+  /// long snap-trimming queues.
+  bool overload_of_snap_trimming:1{false};
+
   /// outside of allowed scrubbing hours/days
   bool restricted_time:1{false};
 
@@ -181,10 +184,11 @@ struct formatter<Scrub::OSDRestrictions> {
   template <typename FormatContext>
   auto format(const Scrub::OSDRestrictions& conds, FormatContext& ctx) const {
     return fmt::format_to(
-       ctx.out(), "<{}.{}.{}.{}.{}>",
+       ctx.out(), "<{}.{}.{}.{}.{}.{}>",
        conds.max_concurrency_reached ? "max-scrubs" : "",
        conds.random_backoff_active ? "backoff" : "",
        conds.cpu_overloaded ? "high-load" : "",
+       conds.overload_of_snap_trimming ? "trim-overload" : "",
        conds.restricted_time ? "time-restrict" : "",
        conds.recovery_in_progress ? "recovery" : "");
   }