From 27794c343ddd8a9fa0dfa000028dc4a107c484f0 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Wed, 29 Apr 2026 04:55:02 +0000 Subject: [PATCH] osd/scrub: limit scrubbing under snap-trimming overload When the snap-trim queues are long, scrubbing is likely to make things worse. This change adds a new scrubbing restriction for that case, and prevents periodic scrubs from starting when the total snap-trim queue length across all PGs exceeds a configurable threshold. Signed-off-by: Ronen Friedman --- src/common/options/osd.yaml.in | 7 +++++++ src/osd/scrubber/osd_scrub.cc | 9 +++++++++ src/osd/scrubber/scrub_job.cc | 5 +++++ src/osd/scrubber/scrub_job.h | 9 +++++++-- src/osd/scrubber_common.h | 6 +++++- 5 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index eb343e33f065..34dc096477e0 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -329,6 +329,13 @@ options: under most load conditions. default: 10.0 with_legacy: true +- name: osd_scrub_queued_snaptrims_limit + type: uint + level: advanced + desc: Do not initiate periodic scrubs when the total snap-trim queues across all + PGs exceeds this value. A value of '0' disables this limit. + default: 500 + with_legacy: false - name: osd_scrub_min_interval type: float level: advanced diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc index f8e2e40a31ea..bba723a314f7 100644 --- a/src/osd/scrubber/osd_scrub.cc +++ b/src/osd/scrubber/osd_scrub.cc @@ -181,6 +181,10 @@ bool OsdScrub::is_sched_target_eligible( if (r.cpu_overloaded && ScrubJob::observes_load_limit(e.urgency)) { return false; } + if (r.overload_of_snap_trimming && + ScrubJob::observes_trims_load(e.urgency)) { + return false; + } if (r.recovery_in_progress && ScrubJob::observes_recovery(e.urgency)) { return false; } @@ -217,6 +221,11 @@ Scrub::OSDRestrictions OsdScrub::restrictions_on_scrubbing( env_conditions.restricted_time = !scrub_time_permit(scrub_clock_now); env_conditions.cpu_overloaded = !scrub_load_below_threshold(); + const auto snaptrims_limit = + cct->_conf.get_val("osd_scrub_queued_snaptrims_limit"); + env_conditions.overload_of_snap_trimming = + (snaptrims_limit > 0) && + (m_osd_svc.get_snap_trim_queue_total() > snaptrims_limit); return env_conditions; } diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc index b260bcc3291c..541da5306ad0 100644 --- a/src/osd/scrubber/scrub_job.cc +++ b/src/osd/scrubber/scrub_job.cc @@ -389,6 +389,11 @@ bool ScrubJob::observes_load_limit(urgency_t urgency) return urgency < urgency_t::after_repair; } +bool ScrubJob::observes_trims_load(urgency_t urgency) +{ + return urgency < urgency_t::repairing; +} + bool ScrubJob::requires_reservation(urgency_t urgency) { return urgency < urgency_t::after_repair; diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h index f0f7f0a32684..1427a7dced0d 100644 --- a/src/osd/scrubber/scrub_job.h +++ b/src/osd/scrubber/scrub_job.h @@ -316,6 +316,8 @@ class ScrubJob { * if continued into the forbidden times, by having a longer sleep time; * (note that this is only applicable to the wq scheduler). * - load: the scrub must not be initiated if the OSD is under heavy CPU load; + * - trims: the scrub must not be initiated if the OSD has too many snap-trim + * jobs pending; * - noscrub: the scrub is aborted if the 'noscrub' flag (or the * 'nodeep-scrub' flag for deep scrubs) is set; * - randomization: the scrub's target time is extended by a random @@ -335,10 +337,11 @@ class ScrubJob { * | limitation | must- | after-repair |repairing| operator | must-repair | * | | scrub |(aft recovery)|(errors) | request | | * +------------+---------+--------------+---------+----------+-------------+ - * | reservation| yes! | no | no? | no | no | - * | dow/time | yes | yes | no | no | no | + * | reservation| yes! | no | no | no | no | + * | dow/time | yes | yes | no X | no | no | * | ext-sleep | no | no | no | no | no | * | load | yes | no | no | no | no | + * | trims | yes | yes | no | no | no | * | noscrub | yes | no | Yes | no | no | * | max-scrubs | yes | yes | Yes | no | no | * | backoff | yes | no | no | no | no | @@ -357,6 +360,8 @@ class ScrubJob { static bool observes_load_limit(urgency_t urgency); + static bool observes_trims_load(urgency_t urgency); + static bool requires_reservation(urgency_t urgency); static bool requires_randomization(urgency_t urgency); diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index 844c6943588e..26a36b808b67 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -102,6 +102,9 @@ struct OSDRestrictions { /// the CPU load is high. No regular scrubs are allowed. bool cpu_overloaded:1{false}; + /// long snap-trimming queues. + bool overload_of_snap_trimming:1{false}; + /// outside of allowed scrubbing hours/days bool restricted_time:1{false}; @@ -181,10 +184,11 @@ struct formatter { template auto format(const Scrub::OSDRestrictions& conds, FormatContext& ctx) const { return fmt::format_to( - ctx.out(), "<{}.{}.{}.{}.{}>", + ctx.out(), "<{}.{}.{}.{}.{}.{}>", conds.max_concurrency_reached ? "max-scrubs" : "", conds.random_backoff_active ? "backoff" : "", conds.cpu_overloaded ? "high-load" : "", + conds.overload_of_snap_trimming ? "trim-overload" : "", conds.restricted_time ? "time-restrict" : "", conds.recovery_in_progress ? "recovery" : ""); } -- 2.47.3