osd/scrub: fix deadline calculations

author Ronen Friedman <rfriedma@redhat.com>

Thu, 8 May 2025 13:45:23 +0000 (08:45 -0500)

committer Ronen Friedman <rfriedma@redhat.com>

Thu, 8 May 2025 14:33:57 +0000 (09:33 -0500)
author Ronen Friedman <rfriedma@redhat.com>
Thu, 8 May 2025 13:45:23 +0000 (08:45 -0500)
committer Ronen Friedman <rfriedma@redhat.com>
Thu, 8 May 2025 14:33:57 +0000 (09:33 -0500)
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc

index ef9818a0fef0af8eac8aa12379aa99eb0304ee11..5f7e2733263b2786a7b817fecb61e9c7b1fe3502 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -674,46 +674,68 @@ Scrub::sched_conf_t PgScrubber::populate_config_params() const
    const auto& conf = get_pg_cct()->_conf;  // for brevity
    Scrub::sched_conf_t configs;
  
-  // deep-scrub optimal interval
-  configs.deep_interval =
-      pool_conf.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0);
-  if (configs.deep_interval <= 0.0) {
-    configs.deep_interval = conf->osd_deep_scrub_interval;
-  }
-
-  // shallow-scrub interval
-  configs.shallow_interval =
+  // shallow scrubs interval
+  const auto shallow_pool =
        pool_conf.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0);
-  if (configs.shallow_interval <= 0.0) {
-    configs.shallow_interval = conf->osd_scrub_min_interval;
-  }
-
-  // the max allowed delay between scrubs.
-  // For deep scrubs - there is no equivalent of scrub_max_interval. Per the
-  // documentation, once deep_scrub_interval has passed, we are already
-  // "overdue", at least as far as the "ignore allowed load" window is
-  // concerned.
+  configs.shallow_interval =
+      shallow_pool > 0.0 ? shallow_pool : conf->osd_scrub_min_interval;
  
-  configs.max_deep = configs.deep_interval + configs.shallow_interval;
+  // deep scrubs optimal interval
+  const auto deep_pool =
+      pool_conf.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0);
+  configs.deep_interval =
+      deep_pool > 0.0 ? deep_pool : conf->osd_deep_scrub_interval;
  
+  /**
+   * 'max_deep' and 'max_shallow' are set to the maximum allowed delay between
+   * scrubs. These deadlines have almost no effect on scrub scheduling
+   * (the only minor exception: when sorting two scrub jobs that are
+   * equivalent in all but the deadline).
+   *
+   * 'max_shallow' is controlled by a pool option and a configuration
+   * parameter. Note that if the value configured is less than the
+   * shallow interval, the max_shallow is disabled.
+   */
    auto max_shallow = pool_conf.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0);
    if (max_shallow <= 0.0) {
      max_shallow = conf->osd_scrub_max_interval;
    }
+
    if (max_shallow > 0.0) {
-    configs.max_shallow = max_shallow;
-    // otherwise - we're left with the default nullopt
+    const auto min_accepted_deadline =
+       configs.shallow_interval *
+       (1 + conf->osd_scrub_interval_randomize_ratio);
+
+    if (max_shallow >= min_accepted_deadline) {
+      configs.max_shallow = max_shallow;
+    } else {
+      // this is a bit odd, but the pool option is set to a value
+      // less than the interval. Keep the nullopt in max_shallow,
+      dout(10) << fmt::format(
+                     "{}: configured 'max shallow' rejected as too low ({}/{} "
+                     "< {})",
+                     __func__,
+                     pool_conf.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0),
+                     conf->osd_scrub_max_interval, min_accepted_deadline)
+              << dendl;
+    }
    }
  
-  // but seems like our tests require: \todo fix!
-  configs.max_deep =
-      std::max(configs.max_shallow.value_or(0.0), configs.deep_interval);
+  // There are no comparable options for max_deep. We set it here to
+  // 4X the deep interval, as a reasonable default.
+  configs.max_deep = 4 * configs.deep_interval;
  
    configs.interval_randomize_ratio = conf->osd_scrub_interval_randomize_ratio;
-  configs.deep_randomize_ratio = conf.get_val<double>("osd_deep_scrub_interval_cv");
+  configs.deep_randomize_ratio =
+      conf.get_val<double>("osd_deep_scrub_interval_cv");
    configs.mandatory_on_invalid = conf->osd_scrub_invalid_stats;
  
-  dout(15) << fmt::format("{}: updated config:{}", __func__, configs) << dendl;
+  dout(15) << fmt::format(
+                 "{}: inputs: intervals: sh:{}(pl:{}),dp:{}(pl:{})",
+                 __func__, configs.shallow_interval, shallow_pool,
+                 configs.deep_interval, deep_pool)
+          << dendl;
+  dout(10) << fmt::format("{}: updated config:{}", __func__, configs) << dendl;
    return configs;
  }
  
@@ -2080,7 +2102,7 @@ void PgScrubber::on_digest_updates()
  void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
  {
    if (!m_scrub_job->is_registered()) {
-    dout(10) << fmt::format(
+    dout(5) << fmt::format(
                     "{}: PG not registered for scrubbing on this OSD. Won't "
                     "requeue!",
                     __func__)
diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc

index f303a47096e2ef8c9c70056f4a5c32fc3651348d..ffd0381d96cc59154318398b98b97c72b555c687 100644 (file)
--- a/src/osd/scrubber/scrub_job.cc
+++ b/src/osd/scrubber/scrub_job.cc
@@ -122,7 +122,7 @@ void ScrubJob::adjust_shallow_schedule(
      if (app_conf.max_shallow) {
        sh_times.deadline += *app_conf.max_shallow;
      } else {
-      sh_times.deadline = utime_t{};
+      sh_times.deadline = utime_t::max();
      }
      if (adj_not_before < adj_target) {
        adj_not_before = adj_target;
@@ -135,7 +135,7 @@ void ScrubJob::adjust_shallow_schedule(
      // the target time is already set. Make sure to reset the n.b. and
      // the (irrelevant) deadline
      sh_times.not_before = sh_times.scheduled_at;
-    sh_times.deadline = sh_times.scheduled_at;
+    sh_times.deadline = utime_t::max();
    }
  
    dout(10) << fmt::format(
@@ -257,12 +257,8 @@ void ScrubJob::adjust_deep_schedule(
                      app_conf.deep_randomize_ratio, adj_target)
               << dendl;
  
-    // the deadline can be updated directly into the scrub-job
-    if (app_conf.max_shallow) {
-      dp_times.deadline += *app_conf.max_shallow;  // RRR fix
-    } else {
-      dp_times.deadline = utime_t{};
-    }
+    dp_times.deadline += app_conf.max_deep;
+
      if (adj_not_before < adj_target) {
        adj_not_before = adj_target;
      }
@@ -272,7 +268,7 @@ void ScrubJob::adjust_deep_schedule(
      // the target time is already set. Make sure to reset the n.b. and
      // the (irrelevant) deadline
      dp_times.not_before = dp_times.scheduled_at;
-    dp_times.deadline = dp_times.scheduled_at;
+    dp_times.deadline = utime_t::max();
    }
  
    dout(10) << fmt::format(
diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h

index 1b9ada4dad21edc716ac9abf469c12d7102f3ea0..2c2d38ec5380ad2dfd591b470d002524570036ff 100644 (file)
--- a/src/osd/scrubber/scrub_job.h
+++ b/src/osd/scrubber/scrub_job.h
@@ -44,15 +44,13 @@ struct sched_conf_t {
    std::optional<double> max_shallow;
  
    /**
-   * the maximum interval between deep scrubs.
-   * For deep scrubs - there is no equivalent of scrub_max_interval. Per the
-   * documentation, once deep_scrub_interval has passed, we are already
-   * "overdue", at least as far as the "ignore allowed load" window is
-   * concerned. \todo based on users complaints (and the fact that the
-   * interaction between the configuration parameters is clear to no one),
-   * this will be revised shortly.
+   * the maximum interval between deep scrubs, after which the
+   * (info-only) "overdue" field in the scheduler dump is set.
+   * There is no specific configuration parameter to control the
+   * deep scrubs max. Instead - we set it to 4 times the average
+   * interval.
     */
-  double max_deep{0.0};
+  double max_deep{std::numeric_limits<double>::max()};
  
    /**
     * interval_randomize_ratio
@@ -226,7 +224,7 @@ class ScrubJob {
     * The new values are updated into the scrub-job.
     *
     * Specifically:
-   * - for high-priority scrubs: n.b. & deadline are set equal to the
+   * - for high-priority scrubs: the 'not_before' is set to the
     *   (untouched) proposed target time.
     * - for regular scrubs: the proposed time is adjusted (delayed) based
     *   on the configuration; the deadline is set further out (if configured)
diff --git a/src/osd/scrubber/scrub_queue_entry.h b/src/osd/scrubber/scrub_queue_entry.h

index aeb76c104fed9fed79c594c36025de7f344ff706..4cca192431ec354d24a5a71499b26938243e80db 100644 (file)
--- a/src/osd/scrubber/scrub_queue_entry.h
+++ b/src/osd/scrubber/scrub_queue_entry.h
@@ -61,9 +61,6 @@ enum class urgency_t {
   * the 'urgency' attribute of the scheduled scrub (which determines most of
   * its behavior and scheduling decisions) and the actual time attributes
   * for scheduling (target, deadline, not_before).
- *
- * In this commit - the 'urgency' attribute is not fully used yet, and some
- * of the scrub behavior is still controlled by the 'planned scrub' flags.
   */
  struct SchedEntry {
    constexpr SchedEntry(spg_t pgid, scrub_level_t level)
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h

index 917b99f98a192718d30e614ba5d7a789802a5ccb..ab8a45044fef7f573b7d59d7b034223639aa8549 100644 (file)
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -136,9 +136,11 @@ struct scrub_schedule_t {
     * the 'deadline' is the time by which we expect the periodic scrub to
     * complete. It is determined by the SCRUB_MAX_INTERVAL pool configuration
     * and by osd_scrub_max_interval;
-   * Once passed, the scrub will be allowed to run even if the OSD is
-   * overloaded.It would also have higher priority than other
-   * auto-scheduled scrubs.
+   * Note: the 'deadline' has only a limited effect on scheduling: when
+   * comparing jobs having identical urgency and target time (scheduled_at'),
+   * the job with the earlier 'deadline' is preferred.
+   * Being past deadline also sets the 'overdue' flag in scrub
+   * scheduling dumps.
     */
    utime_t deadline{utime_t::max()};
author	Ronen Friedman <rfriedma@redhat.com>
	Thu, 8 May 2025 13:45:23 +0000 (08:45 -0500)
committer	Ronen Friedman <rfriedma@redhat.com>
	Thu, 8 May 2025 14:33:57 +0000 (09:33 -0500)
src/osd/scrubber/pg_scrubber.cc		patch \| blob \| history
src/osd/scrubber/scrub_job.cc		patch \| blob \| history
src/osd/scrubber/scrub_job.h		patch \| blob \| history
src/osd/scrubber/scrub_queue_entry.h		patch \| blob \| history
src/osd/scrubber_common.h		patch \| blob \| history