]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd/scrub: add configuration parameters to control length of delay
authorRonen Friedman <rfriedma@redhat.com>
Sun, 1 Sep 2024 07:07:47 +0000 (02:07 -0500)
committerRonen Friedman <rfriedma@redhat.com>
Wed, 4 Sep 2024 12:07:46 +0000 (07:07 -0500)
to apply to a scrub target following a scrub failure

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/common/options/osd.yaml.in
src/osd/scrubber/pg_scrubber.cc
src/osd/scrubber/scrub_job.cc
src/osd/scrubber/scrub_job.h

index 268a89154de5acef38d93ff696bfd239c6c4d92c..946d1bfa1e9426ad3577bda01c26ec3b7c150329 100644 (file)
@@ -520,6 +520,46 @@ options:
     stats (inc. scrub/block duration) every this many seconds.
   default: 120
   with_legacy: false
+- name: osd_scrub_retry_delay
+  type: int
+  level: advanced
+  desc: Period (in seconds) before retrying a PG that has failed a prior scrub.
+  long_desc: Minimum delay after a failed attempt to scrub a PG. The delay is
+    either applied to one of the scheduled scrubs for the PG (the next shallow
+    scrub or the next deep scrub), or to both.
+    This is a default value, used when the cause of the delay does not have an
+    associated configuration option. See the 'see also' for the configuration
+    options for some delay reasons that have their own configuration.
+  default: 30
+  min: 1
+  see_also:
+  - osd_scrub_retry_pg_state
+  - osd_scrub_retry_after_noscrub
+  with_legacy: false
+- name: osd_scrub_retry_after_noscrub
+  type: int
+  level: advanced
+  desc: Period (in seconds) before retrying to scrub a PG at a specific level
+    after detecting a no-scrub or no-deep-scrub flag
+  long_desc: Minimum delay after a failed attempt to scrub a PG at a level
+    (shallow or deep) that is disabled by cluster or pool no-scrub or no-deep-scrub
+    flags.
+  default: 60
+  min: 1
+  see_also:
+  - osd_scrub_retry_delay
+  with_legacy: false
+- name: osd_scrub_retry_pg_state
+  type: int
+  level: advanced
+  desc: Period (in seconds) before retrying to scrub a previously inactive/not-clean PG
+  long_desc: Minimum delay after a failed attempt to scrub a PG that is not
+    active and clean.
+  default: 60
+  min: 1
+  see_also:
+  - osd_scrub_retry_delay
+  with_legacy: false
 - name: osd_scrub_disable_reservation_queuing
   type: bool
   level: advanced
index 0e4253b339afece319a79ccf0b81477e699a0589..0e044810fcd673db8e11f7cd456eb45e2930767d 100644 (file)
@@ -2220,7 +2220,7 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
   // that made any of the targets into a high-priority one. All that's left:
   // delay the specific target that was aborted.
 
-  auto& trgt = m_scrub_job->delay_on_failure(aborted_target.level(), 5s, issue,
+  auto& trgt = m_scrub_job->delay_on_failure(aborted_target.level(), issue,
       scrub_clock_now);
 
   /// \todo complete the merging of the deadline & target for non-hp targets
@@ -2251,8 +2251,7 @@ void PgScrubber::requeue_penalized(
     return;
   }
   /// \todo fix the 5s' to use a cause-specific delay parameter
-  auto& trgt =
-      m_scrub_job->delay_on_failure(s_or_d, 5s, cause, scrub_clock_now);
+  auto& trgt = m_scrub_job->delay_on_failure(s_or_d, cause, scrub_clock_now);
   ceph_assert(!trgt.queued);
   m_osds->get_scrub_services().enqueue_target(trgt);
   trgt.queued = true;
@@ -2274,7 +2273,7 @@ void PgScrubber::requeue_penalized(
       m_osds->get_scrub_services().dequeue_target(m_pg_id, sister_level);
       trgt2.queued = false;
     }
-    m_scrub_job->delay_on_failure(sister_level, 5s, cause, scrub_clock_now);
+    m_scrub_job->delay_on_failure(sister_level, cause, scrub_clock_now);
     m_osds->get_scrub_services().enqueue_target(trgt2);
     trgt2.queued = true;
   }
index ee33ee06706f7aa94474f56fb65919bfd1f06176..dd9f8b56de03803711d9e7184da84dc3dcaf6ef0 100644 (file)
@@ -12,6 +12,7 @@ using sched_conf_t = Scrub::sched_conf_t;
 using scrub_schedule_t = Scrub::scrub_schedule_t;
 using ScrubJob = Scrub::ScrubJob;
 using delay_ready_t = Scrub::delay_ready_t;
+using namespace std::chrono;
 
 namespace {
 utime_t add_double(utime_t t, double d)
@@ -300,16 +301,35 @@ void ScrubJob::adjust_deep_schedule(
 
 SchedTarget& ScrubJob::delay_on_failure(
     scrub_level_t level,
-    std::chrono::seconds delay,
     Scrub::delay_cause_t delay_cause,
     utime_t scrub_clock_now)
 {
+  seconds delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_delay"));
+  switch (delay_cause) {
+    case delay_cause_t::flags:
+      delay =
+         seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_after_noscrub"));
+      break;
+    case delay_cause_t::pg_state:
+      delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_pg_state"));
+      break;
+    case delay_cause_t::local_resources:
+    default:
+      // for all other possible delay causes: use the default delay
+      break;
+  }
+
   auto& delayed_target =
       (level == scrub_level_t::deep) ? deep_target : shallow_target;
   delayed_target.sched_info.schedule.not_before =
       std::max(scrub_clock_now, delayed_target.sched_info.schedule.not_before) +
       utime_t{delay};
   delayed_target.sched_info.last_issue = delay_cause;
+  dout(20) << fmt::format(
+                 "delayed {}scrub due to {} for {}s. Updated: {}",
+                 (level == scrub_level_t::deep ? "deep " : ""), delay_cause,
+                 delay.count(), delayed_target)
+          << dendl;
   return delayed_target;
 }
 
index 98a3e101f9bfa030f3d4873a58f8549bfd9087ba..b037084db6b5df3b3394a6bf6ac62d7d5662d911 100644 (file)
@@ -239,14 +239,14 @@ class ScrubJob {
 
   /**
    * For the level specified, set the 'not-before' time to 'now+delay',
-   * so that this scrub target
-   * would not be retried before 'delay' seconds have passed.
+   * so that this scrub target would not be retried before the required
+   * delay seconds have passed.
+   * The delay is determined based on the 'cause' parameter.
    * The 'last_issue' is updated to the cause of the delay.
    * \returns a reference to the target that was modified.
    */
   [[maybe_unused]] SchedTarget& delay_on_failure(
       scrub_level_t level,
-      std::chrono::seconds delay,
       delay_cause_t delay_cause,
       utime_t scrub_clock_now);