]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: implement existing 'interval' as a distinct delay reason 59590/head
authorRonen Friedman <rfriedma@redhat.com>
Tue, 3 Sep 2024 09:25:36 +0000 (04:25 -0500)
committerRonen Friedman <rfriedma@redhat.com>
Wed, 4 Sep 2024 12:09:42 +0000 (07:09 -0500)
allowing setting specific delay times for scrubs that were aborted
due to the interval being changed. The specified delay should be
lower than the default delay used for the other types of
mid-scrub aborts.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/common/options/osd.yaml.in
src/osd/scrubber/scrub_job.cc
src/osd/scrubber/scrub_machine.cc
src/osd/scrubber/scrub_machine.h

index 226e22e18ff3327abf237741bf0b9dae1b95a218..8dae546521d7be30712f6a7e5c36ed713413f891 100644 (file)
@@ -535,6 +535,8 @@ options:
   see_also:
   - osd_scrub_retry_pg_state
   - osd_scrub_retry_after_noscrub
+  - osd_scrub_retry_new_interval
+  - osd_scrub_retry_trimming
   with_legacy: false
 - name: osd_scrub_retry_after_noscrub
   type: int
@@ -571,6 +573,17 @@ options:
   see_also:
   - osd_scrub_retry_delay
   with_legacy: false
+- name: osd_scrub_retry_new_interval
+  type: int
+  level: advanced
+  desc: Period (in seconds) before retrying a scrub aborted on a new interval
+  long_desc: Minimum delay before retrying, after a scrub was aborted as the
+    PG interval changed.
+  default: 10
+  min: 1
+  see_also:
+  - osd_scrub_retry_delay
+  with_legacy: false
 - name: osd_scrub_disable_reservation_queuing
   type: bool
   level: advanced
index c74648bae2578ce354c7ed9c818abfe4369bf2bb..7b05eea39419a30fc31da1237d3ec3843cae759c 100644 (file)
@@ -301,7 +301,7 @@ void ScrubJob::adjust_deep_schedule(
 
 SchedTarget& ScrubJob::delay_on_failure(
     scrub_level_t level,
-    Scrub::delay_cause_t delay_cause,
+    delay_cause_t delay_cause,
     utime_t scrub_clock_now)
 {
   seconds delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_delay"));
@@ -316,7 +316,11 @@ SchedTarget& ScrubJob::delay_on_failure(
     case delay_cause_t::snap_trimming:
       delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_trimming"));
       break;
+    case delay_cause_t::interval:
+      delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_new_interval"));
+      break;
     case delay_cause_t::local_resources:
+    case delay_cause_t::aborted:
     default:
       // for all other possible delay causes: use the default delay
       break;
index 81e9bd7b6d8b1b249b1baaf6437e5d882b9c2f85..da9466758f468fa0e7174d86575753e6ac51bf34 100644 (file)
@@ -199,6 +199,7 @@ sc::result Session::react(const IntervalChanged&)
 
   ceph_assert(m_reservations);
   m_reservations->discard_remote_reservations();
+  m_abort_reason = delay_cause_t::interval;
   return transit<NotActive>();
 }
 
@@ -300,7 +301,8 @@ ActiveScrubbing::~ActiveScrubbing()
   // completed successfully), we use it now to set the 'failed scrub' duration.
   if (session.m_session_started_at != ScrubTimePoint{}) {
     // delay the next invocation of the scrubber on this target
-    scrbr->on_mid_scrub_abort(Scrub::delay_cause_t::aborted);
+    scrbr->on_mid_scrub_abort(
+       session.m_abort_reason.value_or(Scrub::delay_cause_t::aborted));
 
     auto logged_duration = ScrubClock::now() - session.m_session_started_at;
     session.m_perf_set->tinc(scrbcnt_failed_elapsed, logged_duration);
index d1edfd37c9859270a2343f71b86e93bfb77f54c6..ad0d3bfba3807b9247937c208f5e14dbb07392d1 100644 (file)
@@ -551,6 +551,10 @@ struct Session : sc::state<Session, PrimaryActive, ReservingReplicas>,
 
   /// the time when the session was initiated
   ScrubTimePoint m_session_started_at{ScrubClock::now()};
+
+  /// abort reason - if known. Determines the delay time imposed on the
+  /// failed scrub target.
+  std::optional<Scrub::delay_cause_t> m_abort_reason{std::nullopt};
 };
 
 struct ReservingReplicas : sc::state<ReservingReplicas, Session>, NamedSimply {