From d7c7aa7328d223d51f21368da6c8c069fdde818c Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Tue, 3 Sep 2024 04:25:36 -0500 Subject: [PATCH] osd/scrub: implement existing 'interval' as a distinct delay reason allowing setting specific delay times for scrubs that were aborted due to the interval being changed. The specified delay should be lower than the default delay used for the other types of mid-scrub aborts. Signed-off-by: Ronen Friedman --- src/common/options/osd.yaml.in | 13 +++++++++++++ src/osd/scrubber/scrub_job.cc | 6 +++++- src/osd/scrubber/scrub_machine.cc | 4 +++- src/osd/scrubber/scrub_machine.h | 4 ++++ 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index 226e22e18ff..8dae546521d 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -535,6 +535,8 @@ options: see_also: - osd_scrub_retry_pg_state - osd_scrub_retry_after_noscrub + - osd_scrub_retry_new_interval + - osd_scrub_retry_trimming with_legacy: false - name: osd_scrub_retry_after_noscrub type: int @@ -571,6 +573,17 @@ options: see_also: - osd_scrub_retry_delay with_legacy: false +- name: osd_scrub_retry_new_interval + type: int + level: advanced + desc: Period (in seconds) before retrying a scrub aborted on a new interval + long_desc: Minimum delay before retrying, after a scrub was aborted as the + PG interval changed. + default: 10 + min: 1 + see_also: + - osd_scrub_retry_delay + with_legacy: false - name: osd_scrub_disable_reservation_queuing type: bool level: advanced diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc index c74648bae25..7b05eea3941 100644 --- a/src/osd/scrubber/scrub_job.cc +++ b/src/osd/scrubber/scrub_job.cc @@ -301,7 +301,7 @@ void ScrubJob::adjust_deep_schedule( SchedTarget& ScrubJob::delay_on_failure( scrub_level_t level, - Scrub::delay_cause_t delay_cause, + delay_cause_t delay_cause, utime_t scrub_clock_now) { seconds delay = seconds(cct->_conf.get_val("osd_scrub_retry_delay")); @@ -316,7 +316,11 @@ SchedTarget& ScrubJob::delay_on_failure( case delay_cause_t::snap_trimming: delay = seconds(cct->_conf.get_val("osd_scrub_retry_trimming")); break; + case delay_cause_t::interval: + delay = seconds(cct->_conf.get_val("osd_scrub_retry_new_interval")); + break; case delay_cause_t::local_resources: + case delay_cause_t::aborted: default: // for all other possible delay causes: use the default delay break; diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index 81e9bd7b6d8..da9466758f4 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -199,6 +199,7 @@ sc::result Session::react(const IntervalChanged&) ceph_assert(m_reservations); m_reservations->discard_remote_reservations(); + m_abort_reason = delay_cause_t::interval; return transit(); } @@ -300,7 +301,8 @@ ActiveScrubbing::~ActiveScrubbing() // completed successfully), we use it now to set the 'failed scrub' duration. if (session.m_session_started_at != ScrubTimePoint{}) { // delay the next invocation of the scrubber on this target - scrbr->on_mid_scrub_abort(Scrub::delay_cause_t::aborted); + scrbr->on_mid_scrub_abort( + session.m_abort_reason.value_or(Scrub::delay_cause_t::aborted)); auto logged_duration = ScrubClock::now() - session.m_session_started_at; session.m_perf_set->tinc(scrbcnt_failed_elapsed, logged_duration); diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index d1edfd37c98..ad0d3bfba38 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -551,6 +551,10 @@ struct Session : sc::state, /// the time when the session was initiated ScrubTimePoint m_session_started_at{ScrubClock::now()}; + + /// abort reason - if known. Determines the delay time imposed on the + /// failed scrub target. + std::optional m_abort_reason{std::nullopt}; }; struct ReservingReplicas : sc::state, NamedSimply { -- 2.39.5