From 0de916dc04a356bf0aa57296b699edd050f027a0 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Sun, 5 May 2024 10:42:18 -0500 Subject: [PATCH] osd/scrub: modify deep scrub interval randomization The interaction between the various configuration parameters controlling the scheduling of deep scrubs is not clearly defined nor clearly documented. The existing set of parameters creates unnecessary code complexity, is surprising to the operators, and does not provide the level of control desired by Ceph users. This is a proposed change to the deep scrub interval randomization: Pre this PR, deep scrubs scheduling is controlled by the following set of parameters: The desired interval between deep scrubs is determined by osd_deep_scrub_interval. To prevent a "thundering herd" problem if multiple PGs were created at the same time, a randomization effect was added: at a configurable frequency, a shallow scrub is "upgraded" to a deep scrub. As mentioned above, the interaction between these parameters isn't always clear to the operators. But the main issue is its effect on code complexity and design choices (as it is never known in advance whether the next scrub will be deep or shallow). Here we change the randomization method, decoupling it from shallow scrubs scheduling. In the new method, deep scrubs are scheduled at the desired interval - in average. The actual time is randomized to a normal distribution with a CV of osd_deep_scrub_interval_cv (clamped to reasonable values). Signed-off-by: Ronen Friedman --- src/common/options/osd.yaml.in | 27 +++++++++++++++++++++------ src/osd/scrubber/pg_scrubber.cc | 8 +++----- src/osd/scrubber/scrub_job.cc | 31 ++++++++++++++++++++----------- src/osd/scrubber/scrub_job.h | 11 +++++++++-- 4 files changed, 53 insertions(+), 24 deletions(-) diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index 268a89154de..5b6377a8797 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -440,17 +440,32 @@ options: type: float level: advanced desc: Deep scrub each PG (i.e., verify data checksums) at least this often - fmt_desc: The interval for "deep" scrubbing (fully reading all data). The - ``osd_scrub_load_threshold`` does not affect this setting. + fmt_desc: The interval for "deep" scrubbing (fully reading all data). default: 7_day with_legacy: true +- name: osd_deep_scrub_interval_cv + type: float + level: advanced + desc: determining the amount of variation in the deep scrub interval + long_desc: deep scrub intervals are varied by a random amount to prevent + stampedes. This parameter determines the amount of variation. + Technically - osd_deep_scrub_interval_cv is the coefficient of variation for + the deep scrub interval. + fmt_desc: The coefficient of variation for the deep scrub interval, specified as a + ratio. On average, the next deep scrub for a PG is scheduled osd_deep_scrub_interval + after the last deep scrub . The actual time is randomized to a normal distribution + with a standard deviation of osd_deep_scrub_interval * osd_deep_scrub_interval_cv + (clamped to within 2 standard deviations). + The default value guarantees that 95% of the deep scrubs will be scheduled in the range + [0.8 * osd_deep_scrub_interval, 1.2 * osd_deep_scrub_interval]. + min: 0 + max: 0.4 + default: 0.2 + with_legacy: false - name: osd_deep_scrub_randomize_ratio type: float level: advanced - desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs - are deep) - long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they - are uniformly distributed over the week + desc: deprecated. Has no effect. default: 0.15 with_legacy: true - name: osd_deep_scrub_stride diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 0e4253b339a..cb763994204 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -753,7 +753,7 @@ bool PgScrubber::reserve_local(const Scrub::SchedTarget& trgt) Scrub::sched_conf_t PgScrubber::populate_config_params() const { const pool_opts_t& pool_conf = m_pg->get_pgpool().info.opts; - auto& conf = get_pg_cct()->_conf; // for brevity + const auto& conf = get_pg_cct()->_conf; // for brevity Scrub::sched_conf_t configs; // deep-scrub optimal interval @@ -792,7 +792,7 @@ Scrub::sched_conf_t PgScrubber::populate_config_params() const std::max(configs.max_shallow.value_or(0.0), configs.deep_interval); configs.interval_randomize_ratio = conf->osd_scrub_interval_randomize_ratio; - configs.deep_randomize_ratio = conf->osd_deep_scrub_randomize_ratio; + configs.deep_randomize_ratio = conf.get_val("osd_deep_scrub_interval_cv"); configs.mandatory_on_invalid = conf->osd_scrub_invalid_stats; dout(15) << fmt::format("{}: updated config:{}", __func__, configs) << dendl; @@ -2608,9 +2608,7 @@ PgScrubber::PgScrubber(PG* pg) { m_fsm = std::make_unique(m_pg, this); m_fsm->initiate(); - - m_scrub_job = std::make_optional( - m_osds->cct, m_pg->pg_id, m_osds->get_nodeid()); + m_scrub_job.emplace(m_osds->cct, m_pg->pg_id, m_osds->get_nodeid()); } void PgScrubber::set_scrub_duration(std::chrono::milliseconds duration) diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc index ee33ee06706..8c490deac4c 100644 --- a/src/osd/scrubber/scrub_job.cc +++ b/src/osd/scrubber/scrub_job.cc @@ -65,6 +65,7 @@ ScrubJob::ScrubJob(CephContext* cct, const spg_t& pg, int node_id) , shallow_target{pg, scrub_level_t::shallow} , deep_target{pg, scrub_level_t::deep} , cct{cct} + , random_gen{random_dev()} , log_msg_prefix{fmt::format("osd.{} scrub-job:pg[{}]:", node_id, pgid)} {} @@ -240,6 +241,7 @@ utime_t ScrubJob::get_sched_time() const return earliest_target().sched_info.schedule.not_before; } + void ScrubJob::adjust_deep_schedule( utime_t last_deep, const Scrub::sched_conf_t& app_conf, @@ -256,13 +258,7 @@ void ScrubJob::adjust_deep_schedule( auto& dp_times = deep_target.sched_info.schedule; // shorthand - if (!ScrubJob::requires_randomization(deep_target.urgency())) { - // the target time is already set. Make sure to reset the n.b. and - // the (irrelevant) deadline - dp_times.not_before = dp_times.scheduled_at; - dp_times.deadline = dp_times.scheduled_at; - - } else { + if (ScrubJob::requires_randomization(deep_target.urgency())) { utime_t adj_not_before = last_deep; utime_t adj_target = last_deep; dp_times.deadline = adj_target; @@ -271,10 +267,18 @@ void ScrubJob::adjust_deep_schedule( // scrubs that are not already eligible for scrubbing. if ((modify_ready_targets == delay_ready_t::delay_ready) || adj_not_before > scrub_clock_now) { - adj_target += app_conf.deep_interval; - double r = rand() / (double)RAND_MAX; - adj_target += app_conf.deep_interval * app_conf.interval_randomize_ratio * - r; // RRR fix + double sdv = app_conf.deep_interval * app_conf.deep_randomize_ratio; + std::normal_distribution normal_dist{app_conf.deep_interval, sdv}; + auto next_delay = std::clamp( + normal_dist(random_gen), app_conf.deep_interval - 2 * sdv, + app_conf.deep_interval + 2 * sdv); + adj_target += next_delay; + dout(20) << fmt::format( + "deep scrubbing: next_delay={:.0f} (interval={:.0f}, " + "ratio={:.3f}), adjusted:{:s}", + next_delay, app_conf.deep_interval, + app_conf.deep_randomize_ratio, adj_target) + << dendl; } // the deadline can be updated directly into the scrub-job @@ -288,6 +292,11 @@ void ScrubJob::adjust_deep_schedule( } dp_times.scheduled_at = adj_target; dp_times.not_before = adj_not_before; + } else { + // the target time is already set. Make sure to reset the n.b. and + // the (irrelevant) deadline + dp_times.not_before = dp_times.scheduled_at; + dp_times.deadline = dp_times.scheduled_at; } dout(10) << fmt::format( diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h index 98a3e101f9b..8b0d7dede3f 100644 --- a/src/osd/scrubber/scrub_job.h +++ b/src/osd/scrubber/scrub_job.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "common/ceph_atomic.h" @@ -65,8 +66,9 @@ struct sched_conf_t { /** * a randomization factor aimed at preventing 'thundering herd' problems - * upon deep-scrubs common intervals. If polling a random number smaller - * than that percentage, the next shallow scrub is upgraded to deep. + * upon deep-scrubs common intervals. The actual deep scrub interval will + * be selected with a normal distribution around the configured interval, + * with a standard deviation of * . */ double deep_randomize_ratio{0.0}; @@ -168,6 +170,11 @@ class ScrubJob { CephContext* cct; + /// random generator for the randomization of the scrub times + /// \todo consider using one common generator in the OSD service + std::random_device random_dev; + std::mt19937 random_gen; + ScrubJob(CephContext* cct, const spg_t& pg, int node_id); /** -- 2.47.3