osd/scrub: scheduling the next scrub following scrub completion

author Ronen Friedman <rfriedma@redhat.com>

Thu, 20 Jun 2024 13:04:41 +0000 (08:04 -0500)

committer Ronen Friedman <rfriedma@redhat.com>

Tue, 16 Jul 2024 14:19:33 +0000 (09:19 -0500)
author Ronen Friedman <rfriedma@redhat.com>
Thu, 20 Jun 2024 13:04:41 +0000 (08:04 -0500)
committer Ronen Friedman <rfriedma@redhat.com>
Tue, 16 Jul 2024 14:19:33 +0000 (09:19 -0500)
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc

index 8984d00f45da030407563f88fc1daa98b4579b82..637b4586ab3b4dad52b1e889b52fc9c1824bc57a 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -534,30 +534,16 @@ sched_params_t PgScrubber::determine_initial_schedule(
  }
  
  
-/*
- * Note: referring to m_planned_scrub here is temporary, as this set of
- * scheduling flags will be removed in a followup PR.
- */
  void PgScrubber::schedule_scrub_with_osd()
  {
    ceph_assert(is_primary());
    ceph_assert(m_scrub_job);
  
-  auto pre_reg = registration_state();
-  m_scrub_job->registered = true;
-
-  const auto applicable_conf = populate_config_params();
-  const auto scrub_clock_now = ceph_clock_now();
-  auto suggested = determine_initial_schedule(applicable_conf, scrub_clock_now);
-  m_scrub_job->init_targets(
-      suggested, m_pg->info, applicable_conf, scrub_clock_now);
-
-  m_osds->get_scrub_services().enqueue_target(*m_scrub_job);
-
-  dout(10) << fmt::format(
-                 "{}: <flags:{}> <{:.5}> --> <{:.5}>", __func__,
-                 m_planned_scrub, pre_reg, registration_state())
+  dout(20) << fmt::format(
+                 "{}: state at entry: {}", __func__, m_scrub_job->state_desc())
            << dendl;
+  m_scrub_job->registered = true;
+  update_scrub_job(delay_ready_t::delay_ready);
  }
  
  
@@ -613,8 +599,12 @@ void PgScrubber::update_scrub_job(Scrub::delay_ready_t delay_ready)
    ceph_assert(m_pg->is_locked());
    const auto applicable_conf = populate_config_params();
    const auto scrub_clock_now = ceph_clock_now();
-  const auto suggested = determine_initial_schedule(applicable_conf, scrub_clock_now);
-  m_scrub_job->on_periods_change(suggested, applicable_conf, scrub_clock_now);
+  const auto suggested =
+      determine_initial_schedule(applicable_conf, scrub_clock_now);
+
+  ceph_assert(m_scrub_job->is_registered());
+  m_scrub_job->adjust_schedule(
+      suggested, applicable_conf, scrub_clock_now, delay_ready);
    m_osds->get_scrub_services().enqueue_target(*m_scrub_job);
    m_scrub_job->target_queued = true;
    m_pg->publish_stats_to_osd();
@@ -761,10 +751,7 @@ void PgScrubber::on_operator_periodic_cmd(
            << dendl;
  
    // move the relevant time-stamp backwards - enough to trigger a scrub
-
-  utime_t now_is = ceph_clock_now();
-  utime_t stamp = now_is;
-
+  utime_t stamp = ceph_clock_now();
    if (offset > 0) {
      stamp -= offset;
    } else {
@@ -2083,6 +2070,56 @@ void PgScrubber::on_digest_updates()
  }
  
  
+/**
+ * The scrub session was aborted. We are left with two sets of parameters
+ * as to when the next scrub of this PG should take place, and what should
+ * it be like. One set of parameters is the one that was used to start the
+ * scrub, and that was 'frozen' by set_op_parameters(). It has its own
+ * scheduling target, priority, not-before, etc'.
+ * The other set is the updated state of the current scrub-job. It may
+ * have had its priority, flags, or schedule modified in the meantime.
+ * And - it does not (at least initially, i.e. immediately after
+ * set_op_parameters()), have high priority.
+ *
+ * Alas, the scrub session that was initiated was aborted. We must now
+ * merge the two sets of parameters, using the highest priority and the
+ * nearest target time for the next scrub.
+ *
+ * Note: only half-functioning in this commit. As the scrub-job copy
+ * (the one that was in the scheduling queue, and was passed to the scrubber)
+ * does not have the 'urgency' parameter, we are missing some information
+ * that is still encoded in the 'planned scrub' flags. This will be fixed in
+ * the next step.
+ */
+void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
+{
+  // assuming we can still depend on the 'scrubbing' flag being set;
+  // Also on Queued&Active.
+
+  // note again: this is not how merging should work in the final version:
+  // e.g. - the 'aborted_schedule' data should be passed thru the scrubber.
+  // In this current patchworik, for example, we are only guessing at
+  // the original value of 'must_deep_scrub'.
+  m_planned_scrub.must_deep_scrub =
+      m_planned_scrub.must_deep_scrub || (m_flags.required && m_is_deep);
+  m_planned_scrub.must_scrub = m_planned_scrub.must_deep_scrub ||
+                              m_planned_scrub.must_scrub || m_flags.required;
+  m_planned_scrub.must_repair = m_planned_scrub.must_repair || m_is_repair;
+  m_planned_scrub.need_auto = m_planned_scrub.need_auto || m_flags.auto_repair;
+  m_planned_scrub.deep_scrub_on_error =
+      m_planned_scrub.deep_scrub_on_error || m_flags.deep_scrub_on_error;
+  m_planned_scrub.check_repair =
+      m_planned_scrub.check_repair || m_flags.check_repair;
+
+  m_scrub_job->merge_and_delay(
+      m_active_target->schedule, issue, m_planned_scrub, ceph_clock_now());
+  ceph_assert(m_scrub_job->is_registered());
+  ceph_assert(!m_scrub_job->target_queued);
+  m_osds->get_scrub_services().enqueue_target(*m_scrub_job);
+  m_scrub_job->target_queued = true;
+}
+
+
  void PgScrubber::requeue_penalized(Scrub::delay_cause_t cause)
  {
    /// \todo fix the 5s' to use a cause-specific delay parameter
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h

index 0106edae54ae841a7a9377a76c69dfd8c7b9689d..e970bd7219db52554ff388157a98306543d2d566 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -437,7 +437,7 @@ class PgScrubber : public ScrubPgIF,
  
    void scrub_finish() final;
  
-  void penalize_next_scrub(Scrub::delay_cause_t cause) final;
+  void on_mid_scrub_abort(Scrub::delay_cause_t issue) final;
  
    ScrubMachineListener::MsgAndEpoch prep_replica_map_msg(
      Scrub::PreemptionNoted was_preempted) final;
diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc

index 1770cd95cb3b30ab4331a299c29d3bade0e70a7a..911ecc66ddb6f3966fb44842eea4f400957dc874 100644 (file)
--- a/src/osd/scrubber/scrub_job.cc
+++ b/src/osd/scrubber/scrub_job.cc
@@ -11,6 +11,7 @@ using OSDRestrictions = Scrub::OSDRestrictions;
  using sched_conf_t = Scrub::sched_conf_t;
  using scrub_schedule_t = Scrub::scrub_schedule_t;
  using ScrubJob = Scrub::ScrubJob;
+using delay_ready_t = Scrub::delay_ready_t;
  
  
  // ////////////////////////////////////////////////////////////////////////// //
@@ -32,7 +33,7 @@ ScrubJob::ScrubJob(CephContext* cct, const spg_t& pg, int node_id)
      : pgid{pg}
      , whoami{node_id}
      , cct{cct}
-    , log_msg_prefix{fmt::format("osd.{}: scrub-job:pg[{}]:", node_id, pgid)}
+    , log_msg_prefix{fmt::format("osd.{} scrub-job:pg[{}]:", node_id, pgid)}
  {}
  
  // debug usage only
@@ -44,74 +45,76 @@ ostream& operator<<(ostream& out, const ScrubJob& sjob)
  }  // namespace std
  
  
-Scrub::scrub_schedule_t ScrubJob::adjust_target_time(
-    const sched_conf_t& app_conf,
-    const sched_params_t& suggested) const
+void ScrubJob::adjust_schedule(
+    const Scrub::sched_params_t& suggested,
+    const Scrub::sched_conf_t& app_conf,
+    utime_t scrub_clock_now,
+    delay_ready_t modify_ready_targets)
  {
-  Scrub::scrub_schedule_t adjusted{
-      suggested.proposed_time, suggested.proposed_time, suggested.proposed_time};
+  dout(10) << fmt::format(
+                 "{} current h.p.:{:c} conf:{} also-ready?{:c} "
+                 "sjob@entry:{}",
+                 suggested, high_priority ? 'y' : 'n', app_conf,
+                 (modify_ready_targets == delay_ready_t::delay_ready) ? 'y'
+                                                                      : 'n',
+                 *this)
+          << dendl;
  
-  if (suggested.is_must == Scrub::must_scrub_t::not_mandatory) {
-    // unless explicitly requested, postpone the scrub with a random delay
-    adjusted.scheduled_at += app_conf.shallow_interval;
-    double r = rand() / (double)RAND_MAX;
-    adjusted.scheduled_at +=
-       app_conf.shallow_interval * app_conf.interval_randomize_ratio * r;
+  high_priority = (suggested.is_must == must_scrub_t::mandatory);
+  utime_t adj_not_before = suggested.proposed_time;
+  utime_t adj_target = suggested.proposed_time;
+  schedule.deadline = adj_target;
+
+  if (!high_priority) {
+    // add a random delay to the proposed scheduled time - but only for periodic
+    // scrubs that are not already eligible for scrubbing.
+    if ((modify_ready_targets == delay_ready_t::delay_ready) ||
+       adj_not_before > scrub_clock_now) {
+      adj_target += app_conf.shallow_interval;
+      double r = rand() / (double)RAND_MAX;
+      adj_target +=
+         app_conf.shallow_interval * app_conf.interval_randomize_ratio * r;
+    }
  
+    // the deadline can be updated directly into the scrub-job
      if (app_conf.max_shallow) {
-      adjusted.deadline += *app_conf.max_shallow;
+      schedule.deadline += *app_conf.max_shallow;
      } else {
-      adjusted.deadline = utime_t{};
+      schedule.deadline = utime_t{};
      }
  
-    if (adjusted.not_before < adjusted.scheduled_at) {
-      adjusted.not_before = adjusted.scheduled_at;
+    if (adj_not_before < adj_target) {
+      adj_not_before = adj_target;
      }
-
-    dout(20) << fmt::format(
-                   "not-must. Was:{:s} config:{} adjusted:{}",
-                   suggested.proposed_time, app_conf, adjusted) << dendl;
    }
-  // else - no log is needed. All relevant data will be logged by the caller
  
-  return adjusted;
+  schedule.scheduled_at = adj_target;
+  schedule.not_before = adj_not_before;
+  dout(10) << fmt::format(
+                 "adjusted: nb:{:s} target:{:s} deadline:{:s} ({})",
+                 schedule.not_before, schedule.scheduled_at, schedule.deadline,
+                 state_desc())
+          << dendl;
  }
  
  
-void ScrubJob::init_targets(
-    const sched_params_t& suggested,
-    const pg_info_t& info,
-    const Scrub::sched_conf_t& aconf,
+void ScrubJob::merge_and_delay(
+    const scrub_schedule_t& aborted_schedule,
+    delay_cause_t issue,
+    requested_scrub_t updated_flags,
      utime_t scrub_clock_now)
  {
-  auto adjusted = adjust_target_time(aconf, suggested);
-  high_priority = suggested.is_must == must_scrub_t::mandatory;
-  update_schedule(adjusted, true);
+  // merge the schedule targets:
+  schedule.scheduled_at =
+      std::min(aborted_schedule.scheduled_at, schedule.scheduled_at);
+  high_priority = high_priority || updated_flags.must_scrub;
+  delay_on_failure(5s, issue, scrub_clock_now);
+
+  // the new deadline is the minimum of the two
+  schedule.deadline = std::min(aborted_schedule.deadline, schedule.deadline);
  }
  
  
-void ScrubJob::update_schedule(
-    const Scrub::scrub_schedule_t& adjusted,
-    bool reset_failure_penalty)
-{
-  dout(15) << fmt::format(
-                 "was: nb:{:s}({:s}). Called with: rest?{} {:s} ({})",
-                 schedule.not_before, schedule.scheduled_at,
-                 reset_failure_penalty, adjusted.scheduled_at,
-                 state_desc())
-          << dendl;
-  schedule.scheduled_at = adjusted.scheduled_at;
-  schedule.deadline = adjusted.deadline;
-
-  if (reset_failure_penalty || (schedule.not_before < schedule.scheduled_at)) {
-    schedule.not_before = schedule.scheduled_at;
-  }
-  dout(10) << fmt::format(
-                 "adjusted: nb:{:s} ({:s}) ({})", schedule.not_before,
-                 schedule.scheduled_at, state_desc())
-          << dendl;
-}
-
  void ScrubJob::delay_on_failure(
      std::chrono::seconds delay,
      Scrub::delay_cause_t delay_cause,
diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h

index ce2e66023ede3573dbcfb23e5cbb18d8be861554..ef30bcb4fe572b4b6956edb168bf958e74c206ee 100644 (file)
--- a/src/osd/scrubber/scrub_job.h
+++ b/src/osd/scrubber/scrub_job.h
@@ -155,27 +155,23 @@ class ScrubJob {
    }
  
    /**
-   * 'reset_failure_penalty' is used to reset the 'not_before' jo attribute to
-   * the updated 'scheduled_at' time. This is used whenever the scrub-job
-   * schedule is updated, and the update is not a result of a scrub attempt
-   * failure.
-   */
-  void update_schedule(
-      const scrub_schedule_t& adjusted,
-      bool reset_failure_penalty);
-
-  /**
-   * If the scrub job was not explicitly requested, we postpone it by some
-   * random length of time.
-   * And if delaying the scrub - we calculate, based on pool parameters, a
-   * deadline we should scrub before.
+   * Given a proposed time for the next scrub, and the relevant
+   * configuration, adjust_schedule() determines the actual target time,
+   * the deadline, and the 'not_before' time for the scrub.
+   * The new values are updated into the scrub-job.
     *
-   * @return updated (i.e. - possibly delayed) scrub schedule (schedule,
-   * deadline, not_before)
+   * Specifically:
+   * - for high-priority scrubs: n.b. & deadline are set equal to the
+   *   (untouched) proposed target time.
+   * - for regular scrubs: the proposed time is adjusted (delayed) based
+   *   on the configuration; the deadline is set further out (if configured)
+   *   and the n.b. is reset to the target.
     */
-  Scrub::scrub_schedule_t adjust_target_time(
-    const Scrub::sched_conf_t& app_conf,
-    const Scrub::sched_params_t& proposed_schedule) const;
+  void adjust_schedule(
+    const Scrub::sched_params_t& suggested,
+    const Scrub::sched_conf_t& aconf,
+    utime_t scrub_clock_now,
+    Scrub::delay_ready_t modify_ready_targets);
  
    /**
     * push the 'not_before' time out by 'delay' seconds, so that this scrub target
@@ -187,14 +183,18 @@ class ScrubJob {
        utime_t scrub_clock_now);
  
    /**
-   * initial setting of the scheduling parameters of a newly registered
-   * PG. The scrub targets (in this stage of the refactoring - the whole
-   * scrub job) is initialized as for a regular periodic scrub.
+   *  Recalculating any possible updates to the scrub schedule, following an
+   *  aborted scrub attempt.
+   *  Usually - we can use the same schedule that triggered the aborted scrub.
+   *  But we must take into account scenarios where "something" caused the
+   *  parameters prepared for the *next* scrub to show higher urgency or
+   *  priority. "Something" - as in an operator command requiring immediate
+   *  scrubbing, or a change in the pool/cluster configuration.
     */
-  void init_targets(
-      const sched_params_t& suggested,
-      const pg_info_t& info,
-      const Scrub::sched_conf_t& aconf,
+  void merge_and_delay(
+      const scrub_schedule_t& aborted_schedule,
+      Scrub::delay_cause_t issue,
+      requested_scrub_t updated_flags,
        utime_t scrub_clock_now);
  
   /**
@@ -213,7 +213,6 @@ class ScrubJob {
  
    void dump(ceph::Formatter* f) const;
  
-
    bool is_registered() const { return registered; }
  
    /**
@@ -263,7 +262,6 @@ struct formatter<Scrub::sched_params_t> {
    }
  };
  
-
  template <>
  struct formatter<Scrub::ScrubJob> {
    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
@@ -272,7 +270,7 @@ struct formatter<Scrub::ScrubJob> {
    auto format(const Scrub::ScrubJob& sjob, FormatContext& ctx) const
    {
      return fmt::format_to(
-       ctx.out(), "pg[{}] @ nb:{:s} ({:s}) (dl:{:s}) - <{}>",
+       ctx.out(), "pg[{}]:nb:{:s} / trg:{:s} / dl:{:s} <{}>",
         sjob.pgid, sjob.schedule.not_before, sjob.schedule.scheduled_at,
         sjob.schedule.deadline, sjob.state_desc());
    }
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc

index ca0ff52227839a32a22374d306810f5e8d9504be..547a046d5bd6d15c0ac05f31f61e5f0e231d7674 100644 (file)
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -308,7 +308,7 @@ ActiveScrubbing::~ActiveScrubbing()
    // completed successfully), we use it now to set the 'failed scrub' duration.
    if (session.m_session_started_at != ScrubTimePoint{}) {
      // delay the next invocation of the scrubber on this target
-    scrbr->penalize_next_scrub(Scrub::delay_cause_t::aborted);
+    scrbr->on_mid_scrub_abort(Scrub::delay_cause_t::aborted);
  
      auto logged_duration = ScrubClock::now() - session.m_session_started_at;
      session.m_perf_set->tinc(scrbcnt_failed_elapsed, logged_duration);
diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h

index 85c518c402f82bc1fe92a144c417719cc7613fa7..2177c7e85ab427ea0c0466a60afc49e934533526 100644 (file)
--- a/src/osd/scrubber/scrub_machine_lstnr.h
+++ b/src/osd/scrubber/scrub_machine_lstnr.h
@@ -163,8 +163,7 @@ struct ScrubMachineListener {
    virtual void scrub_finish() = 0;
  
    /// notify the scrubber about a scrub failure
-  /// (note: temporary implementation)
-  virtual void penalize_next_scrub(Scrub::delay_cause_t cause) = 0;
+  virtual void on_mid_scrub_abort(Scrub::delay_cause_t cause) = 0;
  
    /**
     * Prepare a MOSDRepScrubMap message carrying the requested scrub map
author	Ronen Friedman <rfriedma@redhat.com>
	Thu, 20 Jun 2024 13:04:41 +0000 (08:04 -0500)
committer	Ronen Friedman <rfriedma@redhat.com>
	Tue, 16 Jul 2024 14:19:33 +0000 (09:19 -0500)
src/osd/scrubber/pg_scrubber.cc		patch \| blob \| history
src/osd/scrubber/pg_scrubber.h		patch \| blob \| history
src/osd/scrubber/scrub_job.cc		patch \| blob \| history
src/osd/scrubber/scrub_job.h		patch \| blob \| history
src/osd/scrubber/scrub_machine.cc		patch \| blob \| history
src/osd/scrubber/scrub_machine_lstnr.h		patch \| blob \| history