From 8a8fa48b8b0602f3bd646a64a58993cf6793ac85 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Thu, 20 Jun 2024 08:04:41 -0500
Subject: [PATCH] osd/scrub: scheduling the next scrub following scrub
 completion

or after an aborted scrub.

To note: one of the important changes in this commit:
merging the functionality of adjust_target_time() &
update_schedule() into a single function - adjust_schedule().

Regarding the handling of aborts:

Most of the time - all that is required following a scrub abort is to
requeue the scrub job - the one that triggered the aborted scrub -
with just a delay added to its n.b..

But we must take into account scenarios where "something" caused the
parameters prepared for the *next* scrub to show higher urgency or
priority. "Something" - as in an operator command requiring immediate
scrubbing, or a change in the pool/cluster configuration.
In such cases - the current requested flags and the parameters of
the aborted scrub must be merged.

Note that the current implementation is a temporary solution, to be
replaced by a per-level updating of the relevant target.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/osd/scrubber/pg_scrubber.cc        |  85 ++++++++++++++------
 src/osd/scrubber/pg_scrubber.h         |   2 +-
 src/osd/scrubber/scrub_job.cc          | 105 +++++++++++++------------
 src/osd/scrubber/scrub_job.h           |  56 +++++++------
 src/osd/scrubber/scrub_machine.cc      |   2 +-
 src/osd/scrubber/scrub_machine_lstnr.h |   3 +-
 6 files changed, 145 insertions(+), 108 deletions(-)

diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 8984d00f45d..637b4586ab3 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -534,30 +534,16 @@ sched_params_t PgScrubber::determine_initial_schedule(
 }
 
 
-/*
- * Note: referring to m_planned_scrub here is temporary, as this set of
- * scheduling flags will be removed in a followup PR.
- */
 void PgScrubber::schedule_scrub_with_osd()
 {
   ceph_assert(is_primary());
   ceph_assert(m_scrub_job);
 
-  auto pre_reg = registration_state();
-  m_scrub_job->registered = true;
-
-  const auto applicable_conf = populate_config_params();
-  const auto scrub_clock_now = ceph_clock_now();
-  auto suggested = determine_initial_schedule(applicable_conf, scrub_clock_now);
-  m_scrub_job->init_targets(
-      suggested, m_pg->info, applicable_conf, scrub_clock_now);
-
-  m_osds->get_scrub_services().enqueue_target(*m_scrub_job);
-
-  dout(10) << fmt::format(
-		  "{}: <flags:{}> <{:.5}> --> <{:.5}>", __func__,
-		  m_planned_scrub, pre_reg, registration_state())
+  dout(20) << fmt::format(
+		  "{}: state at entry: {}", __func__, m_scrub_job->state_desc())
 	   << dendl;
+  m_scrub_job->registered = true;
+  update_scrub_job(delay_ready_t::delay_ready);
 }
 
 
@@ -613,8 +599,12 @@ void PgScrubber::update_scrub_job(Scrub::delay_ready_t delay_ready)
   ceph_assert(m_pg->is_locked());
   const auto applicable_conf = populate_config_params();
   const auto scrub_clock_now = ceph_clock_now();
-  const auto suggested = determine_initial_schedule(applicable_conf, scrub_clock_now);
-  m_scrub_job->on_periods_change(suggested, applicable_conf, scrub_clock_now);
+  const auto suggested =
+      determine_initial_schedule(applicable_conf, scrub_clock_now);
+
+  ceph_assert(m_scrub_job->is_registered());
+  m_scrub_job->adjust_schedule(
+      suggested, applicable_conf, scrub_clock_now, delay_ready);
   m_osds->get_scrub_services().enqueue_target(*m_scrub_job);
   m_scrub_job->target_queued = true;
   m_pg->publish_stats_to_osd();
@@ -761,10 +751,7 @@ void PgScrubber::on_operator_periodic_cmd(
 	   << dendl;
 
   // move the relevant time-stamp backwards - enough to trigger a scrub
-
-  utime_t now_is = ceph_clock_now();
-  utime_t stamp = now_is;
-
+  utime_t stamp = ceph_clock_now();
   if (offset > 0) {
     stamp -= offset;
   } else {
@@ -2083,6 +2070,56 @@ void PgScrubber::on_digest_updates()
 }
 
 
+/**
+ * The scrub session was aborted. We are left with two sets of parameters
+ * as to when the next scrub of this PG should take place, and what should
+ * it be like. One set of parameters is the one that was used to start the
+ * scrub, and that was 'frozen' by set_op_parameters(). It has its own
+ * scheduling target, priority, not-before, etc'.
+ * The other set is the updated state of the current scrub-job. It may
+ * have had its priority, flags, or schedule modified in the meantime.
+ * And - it does not (at least initially, i.e. immediately after
+ * set_op_parameters()), have high priority.
+ *
+ * Alas, the scrub session that was initiated was aborted. We must now
+ * merge the two sets of parameters, using the highest priority and the
+ * nearest target time for the next scrub.
+ *
+ * Note: only half-functioning in this commit. As the scrub-job copy
+ * (the one that was in the scheduling queue, and was passed to the scrubber)
+ * does not have the 'urgency' parameter, we are missing some information
+ * that is still encoded in the 'planned scrub' flags. This will be fixed in
+ * the next step.
+ */
+void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
+{
+  // assuming we can still depend on the 'scrubbing' flag being set;
+  // Also on Queued&Active.
+
+  // note again: this is not how merging should work in the final version:
+  // e.g. - the 'aborted_schedule' data should be passed thru the scrubber.
+  // In this current patchworik, for example, we are only guessing at
+  // the original value of 'must_deep_scrub'.
+  m_planned_scrub.must_deep_scrub =
+      m_planned_scrub.must_deep_scrub || (m_flags.required && m_is_deep);
+  m_planned_scrub.must_scrub = m_planned_scrub.must_deep_scrub ||
+			       m_planned_scrub.must_scrub || m_flags.required;
+  m_planned_scrub.must_repair = m_planned_scrub.must_repair || m_is_repair;
+  m_planned_scrub.need_auto = m_planned_scrub.need_auto || m_flags.auto_repair;
+  m_planned_scrub.deep_scrub_on_error =
+      m_planned_scrub.deep_scrub_on_error || m_flags.deep_scrub_on_error;
+  m_planned_scrub.check_repair =
+      m_planned_scrub.check_repair || m_flags.check_repair;
+
+  m_scrub_job->merge_and_delay(
+      m_active_target->schedule, issue, m_planned_scrub, ceph_clock_now());
+  ceph_assert(m_scrub_job->is_registered());
+  ceph_assert(!m_scrub_job->target_queued);
+  m_osds->get_scrub_services().enqueue_target(*m_scrub_job);
+  m_scrub_job->target_queued = true;
+}
+
+
 void PgScrubber::requeue_penalized(Scrub::delay_cause_t cause)
 {
   /// \todo fix the 5s' to use a cause-specific delay parameter
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index 0106edae54a..e970bd7219d 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -437,7 +437,7 @@ class PgScrubber : public ScrubPgIF,
 
   void scrub_finish() final;
 
-  void penalize_next_scrub(Scrub::delay_cause_t cause) final;
+  void on_mid_scrub_abort(Scrub::delay_cause_t issue) final;
 
   ScrubMachineListener::MsgAndEpoch prep_replica_map_msg(
     Scrub::PreemptionNoted was_preempted) final;
diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc
index 1770cd95cb3..911ecc66ddb 100644
--- a/src/osd/scrubber/scrub_job.cc
+++ b/src/osd/scrubber/scrub_job.cc
@@ -11,6 +11,7 @@ using OSDRestrictions = Scrub::OSDRestrictions;
 using sched_conf_t = Scrub::sched_conf_t;
 using scrub_schedule_t = Scrub::scrub_schedule_t;
 using ScrubJob = Scrub::ScrubJob;
+using delay_ready_t = Scrub::delay_ready_t;
 
 
 // ////////////////////////////////////////////////////////////////////////// //
@@ -32,7 +33,7 @@ ScrubJob::ScrubJob(CephContext* cct, const spg_t& pg, int node_id)
     : pgid{pg}
     , whoami{node_id}
     , cct{cct}
-    , log_msg_prefix{fmt::format("osd.{}: scrub-job:pg[{}]:", node_id, pgid)}
+    , log_msg_prefix{fmt::format("osd.{} scrub-job:pg[{}]:", node_id, pgid)}
 {}
 
 // debug usage only
@@ -44,74 +45,76 @@ ostream& operator<<(ostream& out, const ScrubJob& sjob)
 }  // namespace std
 
 
-Scrub::scrub_schedule_t ScrubJob::adjust_target_time(
-    const sched_conf_t& app_conf,
-    const sched_params_t& suggested) const
+void ScrubJob::adjust_schedule(
+    const Scrub::sched_params_t& suggested,
+    const Scrub::sched_conf_t& app_conf,
+    utime_t scrub_clock_now,
+    delay_ready_t modify_ready_targets)
 {
-  Scrub::scrub_schedule_t adjusted{
-      suggested.proposed_time, suggested.proposed_time, suggested.proposed_time};
+  dout(10) << fmt::format(
+		  "{} current h.p.:{:c} conf:{} also-ready?{:c} "
+		  "sjob@entry:{}",
+		  suggested, high_priority ? 'y' : 'n', app_conf,
+		  (modify_ready_targets == delay_ready_t::delay_ready) ? 'y'
+								       : 'n',
+		  *this)
+	   << dendl;
 
-  if (suggested.is_must == Scrub::must_scrub_t::not_mandatory) {
-    // unless explicitly requested, postpone the scrub with a random delay
-    adjusted.scheduled_at += app_conf.shallow_interval;
-    double r = rand() / (double)RAND_MAX;
-    adjusted.scheduled_at +=
-	app_conf.shallow_interval * app_conf.interval_randomize_ratio * r;
+  high_priority = (suggested.is_must == must_scrub_t::mandatory);
+  utime_t adj_not_before = suggested.proposed_time;
+  utime_t adj_target = suggested.proposed_time;
+  schedule.deadline = adj_target;
+
+  if (!high_priority) {
+    // add a random delay to the proposed scheduled time - but only for periodic
+    // scrubs that are not already eligible for scrubbing.
+    if ((modify_ready_targets == delay_ready_t::delay_ready) ||
+	adj_not_before > scrub_clock_now) {
+      adj_target += app_conf.shallow_interval;
+      double r = rand() / (double)RAND_MAX;
+      adj_target +=
+	  app_conf.shallow_interval * app_conf.interval_randomize_ratio * r;
+    }
 
+    // the deadline can be updated directly into the scrub-job
     if (app_conf.max_shallow) {
-      adjusted.deadline += *app_conf.max_shallow;
+      schedule.deadline += *app_conf.max_shallow;
     } else {
-      adjusted.deadline = utime_t{};
+      schedule.deadline = utime_t{};
     }
 
-    if (adjusted.not_before < adjusted.scheduled_at) {
-      adjusted.not_before = adjusted.scheduled_at;
+    if (adj_not_before < adj_target) {
+      adj_not_before = adj_target;
     }
-
-    dout(20) << fmt::format(
-		    "not-must. Was:{:s} config:{} adjusted:{}",
-		    suggested.proposed_time, app_conf, adjusted) << dendl;
   }
-  // else - no log is needed. All relevant data will be logged by the caller
 
-  return adjusted;
+  schedule.scheduled_at = adj_target;
+  schedule.not_before = adj_not_before;
+  dout(10) << fmt::format(
+		  "adjusted: nb:{:s} target:{:s} deadline:{:s} ({})",
+		  schedule.not_before, schedule.scheduled_at, schedule.deadline,
+		  state_desc())
+	   << dendl;
 }
 
 
-void ScrubJob::init_targets(
-    const sched_params_t& suggested,
-    const pg_info_t& info,
-    const Scrub::sched_conf_t& aconf,
+void ScrubJob::merge_and_delay(
+    const scrub_schedule_t& aborted_schedule,
+    delay_cause_t issue,
+    requested_scrub_t updated_flags,
     utime_t scrub_clock_now)
 {
-  auto adjusted = adjust_target_time(aconf, suggested);
-  high_priority = suggested.is_must == must_scrub_t::mandatory;
-  update_schedule(adjusted, true);
+  // merge the schedule targets:
+  schedule.scheduled_at =
+      std::min(aborted_schedule.scheduled_at, schedule.scheduled_at);
+  high_priority = high_priority || updated_flags.must_scrub;
+  delay_on_failure(5s, issue, scrub_clock_now);
+
+  // the new deadline is the minimum of the two
+  schedule.deadline = std::min(aborted_schedule.deadline, schedule.deadline);
 }
 
 
-void ScrubJob::update_schedule(
-    const Scrub::scrub_schedule_t& adjusted,
-    bool reset_failure_penalty)
-{
-  dout(15) << fmt::format(
-		  "was: nb:{:s}({:s}). Called with: rest?{} {:s} ({})",
-		  schedule.not_before, schedule.scheduled_at,
-		  reset_failure_penalty, adjusted.scheduled_at,
-		  state_desc())
-	   << dendl;
-  schedule.scheduled_at = adjusted.scheduled_at;
-  schedule.deadline = adjusted.deadline;
-
-  if (reset_failure_penalty || (schedule.not_before < schedule.scheduled_at)) {
-    schedule.not_before = schedule.scheduled_at;
-  }
-  dout(10) << fmt::format(
-		  "adjusted: nb:{:s} ({:s}) ({})", schedule.not_before,
-		  schedule.scheduled_at, state_desc())
-	   << dendl;
-}
-
 void ScrubJob::delay_on_failure(
     std::chrono::seconds delay,
     Scrub::delay_cause_t delay_cause,
diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h
index ce2e66023ed..ef30bcb4fe5 100644
--- a/src/osd/scrubber/scrub_job.h
+++ b/src/osd/scrubber/scrub_job.h
@@ -155,27 +155,23 @@ class ScrubJob {
   }
 
   /**
-   * 'reset_failure_penalty' is used to reset the 'not_before' jo attribute to
-   * the updated 'scheduled_at' time. This is used whenever the scrub-job
-   * schedule is updated, and the update is not a result of a scrub attempt
-   * failure.
-   */
-  void update_schedule(
-      const scrub_schedule_t& adjusted,
-      bool reset_failure_penalty);
-
-  /**
-   * If the scrub job was not explicitly requested, we postpone it by some
-   * random length of time.
-   * And if delaying the scrub - we calculate, based on pool parameters, a
-   * deadline we should scrub before.
+   * Given a proposed time for the next scrub, and the relevant
+   * configuration, adjust_schedule() determines the actual target time,
+   * the deadline, and the 'not_before' time for the scrub.
+   * The new values are updated into the scrub-job.
    *
-   * @return updated (i.e. - possibly delayed) scrub schedule (schedule,
-   * deadline, not_before)
+   * Specifically:
+   * - for high-priority scrubs: n.b. & deadline are set equal to the
+   *   (untouched) proposed target time.
+   * - for regular scrubs: the proposed time is adjusted (delayed) based
+   *   on the configuration; the deadline is set further out (if configured)
+   *   and the n.b. is reset to the target.
    */
-  Scrub::scrub_schedule_t adjust_target_time(
-    const Scrub::sched_conf_t& app_conf,
-    const Scrub::sched_params_t& proposed_schedule) const;
+  void adjust_schedule(
+    const Scrub::sched_params_t& suggested,
+    const Scrub::sched_conf_t& aconf,
+    utime_t scrub_clock_now,
+    Scrub::delay_ready_t modify_ready_targets);
 
   /**
    * push the 'not_before' time out by 'delay' seconds, so that this scrub target
@@ -187,14 +183,18 @@ class ScrubJob {
       utime_t scrub_clock_now);
 
   /**
-   * initial setting of the scheduling parameters of a newly registered
-   * PG. The scrub targets (in this stage of the refactoring - the whole
-   * scrub job) is initialized as for a regular periodic scrub.
+   *  Recalculating any possible updates to the scrub schedule, following an
+   *  aborted scrub attempt.
+   *  Usually - we can use the same schedule that triggered the aborted scrub.
+   *  But we must take into account scenarios where "something" caused the
+   *  parameters prepared for the *next* scrub to show higher urgency or
+   *  priority. "Something" - as in an operator command requiring immediate
+   *  scrubbing, or a change in the pool/cluster configuration.
    */
-  void init_targets(
-      const sched_params_t& suggested,
-      const pg_info_t& info,
-      const Scrub::sched_conf_t& aconf,
+  void merge_and_delay(
+      const scrub_schedule_t& aborted_schedule,
+      Scrub::delay_cause_t issue,
+      requested_scrub_t updated_flags,
       utime_t scrub_clock_now);
 
  /**
@@ -213,7 +213,6 @@ class ScrubJob {
 
   void dump(ceph::Formatter* f) const;
 
-
   bool is_registered() const { return registered; }
 
   /**
@@ -263,7 +262,6 @@ struct formatter<Scrub::sched_params_t> {
   }
 };
 
-
 template <>
 struct formatter<Scrub::ScrubJob> {
   constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
@@ -272,7 +270,7 @@ struct formatter<Scrub::ScrubJob> {
   auto format(const Scrub::ScrubJob& sjob, FormatContext& ctx) const
   {
     return fmt::format_to(
-	ctx.out(), "pg[{}] @ nb:{:s} ({:s}) (dl:{:s}) - <{}>",
+	ctx.out(), "pg[{}]:nb:{:s} / trg:{:s} / dl:{:s} <{}>",
 	sjob.pgid, sjob.schedule.not_before, sjob.schedule.scheduled_at,
 	sjob.schedule.deadline, sjob.state_desc());
   }
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
index ca0ff522278..547a046d5bd 100644
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -308,7 +308,7 @@ ActiveScrubbing::~ActiveScrubbing()
   // completed successfully), we use it now to set the 'failed scrub' duration.
   if (session.m_session_started_at != ScrubTimePoint{}) {
     // delay the next invocation of the scrubber on this target
-    scrbr->penalize_next_scrub(Scrub::delay_cause_t::aborted);
+    scrbr->on_mid_scrub_abort(Scrub::delay_cause_t::aborted);
 
     auto logged_duration = ScrubClock::now() - session.m_session_started_at;
     session.m_perf_set->tinc(scrbcnt_failed_elapsed, logged_duration);
diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h
index 85c518c402f..2177c7e85ab 100644
--- a/src/osd/scrubber/scrub_machine_lstnr.h
+++ b/src/osd/scrubber/scrub_machine_lstnr.h
@@ -163,8 +163,7 @@ struct ScrubMachineListener {
   virtual void scrub_finish() = 0;
 
   /// notify the scrubber about a scrub failure
-  /// (note: temporary implementation)
-  virtual void penalize_next_scrub(Scrub::delay_cause_t cause) = 0;
+  virtual void on_mid_scrub_abort(Scrub::delay_cause_t cause) = 0;
 
   /**
    * Prepare a MOSDRepScrubMap message carrying the requested scrub map
-- 
2.39.5