osd/scrub: update job's NB on failure

author Ronen Friedman <rfriedma@redhat.com>

Tue, 2 Jan 2024 16:09:06 +0000 (10:09 -0600)

committer Ronen Friedman <rfriedma@redhat.com>

Mon, 22 Jan 2024 14:29:35 +0000 (08:29 -0600)
author Ronen Friedman <rfriedma@redhat.com>
Tue, 2 Jan 2024 16:09:06 +0000 (10:09 -0600)
committer Ronen Friedman <rfriedma@redhat.com>
Mon, 22 Jan 2024 14:29:35 +0000 (08:29 -0600)
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index eb7a332f6c2fb36b0b2336dd4c2711f9efccd05a..cb12dae7c5b100977d2a4760b99155d81b1b6da9 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1342,9 +1342,12 @@ Scrub::schedule_result_t PG::start_scrubbing(
            << dendl;
    ceph_assert(ceph_mutex_is_locked(_lock));
  
+  // recheck PG status (as the PG was unlocked for a time after being selected
+  // for scrubbing)
    if (!is_primary() || !is_active() || !is_clean()) {
      dout(10) << __func__ << ": cannot scrub (not a clean and active primary)"
              << dendl;
+    m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::pg_state);
      return schedule_result_t::target_specific_failure;
    }
  
@@ -1361,6 +1364,7 @@ Scrub::schedule_result_t PG::start_scrubbing(
              << ": skipping this PG as repairing was not explicitly "
                 "requested for it"
              << dendl;
+    m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::scrub_params);
      return schedule_result_t::target_specific_failure;
    }
  
@@ -1369,6 +1373,7 @@ Scrub::schedule_result_t PG::start_scrubbing(
      // (on the transition from NotTrimming to Trimming/WaitReservation),
      // i.e. some time before setting 'snaptrim'.
      dout(10) << __func__ << ": cannot scrub while snap-trimming" << dendl;
+    m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::pg_state);
      return schedule_result_t::target_specific_failure;
    }
  
@@ -1381,6 +1386,7 @@ Scrub::schedule_result_t PG::start_scrubbing(
      // (due to configuration or priority issues)
      // The reason was already reported by the callee.
      dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
+    m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::scrub_params);
      return schedule_result_t::target_specific_failure;
    }
  
@@ -1388,6 +1394,7 @@ Scrub::schedule_result_t PG::start_scrubbing(
    // be retried by the OSD later on.
    if (!m_scrubber->reserve_local()) {
      dout(10) << __func__ << ": failed to reserve locally" << dendl;
+    m_scrubber->penalize_next_scrub(Scrub::delay_cause_t::local_resources);
      return schedule_result_t::osd_wide_failure;
    }
  
diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc

index a721dc1cb9c040e7a9b400c412e1fcff0c03dc6e..bc65d64bb422ea41462438e4e1288442ef058df9 100644 (file)
--- a/src/osd/scrubber/osd_scrub.cc
+++ b/src/osd/scrubber/osd_scrub.cc
@@ -435,6 +435,16 @@ void OsdScrub::update_job(
    m_queue.update_job(sjob, suggested, reset_notbefore);
  }
  
+void OsdScrub::delay_on_failure(
+      Scrub::ScrubJobRef sjob,
+      std::chrono::seconds delay,
+      Scrub::delay_cause_t delay_cause,
+      utime_t now_is)
+{
+  m_queue.delay_on_failure(sjob, delay, delay_cause, now_is);
+}
+
+
  void OsdScrub::register_with_osd(
      Scrub::ScrubJobRef sjob,
      const Scrub::sched_params_t& suggested)
diff --git a/src/osd/scrubber/osd_scrub.h b/src/osd/scrubber/osd_scrub.h

index 407eae059079f60d773536b1072c661180168a56..12e9db312312d568df89fb9925719965551e4e24 100644 (file)
--- a/src/osd/scrubber/osd_scrub.h
+++ b/src/osd/scrubber/osd_scrub.h
@@ -149,6 +149,17 @@ class OsdScrub {
  
    void clear_reserving_now(spg_t reserving_id);
  
+  /**
+   * push the 'not_before' time out by 'delay' seconds, so that this scrub target
+   * would not be retried before 'delay' seconds have passed.
+   */
+  void delay_on_failure(
+      Scrub::ScrubJobRef sjob,
+      std::chrono::seconds delay,
+      Scrub::delay_cause_t delay_cause,
+      utime_t now_is);
+
+
    /**
     * \returns true if the current time is within the scrub time window
     */
diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc

index 836bf740ec08769641e0bebc0d2071fe4ebe8453..36644211ec39b6caf8b8b996d34abb20314a71c1 100644 (file)
--- a/src/osd/scrubber/osd_scrub_sched.cc
+++ b/src/osd/scrubber/osd_scrub_sched.cc
@@ -153,6 +153,21 @@ void ScrubQueue::update_job(Scrub::ScrubJobRef scrub_job,
    scrub_job->update_schedule(adjusted, reset_nb);
  }
  
+
+void ScrubQueue::delay_on_failure(
+    Scrub::ScrubJobRef sjob,
+    std::chrono::seconds delay,
+    Scrub::delay_cause_t delay_cause,
+    utime_t now_is)
+{
+  dout(10) << fmt::format(
+                 "pg[{}] delay_on_failure: delay:{} now:{:s}",
+                 sjob->pgid, delay, now_is)
+          << dendl;
+  sjob->delay_on_failure(delay, delay_cause, now_is);
+}
+
+
  sched_params_t ScrubQueue::determine_scrub_time(
    const requested_scrub_t& request_flags,
    const pg_info_t& pg_info,
@@ -281,9 +296,9 @@ ScrubQContainer ScrubQueue::collect_ripe_jobs(
      for (const auto& jobref : group) {
        if (!filtr(jobref)) {
         dout(20) << fmt::format(
-                       " not ripe: {} @ {:s} ({:s})", jobref->pgid,
-                        jobref->schedule.not_before,
-                       jobref->schedule.scheduled_at)
+                       " not eligible: {} @ {:s} ({:s},{:s})", jobref->pgid,
+                       jobref->schedule.not_before,
+                       jobref->schedule.scheduled_at, jobref->last_issue)
                  << dendl;
        }
      }
diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h

index d94ca8cf7579962d7fe241ceae5ed01b090ca366..0eddb22ea7dd54d68df9a3d7a5b7f9f05aa039b6 100644 (file)
--- a/src/osd/scrubber/osd_scrub_sched.h
+++ b/src/osd/scrubber/osd_scrub_sched.h
@@ -220,6 +220,12 @@ class ScrubQueue {
        const sched_params_t& suggested,
        bool reset_notbefore);
  
+  void delay_on_failure(
+      Scrub::ScrubJobRef sjob,
+      std::chrono::seconds delay,
+      Scrub::delay_cause_t delay_cause,
+      utime_t now_is);
+
    sched_params_t determine_scrub_time(const requested_scrub_t& request_flags,
                                       const pg_info_t& pg_info,
                                       const pool_opts_t& pool_conf) const;
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc

index 55fd7b18dcdea0e886ee30ab73a73947d41585f7..3ef5a2ef567dc25be9ef16c125a3490c52655607 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -1756,7 +1756,9 @@ void PgScrubber::clear_scrub_blocked()
  
  void PgScrubber::flag_reservations_failure()
  {
-  m_scrub_job->resources_failure = true;
+  dout(10) << __func__ << dendl;
+  // delay the next invocation of the scrubber on this target
+  penalize_next_scrub(Scrub::delay_cause_t::replicas);
  }
  
  /*
@@ -1968,6 +1970,16 @@ void PgScrubber::scrub_finish()
    }
  }
  
+/*
+ * note: arbitrary delay used in this early version of the
+ * scheduler refactoring.
+ */
+void PgScrubber::penalize_next_scrub(Scrub::delay_cause_t cause)
+{
+  m_osds->get_scrub_services().delay_on_failure(
+      m_scrub_job, 5s, cause, ceph_clock_now());
+}
+
  void PgScrubber::on_digest_updates()
  {
    dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " "
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h

index 5911b67263f49540521c40ec7cdfc0f292e09f4b..9c29d5fdedb9a1a9fa4fa73715f8247cea5ea1c7 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -365,8 +365,7 @@ class PgScrubber : public ScrubPgIF,
    PG* get_pg() const final { return m_pg; }
    PerfCounters& get_counters_set() const final;
  
-  // temporary interface (to be discarded in a follow-up PR)
-  /// set the 'resources_failure' flag in the scrub-job object
+  /// delay next retry of this PG after a replica reservation failure
    void flag_reservations_failure();
  
    scrubber_callback_cancel_token_t schedule_callback_after(
@@ -430,6 +429,8 @@ class PgScrubber : public ScrubPgIF,
  
    void scrub_finish() final;
  
+  void penalize_next_scrub(Scrub::delay_cause_t cause) final;
+
    ScrubMachineListener::MsgAndEpoch prep_replica_map_msg(
      Scrub::PreemptionNoted was_preempted) final;
  
diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc

index 1875641e754a3d9df4cf4d3764d41b4df3f29f12..6564b129b0ef4fbcfb275b0c769fe2fc85ef6497 100644 (file)
--- a/src/osd/scrubber/scrub_job.cc
+++ b/src/osd/scrubber/scrub_job.cc
@@ -45,24 +45,21 @@ ostream& operator<<(ostream& out, const ScrubJob& sjob)
  
  void ScrubJob::update_schedule(
      const Scrub::scrub_schedule_t& adjusted,
-    bool reset_nb)
+    bool reset_failure_penalty)
  {
-  dout(15)
-      << fmt::format(
-            "was: nb:{:s}({:s}). Called with: rest?{} nb:{:s} ({:s}) ({})",
-            schedule.not_before, schedule.scheduled_at, reset_nb,
-            adjusted.not_before, adjusted.scheduled_at, registration_state())
-      << dendl;
+  dout(15) << fmt::format(
+                 "was: nb:{:s}({:s}). Called with: rest?{} {:s} ({})",
+                 schedule.not_before, schedule.scheduled_at,
+                 reset_failure_penalty, adjusted.scheduled_at,
+                 registration_state())
+          << dendl;
    schedule.scheduled_at = adjusted.scheduled_at;
    schedule.deadline = adjusted.deadline;
  
-  if (reset_nb || schedule.not_before < schedule.scheduled_at) {
+  if (reset_failure_penalty || (schedule.not_before < schedule.scheduled_at)) {
      schedule.not_before = schedule.scheduled_at;
    }
  
-  // 'updated' is changed here while not holding jobs_lock. That's OK, as
-  // the (atomic) flag will only be cleared by select_pg_and_scrub() after
-  // scan_penalized() is called and the job was moved to the to_scrub queue.
    updated = true;
    dout(10) << fmt::format(
                   "adjusted: nb:{:s} ({:s}) ({})", schedule.not_before,
@@ -70,6 +67,16 @@ void ScrubJob::update_schedule(
            << dendl;
  }
  
+void ScrubJob::delay_on_failure(
+    std::chrono::seconds delay,
+    Scrub::delay_cause_t delay_cause,
+    utime_t scrub_clock_now)
+{
+  schedule.not_before =
+      std::max(scrub_clock_now, schedule.not_before) + utime_t{delay};
+  last_issue = delay_cause;
+}
+
  std::string ScrubJob::scheduling_state(utime_t now_is, bool is_deep_expected)
      const
  {
diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h

index e6765c767b9685c12cd6696c4856fcbfe0ff821b..98a73477b092772288e71f54a461319d793bd47c 100644 (file)
--- a/src/osd/scrubber/scrub_job.h
+++ b/src/osd/scrubber/scrub_job.h
@@ -71,8 +71,8 @@ class ScrubJob final : public RefCountedObject {
     */
    std::atomic_bool in_queues{false};
  
-  /// last scrub attempt failed to secure replica resources
-  bool resources_failure{false};
+  /// how the last attempt to scrub this PG ended
+  delay_cause_t last_issue{delay_cause_t::none};
  
    /**
     * 'updated' is a temporary flag, used to create a barrier after
@@ -118,6 +118,15 @@ class ScrubJob final : public RefCountedObject {
        const scrub_schedule_t& adjusted,
        bool reset_failure_penalty);
  
+  /**
+   * push the 'not_before' time out by 'delay' seconds, so that this scrub target
+   * would not be retried before 'delay' seconds have passed.
+   */
+  void delay_on_failure(
+      std::chrono::seconds delay,
+      delay_cause_t delay_cause,
+      utime_t scrub_clock_now);
+
    void dump(ceph::Formatter* f) const;
  
    /*
@@ -227,6 +236,20 @@ struct formatter<Scrub::qu_state_t> : formatter<std::string_view> {
    }
  };
  
+template <>
+struct formatter<Scrub::sched_params_t> {
+  constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+  template <typename FormatContext>
+  auto format(const Scrub::sched_params_t& pm, FormatContext& ctx)
+  {
+    return fmt::format_to(
+       ctx.out(), "(proposed:{:s} min/max:{:.3f}/{:.3f} must:{:2s})",
+        utime_t{pm.proposed_time}, pm.min_interval, pm.max_interval,
+        pm.is_must == Scrub::must_scrub_t::mandatory ? "true" : "false");
+  }
+};
+
+
  template <>
  struct formatter<Scrub::ScrubJob> {
    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
@@ -235,12 +258,10 @@ struct formatter<Scrub::ScrubJob> {
    auto format(const Scrub::ScrubJob& sjob, FormatContext& ctx)
    {
      return fmt::format_to(
-       ctx.out(),
-       "pg[{}] @ {:s} ({:s}) (dl:{:s}) - <{}> / failure: {} / queue state: "
-       "{:.7}",
+       ctx.out(), "pg[{}] @ nb:{:s} ({:s}) (dl:{:s}) - <{}> queue state:{:.7}",
         sjob.pgid, sjob.schedule.not_before, sjob.schedule.scheduled_at,
         sjob.schedule.deadline, sjob.registration_state(),
-       sjob.resources_failure, sjob.state.load(std::memory_order_relaxed));
+       sjob.state.load(std::memory_order_relaxed));
    }
  };
  
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc

index eac26a85f8511f4931937c3493a3d49ff3b65169..d0ab07fe1dfae5087f06965be3e18d5527ece0ec 100644 (file)
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -339,6 +339,9 @@ ActiveScrubbing::~ActiveScrubbing()
    // if the begin-time stamp was not set 'off' (as done if the scrubbing
    // completed successfully), we use it now to set the 'failed scrub' duration.
    if (session.m_session_started_at != ScrubTimePoint{}) {
+    // delay the next invocation of the scrubber on this target
+    scrbr->penalize_next_scrub(Scrub::delay_cause_t::aborted);
+
      auto logged_duration = ScrubClock::now() - session.m_session_started_at;
      session.m_perf_set->tinc(scrbcnt_failed_elapsed, logged_duration);
      session.m_perf_set->inc(scrbcnt_failed);
diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h

index 7849d14e4e4147a0dad3de06e4af4be9e4a70c9c..ea893ba81f01c9ba4c408fc641baaefab26d9d2f 100644 (file)
--- a/src/osd/scrubber/scrub_machine_lstnr.h
+++ b/src/osd/scrubber/scrub_machine_lstnr.h
@@ -162,6 +162,10 @@ struct ScrubMachineListener {
    /// the part that actually finalizes a scrub
    virtual void scrub_finish() = 0;
  
+  /// notify the scrubber about a scrub failure
+  /// (note: temporary implementation)
+  virtual void penalize_next_scrub(Scrub::delay_cause_t cause) = 0;
+
    /**
     * Prepare a MOSDRepScrubMap message carrying the requested scrub map
     * @param was_preempted - were we preempted?
@@ -252,8 +256,7 @@ struct ScrubMachineListener {
    /// sending cluster-log warnings
    virtual void log_cluster_warning(const std::string& msg) const = 0;
  
-  // temporary interface (to be discarded in a follow-up PR)
-  /// set the 'resources_failure' flag in the scrub-job object
+  /// delay next retry of this PG after a replica reservation failure
    virtual void flag_reservations_failure() = 0;
  
    /// is this scrub more than just regular periodic scrub?
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h

index 5323b14677b0b0d97b7b2debafd8312e4d4448d3..067b9754c1108ff47e94cf0c93714b4a1869c6d1 100644 (file)
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -85,6 +85,53 @@ struct formatter<Scrub::OSDRestrictions> {
  };
  }  // namespace fmt
  
+namespace Scrub {
+
+/**
+ * the result of the last attempt to schedule a scrub for a specific PG.
+ * The enum value itself is mostly used for logging purposes.
+ */
+enum class delay_cause_t {
+  none,                    ///< scrub attempt was successful
+  replicas,        ///< failed to reserve replicas
+  flags,           ///< noscrub or nodeep-scrub
+  pg_state,        ///< e.g. snap-trimming
+  restricted_time,  ///< time restrictions or busy CPU
+  local_resources,  ///< too many scrubbing PGs
+  aborted,         ///< scrub was aborted w/ unspecified reason
+  interval,        ///< the interval had ended mid-scrub
+  scrub_params,     ///< the specific scrub type is not allowed
+};
+}  // namespace Scrub
+
+namespace fmt {
+// clang-format off
+template <>
+struct formatter<Scrub::delay_cause_t> : ::fmt::formatter<std::string_view> {
+  template <typename FormatContext>
+  auto format(Scrub::delay_cause_t cause, FormatContext& ctx)
+  {
+    using enum Scrub::delay_cause_t;
+    std::string_view desc;
+    switch (cause) {
+      case none:                desc = "ok"; break;
+      case replicas:            desc = "replicas"; break;
+      case flags:               desc = "noscrub"; break;
+      case pg_state:            desc = "pg-state"; break;
+      case restricted_time:     desc = "time/load"; break;
+      case local_resources:     desc = "local-cnt"; break;
+      case aborted:             desc = "aborted"; break;
+      case interval:            desc = "interval"; break;
+      case scrub_params:        desc = "scrub-mode"; break;
+      // better to not have a default case, so that the compiler will warn
+    }
+    return ::fmt::formatter<string_view>::format(desc, ctx);
+  }
+};
+// clang-format on
+}  // namespace fmt
+
+
  namespace Scrub {
  
  /// PG services used by the scrubber backend
@@ -330,6 +377,8 @@ struct ScrubPgIF {
  
    virtual pg_scrubbing_status_t get_schedule() const = 0;
  
+  /// notify the scrubber about a scrub failure
+  virtual void penalize_next_scrub(Scrub::delay_cause_t cause) = 0;
  
    // // perform 'scrub'/'deep_scrub' asok commands
author	Ronen Friedman <rfriedma@redhat.com>
	Tue, 2 Jan 2024 16:09:06 +0000 (10:09 -0600)
committer	Ronen Friedman <rfriedma@redhat.com>
	Mon, 22 Jan 2024 14:29:35 +0000 (08:29 -0600)
src/osd/PG.cc		patch \| blob \| history
src/osd/scrubber/osd_scrub.cc		patch \| blob \| history
src/osd/scrubber/osd_scrub.h		patch \| blob \| history
src/osd/scrubber/osd_scrub_sched.cc		patch \| blob \| history
src/osd/scrubber/osd_scrub_sched.h		patch \| blob \| history
src/osd/scrubber/pg_scrubber.cc		patch \| blob \| history
src/osd/scrubber/pg_scrubber.h		patch \| blob \| history
src/osd/scrubber/scrub_job.cc		patch \| blob \| history
src/osd/scrubber/scrub_job.h		patch \| blob \| history
src/osd/scrubber/scrub_machine.cc		patch \| blob \| history
src/osd/scrubber/scrub_machine_lstnr.h		patch \| blob \| history
src/osd/scrubber_common.h		patch \| blob \| history