osd/scrub: extract restrictions_on_scrubbing()

author Ronen Friedman <rfriedma@redhat.com>

Mon, 18 Sep 2023 11:00:34 +0000 (06:00 -0500)

committer Ronen Friedman <rfriedma@redhat.com>

Wed, 20 Sep 2023 06:39:10 +0000 (01:39 -0500)
author Ronen Friedman <rfriedma@redhat.com>
Mon, 18 Sep 2023 11:00:34 +0000 (06:00 -0500)
committer Ronen Friedman <rfriedma@redhat.com>
Wed, 20 Sep 2023 06:39:10 +0000 (01:39 -0500)
diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh

index c88e0bec1e04658d591cb80e721c029de14d16e1..354bd22880e3658925e43d9615956f6cfe907ad0 100755 (executable)
--- a/qa/standalone/scrub/osd-scrub-test.sh
+++ b/qa/standalone/scrub/osd-scrub-test.sh
@@ -49,9 +49,12 @@ function TEST_scrub_test() {
  
      run_mon $dir a --osd_pool_default_size=3 || return 1
      run_mgr $dir x || return 1
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+    ceph_osd_args+="--osd_stats_update_period_scrubbing=2"
      for osd in $(seq 0 $(expr $OSDS - 1))
      do
-      run_osd $dir $osd || return 1
+      run_osd $dir $osd $ceph_osd_args || return 1
      done
  
      # Create a pool with a single pg
@@ -211,16 +214,17 @@ function TEST_scrub_extended_sleep() {
  
      run_mon $dir a --osd_pool_default_size=3 || return 1
      run_mgr $dir x || return 1
+
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_scrub_sleep=0 "
+    ceph_osd_args+="--osd_scrub_extended_sleep=20 --osd_scrub_begin_week_day=$DAY_START "
+    ceph_osd_args+="--osd_op_queue=wpq --osd_scrub_end_week_day=$DAY_END "
+    ceph_osd_args+="--bluestore_cache_autotune=false" # why needed?
+
      for osd in $(seq 0 $(expr $OSDS - 1))
      do
-      run_osd $dir $osd --osd_scrub_sleep=0 \
-                        --osd_scrub_extended_sleep=20 \
-                        --bluestore_cache_autotune=false \
-                       --osd_deep_scrub_randomize_ratio=0.0 \
-                       --osd_scrub_interval_randomize_ratio=0 \
-                       --osd_scrub_begin_week_day=$DAY_START \
-                       --osd_scrub_end_week_day=$DAY_END \
-                       || return 1
+      run_osd $dir $osd $ceph_osd_args || return 1
      done
  
      # Create a pool with a single pg
@@ -527,6 +531,8 @@ function TEST_dump_scrub_schedule() {
              --osd_scrub_interval_randomize_ratio=0 \
              --osd_scrub_backoff_ratio=0.0 \
              --osd_op_queue=wpq \
+            --osd_stats_update_period_not_scrubbing=3 \
+            --osd_stats_update_period_scrubbing=2 \
              --osd_scrub_sleep=0.2"
  
      for osd in $(seq 0 $(expr $OSDS - 1))
diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc

index 3c03b489eb0d69a586c19d0cc03f0550b2c6f586..5533f3bd6c3dd2697b5209ce793a9246ef1900eb 100644 (file)
--- a/src/osd/scrubber/osd_scrub.cc
+++ b/src/osd/scrubber/osd_scrub.cc
@@ -35,6 +35,7 @@ OsdScrub::OsdScrub(
      , m_resource_bookkeeper{[this](std::string msg) { log_fwd(msg); }, conf}
      , m_queue{cct, m_osd_svc}
      , m_log_prefix{fmt::format("osd.{}: osd-scrub:", m_osd_svc.get_nodeid())}
+    , m_load_tracker{cct, conf, m_osd_svc.get_nodeid()}
  {}
  
  std::ostream& OsdScrub::gen_prefix(std::ostream& out, std::string_view fn) const
@@ -89,22 +90,24 @@ void OsdScrub::initiate_scrub(bool is_recovery_active)
  
    // if there is a PG that is just now trying to reserve scrub replica resources -
    // we should wait and not initiate a new scrub
-  if (is_reserving_now()) {
+  if (m_queue.is_reserving_now()) {
      dout(10) << "scrub resources reservation in progress" << dendl;
      return;
    }
  
-  Scrub::OSDRestrictions env_conditions;
+  utime_t scrub_time = ceph_clock_now();
+  dout(10) << fmt::format(
+                 "{}: time now:{}, recover is active?:{}", __func__,
+                 scrub_time, is_recovery_active)
+          << dendl;
  
-  if (is_recovery_active && !conf->osd_scrub_during_recovery) {
-    if (!conf->osd_repair_during_recovery) {
-      dout(15) << "not scheduling scrubs due to active recovery" << dendl;
-      return;
-    }
-    dout(10) << "will only schedule explicitly requested repair due to active "
-               "recovery"
-            << dendl;
-    env_conditions.allow_requested_repair_only = true;
+  // check the OSD-wide environment conditions (scrub resources, time, etc.).
+  // These may restrict the type of scrubs we are allowed to start, or just
+  // prevent us from starting any scrub at all.
+  auto env_restrictions =
+      restrictions_on_scrubbing(is_recovery_active, scrub_time);
+  if (!env_restrictions) {
+    return;
    }
  
    if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
@@ -115,13 +118,53 @@ void OsdScrub::initiate_scrub(bool is_recovery_active)
      }
    }
  
-  auto was_started = select_pg_and_scrub(env_conditions);
+  auto was_started = select_pg_and_scrub(*env_restrictions);
    dout(20) << fmt::format(
                   "scrub scheduling done ({})",
                   ScrubQueue::attempt_res_text(was_started))
            << dendl;
  }
  
+
+std::optional<Scrub::OSDRestrictions> OsdScrub::restrictions_on_scrubbing(
+    bool is_recovery_active,
+    utime_t scrub_clock_now) const
+{
+  // our local OSD may already be running too many scrubs
+  if (!m_resource_bookkeeper.can_inc_scrubs()) {
+    dout(10) << "OSD cannot inc scrubs" << dendl;
+    return std::nullopt;
+  }
+
+  // if there is a PG that is just now trying to reserve scrub replica resources
+  // - we should wait and not initiate a new scrub
+  if (m_queue.is_reserving_now()) {
+    dout(10) << "scrub resources reservation in progress" << dendl;
+    return std::nullopt;
+  }
+
+  Scrub::OSDRestrictions env_conditions;
+  env_conditions.time_permit = scrub_time_permit(scrub_clock_now);
+  env_conditions.load_is_low = m_load_tracker.scrub_load_below_threshold();
+  env_conditions.only_deadlined =
+      !env_conditions.time_permit || !env_conditions.load_is_low;
+
+  if (is_recovery_active && !conf->osd_scrub_during_recovery) {
+    if (!conf->osd_repair_during_recovery) {
+      dout(15) << "not scheduling scrubs due to active recovery" << dendl;
+      return std::nullopt;
+    }
+
+    dout(10) << "will only schedule explicitly requested repair due to active "
+               "recovery"
+            << dendl;
+    env_conditions.allow_requested_repair_only = true;
+  }
+
+  return env_conditions;
+}
+
+
  // ////////////////////////////////////////////////////////////////////////// //
  // scrub initiation - OSD code temporarily moved here from OSD.cc
  
@@ -223,17 +266,13 @@ void ScrubQueue::dump_scrubs(ceph::Formatter* f) const
   *    - try that one. If not suitable, discard from 'to_scrub_copy'
   */
  Scrub::schedule_result_t ScrubQueue::select_pg_and_scrub(
-  Scrub::OSDRestrictions& preconds)
+  Scrub::OSDRestrictions preconds)
  {
    dout(10) << " reg./pen. sizes: " << to_scrub.size() << " / "
            << penalized.size() << dendl;
  
    utime_t now_is = time_now();
  
-  preconds.time_permit = scrub_time_permit(now_is);
-  preconds.load_is_low = scrub_load_below_threshold();
-  preconds.only_deadlined = !preconds.time_permit || !preconds.load_is_low;
-
    //  create a list of candidates (copying, as otherwise creating a deadlock):
    //  - possibly restore penalized
    //  - (if we didn't handle directly) remove invalid jobs
@@ -283,7 +322,7 @@ Scrub::schedule_result_t ScrubQueue::select_pg_and_scrub(
  // not holding jobs_lock. 'group' is a copy of the actual list.
  Scrub::schedule_result_t ScrubQueue::select_from_group(
      Scrub::ScrubQContainer& group,
-    const Scrub::OSDRestrictions& preconds,
+    Scrub::OSDRestrictions preconds,
      utime_t now_is)
  {
    dout(15) << "jobs #: " << group.size() << dendl;
@@ -347,27 +386,42 @@ Scrub::schedule_result_t ScrubQueue::select_from_group(
  // ////////////////////////////////////////////////////////////////////////// //
  // CPU load tracking and related
  
+OsdScrub::LoadTracker::LoadTracker(
+    CephContext* cct,
+    const ceph::common::ConfigProxy& config,
+    int node_id)
+    : cct{cct}
+    , conf{config}
+    , log_prefix{fmt::format("osd.{} scrub-queue::load-tracker::", node_id)}
+{
+  // initialize the daily loadavg with current 15min loadavg
+  if (double loadavgs[3]; getloadavg(loadavgs, 3) == 3) {
+    daily_loadavg = loadavgs[2];
+  } else {
+    derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
+    daily_loadavg = 1.0;
+  }
+}
  
  ///\todo replace with Knuth's algo (to reduce the numerical error)
-std::optional<double> ScrubQueue::update_load_average()
+std::optional<double> OsdScrub::LoadTracker::update_load_average()
  {
-  int hb_interval = conf()->osd_heartbeat_interval;
+  int hb_interval = conf->osd_heartbeat_interval;
    int n_samples = std::chrono::duration_cast<seconds>(24h).count();
    if (hb_interval > 1) {
      n_samples = std::max(n_samples / hb_interval, 1);
    }
  
-  // get CPU load avg
    double loadavg;
    if (getloadavg(&loadavg, 1) == 1) {
      daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavg) / n_samples;
      return 100 * loadavg;
    }
  
-  return std::nullopt;
+  return std::nullopt; // getloadavg() failed
  }
  
-bool ScrubQueue::scrub_load_below_threshold() const
+bool OsdScrub::LoadTracker::scrub_load_below_threshold() const
  {
    double loadavgs[3];
    if (getloadavg(loadavgs, 3) != 3) {
@@ -378,10 +432,10 @@ bool ScrubQueue::scrub_load_below_threshold() const
    // allow scrub if below configured threshold
    long cpus = sysconf(_SC_NPROCESSORS_ONLN);
    double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
-  if (loadavg_per_cpu < conf()->osd_scrub_load_threshold) {
+  if (loadavg_per_cpu < conf->osd_scrub_load_threshold) {
      dout(20) << fmt::format(
                     "loadavg per cpu {:.3f} < max {:.3f} = yes",
-                   loadavg_per_cpu, conf()->osd_scrub_load_threshold)
+                   loadavg_per_cpu, conf->osd_scrub_load_threshold)
              << dendl;
      return true;
    }
@@ -399,20 +453,24 @@ bool ScrubQueue::scrub_load_below_threshold() const
    dout(10) << fmt::format(
                   "loadavg {:.3f} >= max {:.3f} and ( >= daily_loadavg {:.3f} "
                   "or >= 15m avg {:.3f} ) = no",
-                 loadavgs[0], conf()->osd_scrub_load_threshold, daily_loadavg,
+                 loadavgs[0], conf->osd_scrub_load_threshold, daily_loadavg,
                   loadavgs[2])
            << dendl;
    return false;
  }
  
+std::ostream& OsdScrub::LoadTracker::gen_prefix(
+    std::ostream& out,
+    std::string_view fn) const
+{
+  return out << log_prefix << fn << ": ";
+}
  
  std::optional<double> OsdScrub::update_load_average()
  {
-  return m_queue.update_load_average();
+  return m_load_tracker.update_load_average();
  }
  
-
-
  // ////////////////////////////////////////////////////////////////////////// //
  
  // checks for half-closed ranges. Modify the (p<till)to '<=' to check for
@@ -563,7 +621,7 @@ Scrub::ScrubQContainer OsdScrub::list_registered_jobs() const
  }
  
  Scrub::schedule_result_t OsdScrub::select_pg_and_scrub(
-    Scrub::OSDRestrictions& preconds)
+    Scrub::OSDRestrictions preconds)
  {
    return m_queue.select_pg_and_scrub(preconds);
  }
diff --git a/src/osd/scrubber/osd_scrub.h b/src/osd/scrubber/osd_scrub.h

index 23baecea74541c48897ef5fe864036fad0874711..5792514ea7900f081a294a5e7579f9cc1041bd06 100644 (file)
--- a/src/osd/scrubber/osd_scrub.h
+++ b/src/osd/scrubber/osd_scrub.h
@@ -157,6 +157,25 @@ class OsdScrub {
    Scrub::ScrubSchedListener& m_osd_svc;
    const ceph::common::ConfigProxy& conf;
  
+  /**
+   * check the OSD-wide environment conditions (scrub resources, time, etc.).
+   * These may restrict the type of scrubs we are allowed to start, or just
+   * prevent us from starting any scrub at all.
+   *
+   * Specifically:
+   * a nullopt is returned if we are not allowed to scrub at all, for either of
+   * the following reasons: no local resources (too many scrubs on this OSD);
+   * a dice roll says we will not scrub in this tick;
+   * a recovery is in progress, and we are not allowed to scrub while recovery;
+   * a PG is trying to acquire replica resources.
+   *
+   * If we are allowed to scrub, the returned value specifies whether the only
+   * high priority scrubs or only overdue ones are allowed to go on.
+   */
+  std::optional<Scrub::OSDRestrictions> restrictions_on_scrubbing(
+      bool is_recovery_active,
+      utime_t scrub_clock_now) const;
+
    /// resource reservation management
    Scrub::ScrubResources m_resource_bookkeeper;
  
@@ -164,12 +183,9 @@ class OsdScrub {
    ScrubQueue m_queue;
  
   public:
-  // for this transitory commit only - to be removed
-  bool can_inc_scrubs() { return m_resource_bookkeeper.can_inc_scrubs(); }
-
    // for this transitory commit only - to be removed
    Scrub::schedule_result_t select_pg_and_scrub(
-      Scrub::OSDRestrictions& preconds);
+      Scrub::OSDRestrictions preconds);
  
    // for this transitory commit only - to be moved elsewhere
    /**
@@ -193,4 +209,29 @@ class OsdScrub {
     * \returns true with probability of osd_scrub_backoff_ratio.
     */
    bool scrub_random_backoff() const;
+
+  /**
+   * tracking the average load on the CPU. Used both by the
+   * OSD logger, and by the scrub queue (as no scrubbing is allowed if
+   * the load is too high).
+   */
+  class LoadTracker {
+    CephContext* cct;
+    const ceph::common::ConfigProxy& conf;
+    const std::string log_prefix;
+    double daily_loadavg{0.0};
+
+   public:
+    explicit LoadTracker(
+       CephContext* cct,
+       const ceph::common::ConfigProxy& config,
+       int node_id);
+
+    std::optional<double> update_load_average();
+
+    [[nodiscard]] bool scrub_load_below_threshold() const;
+
+    std::ostream& gen_prefix(std::ostream& out, std::string_view fn) const;
+  };
+  LoadTracker m_load_tracker;
  };
diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc

index 2bbef2a6fa53c13c3b7f8996a5b6d608044b5d5a..bc5094b360a571935dbe553bfdbeaf3534472e22 100644 (file)
--- a/src/osd/scrubber/osd_scrub_sched.cc
+++ b/src/osd/scrubber/osd_scrub_sched.cc
@@ -37,15 +37,7 @@ static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "")
  ScrubQueue::ScrubQueue(CephContext* cct, Scrub::ScrubSchedListener& osds)
      : cct{cct}
      , osd_service{osds}
-{
-  // initialize the daily loadavg with current 15min loadavg
-  if (double loadavgs[3]; getloadavg(loadavgs, 3) == 3) {
-    daily_loadavg = loadavgs[2];
-  } else {
-    derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
-    daily_loadavg = 1.0;
-  }
-}
+{}
  
  std::ostream& ScrubQueue::gen_prefix(std::ostream& out, std::string_view fn)
      const
diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h

index 2ec36bbc43031c3992b7c095b4c0117610015de0..6efec6fd5696fb9b20e785ebfede85df451a7683 100644 (file)
--- a/src/osd/scrubber/osd_scrub_sched.h
+++ b/src/osd/scrubber/osd_scrub_sched.h
@@ -196,7 +196,7 @@ class ScrubQueue {
     *
     * locking: locks jobs_lock
     */
-  Scrub::schedule_result_t select_pg_and_scrub(Scrub::OSDRestrictions& preconds);
+  Scrub::schedule_result_t select_pg_and_scrub(Scrub::OSDRestrictions preconds);
  
    /**
     * Translate attempt_ values into readable text
@@ -277,14 +277,6 @@ class ScrubQueue {
    void clear_reserving_now();
    bool is_reserving_now() const;
  
-
-  bool can_inc_scrubs() const;
-  bool inc_scrubs_local();
-  void dec_scrubs_local();
-  bool inc_scrubs_remote();
-  void dec_scrubs_remote();
-  void dump_scrub_reservations(ceph::Formatter* f) const;
-
    /// counting the number of PGs stuck while scrubbing, waiting for objects
    void mark_pg_scrub_blocked(spg_t blocked_pg);
    void clear_pg_scrub_blocked(spg_t blocked_pg);
@@ -302,13 +294,6 @@ class ScrubQueue {
     */
    std::chrono::milliseconds scrub_sleep_time(bool must_scrub) const;
  
-  /**
-   *  called every heartbeat to update the "daily" load average
-   *
-   *  @returns a load value for the logger
-   */
-  [[nodiscard]] std::optional<double> update_load_average();
-
   private:
    CephContext* cct;
    Scrub::ScrubSchedListener& osd_service;
@@ -334,8 +319,6 @@ class ScrubQueue {
    Scrub::ScrubQContainer penalized;  ///< those that failed to reserve remote resources
    bool restore_penalized{false};
  
-  double daily_loadavg{0.0};
-
    static inline constexpr auto registered_job = [](const auto& jobref) -> bool {
      return jobref->state == Scrub::qu_state_t::registered;
    };
@@ -382,7 +365,6 @@ class ScrubQueue {
  
    std::atomic_bool a_pg_is_reserving{false};
  
-  [[nodiscard]] bool scrub_load_below_threshold() const;
    [[nodiscard]] bool scrub_time_permit(utime_t now) const;
  
    /**
@@ -408,7 +390,7 @@ class ScrubQueue {
  
    Scrub::schedule_result_t select_from_group(
      Scrub::ScrubQContainer& group,
-    const Scrub::OSDRestrictions& preconds,
+    Scrub::OSDRestrictions preconds,
      utime_t now_is);
  
  protected: // used by the unit-tests
author	Ronen Friedman <rfriedma@redhat.com>
	Mon, 18 Sep 2023 11:00:34 +0000 (06:00 -0500)
committer	Ronen Friedman <rfriedma@redhat.com>
	Wed, 20 Sep 2023 06:39:10 +0000 (01:39 -0500)
qa/standalone/scrub/osd-scrub-test.sh		patch \| blob \| history
src/osd/scrubber/osd_scrub.cc		patch \| blob \| history
src/osd/scrubber/osd_scrub.h		patch \| blob \| history
src/osd/scrubber/osd_scrub_sched.cc		patch \| blob \| history
src/osd/scrubber/osd_scrub_sched.h		patch \| blob \| history