From 0af4fc42615b452b91a16233802916e99d417c79 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 18 Sep 2023 06:00:34 -0500
Subject: [PATCH] osd/scrub: extract restrictions_on_scrubbing()

from ScrubQueue::select_pg_and_scrub().

Clearing the path to moving some ScrubQueue methods into
OscScrub. Starting here with the CPU load tracker.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 qa/standalone/scrub/osd-scrub-test.sh |  24 ++++--
 src/osd/scrubber/osd_scrub.cc         | 118 +++++++++++++++++++-------
 src/osd/scrubber/osd_scrub.h          |  49 ++++++++++-
 src/osd/scrubber/osd_scrub_sched.cc   |  10 +--
 src/osd/scrubber/osd_scrub_sched.h    |  22 +----
 5 files changed, 151 insertions(+), 72 deletions(-)

diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh
index c88e0bec1e046..354bd22880e36 100755
--- a/qa/standalone/scrub/osd-scrub-test.sh
+++ b/qa/standalone/scrub/osd-scrub-test.sh
@@ -49,9 +49,12 @@ function TEST_scrub_test() {
 
     run_mon $dir a --osd_pool_default_size=3 || return 1
     run_mgr $dir x || return 1
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+    ceph_osd_args+="--osd_stats_update_period_scrubbing=2"
     for osd in $(seq 0 $(expr $OSDS - 1))
     do
-      run_osd $dir $osd || return 1
+      run_osd $dir $osd $ceph_osd_args || return 1
     done
 
     # Create a pool with a single pg
@@ -211,16 +214,17 @@ function TEST_scrub_extended_sleep() {
 
     run_mon $dir a --osd_pool_default_size=3 || return 1
     run_mgr $dir x || return 1
+
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_scrub_sleep=0 "
+    ceph_osd_args+="--osd_scrub_extended_sleep=20 --osd_scrub_begin_week_day=$DAY_START "
+    ceph_osd_args+="--osd_op_queue=wpq --osd_scrub_end_week_day=$DAY_END "
+    ceph_osd_args+="--bluestore_cache_autotune=false" # why needed?
+
     for osd in $(seq 0 $(expr $OSDS - 1))
     do
-      run_osd $dir $osd --osd_scrub_sleep=0 \
-                        --osd_scrub_extended_sleep=20 \
-                        --bluestore_cache_autotune=false \
-	                --osd_deep_scrub_randomize_ratio=0.0 \
-	                --osd_scrub_interval_randomize_ratio=0 \
-			--osd_scrub_begin_week_day=$DAY_START \
-			--osd_scrub_end_week_day=$DAY_END \
-			|| return 1
+      run_osd $dir $osd $ceph_osd_args || return 1
     done
 
     # Create a pool with a single pg
@@ -527,6 +531,8 @@ function TEST_dump_scrub_schedule() {
             --osd_scrub_interval_randomize_ratio=0 \
             --osd_scrub_backoff_ratio=0.0 \
             --osd_op_queue=wpq \
+            --osd_stats_update_period_not_scrubbing=3 \
+            --osd_stats_update_period_scrubbing=2 \
             --osd_scrub_sleep=0.2"
 
     for osd in $(seq 0 $(expr $OSDS - 1))
diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc
index 3c03b489eb0d6..5533f3bd6c3dd 100644
--- a/src/osd/scrubber/osd_scrub.cc
+++ b/src/osd/scrubber/osd_scrub.cc
@@ -35,6 +35,7 @@ OsdScrub::OsdScrub(
     , m_resource_bookkeeper{[this](std::string msg) { log_fwd(msg); }, conf}
     , m_queue{cct, m_osd_svc}
     , m_log_prefix{fmt::format("osd.{}: osd-scrub:", m_osd_svc.get_nodeid())}
+    , m_load_tracker{cct, conf, m_osd_svc.get_nodeid()}
 {}
 
 std::ostream& OsdScrub::gen_prefix(std::ostream& out, std::string_view fn) const
@@ -89,22 +90,24 @@ void OsdScrub::initiate_scrub(bool is_recovery_active)
 
   // if there is a PG that is just now trying to reserve scrub replica resources -
   // we should wait and not initiate a new scrub
-  if (is_reserving_now()) {
+  if (m_queue.is_reserving_now()) {
     dout(10) << "scrub resources reservation in progress" << dendl;
     return;
   }
 
-  Scrub::OSDRestrictions env_conditions;
+  utime_t scrub_time = ceph_clock_now();
+  dout(10) << fmt::format(
+		  "{}: time now:{}, recover is active?:{}", __func__,
+		  scrub_time, is_recovery_active)
+	   << dendl;
 
-  if (is_recovery_active && !conf->osd_scrub_during_recovery) {
-    if (!conf->osd_repair_during_recovery) {
-      dout(15) << "not scheduling scrubs due to active recovery" << dendl;
-      return;
-    }
-    dout(10) << "will only schedule explicitly requested repair due to active "
-		"recovery"
-	     << dendl;
-    env_conditions.allow_requested_repair_only = true;
+  // check the OSD-wide environment conditions (scrub resources, time, etc.).
+  // These may restrict the type of scrubs we are allowed to start, or just
+  // prevent us from starting any scrub at all.
+  auto env_restrictions =
+      restrictions_on_scrubbing(is_recovery_active, scrub_time);
+  if (!env_restrictions) {
+    return;
   }
 
   if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
@@ -115,13 +118,53 @@ void OsdScrub::initiate_scrub(bool is_recovery_active)
     }
   }
 
-  auto was_started = select_pg_and_scrub(env_conditions);
+  auto was_started = select_pg_and_scrub(*env_restrictions);
   dout(20) << fmt::format(
 		  "scrub scheduling done ({})",
 		  ScrubQueue::attempt_res_text(was_started))
 	   << dendl;
 }
 
+
+std::optional<Scrub::OSDRestrictions> OsdScrub::restrictions_on_scrubbing(
+    bool is_recovery_active,
+    utime_t scrub_clock_now) const
+{
+  // our local OSD may already be running too many scrubs
+  if (!m_resource_bookkeeper.can_inc_scrubs()) {
+    dout(10) << "OSD cannot inc scrubs" << dendl;
+    return std::nullopt;
+  }
+
+  // if there is a PG that is just now trying to reserve scrub replica resources
+  // - we should wait and not initiate a new scrub
+  if (m_queue.is_reserving_now()) {
+    dout(10) << "scrub resources reservation in progress" << dendl;
+    return std::nullopt;
+  }
+
+  Scrub::OSDRestrictions env_conditions;
+  env_conditions.time_permit = scrub_time_permit(scrub_clock_now);
+  env_conditions.load_is_low = m_load_tracker.scrub_load_below_threshold();
+  env_conditions.only_deadlined =
+      !env_conditions.time_permit || !env_conditions.load_is_low;
+
+  if (is_recovery_active && !conf->osd_scrub_during_recovery) {
+    if (!conf->osd_repair_during_recovery) {
+      dout(15) << "not scheduling scrubs due to active recovery" << dendl;
+      return std::nullopt;
+    }
+
+    dout(10) << "will only schedule explicitly requested repair due to active "
+		"recovery"
+	     << dendl;
+    env_conditions.allow_requested_repair_only = true;
+  }
+
+  return env_conditions;
+}
+
+
 // ////////////////////////////////////////////////////////////////////////// //
 // scrub initiation - OSD code temporarily moved here from OSD.cc
 
@@ -223,17 +266,13 @@ void ScrubQueue::dump_scrubs(ceph::Formatter* f) const
  *    - try that one. If not suitable, discard from 'to_scrub_copy'
  */
 Scrub::schedule_result_t ScrubQueue::select_pg_and_scrub(
-  Scrub::OSDRestrictions& preconds)
+  Scrub::OSDRestrictions preconds)
 {
   dout(10) << " reg./pen. sizes: " << to_scrub.size() << " / "
 	   << penalized.size() << dendl;
 
   utime_t now_is = time_now();
 
-  preconds.time_permit = scrub_time_permit(now_is);
-  preconds.load_is_low = scrub_load_below_threshold();
-  preconds.only_deadlined = !preconds.time_permit || !preconds.load_is_low;
-
   //  create a list of candidates (copying, as otherwise creating a deadlock):
   //  - possibly restore penalized
   //  - (if we didn't handle directly) remove invalid jobs
@@ -283,7 +322,7 @@ Scrub::schedule_result_t ScrubQueue::select_pg_and_scrub(
 // not holding jobs_lock. 'group' is a copy of the actual list.
 Scrub::schedule_result_t ScrubQueue::select_from_group(
     Scrub::ScrubQContainer& group,
-    const Scrub::OSDRestrictions& preconds,
+    Scrub::OSDRestrictions preconds,
     utime_t now_is)
 {
   dout(15) << "jobs #: " << group.size() << dendl;
@@ -347,27 +386,42 @@ Scrub::schedule_result_t ScrubQueue::select_from_group(
 // ////////////////////////////////////////////////////////////////////////// //
 // CPU load tracking and related
 
+OsdScrub::LoadTracker::LoadTracker(
+    CephContext* cct,
+    const ceph::common::ConfigProxy& config,
+    int node_id)
+    : cct{cct}
+    , conf{config}
+    , log_prefix{fmt::format("osd.{} scrub-queue::load-tracker::", node_id)}
+{
+  // initialize the daily loadavg with current 15min loadavg
+  if (double loadavgs[3]; getloadavg(loadavgs, 3) == 3) {
+    daily_loadavg = loadavgs[2];
+  } else {
+    derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
+    daily_loadavg = 1.0;
+  }
+}
 
 ///\todo replace with Knuth's algo (to reduce the numerical error)
-std::optional<double> ScrubQueue::update_load_average()
+std::optional<double> OsdScrub::LoadTracker::update_load_average()
 {
-  int hb_interval = conf()->osd_heartbeat_interval;
+  int hb_interval = conf->osd_heartbeat_interval;
   int n_samples = std::chrono::duration_cast<seconds>(24h).count();
   if (hb_interval > 1) {
     n_samples = std::max(n_samples / hb_interval, 1);
   }
 
-  // get CPU load avg
   double loadavg;
   if (getloadavg(&loadavg, 1) == 1) {
     daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavg) / n_samples;
     return 100 * loadavg;
   }
 
-  return std::nullopt;
+  return std::nullopt;	// getloadavg() failed
 }
 
-bool ScrubQueue::scrub_load_below_threshold() const
+bool OsdScrub::LoadTracker::scrub_load_below_threshold() const
 {
   double loadavgs[3];
   if (getloadavg(loadavgs, 3) != 3) {
@@ -378,10 +432,10 @@ bool ScrubQueue::scrub_load_below_threshold() const
   // allow scrub if below configured threshold
   long cpus = sysconf(_SC_NPROCESSORS_ONLN);
   double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
-  if (loadavg_per_cpu < conf()->osd_scrub_load_threshold) {
+  if (loadavg_per_cpu < conf->osd_scrub_load_threshold) {
     dout(20) << fmt::format(
 		    "loadavg per cpu {:.3f} < max {:.3f} = yes",
-		    loadavg_per_cpu, conf()->osd_scrub_load_threshold)
+		    loadavg_per_cpu, conf->osd_scrub_load_threshold)
 	     << dendl;
     return true;
   }
@@ -399,20 +453,24 @@ bool ScrubQueue::scrub_load_below_threshold() const
   dout(10) << fmt::format(
 		  "loadavg {:.3f} >= max {:.3f} and ( >= daily_loadavg {:.3f} "
 		  "or >= 15m avg {:.3f} ) = no",
-		  loadavgs[0], conf()->osd_scrub_load_threshold, daily_loadavg,
+		  loadavgs[0], conf->osd_scrub_load_threshold, daily_loadavg,
 		  loadavgs[2])
 	   << dendl;
   return false;
 }
 
+std::ostream& OsdScrub::LoadTracker::gen_prefix(
+    std::ostream& out,
+    std::string_view fn) const
+{
+  return out << log_prefix << fn << ": ";
+}
 
 std::optional<double> OsdScrub::update_load_average()
 {
-  return m_queue.update_load_average();
+  return m_load_tracker.update_load_average();
 }
 
-
-
 // ////////////////////////////////////////////////////////////////////////// //
 
 // checks for half-closed ranges. Modify the (p<till)to '<=' to check for
@@ -563,7 +621,7 @@ Scrub::ScrubQContainer OsdScrub::list_registered_jobs() const
 }
 
 Scrub::schedule_result_t OsdScrub::select_pg_and_scrub(
-    Scrub::OSDRestrictions& preconds)
+    Scrub::OSDRestrictions preconds)
 {
   return m_queue.select_pg_and_scrub(preconds);
 }
diff --git a/src/osd/scrubber/osd_scrub.h b/src/osd/scrubber/osd_scrub.h
index 23baecea74541..5792514ea7900 100644
--- a/src/osd/scrubber/osd_scrub.h
+++ b/src/osd/scrubber/osd_scrub.h
@@ -157,6 +157,25 @@ class OsdScrub {
   Scrub::ScrubSchedListener& m_osd_svc;
   const ceph::common::ConfigProxy& conf;
 
+  /**
+   * check the OSD-wide environment conditions (scrub resources, time, etc.).
+   * These may restrict the type of scrubs we are allowed to start, or just
+   * prevent us from starting any scrub at all.
+   *
+   * Specifically:
+   * a nullopt is returned if we are not allowed to scrub at all, for either of
+   * the following reasons: no local resources (too many scrubs on this OSD);
+   * a dice roll says we will not scrub in this tick;
+   * a recovery is in progress, and we are not allowed to scrub while recovery;
+   * a PG is trying to acquire replica resources.
+   *
+   * If we are allowed to scrub, the returned value specifies whether the only
+   * high priority scrubs or only overdue ones are allowed to go on.
+   */
+  std::optional<Scrub::OSDRestrictions> restrictions_on_scrubbing(
+      bool is_recovery_active,
+      utime_t scrub_clock_now) const;
+
   /// resource reservation management
   Scrub::ScrubResources m_resource_bookkeeper;
 
@@ -164,12 +183,9 @@ class OsdScrub {
   ScrubQueue m_queue;
 
  public:
-  // for this transitory commit only - to be removed
-  bool can_inc_scrubs() { return m_resource_bookkeeper.can_inc_scrubs(); }
-
   // for this transitory commit only - to be removed
   Scrub::schedule_result_t select_pg_and_scrub(
-      Scrub::OSDRestrictions& preconds);
+      Scrub::OSDRestrictions preconds);
 
   // for this transitory commit only - to be moved elsewhere
   /**
@@ -193,4 +209,29 @@ class OsdScrub {
    * \returns true with probability of osd_scrub_backoff_ratio.
    */
   bool scrub_random_backoff() const;
+
+  /**
+   * tracking the average load on the CPU. Used both by the
+   * OSD logger, and by the scrub queue (as no scrubbing is allowed if
+   * the load is too high).
+   */
+  class LoadTracker {
+    CephContext* cct;
+    const ceph::common::ConfigProxy& conf;
+    const std::string log_prefix;
+    double daily_loadavg{0.0};
+
+   public:
+    explicit LoadTracker(
+	CephContext* cct,
+	const ceph::common::ConfigProxy& config,
+	int node_id);
+
+    std::optional<double> update_load_average();
+
+    [[nodiscard]] bool scrub_load_below_threshold() const;
+
+    std::ostream& gen_prefix(std::ostream& out, std::string_view fn) const;
+  };
+  LoadTracker m_load_tracker;
 };
diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc
index 2bbef2a6fa53c..bc5094b360a57 100644
--- a/src/osd/scrubber/osd_scrub_sched.cc
+++ b/src/osd/scrubber/osd_scrub_sched.cc
@@ -37,15 +37,7 @@ static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "")
 ScrubQueue::ScrubQueue(CephContext* cct, Scrub::ScrubSchedListener& osds)
     : cct{cct}
     , osd_service{osds}
-{
-  // initialize the daily loadavg with current 15min loadavg
-  if (double loadavgs[3]; getloadavg(loadavgs, 3) == 3) {
-    daily_loadavg = loadavgs[2];
-  } else {
-    derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
-    daily_loadavg = 1.0;
-  }
-}
+{}
 
 std::ostream& ScrubQueue::gen_prefix(std::ostream& out, std::string_view fn)
     const
diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h
index 2ec36bbc43031..6efec6fd5696f 100644
--- a/src/osd/scrubber/osd_scrub_sched.h
+++ b/src/osd/scrubber/osd_scrub_sched.h
@@ -196,7 +196,7 @@ class ScrubQueue {
    *
    * locking: locks jobs_lock
    */
-  Scrub::schedule_result_t select_pg_and_scrub(Scrub::OSDRestrictions& preconds);
+  Scrub::schedule_result_t select_pg_and_scrub(Scrub::OSDRestrictions preconds);
 
   /**
    * Translate attempt_ values into readable text
@@ -277,14 +277,6 @@ class ScrubQueue {
   void clear_reserving_now();
   bool is_reserving_now() const;
 
-
-  bool can_inc_scrubs() const;
-  bool inc_scrubs_local();
-  void dec_scrubs_local();
-  bool inc_scrubs_remote();
-  void dec_scrubs_remote();
-  void dump_scrub_reservations(ceph::Formatter* f) const;
-
   /// counting the number of PGs stuck while scrubbing, waiting for objects
   void mark_pg_scrub_blocked(spg_t blocked_pg);
   void clear_pg_scrub_blocked(spg_t blocked_pg);
@@ -302,13 +294,6 @@ class ScrubQueue {
    */
   std::chrono::milliseconds scrub_sleep_time(bool must_scrub) const;
 
-  /**
-   *  called every heartbeat to update the "daily" load average
-   *
-   *  @returns a load value for the logger
-   */
-  [[nodiscard]] std::optional<double> update_load_average();
-
  private:
   CephContext* cct;
   Scrub::ScrubSchedListener& osd_service;
@@ -334,8 +319,6 @@ class ScrubQueue {
   Scrub::ScrubQContainer penalized;  ///< those that failed to reserve remote resources
   bool restore_penalized{false};
 
-  double daily_loadavg{0.0};
-
   static inline constexpr auto registered_job = [](const auto& jobref) -> bool {
     return jobref->state == Scrub::qu_state_t::registered;
   };
@@ -382,7 +365,6 @@ class ScrubQueue {
 
   std::atomic_bool a_pg_is_reserving{false};
 
-  [[nodiscard]] bool scrub_load_below_threshold() const;
   [[nodiscard]] bool scrub_time_permit(utime_t now) const;
 
   /**
@@ -408,7 +390,7 @@ class ScrubQueue {
 
   Scrub::schedule_result_t select_from_group(
     Scrub::ScrubQContainer& group,
-    const Scrub::OSDRestrictions& preconds,
+    Scrub::OSDRestrictions preconds,
     utime_t now_is);
 
 protected: // used by the unit-tests
-- 
2.39.5