]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: expose h.p. scrub jobs in the queue
authorRonen Friedman <rfriedma@redhat.com>
Thu, 7 Dec 2023 09:37:57 +0000 (03:37 -0600)
committerRonen Friedman <rfriedma@redhat.com>
Mon, 11 Dec 2023 07:26:02 +0000 (01:26 -0600)
- a high-priority bit is added to the scrub job
- a scrub scheduling attempt will be performed even if only
  high-priority
  jobs should be allowed to run

Note: some of the changes in this PR are temporary, in the sense that
they would be made obsolete when the scrub scheduler is refactored.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/osd/scrubber/osd_scrub.cc
src/osd/scrubber/osd_scrub.h
src/osd/scrubber/osd_scrub_sched.cc
src/osd/scrubber/scrub_job.h
src/osd/scrubber_common.h

index 99367170dbac3e0ccd35f3b8da79a9a85d907a56..536c4479b1d30ebd3aca8707257e034f9b9c519f 100644 (file)
@@ -69,10 +69,11 @@ bool OsdScrub::scrub_random_backoff() const
 
 void OsdScrub::initiate_scrub(bool is_recovery_active)
 {
-  if (scrub_random_backoff()) {
-    // dice-roll says we should not scrub now
-    return;
-  }
+  const utime_t scrub_time = ceph_clock_now();
+  dout(10) << fmt::format(
+                 "time now:{}, recover is active?:{}", scrub_time,
+                 is_recovery_active)
+          << dendl;
 
   if (auto blocked_pgs = get_blocked_pgs_count(); blocked_pgs > 0) {
     // some PGs managed by this OSD were blocked by a locked object during
@@ -84,35 +85,14 @@ void OsdScrub::initiate_scrub(bool is_recovery_active)
        << dendl;
   }
 
-  // fail fast if no resources are available
-  if (!m_resource_bookkeeper.can_inc_scrubs()) {
-    dout(20) << "too many scrubs already running on this OSD" << dendl;
-    return;
-  }
-
-  // if there is a PG that is just now trying to reserve scrub replica resources -
-  // we should wait and not initiate a new scrub
-  if (m_queue.is_reserving_now()) {
-    dout(10) << "scrub resources reservation in progress" << dendl;
-    return;
-  }
-
-  utime_t scrub_time = ceph_clock_now();
-  dout(10) << fmt::format(
-                 "time now:{}, recover is active?:{}", scrub_time,
-                 is_recovery_active)
-          << dendl;
-
   // check the OSD-wide environment conditions (scrub resources, time, etc.).
   // These may restrict the type of scrubs we are allowed to start, or just
-  // prevent us from starting any scrub at all.
+  // prevent us from starting any non-operator-initiated scrub at all.
   auto env_restrictions =
       restrictions_on_scrubbing(is_recovery_active, scrub_time);
-  if (!env_restrictions) {
-    return;
-  }
 
-  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>() &&
+      !env_restrictions.high_priority_only) {
     dout(20) << "scrub scheduling (@tick) starts" << dendl;
     auto all_jobs = m_queue.list_registered_jobs();
     for (const auto& sj : all_jobs) {
@@ -124,7 +104,7 @@ void OsdScrub::initiate_scrub(bool is_recovery_active)
   // queue interface used here: we ask for a list of
   // eligible targets (based on the known restrictions).
   // We try all elements of this list until a (possibly temporary) success.
-  auto candidates = m_queue.ready_to_scrub(*env_restrictions, scrub_time);
+  auto candidates = m_queue.ready_to_scrub(env_restrictions, scrub_time);
   if (candidates.empty()) {
     dout(20) << "no PGs are ready for scrubbing" << dendl;
     return;
@@ -137,7 +117,7 @@ void OsdScrub::initiate_scrub(bool is_recovery_active)
     // scrub. For some failures - we can continue with the next candidate. For
     // others - we should stop trying to scrub at this tick.
     auto res = initiate_a_scrub(
-       candidate, env_restrictions->allow_requested_repair_only);
+       candidate, env_restrictions.allow_requested_repair_only);
 
     if (res == schedule_result_t::target_specific_failure) {
       // continue with the next job.
@@ -157,39 +137,51 @@ void OsdScrub::initiate_scrub(bool is_recovery_active)
 }
 
 
-std::optional<Scrub::OSDRestrictions> OsdScrub::restrictions_on_scrubbing(
+Scrub::OSDRestrictions OsdScrub::restrictions_on_scrubbing(
     bool is_recovery_active,
     utime_t scrub_clock_now) const
 {
-  // our local OSD may already be running too many scrubs
+  Scrub::OSDRestrictions env_conditions;
+
+  // some environmental conditions prevent all but high priority scrubs
+
   if (!m_resource_bookkeeper.can_inc_scrubs()) {
-    dout(10) << "OSD cannot inc scrubs" << dendl;
-    return std::nullopt;
-  }
+    // our local OSD is already running too many scrubs
+    dout(15) << "OSD cannot inc scrubs" << dendl;
+    env_conditions.high_priority_only = true;
 
-  // if there is a PG that is just now trying to reserve scrub replica resources
-  // - we should wait and not initiate a new scrub
-  if (m_queue.is_reserving_now()) {
+  } else if (scrub_random_backoff()) {
+    // dice-roll says we should not scrub now
+      dout(15) << "Lost in dice. Only high priority scrubs allowed."
+              << dendl;
+      env_conditions.high_priority_only = true;
+
+  } else if (m_queue.is_reserving_now()) {
+    // if there is a PG that is just now trying to reserve scrub replica
+    // resources - we should wait and not initiate a new scrub
     dout(10) << "scrub resources reservation in progress" << dendl;
-    return std::nullopt;
-  }
+    env_conditions.high_priority_only = true;
 
-  Scrub::OSDRestrictions env_conditions;
-  env_conditions.time_permit = scrub_time_permit(scrub_clock_now);
-  env_conditions.load_is_low = m_load_tracker.scrub_load_below_threshold();
-  env_conditions.only_deadlined =
-      !env_conditions.time_permit || !env_conditions.load_is_low;
-
-  if (is_recovery_active && !conf->osd_scrub_during_recovery) {
-    if (!conf->osd_repair_during_recovery) {
-      dout(15) << "not scheduling scrubs due to active recovery" << dendl;
-      return std::nullopt;
+  } else if (is_recovery_active && !conf->osd_scrub_during_recovery) {
+    if (conf->osd_repair_during_recovery) {
+      dout(15)
+         << "will only schedule explicitly requested repair due to active "
+            "recovery"
+         << dendl;
+      env_conditions.allow_requested_repair_only = true;
+
+    } else {
+      dout(15) << "recovery in progress. Only high priority scrubs allowed."
+              << dendl;
+      env_conditions.high_priority_only = true;
     }
+  } else {
 
-    dout(10) << "will only schedule explicitly requested repair due to active "
-               "recovery"
-            << dendl;
-    env_conditions.allow_requested_repair_only = true;
+    // regular, i.e. non-high-priority scrubs are allowed
+    env_conditions.time_permit = scrub_time_permit(scrub_clock_now);
+    env_conditions.load_is_low = m_load_tracker.scrub_load_below_threshold();
+    env_conditions.only_deadlined =
+       !env_conditions.time_permit || !env_conditions.load_is_low;
   }
 
   return env_conditions;
index 56167df2ee6e4f0fe2f87efc1103db9708f5951e..fcc4fd3fe9c52ed6d36d85e3000b33fe8a9a1b0d 100644 (file)
@@ -168,20 +168,17 @@ class OsdScrub {
 
   /**
    * check the OSD-wide environment conditions (scrub resources, time, etc.).
-   * These may restrict the type of scrubs we are allowed to start, or just
-   * prevent us from starting any scrub at all.
+   * These may restrict the type of scrubs we are allowed to start, maybe
+   * down to allowing only high-priority scrubs
    *
    * Specifically:
-   * a nullopt is returned if we are not allowed to scrub at all, for either of
+   * 'only high priority' flag is set for either of
    * the following reasons: no local resources (too many scrubs on this OSD);
    * a dice roll says we will not scrub in this tick;
    * a recovery is in progress, and we are not allowed to scrub while recovery;
    * a PG is trying to acquire replica resources.
-   *
-   * If we are allowed to scrub, the returned value specifies whether the only
-   * high priority scrubs or only overdue ones are allowed to go on.
    */
-  std::optional<Scrub::OSDRestrictions> restrictions_on_scrubbing(
+  Scrub::OSDRestrictions restrictions_on_scrubbing(
       bool is_recovery_active,
       utime_t scrub_clock_now) const;
 
index 324899f29ab89eca462f583927a600af15cf4237..1b3506a35e5021653d0ca2660043f05904403c44 100644 (file)
@@ -149,6 +149,7 @@ void ScrubQueue::update_job(Scrub::ScrubJobRef scrub_job,
   // adjust the suggested scrub time according to OSD-wide status
   auto adjusted = adjust_target_time(suggested);
   scrub_job->update_schedule(adjusted);
+  scrub_job->high_priority = suggested.is_must == must_scrub_t::mandatory;
 }
 
 sched_params_t ScrubQueue::determine_scrub_time(
@@ -299,11 +300,13 @@ void ScrubQueue::rm_unregistered_jobs(ScrubQContainer& group)
 }
 
 namespace {
-struct cmp_sched_time_t {
-  bool operator()(const Scrub::ScrubJobRef& lhs,
-                 const Scrub::ScrubJobRef& rhs) const
+struct cmp_time_n_priority_t {
+  bool operator()(const Scrub::ScrubJobRef& lhs, const Scrub::ScrubJobRef& rhs)
+      const
   {
-    return lhs->schedule.scheduled_at < rhs->schedule.scheduled_at;
+    return lhs->is_high_priority() > rhs->is_high_priority() ||
+          (lhs->is_high_priority() == rhs->is_high_priority() &&
+           lhs->schedule.scheduled_at < rhs->schedule.scheduled_at);
   }
 };
 }  // namespace
@@ -314,11 +317,11 @@ ScrubQContainer ScrubQueue::collect_ripe_jobs(
     OSDRestrictions restrictions,
     utime_t time_now)
 {
-  auto filtr = [time_now, restrictions](const auto& jobref) -> bool {
+  auto filtr = [time_now, rst = restrictions](const auto& jobref) -> bool {
     return jobref->schedule.scheduled_at <= time_now &&
-          (!restrictions.only_deadlined ||
-           (!jobref->schedule.deadline.is_zero() &&
-            jobref->schedule.deadline <= time_now));
+          (!rst.high_priority_only || jobref->high_priority) &&
+          (!rst.only_deadlined || (!jobref->schedule.deadline.is_zero() &&
+                                   jobref->schedule.deadline <= time_now));
   };
 
   rm_unregistered_jobs(group);
@@ -327,7 +330,7 @@ ScrubQContainer ScrubQueue::collect_ripe_jobs(
   ripes.reserve(group.size());
 
   std::copy_if(group.begin(), group.end(), std::back_inserter(ripes), filtr);
-  std::sort(ripes.begin(), ripes.end(), cmp_sched_time_t{});
+  std::sort(ripes.begin(), ripes.end(), cmp_time_n_priority_t{});
 
   if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
     for (const auto& jobref : group) {
index ae29c8ebab44da5e285558d0722ab33bed0b2311..11e7388f6362469ee51d3bb167841dd3be4e4422 100644 (file)
@@ -93,6 +93,8 @@ class ScrubJob final : public RefCountedObject {
 
   CephContext* cct;
 
+  bool high_priority{false};
+
   ScrubJob(CephContext* cct, const spg_t& pg, int node_id);
 
   utime_t get_sched_time() const { return schedule.scheduled_at; }
@@ -130,6 +132,12 @@ class ScrubJob final : public RefCountedObject {
    */
   bool is_state_registered() const { return state == qu_state_t::registered; }
 
+  /**
+   * is this a high priority scrub job?
+   * High priority - (usually) a scrub that was initiated by the operator
+   */
+  bool is_high_priority() const { return high_priority; }
+
   /**
    * a text description of the "scheduling intentions" of this PG:
    * are we already scheduled for a scrub/deep scrub? when?
index d24bb79b801e367f15ac461887ceee9fcc82e49e..f39907f3f6b3fb3ee1d5f1337836de05c38d9628 100644 (file)
@@ -49,12 +49,16 @@ enum class scrub_prio_t : bool { low_priority = false, high_priority = true };
 using act_token_t = uint32_t;
 
 /// "environment" preconditions affecting which PGs are eligible for scrubbing
+/// (note: struct size should be kept small, as it is copied around)
 struct OSDRestrictions {
+  /// high local OSD concurrency. Thus - only high priority scrubs are allowed
+  bool high_priority_only{false};
   bool allow_requested_repair_only{false};
-  bool load_is_low{true};
-  bool time_permit{true};
   bool only_deadlined{false};
+  bool load_is_low:1{true};
+  bool time_permit:1{true};
 };
+static_assert(sizeof(Scrub::OSDRestrictions) <= sizeof(uint32_t));
 
 }  // namespace Scrub
 
@@ -68,7 +72,8 @@ struct formatter<Scrub::OSDRestrictions> {
   {
     return fmt::format_to(
       ctx.out(),
-      "overdue-only:{} load:{} time:{} repair-only:{}",
+      "priority-only:{} overdue-only:{} load:{} time:{} repair-only:{}",
+        conds.high_priority_only,
         conds.only_deadlined,
         conds.load_is_low ? "ok" : "high",
         conds.time_permit ? "ok" : "no",