osd/scrub: modify schedule_result_t to report error class

author Ronen Friedman <rfriedma@redhat.com>

Thu, 21 Sep 2023 12:34:52 +0000 (07:34 -0500)

committer Ronen Friedman <rfriedma@redhat.com>

Thu, 21 Sep 2023 12:57:58 +0000 (07:57 -0500)
author Ronen Friedman <rfriedma@redhat.com>
Thu, 21 Sep 2023 12:34:52 +0000 (07:34 -0500)
committer Ronen Friedman <rfriedma@redhat.com>
Thu, 21 Sep 2023 12:57:58 +0000 (07:57 -0500)
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index 25a8ac7197845b1bc1559f102f7d926a5a08c7da..0d1f8d44e1c447cf57b8bfb0e9ffd19a985693ea 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1348,11 +1348,14 @@ Scrub::schedule_result_t PG::sched_scrub()
    ceph_assert(m_scrubber);
  
    if (is_scrub_queued_or_active()) {
-    return schedule_result_t::already_started;
+     dout(10) << __func__ << ": already scrubbing" << dendl;
+     return schedule_result_t::target_specific_failure;
    }
  
    if (!is_primary() || !is_active() || !is_clean()) {
-    return schedule_result_t::bad_pg_state;
+    dout(10) << __func__ << ": cannot scrub (not a clean and active primary)"
+      << dendl;
+    return schedule_result_t::target_specific_failure;
    }
  
    if (state_test(PG_STATE_SNAPTRIM) || state_test(PG_STATE_SNAPTRIM_WAIT)) {
@@ -1360,7 +1363,7 @@ Scrub::schedule_result_t PG::sched_scrub()
      // (on the transition from NotTrimming to Trimming/WaitReservation),
      // i.e. some time before setting 'snaptrim'.
      dout(10) << __func__ << ": cannot scrub while snap-trimming" << dendl;
-    return schedule_result_t::bad_pg_state;
+    return schedule_result_t::target_specific_failure;
    }
  
    // analyse the combination of the requested scrub flags, the osd/pool configuration
@@ -1372,14 +1375,14 @@ Scrub::schedule_result_t PG::sched_scrub()
      // (due to configuration or priority issues)
      // The reason was already reported by the callee.
      dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
-    return schedule_result_t::preconditions;
+    return schedule_result_t::target_specific_failure;
    }
  
    // try to reserve the local OSD resources. If failing: no harm. We will
    // be retried by the OSD later on.
    if (!m_scrubber->reserve_local()) {
      dout(10) << __func__ << ": failed to reserve locally" << dendl;
-    return schedule_result_t::no_local_resources;
+    return schedule_result_t::osd_wide_failure;
    }
  
    // can commit to the updated flags now, as nothing will stop the scrub
diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc

index 1d1e9eb2fe88e7464cbc7bdacc1856089431e1be..8da75233ebb9cbcde804996945b4f46414aa2e53 100644 (file)
--- a/src/osd/scrubber/osd_scrub.cc
+++ b/src/osd/scrubber/osd_scrub.cc
@@ -125,60 +125,32 @@ void OsdScrub::initiate_scrub(bool is_recovery_active)
    // eligible targets (based on the known restrictions).
    // We try all elements of this list until a (possibly temporary) success.
    auto candidates = m_queue.ready_to_scrub(*env_restrictions, scrub_time);
-  auto res{schedule_result_t::none_ready};
+  if (candidates.empty()) {
+    dout(20) << "no PGs are ready for scrubbing" << dendl;
+    return;
+  }
+
    for (const auto& candidate : candidates) {
      dout(20) << fmt::format("initiating scrub on pg[{}]", candidate) << dendl;
  
      // we have a candidate to scrub. But we may fail when trying to initiate that
      // scrub. For some failures - we can continue with the next candidate. For
      // others - we should stop trying to scrub at this tick.
-    res = initiate_a_scrub(
+    auto res = initiate_a_scrub(
         candidate, env_restrictions->allow_requested_repair_only);
-    switch (res) {
-      case schedule_result_t::scrub_initiated:
-       // the happy path. We are done
-       dout(20) << fmt::format("scrub initiated for pg[{}]", candidate.pgid)
-                << dendl;
-       break;
-
-      case schedule_result_t::already_started:
-      case schedule_result_t::preconditions:
-      case schedule_result_t::bad_pg_state:
-       // continue with the next job
-       dout(20) << fmt::format(
-                       "pg[{}] failed (state/cond/started)", candidate.pgid)
-                << dendl;
-       break;
-
-      case schedule_result_t::no_such_pg:
-       // The pg is no longer there
-       dout(20) << fmt::format("pg[{}] failed (no PG)", candidate.pgid)
-                << dendl;
-       // \todo better handling of this case
-       break;
-
-      case schedule_result_t::no_local_resources:
-       // failure to secure local resources. No point in trying the other
-       // PGs at this time. Note that this is not the same as replica resources
-       // failure!
-       dout(20) << "failed (local resources)" << dendl;
-       break;
-
-      case schedule_result_t::none_ready:
-       // can't happen. Just for the compiler.
-       dout(5) << fmt::format(
-                      "failed!! (possible bug. pg[{}])", candidate.pgid)
-               << dendl;
-       break;
-    }
  
-    if (res == schedule_result_t::no_local_resources) {
+    if (res == schedule_result_t::target_specific_failure) {
+      // continue with the next job.
+      // \todo: consider separate handling of "no such PG", as - later on -
+      // we should be removing both related targets.
+      continue;
+    } else if (res == schedule_result_t::osd_wide_failure) {
+      // no point in trying the other candidates at this time
        break;
-    }
-
-    if (res == schedule_result_t::scrub_initiated) {
-      // note: in the full implementation: we need to dequeue the target
-      // at this time
+    } else {
+      // the happy path. We are done
+      dout(20) << fmt::format("scrub initiated for pg[{}]", candidate.pgid)
+               << dendl;
        break;
      }
    }
@@ -238,13 +210,13 @@ Scrub::schedule_result_t OsdScrub::initiate_a_scrub(
      // the PG was dequeued in the short timespan between creating the
      // candidates list (ready_to_scrub()) and here
      dout(5) << fmt::format("pg[{}] not found", pgid) << dendl;
-    return Scrub::schedule_result_t::no_such_pg;
+    return Scrub::schedule_result_t::target_specific_failure;
    }
  
    // This one is already scrubbing, so go on to the next scrub job
    if (locked_pg->pg()->is_scrub_queued_or_active()) {
      dout(10) << fmt::format("pg[{}]: scrub already in progress", pgid) << dendl;
-    return Scrub::schedule_result_t::already_started;
+    return Scrub::schedule_result_t::target_specific_failure;
    }
    // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
    if (allow_requested_repair_only &&
@@ -254,7 +226,7 @@ Scrub::schedule_result_t OsdScrub::initiate_a_scrub(
                     "requested for that pg",
                     pgid)
              << dendl;
-    return Scrub::schedule_result_t::preconditions;
+    return Scrub::schedule_result_t::target_specific_failure;
    }
  
    return locked_pg->pg()->sched_scrub();
diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc

index 5378864f120f35d1135c90b9e56a62d5ac95e4c1..691461860148140558da39dadb8a9ebb2e46ab2d 100644 (file)
--- a/src/osd/scrubber/osd_scrub_sched.cc
+++ b/src/osd/scrubber/osd_scrub_sched.cc
@@ -226,29 +226,6 @@ void ScrubQueue::move_failed_pgs(utime_t now_is)
    }
  }
  
-// clang-format off
-/*
- * Implementation note:
- * Clang (10 & 11) produces here efficient table-based code, comparable to using
- * a direct index into an array of strings.
- * Gcc (11, trunk) is almost as efficient.
- */
-std::string_view ScrubQueue::attempt_res_text(Scrub::schedule_result_t v)
-{
-  switch (v) {
-    case Scrub::schedule_result_t::scrub_initiated: return "scrubbing"sv;
-    case Scrub::schedule_result_t::none_ready: return "no ready job"sv;
-    case Scrub::schedule_result_t::no_local_resources: return "local resources shortage"sv;
-    case Scrub::schedule_result_t::already_started: return "denied as already started"sv;
-    case Scrub::schedule_result_t::no_such_pg: return "pg not found"sv;
-    case Scrub::schedule_result_t::bad_pg_state: return "prevented by pg state"sv;
-    case Scrub::schedule_result_t::preconditions: return "preconditions not met"sv;
-  }
-  // g++ (unlike CLANG), requires an extra 'return' here
-  return "(unknown)"sv;
-}
-// clang-format on
-
  std::vector<ScrubTargetId> ScrubQueue::ready_to_scrub(
      OSDRestrictions restrictions,  // note: 4B in size! (copy)
      utime_t scrub_tick)
diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h

index a2a19f084f1793bc4c363d19fde41faa04bb5a4b..9e222718c509a11992109540fe83267938bdf51a 100644 (file)
--- a/src/osd/scrubber/osd_scrub_sched.h
+++ b/src/osd/scrubber/osd_scrub_sched.h
@@ -117,15 +117,11 @@ namespace Scrub {
  
  using namespace ::std::literals;
  
-// possible outcome when trying to select a PG and scrub it
+/// possible outcome when trying to select a PG and scrub it
  enum class schedule_result_t {
-  scrub_initiated,     // successfully started a scrub
-  none_ready,         // no pg to scrub
-  no_local_resources,  // failure to secure local OSD scrub resource
-  already_started,     // failed, as already started scrubbing this pg
-  no_such_pg,         // can't find this pg
-  bad_pg_state,               // pg state (clean, active, etc.)
-  preconditions               // time, configuration, etc.
+  scrub_initiated,         // successfully started a scrub
+  target_specific_failure,  // failed to scrub this specific target
+  osd_wide_failure         // failed to scrub any target
  };
  
  // the OSD services provided to the scrub scheduler
@@ -180,11 +176,6 @@ class ScrubQueue {
        Scrub::OSDRestrictions restrictions, // 4B! copy
        utime_t scrub_tick);
  
-  /**
-   * Translate attempt_ values into readable text
-   */
-  static std::string_view attempt_res_text(Scrub::schedule_result_t v);
-
    /**
     * remove the pg from set of PGs to be scanned for scrubbing.
     * To be used if we are no longer the PG's primary, or if the PG is removed.
author	Ronen Friedman <rfriedma@redhat.com>
	Thu, 21 Sep 2023 12:34:52 +0000 (07:34 -0500)
committer	Ronen Friedman <rfriedma@redhat.com>
	Thu, 21 Sep 2023 12:57:58 +0000 (07:57 -0500)
src/osd/PG.cc		patch \| blob \| history
src/osd/scrubber/osd_scrub.cc		patch \| blob \| history
src/osd/scrubber/osd_scrub_sched.cc		patch \| blob \| history
src/osd/scrubber/osd_scrub_sched.h		patch \| blob \| history