From f7ddca67b6a148248f52707ca2c2435fcda64b34 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Thu, 21 Sep 2023 07:34:52 -0500 Subject: [PATCH] osd/scrub: modify schedule_result_t to report error class (which directly translates to the required followup action) instead of reporting the exact failure. The specific of the failure were never used by the scrub scheduler. Signed-off-by: Ronen Friedman --- src/osd/PG.cc | 13 +++--- src/osd/scrubber/osd_scrub.cc | 68 +++++++++-------------------- src/osd/scrubber/osd_scrub_sched.cc | 23 ---------- src/osd/scrubber/osd_scrub_sched.h | 17 ++------ 4 files changed, 32 insertions(+), 89 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 25a8ac7197845..0d1f8d44e1c44 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1348,11 +1348,14 @@ Scrub::schedule_result_t PG::sched_scrub() ceph_assert(m_scrubber); if (is_scrub_queued_or_active()) { - return schedule_result_t::already_started; + dout(10) << __func__ << ": already scrubbing" << dendl; + return schedule_result_t::target_specific_failure; } if (!is_primary() || !is_active() || !is_clean()) { - return schedule_result_t::bad_pg_state; + dout(10) << __func__ << ": cannot scrub (not a clean and active primary)" + << dendl; + return schedule_result_t::target_specific_failure; } if (state_test(PG_STATE_SNAPTRIM) || state_test(PG_STATE_SNAPTRIM_WAIT)) { @@ -1360,7 +1363,7 @@ Scrub::schedule_result_t PG::sched_scrub() // (on the transition from NotTrimming to Trimming/WaitReservation), // i.e. some time before setting 'snaptrim'. dout(10) << __func__ << ": cannot scrub while snap-trimming" << dendl; - return schedule_result_t::bad_pg_state; + return schedule_result_t::target_specific_failure; } // analyse the combination of the requested scrub flags, the osd/pool configuration @@ -1372,14 +1375,14 @@ Scrub::schedule_result_t PG::sched_scrub() // (due to configuration or priority issues) // The reason was already reported by the callee. dout(10) << __func__ << ": failed to initiate a scrub" << dendl; - return schedule_result_t::preconditions; + return schedule_result_t::target_specific_failure; } // try to reserve the local OSD resources. If failing: no harm. We will // be retried by the OSD later on. if (!m_scrubber->reserve_local()) { dout(10) << __func__ << ": failed to reserve locally" << dendl; - return schedule_result_t::no_local_resources; + return schedule_result_t::osd_wide_failure; } // can commit to the updated flags now, as nothing will stop the scrub diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc index 1d1e9eb2fe88e..8da75233ebb9c 100644 --- a/src/osd/scrubber/osd_scrub.cc +++ b/src/osd/scrubber/osd_scrub.cc @@ -125,60 +125,32 @@ void OsdScrub::initiate_scrub(bool is_recovery_active) // eligible targets (based on the known restrictions). // We try all elements of this list until a (possibly temporary) success. auto candidates = m_queue.ready_to_scrub(*env_restrictions, scrub_time); - auto res{schedule_result_t::none_ready}; + if (candidates.empty()) { + dout(20) << "no PGs are ready for scrubbing" << dendl; + return; + } + for (const auto& candidate : candidates) { dout(20) << fmt::format("initiating scrub on pg[{}]", candidate) << dendl; // we have a candidate to scrub. But we may fail when trying to initiate that // scrub. For some failures - we can continue with the next candidate. For // others - we should stop trying to scrub at this tick. - res = initiate_a_scrub( + auto res = initiate_a_scrub( candidate, env_restrictions->allow_requested_repair_only); - switch (res) { - case schedule_result_t::scrub_initiated: - // the happy path. We are done - dout(20) << fmt::format("scrub initiated for pg[{}]", candidate.pgid) - << dendl; - break; - - case schedule_result_t::already_started: - case schedule_result_t::preconditions: - case schedule_result_t::bad_pg_state: - // continue with the next job - dout(20) << fmt::format( - "pg[{}] failed (state/cond/started)", candidate.pgid) - << dendl; - break; - - case schedule_result_t::no_such_pg: - // The pg is no longer there - dout(20) << fmt::format("pg[{}] failed (no PG)", candidate.pgid) - << dendl; - // \todo better handling of this case - break; - - case schedule_result_t::no_local_resources: - // failure to secure local resources. No point in trying the other - // PGs at this time. Note that this is not the same as replica resources - // failure! - dout(20) << "failed (local resources)" << dendl; - break; - - case schedule_result_t::none_ready: - // can't happen. Just for the compiler. - dout(5) << fmt::format( - "failed!! (possible bug. pg[{}])", candidate.pgid) - << dendl; - break; - } - if (res == schedule_result_t::no_local_resources) { + if (res == schedule_result_t::target_specific_failure) { + // continue with the next job. + // \todo: consider separate handling of "no such PG", as - later on - + // we should be removing both related targets. + continue; + } else if (res == schedule_result_t::osd_wide_failure) { + // no point in trying the other candidates at this time break; - } - - if (res == schedule_result_t::scrub_initiated) { - // note: in the full implementation: we need to dequeue the target - // at this time + } else { + // the happy path. We are done + dout(20) << fmt::format("scrub initiated for pg[{}]", candidate.pgid) + << dendl; break; } } @@ -238,13 +210,13 @@ Scrub::schedule_result_t OsdScrub::initiate_a_scrub( // the PG was dequeued in the short timespan between creating the // candidates list (ready_to_scrub()) and here dout(5) << fmt::format("pg[{}] not found", pgid) << dendl; - return Scrub::schedule_result_t::no_such_pg; + return Scrub::schedule_result_t::target_specific_failure; } // This one is already scrubbing, so go on to the next scrub job if (locked_pg->pg()->is_scrub_queued_or_active()) { dout(10) << fmt::format("pg[{}]: scrub already in progress", pgid) << dendl; - return Scrub::schedule_result_t::already_started; + return Scrub::schedule_result_t::target_specific_failure; } // Skip other kinds of scrubbing if only explicitly requested repairing is allowed if (allow_requested_repair_only && @@ -254,7 +226,7 @@ Scrub::schedule_result_t OsdScrub::initiate_a_scrub( "requested for that pg", pgid) << dendl; - return Scrub::schedule_result_t::preconditions; + return Scrub::schedule_result_t::target_specific_failure; } return locked_pg->pg()->sched_scrub(); diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc index 5378864f120f3..6914618601481 100644 --- a/src/osd/scrubber/osd_scrub_sched.cc +++ b/src/osd/scrubber/osd_scrub_sched.cc @@ -226,29 +226,6 @@ void ScrubQueue::move_failed_pgs(utime_t now_is) } } -// clang-format off -/* - * Implementation note: - * Clang (10 & 11) produces here efficient table-based code, comparable to using - * a direct index into an array of strings. - * Gcc (11, trunk) is almost as efficient. - */ -std::string_view ScrubQueue::attempt_res_text(Scrub::schedule_result_t v) -{ - switch (v) { - case Scrub::schedule_result_t::scrub_initiated: return "scrubbing"sv; - case Scrub::schedule_result_t::none_ready: return "no ready job"sv; - case Scrub::schedule_result_t::no_local_resources: return "local resources shortage"sv; - case Scrub::schedule_result_t::already_started: return "denied as already started"sv; - case Scrub::schedule_result_t::no_such_pg: return "pg not found"sv; - case Scrub::schedule_result_t::bad_pg_state: return "prevented by pg state"sv; - case Scrub::schedule_result_t::preconditions: return "preconditions not met"sv; - } - // g++ (unlike CLANG), requires an extra 'return' here - return "(unknown)"sv; -} -// clang-format on - std::vector ScrubQueue::ready_to_scrub( OSDRestrictions restrictions, // note: 4B in size! (copy) utime_t scrub_tick) diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h index a2a19f084f179..9e222718c509a 100644 --- a/src/osd/scrubber/osd_scrub_sched.h +++ b/src/osd/scrubber/osd_scrub_sched.h @@ -117,15 +117,11 @@ namespace Scrub { using namespace ::std::literals; -// possible outcome when trying to select a PG and scrub it +/// possible outcome when trying to select a PG and scrub it enum class schedule_result_t { - scrub_initiated, // successfully started a scrub - none_ready, // no pg to scrub - no_local_resources, // failure to secure local OSD scrub resource - already_started, // failed, as already started scrubbing this pg - no_such_pg, // can't find this pg - bad_pg_state, // pg state (clean, active, etc.) - preconditions // time, configuration, etc. + scrub_initiated, // successfully started a scrub + target_specific_failure, // failed to scrub this specific target + osd_wide_failure // failed to scrub any target }; // the OSD services provided to the scrub scheduler @@ -180,11 +176,6 @@ class ScrubQueue { Scrub::OSDRestrictions restrictions, // 4B! copy utime_t scrub_tick); - /** - * Translate attempt_ values into readable text - */ - static std::string_view attempt_res_text(Scrub::schedule_result_t v); - /** * remove the pg from set of PGs to be scanned for scrubbing. * To be used if we are no longer the PG's primary, or if the PG is removed. -- 2.39.5