ceph_assert(m_scrubber);
if (is_scrub_queued_or_active()) {
- return schedule_result_t::already_started;
+ dout(10) << __func__ << ": already scrubbing" << dendl;
+ return schedule_result_t::target_specific_failure;
}
if (!is_primary() || !is_active() || !is_clean()) {
- return schedule_result_t::bad_pg_state;
+ dout(10) << __func__ << ": cannot scrub (not a clean and active primary)"
+ << dendl;
+ return schedule_result_t::target_specific_failure;
}
if (state_test(PG_STATE_SNAPTRIM) || state_test(PG_STATE_SNAPTRIM_WAIT)) {
// (on the transition from NotTrimming to Trimming/WaitReservation),
// i.e. some time before setting 'snaptrim'.
dout(10) << __func__ << ": cannot scrub while snap-trimming" << dendl;
- return schedule_result_t::bad_pg_state;
+ return schedule_result_t::target_specific_failure;
}
// analyse the combination of the requested scrub flags, the osd/pool configuration
// (due to configuration or priority issues)
// The reason was already reported by the callee.
dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
- return schedule_result_t::preconditions;
+ return schedule_result_t::target_specific_failure;
}
// try to reserve the local OSD resources. If failing: no harm. We will
// be retried by the OSD later on.
if (!m_scrubber->reserve_local()) {
dout(10) << __func__ << ": failed to reserve locally" << dendl;
- return schedule_result_t::no_local_resources;
+ return schedule_result_t::osd_wide_failure;
}
// can commit to the updated flags now, as nothing will stop the scrub
// eligible targets (based on the known restrictions).
// We try all elements of this list until a (possibly temporary) success.
auto candidates = m_queue.ready_to_scrub(*env_restrictions, scrub_time);
- auto res{schedule_result_t::none_ready};
+ if (candidates.empty()) {
+ dout(20) << "no PGs are ready for scrubbing" << dendl;
+ return;
+ }
+
for (const auto& candidate : candidates) {
dout(20) << fmt::format("initiating scrub on pg[{}]", candidate) << dendl;
// we have a candidate to scrub. But we may fail when trying to initiate that
// scrub. For some failures - we can continue with the next candidate. For
// others - we should stop trying to scrub at this tick.
- res = initiate_a_scrub(
+ auto res = initiate_a_scrub(
candidate, env_restrictions->allow_requested_repair_only);
- switch (res) {
- case schedule_result_t::scrub_initiated:
- // the happy path. We are done
- dout(20) << fmt::format("scrub initiated for pg[{}]", candidate.pgid)
- << dendl;
- break;
-
- case schedule_result_t::already_started:
- case schedule_result_t::preconditions:
- case schedule_result_t::bad_pg_state:
- // continue with the next job
- dout(20) << fmt::format(
- "pg[{}] failed (state/cond/started)", candidate.pgid)
- << dendl;
- break;
-
- case schedule_result_t::no_such_pg:
- // The pg is no longer there
- dout(20) << fmt::format("pg[{}] failed (no PG)", candidate.pgid)
- << dendl;
- // \todo better handling of this case
- break;
-
- case schedule_result_t::no_local_resources:
- // failure to secure local resources. No point in trying the other
- // PGs at this time. Note that this is not the same as replica resources
- // failure!
- dout(20) << "failed (local resources)" << dendl;
- break;
-
- case schedule_result_t::none_ready:
- // can't happen. Just for the compiler.
- dout(5) << fmt::format(
- "failed!! (possible bug. pg[{}])", candidate.pgid)
- << dendl;
- break;
- }
- if (res == schedule_result_t::no_local_resources) {
+ if (res == schedule_result_t::target_specific_failure) {
+ // continue with the next job.
+ // \todo: consider separate handling of "no such PG", as - later on -
+ // we should be removing both related targets.
+ continue;
+ } else if (res == schedule_result_t::osd_wide_failure) {
+ // no point in trying the other candidates at this time
break;
- }
-
- if (res == schedule_result_t::scrub_initiated) {
- // note: in the full implementation: we need to dequeue the target
- // at this time
+ } else {
+ // the happy path. We are done
+ dout(20) << fmt::format("scrub initiated for pg[{}]", candidate.pgid)
+ << dendl;
break;
}
}
// the PG was dequeued in the short timespan between creating the
// candidates list (ready_to_scrub()) and here
dout(5) << fmt::format("pg[{}] not found", pgid) << dendl;
- return Scrub::schedule_result_t::no_such_pg;
+ return Scrub::schedule_result_t::target_specific_failure;
}
// This one is already scrubbing, so go on to the next scrub job
if (locked_pg->pg()->is_scrub_queued_or_active()) {
dout(10) << fmt::format("pg[{}]: scrub already in progress", pgid) << dendl;
- return Scrub::schedule_result_t::already_started;
+ return Scrub::schedule_result_t::target_specific_failure;
}
// Skip other kinds of scrubbing if only explicitly requested repairing is allowed
if (allow_requested_repair_only &&
"requested for that pg",
pgid)
<< dendl;
- return Scrub::schedule_result_t::preconditions;
+ return Scrub::schedule_result_t::target_specific_failure;
}
return locked_pg->pg()->sched_scrub();
}
}
-// clang-format off
-/*
- * Implementation note:
- * Clang (10 & 11) produces here efficient table-based code, comparable to using
- * a direct index into an array of strings.
- * Gcc (11, trunk) is almost as efficient.
- */
-std::string_view ScrubQueue::attempt_res_text(Scrub::schedule_result_t v)
-{
- switch (v) {
- case Scrub::schedule_result_t::scrub_initiated: return "scrubbing"sv;
- case Scrub::schedule_result_t::none_ready: return "no ready job"sv;
- case Scrub::schedule_result_t::no_local_resources: return "local resources shortage"sv;
- case Scrub::schedule_result_t::already_started: return "denied as already started"sv;
- case Scrub::schedule_result_t::no_such_pg: return "pg not found"sv;
- case Scrub::schedule_result_t::bad_pg_state: return "prevented by pg state"sv;
- case Scrub::schedule_result_t::preconditions: return "preconditions not met"sv;
- }
- // g++ (unlike CLANG), requires an extra 'return' here
- return "(unknown)"sv;
-}
-// clang-format on
-
std::vector<ScrubTargetId> ScrubQueue::ready_to_scrub(
OSDRestrictions restrictions, // note: 4B in size! (copy)
utime_t scrub_tick)
using namespace ::std::literals;
-// possible outcome when trying to select a PG and scrub it
+/// possible outcome when trying to select a PG and scrub it
enum class schedule_result_t {
- scrub_initiated, // successfully started a scrub
- none_ready, // no pg to scrub
- no_local_resources, // failure to secure local OSD scrub resource
- already_started, // failed, as already started scrubbing this pg
- no_such_pg, // can't find this pg
- bad_pg_state, // pg state (clean, active, etc.)
- preconditions // time, configuration, etc.
+ scrub_initiated, // successfully started a scrub
+ target_specific_failure, // failed to scrub this specific target
+ osd_wide_failure // failed to scrub any target
};
// the OSD services provided to the scrub scheduler
Scrub::OSDRestrictions restrictions, // 4B! copy
utime_t scrub_tick);
- /**
- * Translate attempt_ values into readable text
- */
- static std::string_view attempt_res_text(Scrub::schedule_result_t v);
-
/**
* remove the pg from set of PGs to be scanned for scrubbing.
* To be used if we are no longer the PG's primary, or if the PG is removed.