From 699dd28ad5cce9149e48c81811a54bb23f04a088 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Wed, 10 Jul 2024 02:06:09 -0500 Subject: [PATCH] osd/scrub: fix job requeue conditions Previous commits handled the following two cases correctly: - requeueing a scrub job while the OSD is still the primary, and - not restoring the scrub job to the queue if the PG is not there; Here we handle the missed scenario: the PG is there (we were able to lock it), but is no longer the primary. Also - a configuration change must not cause a re-queue of a scrub-job for a PG that is in the middle of scrubbing. Signed-off-by: Ronen Friedman --- src/osd/PG.cc | 7 +++++-- src/osd/scrubber/pg_scrubber.cc | 33 +++++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 6392a184bd327..96fd6435f7217 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1363,7 +1363,7 @@ double PG::next_deepscrub_interval() const void PG::on_scrub_schedule_input_change(Scrub::delay_ready_t delay_ready) { - if (is_active() && is_primary()) { + if (is_active() && is_primary() && !is_scrub_queued_or_active()) { dout(10) << fmt::format( "{}: active/primary. delay_ready={:c}", __func__, (delay_ready == Scrub::delay_ready_t::delay_ready) ? 't' @@ -1372,7 +1372,10 @@ void PG::on_scrub_schedule_input_change(Scrub::delay_ready_t delay_ready) ceph_assert(m_scrubber); m_scrubber->update_scrub_job(delay_ready); } else { - dout(10) << fmt::format("{}: inactive or non-primary", __func__) << dendl; + dout(10) << fmt::format( + "{}: inactive, non-primary - or already scrubbing", + __func__) + << dendl; } } diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 637b4586ab3b4..659050201505e 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -2093,6 +2093,15 @@ void PgScrubber::on_digest_updates() */ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue) { + if (!m_scrub_job->is_registered()) { + dout(10) << fmt::format( + "{}: PG not registered for scrubbing on this OSD. Won't " + "requeue!", + __func__) + << dendl; + return; + } + // assuming we can still depend on the 'scrubbing' flag being set; // Also on Queued&Active. @@ -2113,7 +2122,6 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue) m_scrub_job->merge_and_delay( m_active_target->schedule, issue, m_planned_scrub, ceph_clock_now()); - ceph_assert(m_scrub_job->is_registered()); ceph_assert(!m_scrub_job->target_queued); m_osds->get_scrub_services().enqueue_target(*m_scrub_job); m_scrub_job->target_queued = true; @@ -2122,9 +2130,16 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue) void PgScrubber::requeue_penalized(Scrub::delay_cause_t cause) { + if (!m_scrub_job->is_registered()) { + dout(10) << fmt::format( + "{}: PG not registered for scrubbing on this OSD. Won't " + "requeue!", + __func__) + << dendl; + return; + } /// \todo fix the 5s' to use a cause-specific delay parameter m_scrub_job->delay_on_failure(5s, cause, ceph_clock_now()); - ceph_assert(m_scrub_job->is_registered()); ceph_assert(!m_scrub_job->target_queued); m_osds->get_scrub_services().enqueue_target(*m_scrub_job); m_scrub_job->target_queued = true; @@ -2148,9 +2163,19 @@ Scrub::schedule_result_t PgScrubber::start_scrub_session( m_active_target = std::move(candidate); + if (!is_primary() || !m_pg->is_active()) { + // the PG is not expected to be 'registered' in this state. And we should + // not attempt to queue it. + dout(10) << __func__ << ": cannot scrub (not an active primary)" + << dendl; + return schedule_result_t::target_specific_failure; + } + // for all other failures - we must reinstate our entry in the Scrub Queue - if (!is_primary() || !m_pg->is_active() || !m_pg->is_clean()) { - dout(10) << __func__ << ": cannot scrub (not a clean and active primary)" + if (!m_pg->is_clean()) { + dout(10) << fmt::format( + "{}: cannot scrub (not clean). Registered?{:c}", __func__, + m_scrub_job->is_registered() ? 'Y' : 'n') << dendl; requeue_penalized(Scrub::delay_cause_t::pg_state); return schedule_result_t::target_specific_failure; -- 2.39.5