From c57731d3cefda8c4d381e778e7b1038df3c5f447 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Tue, 6 Apr 2021 18:01:15 +0300 Subject: [PATCH] Revert "osd: Try other PGs when reservation failures occur" This reverts commit 08c3ede0844507074de2b43638423c094976b493. Due to https://tracker.ceph.com/issues/49868 Should be reinstated once that bug is solved. See tracker comments for analysis and suggested fixes. Signed-off-by: Ronen Friedman --- src/osd/OSD.cc | 34 ++-------------------------------- src/osd/PG.cc | 1 - src/osd/PG.h | 4 ---- src/osd/pg_scrubber.h | 7 ------- src/osd/scrub_machine.cc | 2 -- src/osd/scrub_machine_lstnr.h | 4 ---- src/osd/scrubber_common.h | 4 ---- 7 files changed, 2 insertions(+), 54 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index d28062d3a471..d7d038efcc6c 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -7612,13 +7612,6 @@ void OSD::sched_scrub() continue; } - // If this one couldn't reserve, skip for now - if (pg->get_reserve_failed()) { - pg->unlock(); - dout(20) << __func__ << " pg " << scrub_job.pgid << " reserve failed, skipped" << dendl; - continue; - } - // This has already started, so go on to the next scrub job if (pg->is_scrub_active()) { pg->unlock(); @@ -7638,7 +7631,7 @@ void OSD::sched_scrub() if (pg->m_scrubber->is_reserving()) { pg->unlock(); dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl; - goto out; + break; } dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time << (pg->get_must_scrub() ? ", explicitly requested" : @@ -7647,34 +7640,11 @@ void OSD::sched_scrub() if (pg->sched_scrub()) { pg->unlock(); dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl; - goto out; - } - // If this is set now we must have had a local reserve failure, so can't scrub anything right now - if (pg->get_reserve_failed()) { - pg->unlock(); - dout(20) << __func__ << " pg " << scrub_job.pgid << " local reserve failed, nothing to be done now" << dendl; - goto out; + break; } - pg->unlock(); } while (service.next_scrub_stamp(scrub_job, &scrub_job)); - - // Clear reserve_failed from all pending PGs, so we try again - if (service.first_scrub_stamp(&scrub_job)) { - do { - if (scrub_job.sched_time > now) - break; - PGRef pg = _lookup_lock_pg(scrub_job.pgid); - // If we can't lock, it's ok we can get it next time - if (!pg) - continue; - pg->clear_reserve_failed(); - pg->unlock(); - } while (service.next_scrub_stamp(scrub_job, &scrub_job)); - } } - -out: dout(20) << "sched_scrub done" << dendl; } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index bbb1085905f4..104ed3368280 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1360,7 +1360,6 @@ bool PG::sched_scrub() // be retried by the OSD later on. if (!m_scrubber->reserve_local()) { dout(10) << __func__ << ": failed to reserve locally" << dendl; - set_reserve_failed(); return false; } diff --git a/src/osd/PG.h b/src/osd/PG.h index 4b151c7825bf..1a6599dfbb33 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -181,10 +181,6 @@ public: /// scrubbing state for both Primary & replicas bool is_scrub_active() const { return m_scrubber->is_scrub_active(); } - bool get_reserve_failed() const { return m_scrubber->get_reserve_failed(); } - void set_reserve_failed() { m_scrubber->set_reserve_failed(); } - void clear_reserve_failed() { m_scrubber->clear_reserve_failed(); } - public: // -- members -- const coll_t coll; diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h index 69fa6b0fcebd..bbbac38ca756 100644 --- a/src/osd/pg_scrubber.h +++ b/src/osd/pg_scrubber.h @@ -412,10 +412,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { [[nodiscard]] bool is_scrub_active() const final { return m_active; } - [[nodiscard]] bool get_reserve_failed() const final { return m_reserve_failed; } - void set_reserve_failed() final { m_reserve_failed = true; } - void clear_reserve_failed() final { m_reserve_failed = false; } - private: void reset_internal_state(); @@ -540,9 +536,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { bool m_active{false}; - // This PG could not get all the scrub reservations - bool m_reserve_failed{false}; - eversion_t m_subset_last_update{}; std::unique_ptr m_store; diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc index 64d79e723673..2a2ee8732bda 100644 --- a/src/osd/scrub_machine.cc +++ b/src/osd/scrub_machine.cc @@ -91,8 +91,6 @@ sc::result ReservingReplicas::react(const ReservationFailure&) DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl; - // Mark PG so that we will try other PGs, before coming back to this one - scrbr->set_reserve_failed(); // the Scrubber must release all resources and abort the scrubbing scrbr->clear_pgscrub_state(); return transit(); diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h index b2139773b4d5..2b9616121547 100644 --- a/src/osd/scrub_machine_lstnr.h +++ b/src/osd/scrub_machine_lstnr.h @@ -114,10 +114,6 @@ struct ScrubMachineListener { virtual void unreserve_replicas() = 0; - [[nodiscard]] virtual bool get_reserve_failed() const = 0; - virtual void set_reserve_failed() = 0; - virtual void clear_reserve_failed() = 0; - /** * the FSM interface into the "are we waiting for maps, either our own or from * replicas" state. diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index 3f3a618f8249..15a6cdf4dede 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -150,10 +150,6 @@ struct ScrubPgIF { */ [[nodiscard]] virtual bool is_scrub_active() const = 0; - [[nodiscard]] virtual bool get_reserve_failed() const = 0; - virtual void set_reserve_failed() = 0; - virtual void clear_reserve_failed() = 0; - /// are we waiting for resource reservation grants form our replicas? [[nodiscard]] virtual bool is_reserving() const = 0; -- 2.47.3