From: Ronen Friedman Date: Sun, 10 Sep 2023 19:44:33 +0000 (-0500) Subject: osd/scrub: set_reserving_now() signature modified X-Git-Tag: v19.0.0~438^2~14 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=f3bd9b621c62b29d11a4ea406cd95971992ff60c;p=ceph-ci.git osd/scrub: set_reserving_now() signature modified set_reserving_now() can now return a failure status, indicating a race between two PGs to start scrubbing on the same OSD. The scrubber FSM is modified to handle the failure. Signed-off-by: Ronen Friedman --- diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc index 9170e369ac4..baec441f262 100644 --- a/src/osd/scrubber/osd_scrub_sched.cc +++ b/src/osd/scrubber/osd_scrub_sched.cc @@ -490,3 +490,22 @@ int ScrubQueue::get_blocked_pgs_count() const { return blocked_scrubs_cnt; } + +// ////////////////////////////////////////////////////////////////////////// // +// ScrubQueue - maintaining the 'some PG is reserving' flag + +bool ScrubQueue::set_reserving_now() +{ + auto was_set = a_pg_is_reserving.exchange(true); + return !was_set; +} + +void ScrubQueue::clear_reserving_now() +{ + a_pg_is_reserving = false; +} + +bool ScrubQueue::is_reserving_now() const +{ + return a_pg_is_reserving; +} diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h index 99b9d30a49f..4727f3c7d5b 100644 --- a/src/osd/scrubber/osd_scrub_sched.h +++ b/src/osd/scrubber/osd_scrub_sched.h @@ -277,10 +277,22 @@ class ScrubQueue { /** * No new scrub session will start while a scrub was initiated on a PG, * and that PG is trying to acquire replica resources. + * + * \todo replace the atomic bool with a regular bool protected by a + * common OSD-service lock. Or better still - once PR#53263 is merged, + * remove this flag altogether. */ - void set_reserving_now() { a_pg_is_reserving = true; } - void clear_reserving_now() { a_pg_is_reserving = false; } - bool is_reserving_now() const { return a_pg_is_reserving; } + + /** + * set_reserving_now() + * \returns 'false' if the flag was already set + * (which is a possible result of a race between the check in OsdScrub and + * the initiation of a scrub by some other PG) + */ + bool set_reserving_now(); + void clear_reserving_now(); + bool is_reserving_now() const; + bool can_inc_scrubs() const; bool inc_scrubs_local(); diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 327e9f2b2f8..339fe1e7155 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1687,9 +1687,9 @@ void PgScrubber::on_replica_reservation_timeout() } } -void PgScrubber::set_reserving_now() +bool PgScrubber::set_reserving_now() { - m_osds->get_scrub_services().set_reserving_now(); + return m_osds->get_scrub_services().set_reserving_now(); } void PgScrubber::clear_reserving_now() diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 42ac2b48807..5810f1fd986 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -547,7 +547,7 @@ class PgScrubber : public ScrubPgIF, void reserve_replicas() final; - void set_reserving_now() final; + bool set_reserving_now() final; void clear_reserving_now() final; [[nodiscard]] bool was_epoch_changed() const final; diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index c3bb89b4dad..0d52d5b76d7 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -120,7 +120,14 @@ ReservingReplicas::ReservingReplicas(my_context ctx) // prevent the OSD from starting another scrub while we are trying to secure // replicas resources - scrbr->set_reserving_now(); + if (!scrbr->set_reserving_now()) { + dout(1) << "ReservingReplicas::ReservingReplicas() some other PG is " + "already reserving replicas resources" + << dendl; + post_event(ReservationFailure{}); + return; + } + m_holding_isreserving_flag = true; scrbr->reserve_replicas(); auto timeout = scrbr->get_cct()->_conf.get_val< @@ -136,7 +143,9 @@ ReservingReplicas::ReservingReplicas(my_context ctx) ReservingReplicas::~ReservingReplicas() { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - scrbr->clear_reserving_now(); + if (m_holding_isreserving_flag) { + scrbr->clear_reserving_now(); + } } sc::result ReservingReplicas::react(const ReservationTimeout&) diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index ffa6f4e6d3b..658abfa494f 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -328,6 +328,9 @@ struct ReservingReplicas : sc::state, ceph::coarse_real_clock::now(); ScrubMachine::timer_event_token_t m_timeout_token; + /// if true - we must 'clear_reserving_now()' upon exit + bool m_holding_isreserving_flag{false}; + sc::result react(const FullReset&); sc::result react(const ReservationTimeout&); diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h index 8f4dd8201c1..cfef666e1b1 100644 --- a/src/osd/scrubber/scrub_machine_lstnr.h +++ b/src/osd/scrubber/scrub_machine_lstnr.h @@ -190,8 +190,11 @@ struct ScrubMachineListener { * and that PG is trying to acquire replica resources. * set_reserving_now()/clear_reserving_now() let's the OSD scrub-queue know * we are busy reserving. + * + * set_reserving_now() returns 'false' if there already is a PG in the + * reserving stage of the scrub session. */ - virtual void set_reserving_now() = 0; + virtual bool set_reserving_now() = 0; virtual void clear_reserving_now() = 0; /**