From: Ronen Friedman Date: Mon, 2 Oct 2023 16:29:51 +0000 (-0500) Subject: osd/scrub: group all scrub session states into a Session state X-Git-Tag: v19.0.0~197^2~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=817fd00240cf8ead3f4333154824b2c262dd868b;p=ceph.git osd/scrub: group all scrub session states into a Session state The Session state now includes the ReservingReplicas & Active sub-states. This new state will hold (in future commits) most of the scrub state information that relates to a specific scrub session (and should be cleaned up when that session terminates). Signed-off-by: Ronen Friedman --- diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index 0d52d5b76d77..efb091788206 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -109,25 +109,47 @@ sc::result NotActive::react(const AfterRepairScrub&) return transit(); } +// ----------------------- Session ----------------------------------------- + +Session::Session(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Session") +{ + dout(10) << "-- state -->> Session" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + + // while we've checked the 'someone is reserving' flag before queueing + // the start-scrub event, it's possible that the flag was set in the meantime. + // Handling this case here requires adding a new sub-state, and the + // complication of reporting a failure to the caller in a new failure + // path. On the other hand - ignoring an ongoing reservation on rare + // occasions will cause no harm. + // We choose ignorance. + std::ignore = scrbr->set_reserving_now(); +} + +Session::~Session() +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + + // note the interaction between clearing the 'queued' flag and two + // other states: the snap-mapper and the scrubber internal state. + // All of these must be cleared in the correct order, and the snap mapper + // (re-triggered by resetting the 'queued' flag) must not resume before + // the scrubber is reset. + scrbr->clear_pgscrub_state(); +} + + // ----------------------- ReservingReplicas --------------------------------- ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "ReservingReplicas") + , NamedSimply(context().m_scrbr, "Session/ReservingReplicas") { dout(10) << "-- state -->> ReservingReplicas" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - // prevent the OSD from starting another scrub while we are trying to secure - // replicas resources - if (!scrbr->set_reserving_now()) { - dout(1) << "ReservingReplicas::ReservingReplicas() some other PG is " - "already reserving replicas resources" - << dendl; - post_event(ReservationFailure{}); - return; - } - m_holding_isreserving_flag = true; scrbr->reserve_replicas(); auto timeout = scrbr->get_cct()->_conf.get_val< @@ -143,9 +165,9 @@ ReservingReplicas::ReservingReplicas(my_context ctx) ReservingReplicas::~ReservingReplicas() { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - if (m_holding_isreserving_flag) { - scrbr->clear_reserving_now(); - } + // it's OK to try and clear the flag even if we don't hold it + // (the flag remembers the actual holder) + scrbr->clear_reserving_now(); } sc::result ReservingReplicas::react(const ReservationTimeout&) @@ -166,18 +188,6 @@ sc::result ReservingReplicas::react(const ReservationFailure&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl; - - // the Scrubber must release all resources and abort the scrubbing - scrbr->clear_pgscrub_state(); - return transit(); -} - -/** - * note: the event poster is handling the scrubber reset - */ -sc::result ReservingReplicas::react(const FullReset&) -{ - dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl; return transit(); } @@ -185,7 +195,7 @@ sc::result ReservingReplicas::react(const FullReset&) ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "ActiveScrubbing") + , NamedSimply(context().m_scrbr, "Session/ActiveScrubbing") { dout(10) << "-- state -->> ActiveScrubbing" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -199,8 +209,6 @@ ActiveScrubbing::~ActiveScrubbing() { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(15) << __func__ << dendl; - scrbr->unreserve_replicas(); - scrbr->clear_queued_or_active(); } /* @@ -212,14 +220,6 @@ sc::result ActiveScrubbing::react(const InternalError&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << __func__ << dendl; - scrbr->clear_pgscrub_state(); - return transit(); -} - -sc::result ActiveScrubbing::react(const FullReset&) -{ - dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl; - // caller takes care of clearing the scrubber & FSM states return transit(); } @@ -237,9 +237,9 @@ sc::result ActiveScrubbing::react(const FullReset&) */ RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "Act/RangeBlocked") + , NamedSimply(context().m_scrbr, "Session/Act/RangeBlocked") { - dout(10) << "-- state -->> Act/RangeBlocked" << dendl; + dout(10) << "-- state -->> Session/Act/RangeBlocked" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases auto grace = scrbr->get_range_blocked_grace(); @@ -287,9 +287,9 @@ sc::result RangeBlocked::react(const RangeBlockedAlarm&) */ PendingTimer::PendingTimer(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "Act/PendingTimer") + , NamedSimply(context().m_scrbr, "Session/Act/PendingTimer") { - dout(10) << "-- state -->> Act/PendingTimer" << dendl; + dout(10) << "-- state -->> Session/Act/PendingTimer" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases auto sleep_time = scrbr->get_scrub_sleep_time(); @@ -328,9 +328,9 @@ sc::result PendingTimer::react(const SleepComplete&) */ NewChunk::NewChunk(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "Act/NewChunk") + , NamedSimply(context().m_scrbr, "Session/Act/NewChunk") { - dout(10) << "-- state -->> Act/NewChunk" << dendl; + dout(10) << "-- state -->> Session/Act/NewChunk" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases scrbr->get_preemptor().adjust_parameters(); @@ -355,9 +355,9 @@ sc::result NewChunk::react(const SelectedChunkFree&) WaitPushes::WaitPushes(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "Act/WaitPushes") + , NamedSimply(context().m_scrbr, "Session/Act/WaitPushes") { - dout(10) << " -- state -->> Act/WaitPushes" << dendl; + dout(10) << " -- state -->> Session/Act/WaitPushes" << dendl; post_event(ActivePushesUpd{}); } @@ -383,9 +383,9 @@ sc::result WaitPushes::react(const ActivePushesUpd&) WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "Act/WaitLastUpdate") + , NamedSimply(context().m_scrbr, "Session/Act/WaitLastUpdate") { - dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl; + dout(10) << " -- state -->> Session/Act/WaitLastUpdate" << dendl; post_event(UpdatesApplied{}); } @@ -427,9 +427,9 @@ sc::result WaitLastUpdate::react(const InternalAllUpdates&) BuildMap::BuildMap(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "Act/BuildMap") + , NamedSimply(context().m_scrbr, "Session/Act/BuildMap") { - dout(10) << " -- state -->> Act/BuildMap" << dendl; + dout(10) << " -- state -->> Session/Act/BuildMap" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases // no need to check for an epoch change, as all possible flows that brought @@ -477,9 +477,9 @@ sc::result BuildMap::react(const IntLocalMapDone&) DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "Act/DrainReplMaps") + , NamedSimply(context().m_scrbr, "Session/Act/DrainReplMaps") { - dout(10) << "-- state -->> Act/DrainReplMaps" << dendl; + dout(10) << "-- state -->> Session/Act/DrainReplMaps" << dendl; // we may have got all maps already. Send the event that will make us check. post_event(GotReplicas{}); } @@ -504,9 +504,9 @@ sc::result DrainReplMaps::react(const GotReplicas&) WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "Act/WaitReplicas") + , NamedSimply(context().m_scrbr, "Session/Act/WaitReplicas") { - dout(10) << "-- state -->> Act/WaitReplicas" << dendl; + dout(10) << "-- state -->> Session/Act/WaitReplicas" << dendl; post_event(GotReplicas{}); } @@ -564,10 +564,10 @@ sc::result WaitReplicas::react(const DigestUpdate&) WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx) - , NamedSimply(context().m_scrbr, "Act/WaitDigestUpdate") + , NamedSimply(context().m_scrbr, "Session/Act/WaitDigestUpdate") { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl; + dout(10) << "-- state -->> Session/Act/WaitDigestUpdate" << dendl; // perform an initial check: maybe we already // have all the updates we need: diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 658abfa494f1..071a464ce130 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -141,6 +141,7 @@ MEV(ScrubFinished) // struct NotActive; ///< the quiescent state. No active scrubbing. +struct Session; ///< either reserving or actively scrubbing struct ReservingReplicas; ///< securing scrub resources from replicas' OSDs struct ActiveScrubbing; ///< the active state for a Primary. A sub-machine. struct ReplicaIdle; ///< Initial reserved replica state @@ -163,13 +164,17 @@ class ScrubMachine : public sc::state_machine { [[nodiscard]] bool is_reserving() const; [[nodiscard]] bool is_accepting_updates() const; + +// ///////////////// aux declarations & functions //////////////////////// // + + private: /** * scheduled_event_state_t * * Heap allocated, ref-counted state shared between scheduled event callback * and timer_event_token_t. Ensures that callback and timer_event_token_t - * can be safetly destroyed in either order while still allowing for + * can be safely destroyed in either order while still allowing for * cancellation. */ struct scheduled_event_state_t { @@ -183,7 +188,7 @@ private: ~scheduled_event_state_t() { /* For the moment, this assert encodes an assumption that we always * retain the token until the event either fires or is canceled. - * If a user needs/wants to relaxt that requirement, this assert can + * If a user needs/wants to relax that requirement, this assert can * be removed */ assert(!cb_token); } @@ -259,7 +264,7 @@ public: * schedule_timer_event_after * * Schedules event EventT{Args...} to be delivered duration in the future. - * The implementation implicitely drops the event on interval change. The + * The implementation implicitly drops the event on interval change. The * returned timer_event_token_t can be used to cancel the event prior to * its delivery -- it should generally be embedded as a member in the state * intended to handle the event. See the comment on timer_event_token_t @@ -284,6 +289,10 @@ public: } }; + +// ///////////////// the states //////////////////////// // + + /** * The Scrubber's base (quiescent) state. * Scrubbing is triggered by one of the following events: @@ -314,11 +323,34 @@ struct NotActive : sc::state, NamedSimply { sc::result react(const AfterRepairScrub&); }; -struct ReservingReplicas : sc::state, + +/** + * Session + * + * This state encompasses the two main "active" states: ReservingReplicas and + * ActiveScrubbing. + * 'Session' is the owner of all the resources that are allocated for a + * scrub session performed as a Primary. + * + * Exit from this state is either following an interval change, or with + * 'FullReset' (that would cover all other completion/termination paths). + * Note that if terminating the session following an interval change - no + * reservations are released. This is because we know that the replicas are + * also resetting their reservations. + */ +struct Session : sc::state, NamedSimply { + explicit Session(my_context ctx); + ~Session(); + + using reactions = mpl::list>; + /// \todo handle interval change +}; + +struct ReservingReplicas : sc::state, NamedSimply { explicit ReservingReplicas(my_context ctx); ~ReservingReplicas(); - using reactions = mpl::list, + using reactions = mpl::list< // all replicas granted our resources request sc::transition, sc::custom_reaction, @@ -328,11 +360,6 @@ struct ReservingReplicas : sc::state, ceph::coarse_real_clock::now(); ScrubMachine::timer_event_token_t m_timeout_token; - /// if true - we must 'clear_reserving_now()' upon exit - bool m_holding_isreserving_flag{false}; - - sc::result react(const FullReset&); - sc::result react(const ReservationTimeout&); /// at least one replica denied us the scrub resources we've requested @@ -364,15 +391,13 @@ struct WaitReplicas; struct WaitDigestUpdate; struct ActiveScrubbing - : sc::state, NamedSimply { + : sc::state, NamedSimply { explicit ActiveScrubbing(my_context ctx); ~ActiveScrubbing(); - using reactions = mpl::list, - sc::custom_reaction>; + using reactions = mpl::list>; - sc::result react(const FullReset&); sc::result react(const InternalError&); };