From: Ronen Friedman Date: Mon, 25 Mar 2024 14:27:31 +0000 (-0500) Subject: osd/scrub: discard ReplicaIdle sub-states X-Git-Tag: v19.1.0~38^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=64f6c2c1f0b4a1e3aa42ba9116ec2bafb57d018d;p=ceph.git osd/scrub: discard ReplicaIdle sub-states The 'reservation requested' & 'reservation was granted' states are now maintained directly in the ReplicaActive state. 'StartReplica' (or 'do process a chunk') - the last event handled by the sub-states - is now handled by ReplicaIdle. Signed-off-by: Ronen Friedman (cherry picked from commit 03a2f8c1113cb12fd19f678ab0353a30ed331c47) --- diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index e4b41977d77..d6c8cb43028 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -767,6 +767,12 @@ ReplicaActive::~ReplicaActive() clear_remote_reservation(false); } +void ReplicaActive::exit() +{ + dout(20) << "ReplicaActive::exit()" << dendl; +} + + /* * Note: we are expected to be in the initial internal state (Idle) when * receiving any registration request. ReplicaActiveOp, our other internal @@ -943,84 +949,32 @@ ReplicaIdle::ReplicaIdle(my_context ctx) dout(10) << "-- state -->> ReplicaActive/ReplicaIdle" << dendl; } -void ReplicaIdle::reset_ignored(const FullReset&) -{ - dout(10) << "ReplicaIdle::react(const FullReset&): FullReset ignored" - << dendl; -} - - -// ---------------- ReplicaIdle/ReplicaUnreserved --------------------------- -ReplicaUnreserved::ReplicaUnreserved(my_context ctx) - : my_base(ctx) - , NamedSimply( - context().m_scrbr, - "ReplicaActive/ReplicaIdle/ReplicaUnreserved") -{ - dout(10) << "-- state -->> ReplicaActive/ReplicaIdle/ReplicaUnreserved" - << dendl; -} - - -sc::result ReplicaUnreserved::react(const StartReplica& ev) +sc::result ReplicaIdle::react(const StartReplica& ev) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReplicaUnreserved::react(const StartReplica&)" << dendl; - post_event(ReplicaPushesUpd{}); - return transit(); -} - - -// ---------------- ReplicaIdle/ReplicaWaitingReservation --------------------------- - -ReplicaWaitingReservation::ReplicaWaitingReservation(my_context ctx) - : my_base(ctx) - , NamedSimply( - context().m_scrbr, - "ReplicaActive/ReplicaIdle/ReplicaWaitingReservation") -{ - dout(10) - << "-- state -->> ReplicaActive/ReplicaIdle/ReplicaWaitingReservation" - << dendl; -} - -sc::result ReplicaWaitingReservation::react(const StartReplica& ev) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReplicaWaitingReservation::react(const StartReplica&)" << dendl; - - // this shouldn't happen. We will handle it, but will also log an error. - scrbr->get_clog()->error() << fmt::format( - "osd.{} pg[{}]: new chunk request while still waiting for " - "reservation", - scrbr->get_whoami(), scrbr->get_spgid()); - context().clear_remote_reservation(true); + dout(10) << "ReplicaIdle::react(const StartReplica&)" << dendl; + + // if we are waiting for a reservation grant from the reserver (an + // illegal scenario!), that reservation must be cleared. + if (context().pending_reservation_nonce) { + scrbr->get_clog()->warn() << fmt::format( + "osd.{} pg[{}]: new chunk request while still waiting for " + "reservation", + scrbr->get_whoami(), scrbr->get_spgid()); + context().clear_remote_reservation(true); + } post_event(ReplicaPushesUpd{}); return transit(); } -// ---------------- ReplicaIdle/ReplicaReserved --------------------------- - -ReplicaReserved::ReplicaReserved(my_context ctx) - : my_base(ctx) - , NamedSimply( - context().m_scrbr, - "ReplicaActive/ReplicaIdle/ReplicaReserved") +void ReplicaIdle::reset_ignored(const FullReset&) { - dout(10) << "-- state -->> ReplicaActive/ReplicaIdle/ReplicaReserved" + dout(10) << "ReplicaIdle::react(const FullReset&): FullReset ignored" << dendl; } -sc::result ReplicaReserved::react(const StartReplica& ev) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReplicaReserved::react(const StartReplica&)" << dendl; - post_event(ReplicaPushesUpd{}); - return transit(); -} - // ------------- ReplicaActive/ReplicaActiveOp -------------------------- @@ -1134,20 +1088,22 @@ sc::result ReplicaBuildingMap::react(const SchedReplica&) dout(10) << "replica scrub job preempted" << dendl; scrbr->send_preempted_replica(); - return transit(); + return transit(); } // start or check progress of build_replica_map_chunk() - auto ret_init = scrbr->build_replica_map_chunk(); - if (ret_init != -EINPROGRESS) { - dout(10) << "ReplicaBuildingMap::react(const SchedReplica&): back to idle" - << dendl; - return transit(); + if (scrbr->build_replica_map_chunk() == -EINPROGRESS) { + // Must ask the backend for the next stride shortly. + // build_replica_map_chunk() has already requeued us. + dout(20) << "waiting for the backend..." << dendl; + return discard_event(); } - dout(20) << "ReplicaBuildingMap::react(const SchedReplica&): discarded" + // Note: build_replica_map_chunk() aborts the OSD on any backend retval + // which is not -EINPROGRESS or 0 ('done'). + dout(10) << "ReplicaBuildingMap::react(const SchedReplica&): chunk done" << dendl; - return discard_event(); + return transit(); } } // namespace Scrub diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 081baaf943b..3048b006883 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -268,12 +268,8 @@ struct Session; ///< either reserving or actively scrubbing // the Replica states: struct ReplicaActive; ///< base state for when peered as a replica -/// Inactive replica state. Handles reservation requests +/// Inactive replica state struct ReplicaIdle; -// its sub-states: -struct ReplicaUnreserved; ///< not reserved by a primary -struct ReplicaWaitingReservation; ///< a reservation request was received from -struct ReplicaReserved; ///< we are reserved by our primary // and when handling a single chunk scrub request op: struct ReplicaActiveOp; @@ -815,6 +811,7 @@ struct ReplicaActive : sc::state, NamedSimply { explicit ReplicaActive(my_context ctx); ~ReplicaActive(); + void exit(); /** * cancel a granted or pending reservation @@ -847,6 +844,12 @@ struct ReplicaActive : sc::state, */ sc::result react(const ReserverGranted&); + /** + * a reservation request with this nonce is queued at the scrub_reserver, + * and was not yet granted. + */ + MOSDScrubReserve::reservation_nonce_t pending_reservation_nonce{0}; + private: PG* m_pg; OSDService* m_osds; @@ -878,12 +881,6 @@ struct ReplicaActive : sc::state, reservation_status_t m_reservation_status{reservation_status_t::unreserved}; - /** - * a reservation request with this nonce is queued at the scrub_reserver, - * and was not yet granted. - */ - MOSDScrubReserve::reservation_nonce_t pending_reservation_nonce{0}; - // clang-format off struct RtReservationCB : public Context { PGRef pg; @@ -904,83 +901,18 @@ struct ReplicaActive : sc::state, }; -struct ReplicaIdle : sc::state< - ReplicaIdle, - ReplicaActive, - ReplicaUnreserved>, - NamedSimply { +struct ReplicaIdle : sc::state, NamedSimply { explicit ReplicaIdle(my_context ctx); ~ReplicaIdle() = default; void reset_ignored(const FullReset&); - using reactions = mpl::list>; -}; - -/* - * ReplicaUnreserved - * - * Possible events: - * - a reservation request from a legacy primary (i.e. a primary that does not - * support queued reservations). We either deny or grant, transitioning to - * ReplicaReserved directly. - * - a reservation request from a primary that supports queued reservations. - * We transition to ReplicaWaitingReservation, and wait for the Reserver's - * response. - * - (handled by our parent state) a chunk scrub request. We transition to - * ReplicaActiveOp. - */ -struct ReplicaUnreserved : sc::state, - NamedSimply { - explicit ReplicaUnreserved(my_context ctx); - - using reactions = mpl::list< - sc::custom_reaction>; - - sc::result react(const StartReplica& ev); -}; - -/** - * ReplicaWaitingReservation - * - * Possible events: - * - 'go ahead' from the async reserver. We send a GRANT message to the - * primary & transition to ReplicaReserved. - * - 'cancel' from the primary. We clear our reservation state, and transition - * back to ReplicaUnreserved. - * - a chunk request: shouldn't happen, but we handle it anyway. An error - * is logged (to trigger test failures). - * - on interval change: handled by our parent state. - */ -struct ReplicaWaitingReservation - : sc::state, - NamedSimply { - explicit ReplicaWaitingReservation(my_context ctx); - using reactions = mpl::list< - sc::custom_reaction>; - - sc::result react(const StartReplica& ev); -}; - -/** - * ReplicaReserved - * - * Possible events: - * - 'cancel' from the primary. We clear our reservation state, and transition - * back to ReplicaUnreserved. - * - a chunk scrub request. We transition to ReplicaActiveOp. - * - on interval change: we clear our reservation state, and transition - * back to ReplicaUnreserved. - */ -struct ReplicaReserved : sc::state, NamedSimply { - explicit ReplicaReserved(my_context ctx); - - using reactions = mpl::list< - sc::custom_reaction>; + sc::custom_reaction, + sc::in_state_reaction< + FullReset, + ReplicaIdle, + &ReplicaIdle::reset_ignored>>; - sc::result react(const StartReplica& eq); + sc::result react(const StartReplica& ev); };