From a437c24a236f768fe63f3601b8b08966287d1904 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Wed, 3 Apr 2024 02:52:35 -0500 Subject: [PATCH] osd/scrub: uniform handling of reservation requests we now allow - on the replica side - reservation requests regardless of the ReplicaActive sub-state. I.e. - we will honor such requests even when handling a chunk request (in ReplicaActiveOp). Note that the current Primary code would never send such a request. But a future primary code might. Signed-off-by: Ronen Friedman (cherry picked from commit 005839bdd41835a3235af81c5b9a42f19b663686) --- src/osd/scrubber/scrub_machine.cc | 89 ++++++++++++++----------------- src/osd/scrubber/scrub_machine.h | 21 ++++---- 2 files changed, 52 insertions(+), 58 deletions(-) diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index 4f0cd7b0c8e..66ba0751d0e 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -774,26 +774,48 @@ void ReplicaActive::exit() /* - * Note: we are expected to be in the initial internal state (Idle) when - * receiving any registration request. ReplicaActiveOp, our other internal - * state, has its own handler for this event, and will treat it - * as an abort request. - * + * Note: we are expected to be in the ReplicaIdle sub-state: the current + * scrub code on the primary side would never interlace chunk ops with + * reservation requests. But 'badly timed' requests are not blocked + * on the replica side: while requests arriving while in ReplicaActiveOp + * are at this time probably a bug; but a future Primary scrub code + * would possibly treat 'reservation' & 'scrubbing' as (almost) + * totally orthogonal. + */ +sc::result ReplicaActive::react(const ReplicaReserveReq& ev) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ReplicaActive::react(const ReplicaReserveReq&)" << dendl; + + if (m_reservation_status != reservation_status_t::unreserved) { + // we are not expected to be in this state when a new request arrives. + // Clear the existing reservation - be it granted or pending. + const auto& m = *(ev.m_op->get_req()); + dout(1) << fmt::format( + "ReplicaActive::react(const ReplicaReserveReq&): unexpected " + "request. Discarding existing " + "reservation (was granted?:{}). Incoming request: {}", + reservation_granted, m) + << dendl; + clear_remote_reservation(true); + } + + handle_reservation_request(ev); + return discard_event(); +} + + +/* * Process: - * - if already reserved: clear existing reservation, then continue * - for async requests: - * - enqueue the request with reserver; - * - move to the ReplicaWaitingReservation state + * - enqueue the request with reserver, noting the nonce; * - no reply is expected by the caller * - for legacy requests: - * - ask the OSD for the "reservation resource" - * - if granted: move to ReplicaReserved and notify the Primary. - * - otherwise: just notify the requesting primary. - * - * implementation note: sc::result objects cannot be copied or moved. Thus, - * we've resorted to returning a code indicating the next action. + * - ask the OSD for the "reservation resource"; + * - send grant/reject to the requesting primary; + * - update 'reservation_granted' */ -sc::result ReplicaActive::react(const ReplicaReserveReq& ev) +void ReplicaActive::handle_reservation_request(const ReplicaReserveReq& ev) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases const auto& m = *(ev.m_op->get_req()); @@ -803,24 +825,11 @@ sc::result ReplicaActive::react(const ReplicaReserveReq& ev) "osd_scrub_disable_reservation_queuing"); const bool async_request = !async_disabled && m.wait_for_resources; dout(10) << fmt::format( - "ReplicaActive::react(const ReplicaReserveReq&): async " - "request?:{} disabled?:{} -> async? {}", - m.wait_for_resources, async_disabled, async_request) + "{}: Message:{}. async request?:{} disabled?:{} -> async? {}", + __func__, m, m.wait_for_resources, async_disabled, + async_request) << dendl; - if (m_reservation_status != reservation_status_t::unreserved) { - // we are not expected to be in this state when a new request arrives. - // Exit-then-reenter ReplicaActive, causing the existing reservation - be - // it granted or pending - to be cleared. This maneuver also aborts - // any on-going chunk request. - dout(1) << fmt::format( - "{}: unexpected request (granted:{} request:{})", __func__, - reservation_granted, m) - << dendl; - post_event(ev); - return transit(); - } - auto& reserver = m_osds->get_scrub_reserver(); if (async_request) { @@ -828,8 +837,7 @@ sc::result ReplicaActive::react(const ReplicaReserveReq& ev) AsyncScrubResData request_details{ pg_id, ev.m_from, ev.m_op->sent_epoch, m.reservation_nonce}; dout(15) << fmt::format( - "ReplicaActive::react(const ReplicaReserveReq& ev): async " - "request: {} details:{}", + "{}: async request: {} details:{}", __func__, ev, request_details) << dendl; @@ -859,7 +867,6 @@ sc::result ReplicaActive::react(const ReplicaReserveReq& ev) m_osds->send_message_osd_cluster( reply, ev.m_op->get_req()->get_connection()); } - return discard_event(); } @@ -1012,20 +1019,6 @@ sc::result ReplicaActiveOp::react(const StartReplica&) } -sc::result ReplicaActiveOp::react(const ReplicaReserveReq& ev) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReplicaActiveOp::react(const ReplicaReserveReq&)" << dendl; - scrbr->get_clog()->warn() << fmt::format( - "osd.{} pg[{}]: reservation requested while processing a chunk", - scrbr->get_whoami(), scrbr->get_spgid()); - // exit-n-renter ReplicaActive (aborting the chunk & freeing the reservation - // in the process) - post_event(ev); - return transit(); -} - - sc::result ReplicaActiveOp::react(const ReplicaRelease& ev) { dout(10) << "ReplicaActiveOp::react(const ReplicaRelease&)" << dendl; diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 31625b55d43..b9f60481674 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -887,6 +887,16 @@ struct ReplicaActive : sc::state, reservation_status_t m_reservation_status{reservation_status_t::unreserved}; + /** + * React to the reservation request. + * Called after any existing pending/granted request was released. + * + * Async requests are sent to the reserver. + * For old-style synchronous requests, the reserver is queried using + * its 'immediate' interface, and the response is sent back to the primary. + */ + void handle_reservation_request(const ReplicaReserveReq& ev); + // clang-format off struct RtReservationCB : public Context { PGRef pg; @@ -929,8 +939,7 @@ struct ReplicaActiveOp using reactions = mpl::list< sc::custom_reaction, - sc::custom_reaction, - sc::custom_reaction>; + sc::custom_reaction>; /** * Handling the unexpected (read - caused by a bug) case of receiving a @@ -950,14 +959,6 @@ struct ReplicaActiveOp * releasing the reservation on the way. */ sc::result react(const ReplicaRelease&); - - /** - * handling unexpected 'reservation requests' while we are in the middle - * of serving a chunk request. - * This is a bug. We log it, abort the operation, and re-enter ReplicaActive - * to handle the reservation request. - */ - sc::result react(const ReplicaReserveReq& ev); }; /* -- 2.39.5