From: Ronen Friedman Date: Thu, 25 Jan 2024 19:05:01 +0000 (-0600) Subject: osd/scrub: directly manage remote reservations in the FSM X-Git-Tag: v19.3.0~106^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8516c0ebc5a83eb647b482df7b977f41bd894b32;p=ceph.git osd/scrub: directly manage remote reservations in the FSM The FSM now interacts with the scrub_reserver directly. Signed-off-by: Ronen Friedman --- diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 3ef5a2ef567d..9266a54d7858 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -397,6 +397,15 @@ void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued) dout(10) << "scrubber event --<< " << __func__ << dendl; } +void PgScrubber::send_granted_by_reserver(const AsyncScrubResData& req) +{ + dout(10) << "scrubber event -->> granted_by_reserver" << dendl; + if (check_interval(req.request_epoch)) { + m_fsm->process_event(Scrub::ReserverGranted{req}); + } + dout(10) << "scrubber event --<< granted_by_reserver" << dendl; +} + // ----------------- bool PgScrubber::is_reserving() const diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 9c29d5fdedb9..bcab24cddfa3 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -227,6 +227,8 @@ class PgScrubber : public ScrubPgIF, void send_scrub_is_finished(epoch_t epoch_queued) final; + void send_granted_by_reserver(const AsyncScrubResData& req) final; + /** * we allow some number of preemptions of the scrub, which mean we do * not block. Then we start to block. Once we start blocking, we do diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index fc5238186868..57c0492ec2e4 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -766,12 +766,7 @@ ReplicaActive::ReplicaActive(my_context ctx) ReplicaActive::~ReplicaActive() { - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - if (reserved_by_my_primary) { - dout(10) << "ReplicaActive::~ReplicaActive(): clearing reservation" - << dendl; - clear_reservation_by_remote_primary(false); - } + clear_remote_reservation(false); } /* @@ -800,85 +795,118 @@ ReplicaReactCode ReplicaActive::on_reserve_request( { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases const auto m = ev.m_op->get_req(); - const auto msg_nonce = m->reservation_nonce; dout(10) << fmt::format( - "ReplicaActive::on_reserve_req() from {} request:{} is " - "async?{} (reservation_nonce:{})", - ev.m_from, ev, async_request, msg_nonce) + "ReplicaActive::on_reserve_req() request:{} async_request:{} " + "reservation_nonce:{}", + ev, async_request, m->reservation_nonce) << dendl; - auto& svc = m_osds->get_scrub_services(); // shorthand - - if (reserved_by_my_primary) { - dout(10) << "ReplicaActive::on_reserve_request(): already reserved" - << dendl; - // clear the existing reservation. Clears the flag, too - clear_reservation_by_remote_primary(false); - } - Message* reply{nullptr}; + ceph_assert(!reservation_granted); + ceph_assert(!pending_reservation_nonce); ReplicaReactCode next_action{ReplicaReactCode::discard}; + AsyncScrubResData request_details{ + pg_id, ev.m_from, ev.m_op->sent_epoch, m->reservation_nonce}; + auto& reserver = m_osds->get_scrub_reserver(); if (async_request) { // the request is to be handled asynchronously - svc.enqueue_remote_reservation(pg_id.pgid); + dout(20) << fmt::format( + "{}: async request: {} details:{}", __func__, ev, + request_details) + << dendl; + pending_reservation_nonce = m->reservation_nonce; + const auto reservation_cb = new RtReservationCB(m_pg, request_details); + reserver.request_reservation(pg_id, reservation_cb, 0, nullptr); next_action = ReplicaReactCode::goto_waiting_reservation; } else { // an immediate yes/no is required - const auto granted = svc.inc_scrubs_remote(scrbr->get_spgid().pgid); - if (granted) { - reserved_by_my_primary = true; + Message* reply{nullptr}; + reservation_granted = reserver.request_reservation_or_fail(pg_id); + if (reservation_granted) { dout(10) << fmt::format("{}: reserved? yes", __func__) << dendl; reply = new MOSDScrubReserve( spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch, - MOSDScrubReserve::GRANT, m_pg->pg_whoami, msg_nonce); + MOSDScrubReserve::GRANT, m_pg->pg_whoami, m->reservation_nonce); next_action = ReplicaReactCode::goto_replica_reserved; } else { dout(10) << fmt::format("{}: reserved? no", __func__) << dendl; reply = new MOSDScrubReserve( spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch, - MOSDScrubReserve::REJECT, m_pg->pg_whoami, msg_nonce); + MOSDScrubReserve::REJECT, m_pg->pg_whoami, m->reservation_nonce); // the event is discarded next_action = ReplicaReactCode::discard; } - } - if (reply) { m_osds->send_message_osd_cluster( reply, ev.m_op->get_req()->get_connection()); } + return next_action; } +bool ReplicaActive::granted_by_reserver(const AsyncScrubResData& reservation) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << fmt::format("{}: reservation granted: {}", __func__, reservation) + << dendl; + + /// verify that the granted reservation is the one we were waiting for + if (reservation.nonce != pending_reservation_nonce) { + dout(5) << fmt::format( + "{}: reservation_nonce mismatch: {} != {}", __func__, reservation.nonce, + pending_reservation_nonce) << dendl; + return false; + } + + reservation_granted = true; + pending_reservation_nonce = 0; // no longer pending + + // notify the primary + auto grant_msg = make_message( + spg_t(pg_id.pgid, m_pg->get_primary().shard), reservation.request_epoch, + MOSDScrubReserve::GRANT, m_pg->pg_whoami, pending_reservation_nonce); + m_pg->send_cluster_message( + m_pg->get_primary().osd, grant_msg, reservation.request_epoch, false); + return true; +} + void ReplicaActive::on_release(const ReplicaRelease& ev) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << fmt::format("ReplicaActive::on_release() from {}", ev.m_from) << dendl; - clear_reservation_by_remote_primary(true); + clear_remote_reservation(true); } -void ReplicaActive::clear_reservation_by_remote_primary(bool log_failure) +void ReplicaActive::clear_remote_reservation(bool warn_if_no_reservation) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << fmt::format( - "ReplicaActive::clear_reservation_by_remote_primary(): was " - "reserved? {}", - (reserved_by_my_primary ? "yes" : "no")) + "ReplicaActive::clear_remote_reservation(): " + "pending_reservation_nonce {}, reservation_granted {}", + reservation_granted, pending_reservation_nonce) << dendl; - if (reserved_by_my_primary) { - m_osds->get_scrub_services().dec_scrubs_remote(scrbr->get_spgid().pgid); - reserved_by_my_primary = false; - } else if (log_failure) { - const auto msg = fmt::format( - "ReplicaActive::clear_reservation_by_remote_primary(): " - "not reserved!"); + if (reservation_granted || pending_reservation_nonce) { + m_osds->get_scrub_reserver().cancel_reservation(pg_id); + reservation_granted = false; + pending_reservation_nonce = 0; + } else if (warn_if_no_reservation) { + const auto msg = + "ReplicaActive::clear_remote_reservation(): " + "not reserved!"; dout(5) << msg << dendl; scrbr->get_clog()->warn() << msg; } } +void ReplicaActive::ignore_unhandled_grant(const ReserverGranted&) +{ + dout(10) << "ReplicaActive::react(const ReserverGranted&): ignored" + << dendl; +} + // ---------------- ReplicaActive/ReplicaIdle --------------------------- @@ -913,7 +941,8 @@ sc::result ReplicaUnreserved::react(const ReplicaReserveReq& ev) DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "ReplicaUnreserved::react(const ReplicaReserveReq&)" << dendl; - switch (context().on_reserve_request(ev, false)) { + bool async_request = true && ev.m_op->get_req()->wait_for_resources /* && a config */; + switch (context().on_reserve_request(ev, async_request)) { case ReplicaReactCode::discard: return discard_event(); case ReplicaReactCode::goto_waiting_reservation: @@ -939,15 +968,11 @@ sc::result ReplicaUnreserved::react(const ReplicaRelease&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "ReplicaUnreserved::react(const ReplicaRelease&)" << dendl; - // shouldn't happen. Possible (faulty) sequence: getting an op - // command while in ReplicaWaitingReservation (as we would just - // treat that as a regular op request, but will stop waiting for - // reservation). - // must cancel the queued reservation request + // this is a bug. We should never receive a release request unless we + // are reserved or have a pending reservation. scrbr->get_clog()->error() << fmt::format( "osd.{} pg[{}]: reservation released while not reserved", scrbr->get_whoami(), scrbr->get_spgid()); - context().clear_reservation_by_remote_primary(true); return discard_event(); } @@ -955,15 +980,13 @@ sc::result ReplicaUnreserved::react(const ReserverGranted&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "ReplicaUnreserved::react(const ReserverGranted&)" << dendl; - // shouldn't happen. Possible (faulty) sequence: getting an op - // command while in ReplicaWaitingReservation (as we would just - // treat that as a regular op request, but will stop waiting for - // reservation). + // shouldn't happen. Might be a result of a cancelled reservation + // that was still delivered. // must unreserve - scrbr->get_clog()->error() << fmt::format( - "osd.{} pg[{}]: reservation granted while not being waited for", - scrbr->get_whoami(), scrbr->get_spgid()); - context().clear_reservation_by_remote_primary(true); + dout(5) << "ReplicaUnreserved::react(const ReserverGranted&): reservation " + "granted while not being waited for" + << dendl; + context().clear_remote_reservation(false); return discard_event(); } @@ -981,14 +1004,17 @@ ReplicaWaitingReservation::ReplicaWaitingReservation(my_context ctx) << dendl; } -sc::result ReplicaWaitingReservation::react(const ReserverGranted&) +sc::result ReplicaWaitingReservation::react(const ReserverGranted& ev) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReplicaWaitingReservation::react(const ReserverGranted&)" + dout(10) << fmt::format( + "ReplicaWaitingReservation::react(const ReserverGranted&): " + "event:{}", + ev) << dendl; - - /// \todo complete the handling of the granted reservation - ceph_abort_msg("not implemented yet"); + if (context().granted_by_reserver(ev.value)) { + return transit(); + } return discard_event(); } @@ -1012,6 +1038,7 @@ sc::result ReplicaWaitingReservation::react(const StartReplica& ev) "osd.{} pg[{}]: new chunk request while still waiting for " "reservation", scrbr->get_whoami(), scrbr->get_spgid()); + context().clear_remote_reservation(true); clear_shallow_history(); post_event(ReplicaPushesUpd{}); return transit(); @@ -1027,7 +1054,7 @@ sc::result ReplicaWaitingReservation::react(const ReplicaReserveReq& ev) "osd.{} pg[{}]: reservation requested while previous is pending", scrbr->get_whoami(), scrbr->get_spgid()); // cancel the existing reservation, and re-request - context().clear_reservation_by_remote_primary(true); + context().clear_remote_reservation(true); post_event(ev); return transit(); } @@ -1060,8 +1087,11 @@ sc::result ReplicaReserved::react(const ReplicaReserveReq& ev) scrbr->get_clog()->error() << fmt::format( "osd.{} pg[{}]: reservation requested while still reserved", scrbr->get_whoami(), scrbr->get_spgid()); + // This is a bug. We should never receive a new request unless the + // previous one was cancelled - either by the primary, or on interval + // change. // cancel the existing reservation, and re-request - context().clear_reservation_by_remote_primary(true); + context().clear_remote_reservation(true); post_event(ev); return transit(); } diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 6eadb109cd54..254e7861ed95 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -16,13 +16,11 @@ #include #include "common/fmt_common.h" +#include "include/Context.h" #include "common/version.h" #include "messages/MOSDOp.h" #include "messages/MOSDRepScrub.h" #include "messages/MOSDRepScrubMap.h" -#include "messages/MOSDScrubReserve.h" - -#include "include/Context.h" #include "osd/scrubber_common.h" #include "scrub_machine_lstnr.h" @@ -136,7 +134,7 @@ struct value_event_t : sc::event { /// the async-reserver granted our reservation request -OP_EV(ReserverGranted); +VALUE_EVENT(ReserverGranted, AsyncScrubResData); #define MEV(E) \ struct E : sc::event { \ @@ -783,6 +781,41 @@ struct WaitDigestUpdate : sc::state, * * ReplicaWaitUpdates * * ReplicaBuildingMap */ +/* + * AsyncReserver for scrub 'remote' reservations + * ----------------------------------------------- + * + * Unless disabled by 'osd_scrub_disable_reservation_queuing' (*), scrub + * reservation requests are handled by an async reserver: they are queued, + * until the number of concurrent scrubs is below the configured limit. + + * (*) Note: the 'osd_scrub_disable_reservation_queuing' option is a temporary + * debug measure, and will be removed without deprecation in a future release. + * + * On the replica side, all reservations are treated as having the same priority. + * Note that 'high priority' scrubs, e.g. user-initiated scrubs, do not perform + * reservations on replicas at all. + * + * A queued scrub reservation request is cancelled by any of the following events: + * + * - a new interval: in this case, we do not expect to see a cancellation request + * from the primary, and we can simply remove the request from the queue; + * + * - a cancellation request from the primary: probably a result of timing out on + * the reservation process. Here, we can simply remove the request from the queue. + * + * - a new reservation request for the same PG: this is a bug. We had missed the + * previous cancellation request, which could never happen. + * We cancel the previous request, and replace + * it with the new one. We would also issue an error log message. + * + * Primary/Replica with differing versions: + * + * The updated version of MOSDScrubReserve contains a new 'wait_for_resources' + * field. For legacy Primary OSDs, this field is decoded as 'false', and the + * replica responds immediately, with grant/rejection. +*/ + struct ReplicaIdle; @@ -809,22 +842,95 @@ struct ReplicaActive : sc::state< const ReplicaReserveReq&, bool async_request); + /** + * the queued reservation request was granted by the async reserver. + * Notify the Primary. + * Returns 'false' if the reservation is not the last one to be received + * by this replica. + */ + bool granted_by_reserver(const AsyncScrubResData& resevation); + /// handle a 'release' from a primary void on_release(const ReplicaRelease& ev); - /// cancel the reserver request. - /// The 'failure' re 'log_failure' is logged if we are not reserved to - /// begin with. - void clear_reservation_by_remote_primary(bool log_failure); + /** + * cancel a granted or pending reservation + * + * warn_if_no_reservation is set to true if the call is in response to a + * cancellation from the primary. In that event, we *must* find a + * a granted or pending reservation and failing to do so warrants + * a warning to clog as it is a bug. + */ + void clear_remote_reservation(bool warn_if_no_reservation); + + /** + * discard (and log) unhandled 'reservation granted' messages + * from the async reserver. + * As canceled reservations may still be triggered, this is not + * necessarily a bug. + */ + void ignore_unhandled_grant(const ReserverGranted&); - using reactions = mpl::list>; + using reactions = mpl::list< + sc::transition, + sc::in_state_reaction< + ReserverGranted, + ReplicaActive, + &ReplicaActive::ignore_unhandled_grant>>; private: - bool reserved_by_my_primary{false}; - - // shortcuts: PG* m_pg; OSDService* m_osds; + + // --- remote reservation machinery + + /* + * 'reservation_granted' is set to 'true' when we have grant confirmation + * to the primary, and the reservation has not yet been canceled (either + * by the primary or following an interval change). + * + * Note the interaction with 'pending_reservation_nonce': the combination + * of these two variables is used to track the state of the reservation + * with the scrub_reserver. The possible combinations: + * - pending_reservation_nonce == 0 && !reservation_granted -- no reservation + * was granted, and none is pending; + * - pending_reservation_nonce != 0 && !reservation_granted -- we have a + * pending cb in the AsyncReserver for a request with nonce + * 'pending_reservation_nonce' + * - pending_reservation_nonce == 0 && reservation_granted -- we have sent + * a response to the primary granting the reservation + * (invariant: !((pending_reservation_nonce != 0) && reservation_granted) + * + * Note that in the event that the primary is too old to support asynchronous + * reservation, MOSDScrubReserve::wait_for_resources will be set to false by + * the decoder and we bypass the 2'nd case above. + * See ReplicaActive::on_reserve_request(). + */ + bool reservation_granted{false}; + + /** + * a reservation request with this nonce is queued at the scrub_reserver, + * and was not yet granted. + */ + MOSDScrubReserve::reservation_nonce_t pending_reservation_nonce{0}; + + // clang-format off + struct RtReservationCB : public Context { + PGRef pg; + AsyncScrubResData res_data; + + explicit RtReservationCB(PGRef pg, AsyncScrubResData request_details) + : pg{pg} + , res_data{request_details} + {} + + void finish(int) override { + pg->lock(); + pg->m_scrubber->send_granted_by_reserver(res_data); + pg->unlock(); + } + }; + // clang-format on }; @@ -862,9 +968,10 @@ struct ReplicaUnreserved : sc::state, using reactions = mpl::list< sc::custom_reaction, + sc::custom_reaction, + // unexpected (bug-induced) events: sc::custom_reaction, - sc::custom_reaction, - sc::custom_reaction>; + sc::custom_reaction>; sc::result react(const ReplicaReserveReq& ev); sc::result react(const StartReplica& ev); diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index 067b9754c110..66e61d856cd4 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -3,10 +3,11 @@ #pragma once #include - #include "common/ceph_time.h" +#include "common/fmt_common.h" #include "common/scrub_types.h" #include "include/types.h" +#include "messages/MOSDScrubReserve.h" #include "os/ObjectStore.h" #include "OpRequest.h" @@ -24,6 +25,32 @@ namespace Scrub { struct ReplicaActive; } +/// reservation-related data sent by the primary to the replicas, +/// and used to match the responses to the requests +struct AsyncScrubResData { + spg_t pgid; + pg_shard_t from; + epoch_t request_epoch; + MOSDScrubReserve::reservation_nonce_t nonce; + AsyncScrubResData( + spg_t pgid, + pg_shard_t from, + epoch_t request_epoch, + MOSDScrubReserve::reservation_nonce_t nonce) + : pgid{pgid} + , from{from} + , request_epoch{request_epoch} + , nonce{nonce} + {} + template + auto fmt_print_ctx(FormatContext& ctx) const + { + return fmt::format_to( + ctx.out(), "pg[{}],f:{},ep:{},n:{}", pgid, from, request_epoch, nonce); + } +}; + + /// Facilitating scrub-related object access to private PG data class ScrubberPasskey { private: @@ -317,6 +344,8 @@ struct ScrubPgIF { virtual void send_scrub_is_finished(epoch_t epoch_queued) = 0; + virtual void send_granted_by_reserver(const AsyncScrubResData& req) = 0; + virtual void on_applied_when_primary(const eversion_t& applied_version) = 0; // --------------------------------------------------