From: Ronen Friedman Date: Wed, 17 Jan 2024 15:36:16 +0000 (-0600) Subject: osd/scrub: check reservation replies for relevance X-Git-Tag: v19.3.0~177^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7559a6c0d11b5c8f2443e208c8188a9201b82a4d;p=ceph.git osd/scrub: check reservation replies for relevance Compare a token (nonce) carried in the reservation reply with the remembered token of the reservation request. If they don't match, the reply is stale and should be ignored (and logged). Fixes: https://tracker.ceph.com/issues/64052 Signed-off-by: Ronen Friedman --- diff --git a/src/messages/MOSDScrubReserve.h b/src/messages/MOSDScrubReserve.h index c7ab985411750..97b6dff4dd006 100644 --- a/src/messages/MOSDScrubReserve.h +++ b/src/messages/MOSDScrubReserve.h @@ -19,9 +19,11 @@ class MOSDScrubReserve : public MOSDFastDispatchOp { private: - static constexpr int HEAD_VERSION = 1; + static constexpr int HEAD_VERSION = 2; static constexpr int COMPAT_VERSION = 1; public: + using reservation_nonce_t = uint32_t; + spg_t pgid; epoch_t map_epoch; enum ReserveMsgOp { @@ -32,6 +34,7 @@ public: }; int32_t type; pg_shard_t from; + reservation_nonce_t reservation_nonce{0}; epoch_t get_map_epoch() const override { return map_epoch; @@ -46,10 +49,11 @@ public: MOSDScrubReserve(spg_t pgid, epoch_t map_epoch, int type, - pg_shard_t from) + pg_shard_t from, + reservation_nonce_t nonce) : MOSDFastDispatchOp{MSG_OSD_SCRUB_RESERVE, HEAD_VERSION, COMPAT_VERSION}, pgid(pgid), map_epoch(map_epoch), - type(type), from(from) {} + type(type), from(from), reservation_nonce{nonce} {} std::string_view get_type_name() const { return "MOSDScrubReserve"; @@ -71,7 +75,8 @@ public: out << "RELEASE "; break; } - out << "e" << map_epoch << ")"; + out << "e" << map_epoch << " from: " << from + << " reservation_nonce: " << reservation_nonce << ")"; return; } @@ -82,6 +87,13 @@ public: decode(map_epoch, p); decode(type, p); decode(from, p); + if (header.version >= 2) { + decode(reservation_nonce, p); + } else { + // a zero nonce (identifying legacy senders) is ignored when + // checking the request for obsolescence + reservation_nonce = 0; + } } void encode_payload(uint64_t features) { @@ -90,6 +102,7 @@ public: encode(map_epoch, payload); encode(type, payload); encode(from, payload); + encode(reservation_nonce, payload); } private: template diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index d0ab07fe1dfae..d9d03fe688964 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -230,7 +230,9 @@ ReservingReplicas::ReservingReplicas(my_context ctx) DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases // initiate the reservation process - session.m_reservations.emplace(*scrbr, *session.m_perf_set); + session.m_reservations.emplace( + *scrbr, context().last_request_sent_nonce, + *session.m_perf_set); if (session.m_reservations->get_last_sent()) { // the 1'st reservation request was sent @@ -263,9 +265,9 @@ sc::result ReservingReplicas::react(const ReplicaGrant& ev) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "ReservingReplicas::react(const ReplicaGrant&)" << dendl; + const auto& m = ev.m_op->get_req(); - if (context().m_reservations->handle_reserve_grant( - ev.m_op, ev.m_from)) { + if (context().m_reservations->handle_reserve_grant(*m, ev.m_from)) { // we are done with the reservation process return transit(); } @@ -277,13 +279,21 @@ sc::result ReservingReplicas::react(const ReplicaReject& ev) DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases auto& session = context(); dout(10) << "ReservingReplicas::react(const ReplicaReject&)" << dendl; - session.m_reservations->log_failure_and_duration(scrbcnt_resrv_rejected); - - // manipulate the 'next to reserve' iterator to exclude - // the rejecting replica from the set of replicas requiring release - session.m_reservations->verify_rejections_source(ev.m_op, ev.m_from); + const auto m = ev.m_op->get_req(); + + // Verify that the message is from the replica we were expecting a reply from, + // and that the message is not stale. If all is well - this is a real rejection: + // - log required details; + // - manipulate the 'next to reserve' iterator to exclude + // the rejecting replica from the set of replicas requiring release + if (!session.m_reservations->handle_reserve_rejection(*m, ev.m_from)) { + // stale or unexpected + return discard_event(); + } - // set 'reservation failure' as the scrub termination cause (affecting + // The rejection was carrying the correct reservation_nonce. It was + // logged by handle_reserve_rejection(). + // Set 'reservation failure' as the scrub termination cause (affecting // the rescheduling of this PG) scrbr->flag_reservations_failure(); @@ -777,7 +787,13 @@ ReplicaActive::~ReplicaActive() void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReplicaActive::on_reserve_req()" << dendl; + const auto m = ev.m_op->get_req(); + const auto msg_nonce = m->reservation_nonce; + dout(10) + << fmt::format( + "ReplicaActive::on_reserve_req() from {} (reservation_nonce:{})", + ev.m_from, msg_nonce) + << dendl; if (reserved_by_my_primary) { dout(10) << "ReplicaActive::on_reserve_req(): already reserved" << dendl; @@ -797,7 +813,7 @@ void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev) Message* reply = new MOSDScrubReserve( spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch, ret.op, - m_pg->pg_whoami); + m_pg->pg_whoami, msg_nonce); m_osds->send_message_osd_cluster(reply, ev.m_op->get_req()->get_connection()); } diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 8fa96da140525..beb4d7a4c0fa7 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -440,6 +440,22 @@ struct PrimaryActive : sc::state, using reactions = mpl::list< // when the interval ends - we may not be a primary anymore sc::transition>; + + /** + * Identifies a specific reservation request. + * The primary is permitted to cancel outstanding reservation requests without + * waiting for the pending response from the replica. Thus, we may, in general, + * see responses from prior reservation attempts that we need to ignore. Each + * reservation request is therefore associated with a nonce incremented within + * an interval with each reservation request. Any response with a non-matching + * nonce must be from a reservation request we canceled. Note that this check + * occurs after validating that the message is from the current interval, so + * reusing nonces between intervals is safe. + * + * 0 is a special value used to indicate that the sender did not include a nonce due + * to not being a sufficiently recent version. + */ + reservation_nonce_t last_request_sent_nonce{1}; }; /** diff --git a/src/osd/scrubber/scrub_reservations.cc b/src/osd/scrubber/scrub_reservations.cc index 3faafe9cb0a1d..284d90290578a 100644 --- a/src/osd/scrubber/scrub_reservations.cc +++ b/src/osd/scrubber/scrub_reservations.cc @@ -6,7 +6,6 @@ #include #include "common/ceph_time.h" -#include "messages/MOSDScrubReserve.h" #include "osd/OSD.h" #include "osd/PG.h" #include "osd/osd_types_fmt.h" @@ -31,11 +30,13 @@ namespace Scrub { ReplicaReservations::ReplicaReservations( ScrubMachineListener& scrbr, + reservation_nonce_t& nonce, PerfCounters& pc) : m_scrubber{scrbr} , m_pg{m_scrubber.get_pg()} , m_pgid{m_scrubber.get_spgid().pgid} , m_osds{m_pg->get_pg_osd(ScrubberPasskey())} + , m_last_request_sent_nonce{nonce} , m_perf_set{pc} { // the acting set is sorted by pg_shard_t. The reservations are to be issued @@ -80,7 +81,7 @@ void ReplicaReservations::release_all() for (const auto& peer : replicas) { auto m = make_message( spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::RELEASE, - m_pg->pg_whoami); + m_pg->pg_whoami, 0); m_pg->send_cluster_message(peer.osd, m, epoch, false); } @@ -125,16 +126,50 @@ ReplicaReservations::~ReplicaReservations() log_failure_and_duration(scrbcnt_resrv_aborted); } -bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from) +bool ReplicaReservations::is_reservation_response_relevant( + reservation_nonce_t msg_nonce) const { - // verify that the grant is from the peer we expected. If not? - // for now - abort the OSD. \todo reconsider the reaction. - if (!get_last_sent().has_value() || from != *get_last_sent()) { + return (msg_nonce == 0) || (msg_nonce == m_last_request_sent_nonce); +} + +bool ReplicaReservations::is_msg_source_correct(pg_shard_t from) const +{ + const auto exp_source = get_last_sent(); + return exp_source && from == *exp_source; +} + +bool ReplicaReservations::handle_reserve_grant( + const MOSDScrubReserve& msg, + pg_shard_t from) +{ + if (!is_reservation_response_relevant(msg.reservation_nonce)) { + // this is a stale response to a previous request (e.g. one that + // timed-out). See m_last_request_sent_nonce for details. dout(1) << fmt::format( - "unexpected grant from {} (expected {})", from, - get_last_sent().value_or(pg_shard_t{})) + "stale reservation response from {} with nonce {} vs. " + "expected {} (e:{})", + from, msg.reservation_nonce, m_last_request_sent_nonce, + msg.map_epoch) << dendl; - ceph_assert(from == get_last_sent()); + return false; + } + + // verify that the grant is from the peer we expected. If not? + // for now - abort the OSD. There is no known scenario in which a + // grant message with a correct nonce can arrive from the wrong peer. + // (we would not abort for arriving messages with nonce 0, as those + // are legacy messages, for which the nonce was not verified). + if (!is_msg_source_correct(from)) { + const auto error_text = fmt::format( + "unexpected reservation grant from {} vs. the expected {} (e:{} " + "message nonce:{})", + from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch, + msg.reservation_nonce); + dout(1) << error_text << dendl; + if (msg.reservation_nonce != 0) { + m_osds->clog->error() << error_text; + ceph_abort_msg(error_text); + } return false; } @@ -143,15 +178,15 @@ bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from) // log a warning if the response was slow to arrive if ((m_slow_response_warn_timeout > 0ms) && (elapsed > m_slow_response_warn_timeout)) { - dout(1) << fmt::format( - "slow reservation response from {} ({}ms)", from, - duration_cast(elapsed).count()) - << dendl; + m_osds->clog->warn() << fmt::format( + "slow reservation response from {} ({}ms)", from, + duration_cast(elapsed).count()); // prevent additional warnings m_slow_response_warn_timeout = 0ms; } dout(10) << fmt::format( - "granted by {} ({} of {}) in {}ms", from, + "(e:{} nonce:{}) granted by {} ({} of {}) in {}ms", + msg.map_epoch, msg.reservation_nonce, from, active_requests_cnt(), m_sorted_secondaries.size(), duration_cast(elapsed).count()) << dendl; @@ -170,44 +205,64 @@ bool ReplicaReservations::send_next_reservation_or_complete() // send the next reservation request const auto peer = *m_next_to_request; const auto epoch = m_pg->get_osdmap_epoch(); + m_last_request_sent_nonce++; + auto m = make_message( - spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST, - m_pg->pg_whoami); + spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST, m_pg->pg_whoami, + m_last_request_sent_nonce); m_pg->send_cluster_message(peer.osd, m, epoch, false); m_last_request_sent_at = ScrubClock::now(); dout(10) << fmt::format( - "reserving {} (the {} of {} replicas)", *m_next_to_request, - active_requests_cnt() + 1, m_sorted_secondaries.size()) + "reserving {} (the {} of {} replicas) e:{} nonce:{}", + *m_next_to_request, active_requests_cnt() + 1, + m_sorted_secondaries.size(), epoch, m_last_request_sent_nonce) << dendl; m_next_to_request++; return false; } -void ReplicaReservations::verify_rejections_source( - OpRequestRef op, +bool ReplicaReservations::handle_reserve_rejection( + const MOSDScrubReserve& msg, pg_shard_t from) { // a convenient log message for the reservation process conclusion // (matches the one in send_next_reservation_or_complete()) dout(10) << fmt::format( - "remote reservation failure. Rejected by {} ({})", from, - *op->get_req()) + "remote reservation failure. Rejected by {} ({})", from, msg) << dendl; + if (!is_reservation_response_relevant(msg.reservation_nonce)) { + // this is a stale response to a previous request (e.g. one that + // timed-out). See m_last_request_sent_nonce for details. + dout(10) << fmt::format( + "stale reservation response from {} with reservation_nonce " + "{} vs. expected {} (e:{})", + from, msg.reservation_nonce, m_last_request_sent_nonce, + msg.map_epoch) + << dendl; + return false; + } + + log_failure_and_duration(scrbcnt_resrv_rejected); + + // we should never see a rejection carrying a valid + // reservation nonce - arriving while we have no pending requests + ceph_assert(get_last_sent().has_value() || msg.reservation_nonce == 0); + // verify that the denial is from the peer we expected. If not? + // There is no known scenario in which this can happen, but if it does - // we should treat it as though the *correct* peer has rejected the request, // but remember to release that peer, too. - - ceph_assert(get_last_sent().has_value()); - const auto expected = *get_last_sent(); - if (from != expected) { - dout(1) << fmt::format( - "unexpected rejection from {} (expected {})", from, expected) - << dendl; - } else { - // correct peer, wrong answer... + if (is_msg_source_correct(from)) { m_next_to_request--; // no need to release this one + } else { + m_osds->clog->warn() << fmt::format( + "unexpected reservation denial from {} vs the expected {} (e:{} " + "message reservation_nonce:{})", + from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch, + msg.reservation_nonce); } + return true; } std::optional ReplicaReservations::get_last_sent() const diff --git a/src/osd/scrubber/scrub_reservations.h b/src/osd/scrubber/scrub_reservations.h index 9d59033bac874..2e13760ffa1f0 100644 --- a/src/osd/scrubber/scrub_reservations.h +++ b/src/osd/scrubber/scrub_reservations.h @@ -8,6 +8,7 @@ #include #include +#include "messages/MOSDScrubReserve.h" #include "osd/scrubber_common.h" #include "osd_scrub_sched.h" @@ -15,6 +16,8 @@ namespace Scrub { +using reservation_nonce_t = MOSDScrubReserve::reservation_nonce_t; + /** * Reserving/freeing scrub resources at the replicas. * @@ -44,6 +47,21 @@ namespace Scrub { * that have been acquired until that moment. * (Why? because we have encountered instances where a reservation request was * lost - either due to a bug or due to a network issue.) + * + * Keeping primary & replica in sync: + * + * Reservation requests may be canceled by the primary independently of the + * replica's response. Depending on timing, a cancellation by the primary might + * or might not be processed by a replica prior to sending a response (either + * rejection or success). Thus, we associate each reservation request with a + * nonce incremented with each reservation during an interval and drop any + * responses that do not match our current nonce. + * This check occurs after rejecting any messages from prior intervals, so + * reusing nonces between intervals is not a problem. Note that epoch would + * not suffice as it is possible for this sequence to occur several times + * without a new map epoch. + * Note - 'release' messages, which are not replied to by the replica, + * do not need or use that field. */ class ReplicaReservations { ScrubMachineListener& m_scrubber; @@ -64,6 +82,14 @@ class ReplicaReservations { /// for logs, and for detecting slow peers ScrubTimePoint m_last_request_sent_at; + /** + * A ref to PrimaryActive::last_request_sent_nonce. + * Identifies a specific request sent, to verify against grant/deny + * responses. + * See PrimaryActive::last_request_sent_nonce for details. + */ + reservation_nonce_t& m_last_request_sent_nonce; + /// the 'slow response' timeout (in milliseconds) - as configured. /// Doubles as a 'do once' flag for the warning. std::chrono::milliseconds m_slow_response_warn_timeout; @@ -77,7 +103,10 @@ class ReplicaReservations { std::optional m_process_started_at; public: - ReplicaReservations(ScrubMachineListener& scrubber, PerfCounters& pc); + ReplicaReservations( + ScrubMachineListener& scrubber, + reservation_nonce_t& nonce, + PerfCounters& pc); ~ReplicaReservations(); @@ -90,18 +119,23 @@ class ReplicaReservations { * \returns true if there are no more replicas to send reservation requests * (i.e., the scrubber should proceed to the next phase), false otherwise. */ - bool handle_reserve_grant(OpRequestRef op, pg_shard_t from); + bool handle_reserve_grant(const MOSDScrubReserve& msg, pg_shard_t from); /** + * React to an incoming reservation rejection. + * * Verify that the sender of the received rejection is the replica we - * were expecting a reply from. - * If this is so - just mark the fact that the specific peer need not - * be released. + * were expecting a reply from, and that the message isn't stale (see + * m_last_request_sent_nonce for details). + * If a valid rejection: log it, and mark the fact that the specific peer + * need not be released. * * Note - the actual handling of scrub session termination and of * releasing the reserved replicas is done by the caller (the FSM). + * + * Returns true if the rejection is valid, false otherwise. */ - void verify_rejections_source(OpRequestRef op, pg_shard_t from); + bool handle_reserve_rejection(const MOSDScrubReserve& msg, pg_shard_t from); /** * Notifies implementation that it is no longer responsible for releasing @@ -140,6 +174,20 @@ class ReplicaReservations { */ bool send_next_reservation_or_complete(); + /** + * is this is a reply to our last request? + * Checks response once against m_last_request_sent_nonce. See + * m_last_request_sent_nonce for details. + */ + bool is_reservation_response_relevant(reservation_nonce_t msg_nonce) const; + + /** + * is this reply coming from the expected replica? + * Now that we check the nonce before checking the sender - this + * check should never fail. + */ + bool is_msg_source_correct(pg_shard_t from) const; + // --- perf counters helpers /**