From: Ronen Friedman <rfriedma@redhat.com>
Date: Wed, 17 Jan 2024 15:36:16 +0000 (-0600)
Subject: osd/scrub: check reservation replies for relevance
X-Git-Tag: v19.3.0~177^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7559a6c0d11b5c8f2443e208c8188a9201b82a4d;p=ceph.git

osd/scrub: check reservation replies for relevance

Compare a token (nonce) carried in the reservation reply with the remembered
token of the reservation request.  If they don't match, the reply is
stale and should be ignored (and logged).

Fixes: https://tracker.ceph.com/issues/64052

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---

diff --git a/src/messages/MOSDScrubReserve.h b/src/messages/MOSDScrubReserve.h
index c7ab985411750..97b6dff4dd006 100644
--- a/src/messages/MOSDScrubReserve.h
+++ b/src/messages/MOSDScrubReserve.h
@@ -19,9 +19,11 @@
 
 class MOSDScrubReserve : public MOSDFastDispatchOp {
 private:
-  static constexpr int HEAD_VERSION = 1;
+  static constexpr int HEAD_VERSION = 2;
   static constexpr int COMPAT_VERSION = 1;
 public:
+  using reservation_nonce_t = uint32_t;
+
   spg_t pgid;
   epoch_t map_epoch;
   enum ReserveMsgOp {
@@ -32,6 +34,7 @@ public:
   };
   int32_t type;
   pg_shard_t from;
+  reservation_nonce_t reservation_nonce{0};
 
   epoch_t get_map_epoch() const override {
     return map_epoch;
@@ -46,10 +49,11 @@ public:
   MOSDScrubReserve(spg_t pgid,
 		   epoch_t map_epoch,
 		   int type,
-		   pg_shard_t from)
+		   pg_shard_t from,
+		   reservation_nonce_t nonce)
     : MOSDFastDispatchOp{MSG_OSD_SCRUB_RESERVE, HEAD_VERSION, COMPAT_VERSION},
       pgid(pgid), map_epoch(map_epoch),
-      type(type), from(from) {}
+      type(type), from(from), reservation_nonce{nonce} {}
 
   std::string_view get_type_name() const {
     return "MOSDScrubReserve";
@@ -71,7 +75,8 @@ public:
       out << "RELEASE ";
       break;
     }
-    out << "e" << map_epoch << ")";
+    out << "e" << map_epoch << " from: " << from
+	<< " reservation_nonce: " << reservation_nonce << ")";
     return;
   }
 
@@ -82,6 +87,13 @@ public:
     decode(map_epoch, p);
     decode(type, p);
     decode(from, p);
+    if (header.version >= 2) {
+      decode(reservation_nonce, p);
+    } else {
+      // a zero nonce (identifying legacy senders) is ignored when
+      // checking the request for obsolescence
+      reservation_nonce = 0;
+    }
   }
 
   void encode_payload(uint64_t features) {
@@ -90,6 +102,7 @@ public:
     encode(map_epoch, payload);
     encode(type, payload);
     encode(from, payload);
+    encode(reservation_nonce, payload);
   }
 private:
   template<class T, typename... Args>
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
index d0ab07fe1dfae..d9d03fe688964 100644
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -230,7 +230,9 @@ ReservingReplicas::ReservingReplicas(my_context ctx)
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
 
   // initiate the reservation process
-  session.m_reservations.emplace(*scrbr, *session.m_perf_set);
+  session.m_reservations.emplace(
+      *scrbr, context<PrimaryActive>().last_request_sent_nonce,
+      *session.m_perf_set);
 
   if (session.m_reservations->get_last_sent()) {
     // the 1'st reservation request was sent
@@ -263,9 +265,9 @@ sc::result ReservingReplicas::react(const ReplicaGrant& ev)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   dout(10) << "ReservingReplicas::react(const ReplicaGrant&)" << dendl;
+  const auto& m = ev.m_op->get_req<MOSDScrubReserve>();
 
-  if (context<Session>().m_reservations->handle_reserve_grant(
-	  ev.m_op, ev.m_from)) {
+  if (context<Session>().m_reservations->handle_reserve_grant(*m, ev.m_from)) {
     // we are done with the reservation process
     return transit<ActiveScrubbing>();
   }
@@ -277,13 +279,21 @@ sc::result ReservingReplicas::react(const ReplicaReject& ev)
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   auto& session = context<Session>();
   dout(10) << "ReservingReplicas::react(const ReplicaReject&)" << dendl;
-  session.m_reservations->log_failure_and_duration(scrbcnt_resrv_rejected);
-
-  // manipulate the 'next to reserve' iterator to exclude
-  // the rejecting replica from the set of replicas requiring release
-  session.m_reservations->verify_rejections_source(ev.m_op, ev.m_from);
+  const auto m = ev.m_op->get_req<MOSDScrubReserve>();
+
+  // Verify that the message is from the replica we were expecting a reply from,
+  // and that the message is not stale. If all is well - this is a real rejection:
+  // - log required details;
+  // - manipulate the 'next to reserve' iterator to exclude
+  //   the rejecting replica from the set of replicas requiring release
+  if (!session.m_reservations->handle_reserve_rejection(*m, ev.m_from)) {
+    // stale or unexpected
+    return discard_event();
+  }
 
-  // set 'reservation failure' as the scrub termination cause (affecting
+  // The rejection was carrying the correct reservation_nonce. It was
+  // logged by handle_reserve_rejection().
+  // Set 'reservation failure' as the scrub termination cause (affecting
   // the rescheduling of this PG)
   scrbr->flag_reservations_failure();
 
@@ -777,7 +787,13 @@ ReplicaActive::~ReplicaActive()
 void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "ReplicaActive::on_reserve_req()" << dendl;
+  const auto m = ev.m_op->get_req<MOSDScrubReserve>();
+  const auto msg_nonce = m->reservation_nonce;
+  dout(10)
+      << fmt::format(
+	     "ReplicaActive::on_reserve_req() from {} (reservation_nonce:{})",
+	     ev.m_from, msg_nonce)
+      << dendl;
 
   if (reserved_by_my_primary) {
     dout(10) << "ReplicaActive::on_reserve_req(): already reserved" << dendl;
@@ -797,7 +813,7 @@ void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev)
 
   Message* reply = new MOSDScrubReserve(
       spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch, ret.op,
-      m_pg->pg_whoami);
+      m_pg->pg_whoami, msg_nonce);
   m_osds->send_message_osd_cluster(reply, ev.m_op->get_req()->get_connection());
 }
 
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h
index 8fa96da140525..beb4d7a4c0fa7 100644
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -440,6 +440,22 @@ struct PrimaryActive : sc::state<PrimaryActive, ScrubMachine, PrimaryIdle>,
   using reactions = mpl::list<
       // when the interval ends - we may not be a primary anymore
       sc::transition<IntervalChanged, NotActive>>;
+
+ /**
+  * Identifies a specific reservation request.
+  * The primary is permitted to cancel outstanding reservation requests without
+  * waiting for the pending response from the replica.  Thus, we may, in general,
+  * see responses from prior reservation attempts that we need to ignore.  Each
+  * reservation request is therefore associated with a nonce incremented within
+  * an interval with each reservation request.  Any response with a non-matching
+  * nonce must be from a reservation request we canceled.  Note that this check
+  * occurs after validating that the message is from the current interval, so
+  * reusing nonces between intervals is safe.
+  *
+  * 0 is a special value used to indicate that the sender did not include a nonce due
+  * to not being a sufficiently recent version.
+  */
+  reservation_nonce_t last_request_sent_nonce{1};
 };
 
 /**
diff --git a/src/osd/scrubber/scrub_reservations.cc b/src/osd/scrubber/scrub_reservations.cc
index 3faafe9cb0a1d..284d90290578a 100644
--- a/src/osd/scrubber/scrub_reservations.cc
+++ b/src/osd/scrubber/scrub_reservations.cc
@@ -6,7 +6,6 @@
 #include <span>
 
 #include "common/ceph_time.h"
-#include "messages/MOSDScrubReserve.h"
 #include "osd/OSD.h"
 #include "osd/PG.h"
 #include "osd/osd_types_fmt.h"
@@ -31,11 +30,13 @@ namespace Scrub {
 
 ReplicaReservations::ReplicaReservations(
     ScrubMachineListener& scrbr,
+    reservation_nonce_t& nonce,
     PerfCounters& pc)
     : m_scrubber{scrbr}
     , m_pg{m_scrubber.get_pg()}
     , m_pgid{m_scrubber.get_spgid().pgid}
     , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
+    , m_last_request_sent_nonce{nonce}
     , m_perf_set{pc}
 {
   // the acting set is sorted by pg_shard_t. The reservations are to be issued
@@ -80,7 +81,7 @@ void ReplicaReservations::release_all()
   for (const auto& peer : replicas) {
     auto m = make_message<MOSDScrubReserve>(
 	spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::RELEASE,
-	m_pg->pg_whoami);
+	m_pg->pg_whoami, 0);
     m_pg->send_cluster_message(peer.osd, m, epoch, false);
   }
 
@@ -125,16 +126,50 @@ ReplicaReservations::~ReplicaReservations()
   log_failure_and_duration(scrbcnt_resrv_aborted);
 }
 
-bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+bool ReplicaReservations::is_reservation_response_relevant(
+    reservation_nonce_t msg_nonce) const
 {
-  // verify that the grant is from the peer we expected. If not?
-  // for now - abort the OSD. \todo reconsider the reaction.
-  if (!get_last_sent().has_value() || from != *get_last_sent()) {
+  return (msg_nonce == 0) || (msg_nonce == m_last_request_sent_nonce);
+}
+
+bool ReplicaReservations::is_msg_source_correct(pg_shard_t from) const
+{
+  const auto exp_source = get_last_sent();
+  return exp_source && from == *exp_source;
+}
+
+bool ReplicaReservations::handle_reserve_grant(
+    const MOSDScrubReserve& msg,
+    pg_shard_t from)
+{
+  if (!is_reservation_response_relevant(msg.reservation_nonce)) {
+    // this is a stale response to a previous request (e.g. one that
+    // timed-out). See m_last_request_sent_nonce for details.
     dout(1) << fmt::format(
-		   "unexpected grant from {} (expected {})", from,
-		   get_last_sent().value_or(pg_shard_t{}))
+		   "stale reservation response from {} with nonce {} vs. "
+		   "expected {} (e:{})",
+		   from, msg.reservation_nonce, m_last_request_sent_nonce,
+		   msg.map_epoch)
 	    << dendl;
-    ceph_assert(from == get_last_sent());
+    return false;
+  }
+
+  // verify that the grant is from the peer we expected. If not?
+  // for now - abort the OSD. There is no known scenario in which a
+  // grant message with a correct nonce can arrive from the wrong peer.
+  // (we would not abort for arriving messages with nonce 0, as those
+  // are legacy messages, for which the nonce was not verified).
+  if (!is_msg_source_correct(from)) {
+    const auto error_text = fmt::format(
+	"unexpected reservation grant from {} vs. the expected {} (e:{} "
+	"message nonce:{})",
+	from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch,
+	msg.reservation_nonce);
+    dout(1) << error_text << dendl;
+    if (msg.reservation_nonce != 0) {
+      m_osds->clog->error() << error_text;
+      ceph_abort_msg(error_text);
+    }
     return false;
   }
 
@@ -143,15 +178,15 @@ bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
   // log a warning if the response was slow to arrive
   if ((m_slow_response_warn_timeout > 0ms) &&
       (elapsed > m_slow_response_warn_timeout)) {
-    dout(1) << fmt::format(
-		   "slow reservation response from {} ({}ms)", from,
-		   duration_cast<milliseconds>(elapsed).count())
-	    << dendl;
+    m_osds->clog->warn() << fmt::format(
+	"slow reservation response from {} ({}ms)", from,
+	duration_cast<milliseconds>(elapsed).count());
     // prevent additional warnings
     m_slow_response_warn_timeout = 0ms;
   }
   dout(10) << fmt::format(
-		  "granted by {} ({} of {}) in {}ms", from,
+		  "(e:{} nonce:{}) granted by {} ({} of {}) in {}ms",
+		  msg.map_epoch, msg.reservation_nonce, from,
 		  active_requests_cnt(), m_sorted_secondaries.size(),
 		  duration_cast<milliseconds>(elapsed).count())
 	   << dendl;
@@ -170,44 +205,64 @@ bool ReplicaReservations::send_next_reservation_or_complete()
   // send the next reservation request
   const auto peer = *m_next_to_request;
   const auto epoch = m_pg->get_osdmap_epoch();
+  m_last_request_sent_nonce++;
+
   auto m = make_message<MOSDScrubReserve>(
-      spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
-      m_pg->pg_whoami);
+      spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST, m_pg->pg_whoami,
+      m_last_request_sent_nonce);
   m_pg->send_cluster_message(peer.osd, m, epoch, false);
   m_last_request_sent_at = ScrubClock::now();
   dout(10) << fmt::format(
-		  "reserving {} (the {} of {} replicas)", *m_next_to_request,
-		  active_requests_cnt() + 1, m_sorted_secondaries.size())
+		  "reserving {} (the {} of {} replicas) e:{} nonce:{}",
+		  *m_next_to_request, active_requests_cnt() + 1,
+		  m_sorted_secondaries.size(), epoch, m_last_request_sent_nonce)
 	   << dendl;
   m_next_to_request++;
   return false;
 }
 
-void ReplicaReservations::verify_rejections_source(
-    OpRequestRef op,
+bool ReplicaReservations::handle_reserve_rejection(
+    const MOSDScrubReserve& msg,
     pg_shard_t from)
 {
   // a convenient log message for the reservation process conclusion
   // (matches the one in send_next_reservation_or_complete())
   dout(10) << fmt::format(
-		  "remote reservation failure. Rejected by {} ({})", from,
-		  *op->get_req())
+		  "remote reservation failure. Rejected by {} ({})", from, msg)
 	   << dendl;
 
+  if (!is_reservation_response_relevant(msg.reservation_nonce)) {
+    // this is a stale response to a previous request (e.g. one that
+    // timed-out). See m_last_request_sent_nonce for details.
+    dout(10) << fmt::format(
+		    "stale reservation response from {} with reservation_nonce "
+		    "{} vs. expected {} (e:{})",
+		    from, msg.reservation_nonce, m_last_request_sent_nonce,
+		    msg.map_epoch)
+	     << dendl;
+    return false;
+  }
+
+  log_failure_and_duration(scrbcnt_resrv_rejected);
+
+  // we should never see a rejection carrying a valid
+  // reservation nonce - arriving while we have no pending requests
+  ceph_assert(get_last_sent().has_value() || msg.reservation_nonce == 0);
+
   // verify that the denial is from the peer we expected. If not?
+  // There is no known scenario in which this can happen, but if it does -
   // we should treat it as though the *correct* peer has rejected the request,
   // but remember to release that peer, too.
-
-  ceph_assert(get_last_sent().has_value());
-  const auto expected = *get_last_sent();
-  if (from != expected) {
-    dout(1) << fmt::format(
-		   "unexpected rejection from {} (expected {})", from, expected)
-	    << dendl;
-  } else {
-    // correct peer, wrong answer...
+  if (is_msg_source_correct(from)) {
     m_next_to_request--;  // no need to release this one
+  } else {
+    m_osds->clog->warn() << fmt::format(
+	"unexpected reservation denial from {} vs the expected {} (e:{} "
+	"message reservation_nonce:{})",
+	from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch,
+	msg.reservation_nonce);
   }
+  return true;
 }
 
 std::optional<pg_shard_t> ReplicaReservations::get_last_sent() const
diff --git a/src/osd/scrubber/scrub_reservations.h b/src/osd/scrubber/scrub_reservations.h
index 9d59033bac874..2e13760ffa1f0 100644
--- a/src/osd/scrubber/scrub_reservations.h
+++ b/src/osd/scrubber/scrub_reservations.h
@@ -8,6 +8,7 @@
 #include <string_view>
 #include <vector>
 
+#include "messages/MOSDScrubReserve.h"
 #include "osd/scrubber_common.h"
 
 #include "osd_scrub_sched.h"
@@ -15,6 +16,8 @@
 
 namespace Scrub {
 
+using reservation_nonce_t = MOSDScrubReserve::reservation_nonce_t;
+
 /**
  * Reserving/freeing scrub resources at the replicas.
  *
@@ -44,6 +47,21 @@ namespace Scrub {
  *  that have been acquired until that moment.
  *  (Why? because we have encountered instances where a reservation request was
  *  lost - either due to a bug or due to a network issue.)
+ *
+ * Keeping primary & replica in sync:
+ *
+ * Reservation requests may be canceled by the primary independently of the
+ * replica's response. Depending on timing, a cancellation by the primary might
+ * or might not be processed by a replica prior to sending a response (either
+ * rejection or success).  Thus, we associate each reservation request with a
+ * nonce incremented with each reservation during an interval and drop any
+ * responses that do not match our current nonce.
+ * This check occurs after rejecting any messages from prior intervals, so
+ * reusing nonces between intervals is not a problem.  Note that epoch would
+ * not suffice as it is possible for this sequence to occur several times
+ * without a new map epoch.
+ * Note - 'release' messages, which are not replied to by the replica,
+ * do not need or use that field.
  */
 class ReplicaReservations {
   ScrubMachineListener& m_scrubber;
@@ -64,6 +82,14 @@ class ReplicaReservations {
   /// for logs, and for detecting slow peers
   ScrubTimePoint m_last_request_sent_at;
 
+  /**
+   * A ref to PrimaryActive::last_request_sent_nonce.
+   * Identifies a specific request sent, to verify against grant/deny
+   * responses.
+   * See PrimaryActive::last_request_sent_nonce for details.
+   */
+  reservation_nonce_t& m_last_request_sent_nonce;
+
   /// the 'slow response' timeout (in milliseconds) - as configured.
   /// Doubles as a 'do once' flag for the warning.
   std::chrono::milliseconds m_slow_response_warn_timeout;
@@ -77,7 +103,10 @@ class ReplicaReservations {
   std::optional<ScrubTimePoint> m_process_started_at;
 
  public:
-  ReplicaReservations(ScrubMachineListener& scrubber, PerfCounters& pc);
+  ReplicaReservations(
+      ScrubMachineListener& scrubber,
+      reservation_nonce_t& nonce,
+      PerfCounters& pc);
 
   ~ReplicaReservations();
 
@@ -90,18 +119,23 @@ class ReplicaReservations {
    * \returns true if there are no more replicas to send reservation requests
    * (i.e., the scrubber should proceed to the next phase), false otherwise.
    */
-  bool handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+  bool handle_reserve_grant(const MOSDScrubReserve& msg, pg_shard_t from);
 
   /**
+   * React to an incoming reservation rejection.
+   *
    * Verify that the sender of the received rejection is the replica we
-   * were expecting a reply from.
-   * If this is so - just mark the fact that the specific peer need not
-   * be released.
+   * were expecting a reply from, and that the message isn't stale (see
+   * m_last_request_sent_nonce for details).
+   * If a valid rejection: log it, and mark the fact that the specific peer
+   * need not be released.
    *
    * Note - the actual handling of scrub session termination and of
    * releasing the reserved replicas is done by the caller (the FSM).
+   *
+   * Returns true if the rejection is valid, false otherwise.
    */
-  void verify_rejections_source(OpRequestRef op, pg_shard_t from);
+  bool handle_reserve_rejection(const MOSDScrubReserve& msg, pg_shard_t from);
 
   /**
    * Notifies implementation that it is no longer responsible for releasing
@@ -140,6 +174,20 @@ class ReplicaReservations {
    */
   bool send_next_reservation_or_complete();
 
+  /**
+   * is this is a reply to our last request?
+   * Checks response once against m_last_request_sent_nonce. See
+   * m_last_request_sent_nonce for details.
+   */
+  bool is_reservation_response_relevant(reservation_nonce_t msg_nonce) const;
+
+  /**
+   * is this reply coming from the expected replica?
+   * Now that we check the nonce before checking the sender - this
+   * check should never fail.
+   */
+  bool is_msg_source_correct(pg_shard_t from) const;
+
   // ---   perf counters helpers
 
   /**