osd/scrub: check reservation replies for relevance

author Ronen Friedman <rfriedma@redhat.com>

Wed, 17 Jan 2024 15:36:16 +0000 (09:36 -0600)

committer Ronen Friedman <rfriedma@redhat.com>

Tue, 23 Jan 2024 09:33:03 +0000 (03:33 -0600)
author Ronen Friedman <rfriedma@redhat.com>
Wed, 17 Jan 2024 15:36:16 +0000 (09:36 -0600)
committer Ronen Friedman <rfriedma@redhat.com>
Tue, 23 Jan 2024 09:33:03 +0000 (03:33 -0600)
diff --git a/src/messages/MOSDScrubReserve.h b/src/messages/MOSDScrubReserve.h

index c7ab985411750bc3320f4eca462aecd03530965c..97b6dff4dd006c80555b2dc34b54f6f9e8d0c1bb 100644 (file)
--- a/src/messages/MOSDScrubReserve.h
+++ b/src/messages/MOSDScrubReserve.h
@@ -19,9 +19,11 @@
  
  class MOSDScrubReserve : public MOSDFastDispatchOp {
  private:
-  static constexpr int HEAD_VERSION = 1;
+  static constexpr int HEAD_VERSION = 2;
    static constexpr int COMPAT_VERSION = 1;
  public:
+  using reservation_nonce_t = uint32_t;
+
    spg_t pgid;
    epoch_t map_epoch;
    enum ReserveMsgOp {
@@ -32,6 +34,7 @@ public:
    };
    int32_t type;
    pg_shard_t from;
+  reservation_nonce_t reservation_nonce{0};
  
    epoch_t get_map_epoch() const override {
      return map_epoch;
@@ -46,10 +49,11 @@ public:
    MOSDScrubReserve(spg_t pgid,
                    epoch_t map_epoch,
                    int type,
-                  pg_shard_t from)
+                  pg_shard_t from,
+                  reservation_nonce_t nonce)
      : MOSDFastDispatchOp{MSG_OSD_SCRUB_RESERVE, HEAD_VERSION, COMPAT_VERSION},
        pgid(pgid), map_epoch(map_epoch),
-      type(type), from(from) {}
+      type(type), from(from), reservation_nonce{nonce} {}
  
    std::string_view get_type_name() const {
      return "MOSDScrubReserve";
@@ -71,7 +75,8 @@ public:
        out << "RELEASE ";
        break;
      }
-    out << "e" << map_epoch << ")";
+    out << "e" << map_epoch << " from: " << from
+       << " reservation_nonce: " << reservation_nonce << ")";
      return;
    }
  
@@ -82,6 +87,13 @@ public:
      decode(map_epoch, p);
      decode(type, p);
      decode(from, p);
+    if (header.version >= 2) {
+      decode(reservation_nonce, p);
+    } else {
+      // a zero nonce (identifying legacy senders) is ignored when
+      // checking the request for obsolescence
+      reservation_nonce = 0;
+    }
    }
  
    void encode_payload(uint64_t features) {
@@ -90,6 +102,7 @@ public:
      encode(map_epoch, payload);
      encode(type, payload);
      encode(from, payload);
+    encode(reservation_nonce, payload);
    }
  private:
    template<class T, typename... Args>
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc

index d0ab07fe1dfae5087f06965be3e18d5527ece0ec..d9d03fe688964135085e491758d0e0e272f3205d 100644 (file)
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -230,7 +230,9 @@ ReservingReplicas::ReservingReplicas(my_context ctx)
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
  
    // initiate the reservation process
-  session.m_reservations.emplace(*scrbr, *session.m_perf_set);
+  session.m_reservations.emplace(
+      *scrbr, context<PrimaryActive>().last_request_sent_nonce,
+      *session.m_perf_set);
  
    if (session.m_reservations->get_last_sent()) {
      // the 1'st reservation request was sent
@@ -263,9 +265,9 @@ sc::result ReservingReplicas::react(const ReplicaGrant& ev)
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(10) << "ReservingReplicas::react(const ReplicaGrant&)" << dendl;
+  const auto& m = ev.m_op->get_req<MOSDScrubReserve>();
  
-  if (context<Session>().m_reservations->handle_reserve_grant(
-         ev.m_op, ev.m_from)) {
+  if (context<Session>().m_reservations->handle_reserve_grant(*m, ev.m_from)) {
      // we are done with the reservation process
      return transit<ActiveScrubbing>();
    }
@@ -277,13 +279,21 @@ sc::result ReservingReplicas::react(const ReplicaReject& ev)
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    auto& session = context<Session>();
    dout(10) << "ReservingReplicas::react(const ReplicaReject&)" << dendl;
-  session.m_reservations->log_failure_and_duration(scrbcnt_resrv_rejected);
-
-  // manipulate the 'next to reserve' iterator to exclude
-  // the rejecting replica from the set of replicas requiring release
-  session.m_reservations->verify_rejections_source(ev.m_op, ev.m_from);
+  const auto m = ev.m_op->get_req<MOSDScrubReserve>();
+
+  // Verify that the message is from the replica we were expecting a reply from,
+  // and that the message is not stale. If all is well - this is a real rejection:
+  // - log required details;
+  // - manipulate the 'next to reserve' iterator to exclude
+  //   the rejecting replica from the set of replicas requiring release
+  if (!session.m_reservations->handle_reserve_rejection(*m, ev.m_from)) {
+    // stale or unexpected
+    return discard_event();
+  }
  
-  // set 'reservation failure' as the scrub termination cause (affecting
+  // The rejection was carrying the correct reservation_nonce. It was
+  // logged by handle_reserve_rejection().
+  // Set 'reservation failure' as the scrub termination cause (affecting
    // the rescheduling of this PG)
    scrbr->flag_reservations_failure();
  
@@ -777,7 +787,13 @@ ReplicaActive::~ReplicaActive()
  void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev)
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "ReplicaActive::on_reserve_req()" << dendl;
+  const auto m = ev.m_op->get_req<MOSDScrubReserve>();
+  const auto msg_nonce = m->reservation_nonce;
+  dout(10)
+      << fmt::format(
+            "ReplicaActive::on_reserve_req() from {} (reservation_nonce:{})",
+            ev.m_from, msg_nonce)
+      << dendl;
  
    if (reserved_by_my_primary) {
      dout(10) << "ReplicaActive::on_reserve_req(): already reserved" << dendl;
@@ -797,7 +813,7 @@ void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev)
  
    Message* reply = new MOSDScrubReserve(
        spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch, ret.op,
-      m_pg->pg_whoami);
+      m_pg->pg_whoami, msg_nonce);
    m_osds->send_message_osd_cluster(reply, ev.m_op->get_req()->get_connection());
  }
  
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h

index 8fa96da1405251bca805419789e6c3ea3ade2dbd..beb4d7a4c0fa7f09e6ec127655d1a08140307216 100644 (file)
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -440,6 +440,22 @@ struct PrimaryActive : sc::state<PrimaryActive, ScrubMachine, PrimaryIdle>,
    using reactions = mpl::list<
        // when the interval ends - we may not be a primary anymore
        sc::transition<IntervalChanged, NotActive>>;
+
+ /**
+  * Identifies a specific reservation request.
+  * The primary is permitted to cancel outstanding reservation requests without
+  * waiting for the pending response from the replica.  Thus, we may, in general,
+  * see responses from prior reservation attempts that we need to ignore.  Each
+  * reservation request is therefore associated with a nonce incremented within
+  * an interval with each reservation request.  Any response with a non-matching
+  * nonce must be from a reservation request we canceled.  Note that this check
+  * occurs after validating that the message is from the current interval, so
+  * reusing nonces between intervals is safe.
+  *
+  * 0 is a special value used to indicate that the sender did not include a nonce due
+  * to not being a sufficiently recent version.
+  */
+  reservation_nonce_t last_request_sent_nonce{1};
  };
  
  /**
diff --git a/src/osd/scrubber/scrub_reservations.cc b/src/osd/scrubber/scrub_reservations.cc

index 3faafe9cb0a1dfcba5085c8460349d2a26302e6b..284d90290578a2e2b8a8297cb3dc8af983fbc4b9 100644 (file)
--- a/src/osd/scrubber/scrub_reservations.cc
+++ b/src/osd/scrubber/scrub_reservations.cc
@@ -6,7 +6,6 @@
  #include <span>
  
  #include "common/ceph_time.h"
-#include "messages/MOSDScrubReserve.h"
  #include "osd/OSD.h"
  #include "osd/PG.h"
  #include "osd/osd_types_fmt.h"
@@ -31,11 +30,13 @@ namespace Scrub {
  
  ReplicaReservations::ReplicaReservations(
      ScrubMachineListener& scrbr,
+    reservation_nonce_t& nonce,
      PerfCounters& pc)
      : m_scrubber{scrbr}
      , m_pg{m_scrubber.get_pg()}
      , m_pgid{m_scrubber.get_spgid().pgid}
      , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
+    , m_last_request_sent_nonce{nonce}
      , m_perf_set{pc}
  {
    // the acting set is sorted by pg_shard_t. The reservations are to be issued
@@ -80,7 +81,7 @@ void ReplicaReservations::release_all()
    for (const auto& peer : replicas) {
      auto m = make_message<MOSDScrubReserve>(
         spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::RELEASE,
-       m_pg->pg_whoami);
+       m_pg->pg_whoami, 0);
      m_pg->send_cluster_message(peer.osd, m, epoch, false);
    }
  
@@ -125,16 +126,50 @@ ReplicaReservations::~ReplicaReservations()
    log_failure_and_duration(scrbcnt_resrv_aborted);
  }
  
-bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+bool ReplicaReservations::is_reservation_response_relevant(
+    reservation_nonce_t msg_nonce) const
  {
-  // verify that the grant is from the peer we expected. If not?
-  // for now - abort the OSD. \todo reconsider the reaction.
-  if (!get_last_sent().has_value() || from != *get_last_sent()) {
+  return (msg_nonce == 0) || (msg_nonce == m_last_request_sent_nonce);
+}
+
+bool ReplicaReservations::is_msg_source_correct(pg_shard_t from) const
+{
+  const auto exp_source = get_last_sent();
+  return exp_source && from == *exp_source;
+}
+
+bool ReplicaReservations::handle_reserve_grant(
+    const MOSDScrubReserve& msg,
+    pg_shard_t from)
+{
+  if (!is_reservation_response_relevant(msg.reservation_nonce)) {
+    // this is a stale response to a previous request (e.g. one that
+    // timed-out). See m_last_request_sent_nonce for details.
      dout(1) << fmt::format(
-                  "unexpected grant from {} (expected {})", from,
-                  get_last_sent().value_or(pg_shard_t{}))
+                  "stale reservation response from {} with nonce {} vs. "
+                  "expected {} (e:{})",
+                  from, msg.reservation_nonce, m_last_request_sent_nonce,
+                  msg.map_epoch)
             << dendl;
-    ceph_assert(from == get_last_sent());
+    return false;
+  }
+
+  // verify that the grant is from the peer we expected. If not?
+  // for now - abort the OSD. There is no known scenario in which a
+  // grant message with a correct nonce can arrive from the wrong peer.
+  // (we would not abort for arriving messages with nonce 0, as those
+  // are legacy messages, for which the nonce was not verified).
+  if (!is_msg_source_correct(from)) {
+    const auto error_text = fmt::format(
+       "unexpected reservation grant from {} vs. the expected {} (e:{} "
+       "message nonce:{})",
+       from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch,
+       msg.reservation_nonce);
+    dout(1) << error_text << dendl;
+    if (msg.reservation_nonce != 0) {
+      m_osds->clog->error() << error_text;
+      ceph_abort_msg(error_text);
+    }
      return false;
    }
  
@@ -143,15 +178,15 @@ bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
    // log a warning if the response was slow to arrive
    if ((m_slow_response_warn_timeout > 0ms) &&
        (elapsed > m_slow_response_warn_timeout)) {
-    dout(1) << fmt::format(
-                  "slow reservation response from {} ({}ms)", from,
-                  duration_cast<milliseconds>(elapsed).count())
-           << dendl;
+    m_osds->clog->warn() << fmt::format(
+       "slow reservation response from {} ({}ms)", from,
+       duration_cast<milliseconds>(elapsed).count());
      // prevent additional warnings
      m_slow_response_warn_timeout = 0ms;
    }
    dout(10) << fmt::format(
-                 "granted by {} ({} of {}) in {}ms", from,
+                 "(e:{} nonce:{}) granted by {} ({} of {}) in {}ms",
+                 msg.map_epoch, msg.reservation_nonce, from,
                   active_requests_cnt(), m_sorted_secondaries.size(),
                   duration_cast<milliseconds>(elapsed).count())
            << dendl;
@@ -170,44 +205,64 @@ bool ReplicaReservations::send_next_reservation_or_complete()
    // send the next reservation request
    const auto peer = *m_next_to_request;
    const auto epoch = m_pg->get_osdmap_epoch();
+  m_last_request_sent_nonce++;
+
    auto m = make_message<MOSDScrubReserve>(
-      spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
-      m_pg->pg_whoami);
+      spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST, m_pg->pg_whoami,
+      m_last_request_sent_nonce);
    m_pg->send_cluster_message(peer.osd, m, epoch, false);
    m_last_request_sent_at = ScrubClock::now();
    dout(10) << fmt::format(
-                 "reserving {} (the {} of {} replicas)", *m_next_to_request,
-                 active_requests_cnt() + 1, m_sorted_secondaries.size())
+                 "reserving {} (the {} of {} replicas) e:{} nonce:{}",
+                 *m_next_to_request, active_requests_cnt() + 1,
+                 m_sorted_secondaries.size(), epoch, m_last_request_sent_nonce)
            << dendl;
    m_next_to_request++;
    return false;
  }
  
-void ReplicaReservations::verify_rejections_source(
-    OpRequestRef op,
+bool ReplicaReservations::handle_reserve_rejection(
+    const MOSDScrubReserve& msg,
      pg_shard_t from)
  {
    // a convenient log message for the reservation process conclusion
    // (matches the one in send_next_reservation_or_complete())
    dout(10) << fmt::format(
-                 "remote reservation failure. Rejected by {} ({})", from,
-                 *op->get_req())
+                 "remote reservation failure. Rejected by {} ({})", from, msg)
            << dendl;
  
+  if (!is_reservation_response_relevant(msg.reservation_nonce)) {
+    // this is a stale response to a previous request (e.g. one that
+    // timed-out). See m_last_request_sent_nonce for details.
+    dout(10) << fmt::format(
+                   "stale reservation response from {} with reservation_nonce "
+                   "{} vs. expected {} (e:{})",
+                   from, msg.reservation_nonce, m_last_request_sent_nonce,
+                   msg.map_epoch)
+            << dendl;
+    return false;
+  }
+
+  log_failure_and_duration(scrbcnt_resrv_rejected);
+
+  // we should never see a rejection carrying a valid
+  // reservation nonce - arriving while we have no pending requests
+  ceph_assert(get_last_sent().has_value() || msg.reservation_nonce == 0);
+
    // verify that the denial is from the peer we expected. If not?
+  // There is no known scenario in which this can happen, but if it does -
    // we should treat it as though the *correct* peer has rejected the request,
    // but remember to release that peer, too.
-
-  ceph_assert(get_last_sent().has_value());
-  const auto expected = *get_last_sent();
-  if (from != expected) {
-    dout(1) << fmt::format(
-                  "unexpected rejection from {} (expected {})", from, expected)
-           << dendl;
-  } else {
-    // correct peer, wrong answer...
+  if (is_msg_source_correct(from)) {
      m_next_to_request--;  // no need to release this one
+  } else {
+    m_osds->clog->warn() << fmt::format(
+       "unexpected reservation denial from {} vs the expected {} (e:{} "
+       "message reservation_nonce:{})",
+       from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch,
+       msg.reservation_nonce);
    }
+  return true;
  }
  
  std::optional<pg_shard_t> ReplicaReservations::get_last_sent() const
diff --git a/src/osd/scrubber/scrub_reservations.h b/src/osd/scrubber/scrub_reservations.h

index 9d59033bac874610924ac8865e6d32ebc95a9d6f..2e13760ffa1f049b03138a6ee5e28c94202042f2 100644 (file)
--- a/src/osd/scrubber/scrub_reservations.h
+++ b/src/osd/scrubber/scrub_reservations.h
@@ -8,6 +8,7 @@
  #include <string_view>
  #include <vector>
  
+#include "messages/MOSDScrubReserve.h"
  #include "osd/scrubber_common.h"
  
  #include "osd_scrub_sched.h"
@@ -15,6 +16,8 @@
  
  namespace Scrub {
  
+using reservation_nonce_t = MOSDScrubReserve::reservation_nonce_t;
+
  /**
   * Reserving/freeing scrub resources at the replicas.
   *
@@ -44,6 +47,21 @@ namespace Scrub {
   *  that have been acquired until that moment.
   *  (Why? because we have encountered instances where a reservation request was
   *  lost - either due to a bug or due to a network issue.)
+ *
+ * Keeping primary & replica in sync:
+ *
+ * Reservation requests may be canceled by the primary independently of the
+ * replica's response. Depending on timing, a cancellation by the primary might
+ * or might not be processed by a replica prior to sending a response (either
+ * rejection or success).  Thus, we associate each reservation request with a
+ * nonce incremented with each reservation during an interval and drop any
+ * responses that do not match our current nonce.
+ * This check occurs after rejecting any messages from prior intervals, so
+ * reusing nonces between intervals is not a problem.  Note that epoch would
+ * not suffice as it is possible for this sequence to occur several times
+ * without a new map epoch.
+ * Note - 'release' messages, which are not replied to by the replica,
+ * do not need or use that field.
   */
  class ReplicaReservations {
    ScrubMachineListener& m_scrubber;
@@ -64,6 +82,14 @@ class ReplicaReservations {
    /// for logs, and for detecting slow peers
    ScrubTimePoint m_last_request_sent_at;
  
+  /**
+   * A ref to PrimaryActive::last_request_sent_nonce.
+   * Identifies a specific request sent, to verify against grant/deny
+   * responses.
+   * See PrimaryActive::last_request_sent_nonce for details.
+   */
+  reservation_nonce_t& m_last_request_sent_nonce;
+
    /// the 'slow response' timeout (in milliseconds) - as configured.
    /// Doubles as a 'do once' flag for the warning.
    std::chrono::milliseconds m_slow_response_warn_timeout;
@@ -77,7 +103,10 @@ class ReplicaReservations {
    std::optional<ScrubTimePoint> m_process_started_at;
  
   public:
-  ReplicaReservations(ScrubMachineListener& scrubber, PerfCounters& pc);
+  ReplicaReservations(
+      ScrubMachineListener& scrubber,
+      reservation_nonce_t& nonce,
+      PerfCounters& pc);
  
    ~ReplicaReservations();
  
@@ -90,18 +119,23 @@ class ReplicaReservations {
     * \returns true if there are no more replicas to send reservation requests
     * (i.e., the scrubber should proceed to the next phase), false otherwise.
     */
-  bool handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+  bool handle_reserve_grant(const MOSDScrubReserve& msg, pg_shard_t from);
  
    /**
+   * React to an incoming reservation rejection.
+   *
     * Verify that the sender of the received rejection is the replica we
-   * were expecting a reply from.
-   * If this is so - just mark the fact that the specific peer need not
-   * be released.
+   * were expecting a reply from, and that the message isn't stale (see
+   * m_last_request_sent_nonce for details).
+   * If a valid rejection: log it, and mark the fact that the specific peer
+   * need not be released.
     *
     * Note - the actual handling of scrub session termination and of
     * releasing the reserved replicas is done by the caller (the FSM).
+   *
+   * Returns true if the rejection is valid, false otherwise.
     */
-  void verify_rejections_source(OpRequestRef op, pg_shard_t from);
+  bool handle_reserve_rejection(const MOSDScrubReserve& msg, pg_shard_t from);
  
    /**
     * Notifies implementation that it is no longer responsible for releasing
@@ -140,6 +174,20 @@ class ReplicaReservations {
     */
    bool send_next_reservation_or_complete();
  
+  /**
+   * is this is a reply to our last request?
+   * Checks response once against m_last_request_sent_nonce. See
+   * m_last_request_sent_nonce for details.
+   */
+  bool is_reservation_response_relevant(reservation_nonce_t msg_nonce) const;
+
+  /**
+   * is this reply coming from the expected replica?
+   * Now that we check the nonce before checking the sender - this
+   * check should never fail.
+   */
+  bool is_msg_source_correct(pg_shard_t from) const;
+
    // ---   perf counters helpers
  
    /**
author	Ronen Friedman <rfriedma@redhat.com>
	Wed, 17 Jan 2024 15:36:16 +0000 (09:36 -0600)
committer	Ronen Friedman <rfriedma@redhat.com>
	Tue, 23 Jan 2024 09:33:03 +0000 (03:33 -0600)
src/messages/MOSDScrubReserve.h		patch \| blob \| history
src/osd/scrubber/scrub_machine.cc		patch \| blob \| history
src/osd/scrubber/scrub_machine.h		patch \| blob \| history
src/osd/scrubber/scrub_reservations.cc		patch \| blob \| history
src/osd/scrubber/scrub_reservations.h		patch \| blob \| history