]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: directly manage remote reservations in the FSM
authorRonen Friedman <rfriedma@redhat.com>
Thu, 25 Jan 2024 19:05:01 +0000 (13:05 -0600)
committerRonen Friedman <rfriedma@redhat.com>
Wed, 31 Jan 2024 07:29:19 +0000 (01:29 -0600)
The FSM now interacts with the scrub_reserver directly.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/osd/scrubber/pg_scrubber.cc
src/osd/scrubber/pg_scrubber.h
src/osd/scrubber/scrub_machine.cc
src/osd/scrubber/scrub_machine.h
src/osd/scrubber_common.h

index 3ef5a2ef567dc25be9ef16c125a3490c52655607..9266a54d78585e4c7f2faab7019ab77d3fcb48b3 100644 (file)
@@ -397,6 +397,15 @@ void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued)
   dout(10) << "scrubber event --<< " << __func__ << dendl;
 }
 
+void PgScrubber::send_granted_by_reserver(const AsyncScrubResData& req)
+{
+  dout(10) << "scrubber event -->> granted_by_reserver" << dendl;
+  if (check_interval(req.request_epoch)) {
+    m_fsm->process_event(Scrub::ReserverGranted{req});
+  }
+  dout(10) << "scrubber event --<< granted_by_reserver" << dendl;
+}
+
 // -----------------
 
 bool PgScrubber::is_reserving() const
index 9c29d5fdedb9a1a9fa4fa73715f8247cea5ea1c7..bcab24cddfa3396ff28e71f3b8ca370a7eadf780 100644 (file)
@@ -227,6 +227,8 @@ class PgScrubber : public ScrubPgIF,
 
   void send_scrub_is_finished(epoch_t epoch_queued) final;
 
+  void send_granted_by_reserver(const AsyncScrubResData& req) final;
+
   /**
    *  we allow some number of preemptions of the scrub, which mean we do
    *  not block.  Then we start to block.  Once we start blocking, we do
index fc5238186868e85dd02433cb150a7cd08db669c8..57c0492ec2e42fee10ec10f6aebba8eaed3a91b1 100644 (file)
@@ -766,12 +766,7 @@ ReplicaActive::ReplicaActive(my_context ctx)
 
 ReplicaActive::~ReplicaActive()
 {
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  if (reserved_by_my_primary) {
-    dout(10) << "ReplicaActive::~ReplicaActive(): clearing reservation"
-            << dendl;
-    clear_reservation_by_remote_primary(false);
-  }
+  clear_remote_reservation(false);
 }
 
 /*
@@ -800,85 +795,118 @@ ReplicaReactCode ReplicaActive::on_reserve_request(
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   const auto m = ev.m_op->get_req<MOSDScrubReserve>();
-  const auto msg_nonce = m->reservation_nonce;
   dout(10) << fmt::format(
-                 "ReplicaActive::on_reserve_req() from {} request:{} is "
-                 "async?{} (reservation_nonce:{})",
-                 ev.m_from, ev, async_request, msg_nonce)
+                 "ReplicaActive::on_reserve_req() request:{} async_request:{} "
+                 "reservation_nonce:{}",
+                 ev, async_request, m->reservation_nonce)
           << dendl;
-  auto& svc = m_osds->get_scrub_services();  // shorthand
-
-  if (reserved_by_my_primary) {
-    dout(10) << "ReplicaActive::on_reserve_request(): already reserved"
-            << dendl;
-    // clear the existing reservation. Clears the flag, too
-    clear_reservation_by_remote_primary(false);
-  }
 
-  Message* reply{nullptr};
+  ceph_assert(!reservation_granted);
+  ceph_assert(!pending_reservation_nonce);
   ReplicaReactCode next_action{ReplicaReactCode::discard};
+  AsyncScrubResData request_details{
+      pg_id, ev.m_from, ev.m_op->sent_epoch, m->reservation_nonce};
+  auto& reserver = m_osds->get_scrub_reserver();
 
   if (async_request) {
     // the request is to be handled asynchronously
-    svc.enqueue_remote_reservation(pg_id.pgid);
+    dout(20) << fmt::format(
+                   "{}: async request: {} details:{}", __func__, ev,
+                   request_details)
+            << dendl;
+    pending_reservation_nonce = m->reservation_nonce;
+    const auto reservation_cb = new RtReservationCB(m_pg, request_details);
+    reserver.request_reservation(pg_id, reservation_cb, 0, nullptr);
     next_action = ReplicaReactCode::goto_waiting_reservation;
 
   } else {
     // an immediate yes/no is required
-    const auto granted = svc.inc_scrubs_remote(scrbr->get_spgid().pgid);
-    if (granted) {
-      reserved_by_my_primary = true;
+    Message* reply{nullptr};
+    reservation_granted = reserver.request_reservation_or_fail(pg_id);
+    if (reservation_granted) {
       dout(10) << fmt::format("{}: reserved? yes", __func__) << dendl;
       reply = new MOSDScrubReserve(
          spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch,
-         MOSDScrubReserve::GRANT, m_pg->pg_whoami, msg_nonce);
+         MOSDScrubReserve::GRANT, m_pg->pg_whoami, m->reservation_nonce);
       next_action = ReplicaReactCode::goto_replica_reserved;
 
     } else {
       dout(10) << fmt::format("{}: reserved? no", __func__) << dendl;
       reply = new MOSDScrubReserve(
          spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch,
-         MOSDScrubReserve::REJECT, m_pg->pg_whoami, msg_nonce);
+         MOSDScrubReserve::REJECT, m_pg->pg_whoami, m->reservation_nonce);
       // the event is discarded
       next_action = ReplicaReactCode::discard;
     }
-  }
 
-  if (reply) {
     m_osds->send_message_osd_cluster(
        reply, ev.m_op->get_req()->get_connection());
   }
+
   return next_action;
 }
 
+bool ReplicaActive::granted_by_reserver(const AsyncScrubResData& reservation)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << fmt::format("{}: reservation granted: {}", __func__, reservation)
+          << dendl;
+
+  /// verify that the granted reservation is the one we were waiting for
+  if (reservation.nonce != pending_reservation_nonce) {
+    dout(5) << fmt::format(
+       "{}: reservation_nonce mismatch: {} != {}", __func__, reservation.nonce,
+       pending_reservation_nonce) << dendl;
+    return false;
+  }
+
+  reservation_granted = true;
+  pending_reservation_nonce = 0;  // no longer pending
+
+  // notify the primary
+  auto grant_msg = make_message<MOSDScrubReserve>(
+      spg_t(pg_id.pgid, m_pg->get_primary().shard), reservation.request_epoch,
+      MOSDScrubReserve::GRANT, m_pg->pg_whoami, pending_reservation_nonce);
+  m_pg->send_cluster_message(
+      m_pg->get_primary().osd, grant_msg, reservation.request_epoch, false);
+  return true;
+}
+
 void ReplicaActive::on_release(const ReplicaRelease& ev)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   dout(10) << fmt::format("ReplicaActive::on_release() from {}", ev.m_from)
           << dendl;
-  clear_reservation_by_remote_primary(true);
+  clear_remote_reservation(true);
 }
 
-void ReplicaActive::clear_reservation_by_remote_primary(bool log_failure)
+void ReplicaActive::clear_remote_reservation(bool warn_if_no_reservation)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   dout(10) << fmt::format(
-                 "ReplicaActive::clear_reservation_by_remote_primary(): was "
-                 "reserved? {}",
-                 (reserved_by_my_primary ? "yes" : "no"))
+                 "ReplicaActive::clear_remote_reservation(): "
+                 "pending_reservation_nonce {}, reservation_granted {}",
+                 reservation_granted, pending_reservation_nonce)
           << dendl;
-  if (reserved_by_my_primary) {
-    m_osds->get_scrub_services().dec_scrubs_remote(scrbr->get_spgid().pgid);
-    reserved_by_my_primary = false;
-  } else if (log_failure) {
-    const auto msg = fmt::format(
-       "ReplicaActive::clear_reservation_by_remote_primary(): "
-       "not reserved!");
+  if (reservation_granted || pending_reservation_nonce) {
+    m_osds->get_scrub_reserver().cancel_reservation(pg_id);
+    reservation_granted = false;
+    pending_reservation_nonce = 0;
+  } else if (warn_if_no_reservation) {
+    const auto msg =
+       "ReplicaActive::clear_remote_reservation(): "
+       "not reserved!";
     dout(5) << msg << dendl;
     scrbr->get_clog()->warn() << msg;
   }
 }
 
+void ReplicaActive::ignore_unhandled_grant(const ReserverGranted&)
+{
+  dout(10) << "ReplicaActive::react(const ReserverGranted&): ignored"
+          << dendl;
+}
+
 
 // ---------------- ReplicaActive/ReplicaIdle ---------------------------
 
@@ -913,7 +941,8 @@ sc::result ReplicaUnreserved::react(const ReplicaReserveReq& ev)
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   dout(10) << "ReplicaUnreserved::react(const ReplicaReserveReq&)" << dendl;
 
-  switch (context<ReplicaActive>().on_reserve_request(ev, false)) {
+  bool async_request = true && ev.m_op->get_req<MOSDScrubReserve>()->wait_for_resources /* && a config */;
+  switch (context<ReplicaActive>().on_reserve_request(ev, async_request)) {
     case ReplicaReactCode::discard:
       return discard_event();
     case ReplicaReactCode::goto_waiting_reservation:
@@ -939,15 +968,11 @@ sc::result ReplicaUnreserved::react(const ReplicaRelease&)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   dout(10) << "ReplicaUnreserved::react(const ReplicaRelease&)" << dendl;
-  // shouldn't happen. Possible (faulty) sequence: getting an op
-  // command while in ReplicaWaitingReservation (as we would just
-  // treat that as a regular op request, but will stop waiting for
-  // reservation).
-  // must cancel the queued reservation request
+  // this is a bug. We should never receive a release request unless we
+  // are reserved or have a pending reservation.
   scrbr->get_clog()->error() << fmt::format(
       "osd.{} pg[{}]: reservation released while not reserved",
       scrbr->get_whoami(), scrbr->get_spgid());
-  context<ReplicaActive>().clear_reservation_by_remote_primary(true);
   return discard_event();
 }
 
@@ -955,15 +980,13 @@ sc::result ReplicaUnreserved::react(const ReserverGranted&)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   dout(10) << "ReplicaUnreserved::react(const ReserverGranted&)" << dendl;
-  // shouldn't happen. Possible (faulty) sequence: getting an op
-  // command while in ReplicaWaitingReservation (as we would just
-  // treat that as a regular op request, but will stop waiting for
-  // reservation).
+  // shouldn't happen. Might be a result of a cancelled reservation
+  // that was still delivered.
   // must unreserve
-  scrbr->get_clog()->error() << fmt::format(
-      "osd.{} pg[{}]: reservation granted while not being waited for",
-      scrbr->get_whoami(), scrbr->get_spgid());
-  context<ReplicaActive>().clear_reservation_by_remote_primary(true);
+  dout(5) << "ReplicaUnreserved::react(const ReserverGranted&): reservation "
+            "granted while not being waited for"
+         << dendl;
+  context<ReplicaActive>().clear_remote_reservation(false);
   return discard_event();
 }
 
@@ -981,14 +1004,17 @@ ReplicaWaitingReservation::ReplicaWaitingReservation(my_context ctx)
       << dendl;
 }
 
-sc::result ReplicaWaitingReservation::react(const ReserverGranted&)
+sc::result ReplicaWaitingReservation::react(const ReserverGranted& ev)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "ReplicaWaitingReservation::react(const ReserverGranted&)"
+  dout(10) << fmt::format(
+                 "ReplicaWaitingReservation::react(const ReserverGranted&): "
+                 "event:{}",
+                 ev)
           << dendl;
-
-  /// \todo complete the handling of the granted reservation
-  ceph_abort_msg("not implemented yet");
+  if (context<ReplicaActive>().granted_by_reserver(ev.value)) {
+    return transit<ReplicaReserved>();
+  }
   return discard_event();
 }
 
@@ -1012,6 +1038,7 @@ sc::result ReplicaWaitingReservation::react(const StartReplica& ev)
       "osd.{} pg[{}]: new chunk request while still waiting for "
       "reservation",
       scrbr->get_whoami(), scrbr->get_spgid());
+  context<ReplicaActive>().clear_remote_reservation(true);
   clear_shallow_history<ReplicaIdle, 0>();
   post_event(ReplicaPushesUpd{});
   return transit<ReplicaActiveOp>();
@@ -1027,7 +1054,7 @@ sc::result ReplicaWaitingReservation::react(const ReplicaReserveReq& ev)
       "osd.{} pg[{}]: reservation requested while previous is pending",
       scrbr->get_whoami(), scrbr->get_spgid());
   // cancel the existing reservation, and re-request
-  context<ReplicaActive>().clear_reservation_by_remote_primary(true);
+  context<ReplicaActive>().clear_remote_reservation(true);
   post_event(ev);
   return transit<ReplicaUnreserved>();
 }
@@ -1060,8 +1087,11 @@ sc::result ReplicaReserved::react(const ReplicaReserveReq& ev)
   scrbr->get_clog()->error() << fmt::format(
       "osd.{} pg[{}]: reservation requested while still reserved",
       scrbr->get_whoami(), scrbr->get_spgid());
+  // This is a bug. We should never receive a new request unless the
+  // previous one was cancelled - either by the primary, or on interval
+  // change.
   // cancel the existing reservation, and re-request
-  context<ReplicaActive>().clear_reservation_by_remote_primary(true);
+  context<ReplicaActive>().clear_remote_reservation(true);
   post_event(ev);
   return transit<ReplicaUnreserved>();
 }
index 6eadb109cd54253b85e4a964b5f80d194ccd40ef..254e7861ed9568eae79a086f2a863ece7662d30b 100644 (file)
 #include <boost/statechart/transition.hpp>
 
 #include "common/fmt_common.h"
+#include "include/Context.h"
 #include "common/version.h"
 #include "messages/MOSDOp.h"
 #include "messages/MOSDRepScrub.h"
 #include "messages/MOSDRepScrubMap.h"
-#include "messages/MOSDScrubReserve.h"
-
-#include "include/Context.h"
 #include "osd/scrubber_common.h"
 
 #include "scrub_machine_lstnr.h"
@@ -136,7 +134,7 @@ struct value_event_t : sc::event<T> {
 
 
 /// the async-reserver granted our reservation request
-OP_EV(ReserverGranted);
+VALUE_EVENT(ReserverGranted, AsyncScrubResData);
 
 #define MEV(E)                                          \
   struct E : sc::event<E> {                             \
@@ -783,6 +781,41 @@ struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing>,
  *      * ReplicaWaitUpdates
  *      * ReplicaBuildingMap
  */
+/*
+ * AsyncReserver for scrub 'remote' reservations
+ * -----------------------------------------------
+ *
+ * Unless disabled by 'osd_scrub_disable_reservation_queuing' (*), scrub
+ * reservation requests are handled by an async reserver: they are queued,
+ * until the number of concurrent scrubs is below the configured limit.
+
+ * (*) Note: the 'osd_scrub_disable_reservation_queuing' option is a temporary
+ * debug measure, and will be removed without deprecation in a future release.
+ *
+ * On the replica side, all reservations are treated as having the same priority.
+ * Note that 'high priority' scrubs, e.g. user-initiated scrubs, do not perform
+ * reservations on replicas at all.
+ *
+ * A queued scrub reservation request is cancelled by any of the following events:
+ *
+ * - a new interval: in this case, we do not expect to see a cancellation request
+ *   from the primary, and we can simply remove the request from the queue;
+ *
+ * - a cancellation request from the primary: probably a result of timing out on
+ *   the reservation process. Here, we can simply remove the request from the queue.
+ *
+ * - a new reservation request for the same PG: this is a bug. We had missed the
+ *   previous cancellation request, which could never happen.
+ *   We cancel the previous request, and replace
+ *   it with the new one. We would also issue an error log message.
+ *
+ * Primary/Replica with differing versions:
+ *
+ * The updated version of MOSDScrubReserve contains a new 'wait_for_resources'
+ * field. For legacy Primary OSDs, this field is decoded as 'false', and the
+ * replica responds immediately, with grant/rejection.
+*/
+
 
 struct ReplicaIdle;
 
@@ -809,22 +842,95 @@ struct ReplicaActive : sc::state<
       const ReplicaReserveReq&,
       bool async_request);
 
+  /**
+   * the queued reservation request was granted by the async reserver.
+   * Notify the Primary.
+   * Returns 'false' if the reservation is not the last one to be received
+   * by this replica.
+   */
+  bool granted_by_reserver(const AsyncScrubResData& resevation);
+
   /// handle a 'release' from a primary
   void on_release(const ReplicaRelease& ev);
 
-  /// cancel the reserver request.
-  /// The 'failure' re 'log_failure' is logged if we are not reserved to
-  /// begin with.
-  void clear_reservation_by_remote_primary(bool log_failure);
+  /**
+   * cancel a granted or pending reservation
+   *
+   * warn_if_no_reservation is set to true if the call is in response to a
+   * cancellation from the primary.  In that event, we *must* find a
+   * a granted or pending reservation and failing to do so warrants
+   * a warning to clog as it is a bug.
+   */
+  void clear_remote_reservation(bool warn_if_no_reservation);
+
+  /**
+   * discard (and log) unhandled 'reservation granted' messages
+   * from the async reserver.
+   * As canceled reservations may still be triggered, this is not
+   * necessarily a bug.
+   */
+  void ignore_unhandled_grant(const ReserverGranted&);
 
-  using reactions = mpl::list<sc::transition<IntervalChanged, NotActive>>;
+  using reactions = mpl::list<
+      sc::transition<IntervalChanged, NotActive>,
+      sc::in_state_reaction<
+         ReserverGranted,
+         ReplicaActive,
+         &ReplicaActive::ignore_unhandled_grant>>;
 
  private:
-  bool reserved_by_my_primary{false};
-
-  // shortcuts:
   PG* m_pg;
   OSDService* m_osds;
+
+  // --- remote reservation machinery
+
+  /*
+   * 'reservation_granted' is set to 'true' when we have grant confirmation
+   *  to the primary, and the reservation has not yet been canceled (either
+   *  by the primary or following an interval change).
+   *
+   * Note the interaction with 'pending_reservation_nonce': the combination
+   * of these two variables is used to track the state of the reservation
+   * with the scrub_reserver. The possible combinations:
+   * - pending_reservation_nonce == 0 && !reservation_granted -- no reservation
+   *    was granted, and none is pending;
+   * - pending_reservation_nonce != 0 && !reservation_granted -- we have a
+   *   pending cb in the AsyncReserver for a request with nonce
+   *   'pending_reservation_nonce'
+   * - pending_reservation_nonce == 0 && reservation_granted -- we have sent
+   *   a response to the primary granting the reservation
+   * (invariant: !((pending_reservation_nonce != 0) && reservation_granted)
+   *
+   * Note that in the event that the primary is too old to support asynchronous
+   * reservation, MOSDScrubReserve::wait_for_resources will be set to false by
+   * the decoder and we bypass the 2'nd case above.
+   * See ReplicaActive::on_reserve_request().
+   */
+  bool reservation_granted{false};
+
+  /**
+   * a reservation request with this nonce is queued at the scrub_reserver,
+   * and was not yet granted.
+   */
+  MOSDScrubReserve::reservation_nonce_t pending_reservation_nonce{0};
+
+  // clang-format off
+  struct RtReservationCB : public Context {
+    PGRef pg;
+    AsyncScrubResData res_data;
+
+    explicit RtReservationCB(PGRef pg, AsyncScrubResData request_details)
+       : pg{pg}
+       , res_data{request_details}
+    {}
+
+    void finish(int) override {
+      pg->lock();
+      pg->m_scrubber->send_granted_by_reserver(res_data);
+      pg->unlock();
+    }
+  };
+  // clang-format on
 };
 
 
@@ -862,9 +968,10 @@ struct ReplicaUnreserved : sc::state<ReplicaUnreserved, ReplicaIdle>,
 
   using reactions = mpl::list<
       sc::custom_reaction<ReplicaReserveReq>,
+      sc::custom_reaction<StartReplica>,
+      // unexpected (bug-induced) events:
       sc::custom_reaction<ReplicaRelease>,
-      sc::custom_reaction<ReserverGranted>,
-      sc::custom_reaction<StartReplica>>;
+      sc::custom_reaction<ReserverGranted>>;
 
   sc::result react(const ReplicaReserveReq& ev);
   sc::result react(const StartReplica& ev);
index 067b9754c1108ff47e94cf0c93714b4a1869c6d1..66e61d856cd4f62d958811eabb9de5e3f0630a9d 100644 (file)
@@ -3,10 +3,11 @@
 #pragma once
 
 #include <fmt/ranges.h>
-
 #include "common/ceph_time.h"
+#include "common/fmt_common.h"
 #include "common/scrub_types.h"
 #include "include/types.h"
+#include "messages/MOSDScrubReserve.h"
 #include "os/ObjectStore.h"
 
 #include "OpRequest.h"
@@ -24,6 +25,32 @@ namespace Scrub {
   struct ReplicaActive;
 }
 
+/// reservation-related data sent by the primary to the replicas,
+/// and used to match the responses to the requests
+struct AsyncScrubResData {
+  spg_t pgid;
+  pg_shard_t from;
+  epoch_t request_epoch;
+  MOSDScrubReserve::reservation_nonce_t nonce;
+  AsyncScrubResData(
+      spg_t pgid,
+      pg_shard_t from,
+      epoch_t request_epoch,
+      MOSDScrubReserve::reservation_nonce_t nonce)
+      : pgid{pgid}
+      , from{from}
+      , request_epoch{request_epoch}
+      , nonce{nonce}
+  {}
+  template <typename FormatContext>
+  auto fmt_print_ctx(FormatContext& ctx) const
+  {
+    return fmt::format_to(
+       ctx.out(), "pg[{}],f:{},ep:{},n:{}", pgid, from, request_epoch, nonce);
+  }
+};
+
+
 /// Facilitating scrub-related object access to private PG data
 class ScrubberPasskey {
 private:
@@ -317,6 +344,8 @@ struct ScrubPgIF {
 
   virtual void send_scrub_is_finished(epoch_t epoch_queued) = 0;
 
+  virtual void send_granted_by_reserver(const AsyncScrubResData& req) = 0;
+
   virtual void on_applied_when_primary(const eversion_t& applied_version) = 0;
 
   // --------------------------------------------------