]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: uniform handling of reservation requests 56459/head
authorRonen Friedman <rfriedma@redhat.com>
Wed, 3 Apr 2024 07:52:35 +0000 (02:52 -0500)
committerRonen Friedman <rfriedma@redhat.com>
Wed, 3 Apr 2024 08:42:33 +0000 (03:42 -0500)
we now allow - on the replica side - reservation requests
regardless of the ReplicaActive sub-state. I.e. - we will
honor such requests even when handling a chunk request
(in ReplicaActiveOp).

Note that the current Primary code would never send such
a request. But a future primary code might.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/osd/scrubber/scrub_machine.cc
src/osd/scrubber/scrub_machine.h

index 4f0cd7b0c8eb755f6f4080f79223f8bae866876a..66ba0751d0e910dcf5fb35d2c751ec70fd050c6f 100644 (file)
@@ -774,26 +774,48 @@ void ReplicaActive::exit()
 
 
 /*
- * Note: we are expected to be in the initial internal state (Idle) when
- * receiving any registration request. ReplicaActiveOp, our other internal
- * state, has its own handler for this event, and will treat it
- * as an abort request.
- *
+ * Note: we are expected to be in the ReplicaIdle sub-state: the current
+ * scrub code on the primary side would never interlace chunk ops with
+ * reservation requests. But 'badly timed' requests are not blocked
+ * on the replica side: while requests arriving while in ReplicaActiveOp
+ * are at this time probably a bug; but a future Primary scrub code
+ * would possibly treat 'reservation' & 'scrubbing' as (almost)
+ * totally orthogonal.
+ */
+sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaActive::react(const ReplicaReserveReq&)" << dendl;
+
+  if (m_reservation_status != reservation_status_t::unreserved) {
+    // we are not expected to be in this state when a new request arrives.
+    // Clear the existing reservation - be it granted or pending.
+    const auto& m = *(ev.m_op->get_req<MOSDScrubReserve>());
+    dout(1) << fmt::format(
+                  "ReplicaActive::react(const ReplicaReserveReq&): unexpected "
+                  "request. Discarding existing "
+                  "reservation (was granted?:{}). Incoming request: {}",
+                  reservation_granted, m)
+           << dendl;
+    clear_remote_reservation(true);
+  }
+
+  handle_reservation_request(ev);
+  return discard_event();
+}
+
+
+/*
  * Process:
- * - if already reserved: clear existing reservation, then continue
  * - for async requests:
- *   - enqueue the request with reserver;
- *   - move to the ReplicaWaitingReservation state
+ *   - enqueue the request with reserver, noting the nonce;
  *   - no reply is expected by the caller
  * - for legacy requests:
- *   - ask the OSD for the "reservation resource"
- *   - if granted: move to ReplicaReserved and notify the Primary.
- *   - otherwise: just notify the requesting primary.
- *
- * implementation note: sc::result objects cannot be copied or moved. Thus,
- * we've resorted to returning a code indicating the next action.
+ *   - ask the OSD for the "reservation resource";
+ *   - send grant/reject to the requesting primary;
+ *   - update 'reservation_granted'
  */
-sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
+void ReplicaActive::handle_reservation_request(const ReplicaReserveReq& ev)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   const auto& m = *(ev.m_op->get_req<MOSDScrubReserve>());
@@ -803,24 +825,11 @@ sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
       "osd_scrub_disable_reservation_queuing");
   const bool async_request = !async_disabled && m.wait_for_resources;
   dout(10) << fmt::format(
-                 "ReplicaActive::react(const ReplicaReserveReq&): async "
-                 "request?:{} disabled?:{} -> async? {}",
-                 m.wait_for_resources, async_disabled, async_request)
+                 "{}: Message:{}. async request?:{} disabled?:{} -> async? {}",
+                 __func__, m, m.wait_for_resources, async_disabled,
+                 async_request)
           << dendl;
 
-  if (m_reservation_status != reservation_status_t::unreserved) {
-    // we are not expected to be in this state when a new request arrives.
-    // Exit-then-reenter ReplicaActive, causing the existing reservation - be
-    // it granted or pending - to be cleared. This maneuver also aborts
-    // any on-going chunk request.
-    dout(1) << fmt::format(
-                  "{}: unexpected request (granted:{} request:{})", __func__,
-                  reservation_granted, m)
-           << dendl;
-    post_event(ev);
-    return transit<ReplicaActive>();
-  }
-
   auto& reserver = m_osds->get_scrub_reserver();
 
   if (async_request) {
@@ -828,8 +837,7 @@ sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
     AsyncScrubResData request_details{
        pg_id, ev.m_from, ev.m_op->sent_epoch, m.reservation_nonce};
     dout(15) << fmt::format(
-                   "ReplicaActive::react(const ReplicaReserveReq& ev): async "
-                   "request: {} details:{}",
+                   "{}: async request: {} details:{}", __func__,
                    ev, request_details)
             << dendl;
 
@@ -859,7 +867,6 @@ sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
     m_osds->send_message_osd_cluster(
        reply, ev.m_op->get_req()->get_connection());
   }
-  return discard_event();
 }
 
 
@@ -1012,20 +1019,6 @@ sc::result ReplicaActiveOp::react(const StartReplica&)
 }
 
 
-sc::result ReplicaActiveOp::react(const ReplicaReserveReq& ev)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "ReplicaActiveOp::react(const ReplicaReserveReq&)" << dendl;
-  scrbr->get_clog()->warn() << fmt::format(
-      "osd.{} pg[{}]: reservation requested while processing a chunk",
-      scrbr->get_whoami(), scrbr->get_spgid());
-  // exit-n-renter ReplicaActive (aborting the chunk & freeing the reservation
-  // in the process)
-  post_event(ev);
-  return transit<ReplicaActive>();
-}
-
-
 sc::result ReplicaActiveOp::react(const ReplicaRelease& ev)
 {
   dout(10) << "ReplicaActiveOp::react(const ReplicaRelease&)" << dendl;
index 31625b55d43a68ca84e93ff2e34e3e6c4a07ae74..b9f60481674da57647fd99008b6bbbb4dde02bd4 100644 (file)
@@ -887,6 +887,16 @@ struct ReplicaActive : sc::state<ReplicaActive, ScrubMachine, ReplicaIdle>,
 
   reservation_status_t m_reservation_status{reservation_status_t::unreserved};
 
+  /**
+   * React to the reservation request.
+   * Called after any existing pending/granted request was released.
+   *
+   * Async requests are sent to the reserver.
+   * For old-style synchronous requests, the reserver is queried using
+   * its 'immediate' interface, and the response is sent back to the primary.
+   */
+  void handle_reservation_request(const ReplicaReserveReq& ev);
+
   // clang-format off
   struct RtReservationCB : public Context {
     PGRef pg;
@@ -929,8 +939,7 @@ struct ReplicaActiveOp
 
   using reactions = mpl::list<
       sc::custom_reaction<StartReplica>,
-      sc::custom_reaction<ReplicaRelease>,
-      sc::custom_reaction<ReplicaReserveReq>>;
+      sc::custom_reaction<ReplicaRelease>>;
 
   /**
    * Handling the unexpected (read - caused by a bug) case of receiving a
@@ -950,14 +959,6 @@ struct ReplicaActiveOp
    * releasing the reservation on the way.
    */
   sc::result react(const ReplicaRelease&);
-
-  /**
-   * handling unexpected 'reservation requests' while we are in the middle
-   * of serving a chunk request.
-   * This is a bug. We log it, abort the operation, and re-enter ReplicaActive
-   * to handle the reservation request.
-   */
-  sc::result react(const ReplicaReserveReq& ev);
 };
 
 /*