From e65ded632888bd07daf5189b210934a872026116 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 13 Nov 2023 01:09:45 -0600
Subject: [PATCH] osd/scrub: decouple being reserved from handling scrub
 requests

For a replica, following this change:

* 'ReplicaActive' captures the state of the scrubber when
  acting as a replica, from peering to interval change;

* "being reserved" is just a flag maintained by ReplicaActive, and
  is no longer a prerequisite for handling scrub requests.

* each scrub request is now associated with its own 'token' value.

and the following minor simplification:

* the 'should we wait for pushes' decision is now part of the
  code executed on the transition from ReplicaIdle into ReplicaActiveOp.
  StartReplicaNoWait can now be discarded.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/messages/MOSDScrubReserve.h        |   2 +-
 src/osd/PG.cc                          |   5 +
 src/osd/PG.h                           |   2 +
 src/osd/scrubber/pg_scrubber.cc        | 116 ++++-------------
 src/osd/scrubber/pg_scrubber.h         |  30 ++---
 src/osd/scrubber/scrub_machine.cc      | 164 +++++++++++++++++++------
 src/osd/scrubber/scrub_machine.h       | 132 ++++++++++++++------
 src/osd/scrubber/scrub_machine_lstnr.h |   6 -
 src/osd/scrubber_common.h              |   5 +
 9 files changed, 272 insertions(+), 190 deletions(-)

diff --git a/src/messages/MOSDScrubReserve.h b/src/messages/MOSDScrubReserve.h
index f1f76b3e6fe..c7ab9854117 100644
--- a/src/messages/MOSDScrubReserve.h
+++ b/src/messages/MOSDScrubReserve.h
@@ -24,7 +24,7 @@ private:
 public:
   spg_t pgid;
   epoch_t map_epoch;
-  enum {
+  enum ReserveMsgOp {
     REQUEST = 0,
     GRANT = 1,
     RELEASE = 2,
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index d2f97a129a2..ddef326e2a8 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1823,6 +1823,11 @@ void PG::on_activate(interval_set<snapid_t> snaps)
   m_scrubber->on_pg_activate(m_planned_scrub);
 }
 
+void PG::on_replica_activate()
+{
+  m_scrubber->on_replica_activate();
+}
+
 void PG::on_active_exit()
 {
   backfill_reserving = false;
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 2e82e74ab01..e0f070960b4 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -624,6 +624,8 @@ public:
 
   void on_activate(interval_set<snapid_t> snaps) override;
 
+  void on_replica_activate() override;
+
   void on_activate_committed() override;
 
   void on_active_actmap() override;
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 70d314f0d2f..a88a09aeb07 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -85,6 +85,13 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf)
   return out;
 }
 
+void PgScrubber::on_replica_activate()
+{
+  dout(10) << __func__ << dendl;
+  m_fsm->process_event(ReplicaActivate{});
+}
+
+
 /*
  * if the incoming message is from a previous interval, it must mean
  * PrimaryLogPG::on_change() was called when that interval ended. We can safely
@@ -197,7 +204,6 @@ bool PgScrubber::should_abort() const
  *
  * Some of the considerations above are also relevant to the replica-side
  * initiation
- * ('StartReplica' & 'StartReplicaNoWait').
  */
 
 void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
@@ -216,11 +222,6 @@ void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
   }
 }
 
-void PgScrubber::dec_scrubs_remote()
-{
-  m_osds->get_scrub_services().dec_scrubs_remote(m_pg_id.pgid);
-}
-
 void PgScrubber::advance_token()
 {
   m_current_token++;
@@ -274,13 +275,7 @@ void PgScrubber::send_start_replica(epoch_t epoch_queued,
   }
 
   if (check_interval(epoch_queued) && is_token_current(token)) {
-    // save us some time by not waiting for updates if there are none
-    // to wait for. Affects the transition from NotActive into either
-    // ReplicaWaitUpdates or ActiveReplica.
-    if (pending_active_pushes())
-      m_fsm->process_event(StartReplica{});
-    else
-      m_fsm->process_event(StartReplicaNoWait{});
+    m_fsm->process_event(StartReplica{});
   }
   dout(10) << "scrubber event --<< " << __func__ << dendl;
 }
@@ -452,6 +447,11 @@ unsigned int PgScrubber::scrub_requeue_priority(
  * Responsible for resetting any scrub state and releasing any resources.
  * Any inflight events will be ignored via check_interval/should_drop_message
  * or canceled.
+ * Specifically:
+ * - if Primary and in an active session - the IntervalChanged handler takes
+ *   care of discarding the remote reservations, and transitioning out of
+ *   Session. That resets both the scrubber and the FSM.
+ * - if we are a reserved replica - we need to free ourselves;
  */
 void PgScrubber::on_new_interval()
 {
@@ -461,13 +461,7 @@ void PgScrubber::on_new_interval()
 		  is_scrub_active(), is_queued_or_active())
 	   << dendl;
 
-  // If in active session - the IntervalChanged handler takes care of
-  // discarding the remote reservations, and transitioning out of Session.
-  // That resets both the scrubber and the FSM.
   m_fsm->process_event(IntervalChanged{});
-
-  // The 'FullReset' is only relevant if we are not an active Primary
-  m_fsm->process_event(FullReset{});
   rm_from_osd_scrubbing();
 }
 
@@ -1139,13 +1133,7 @@ void PgScrubber::on_init()
   m_pg->publish_stats_to_osd();
 }
 
-/*
- * Note: as on_replica_init() is likely to be called twice (entering
- * both ReplicaWaitUpdates & ActiveReplica), its operations should be
- * idempotent.
- * Now that it includes some state-changing operations, we need to check
- * m_active against double-activation.
- */
+
 void PgScrubber::on_replica_init()
 {
   dout(10) << __func__ << " called with 'active' "
@@ -1159,6 +1147,7 @@ void PgScrubber::on_replica_init()
   }
 }
 
+
 int PgScrubber::build_primary_map_chunk()
 {
   epoch_t map_building_since = m_pg->get_osdmap_epoch();
@@ -1217,23 +1206,21 @@ int PgScrubber::build_replica_map_chunk()
 
       // the local map has been created. Send it to the primary.
       // Note: once the message reaches the Primary, it may ask us for another
-      // chunk - and we better be done with the current scrub. Thus - the
-      // preparation of the reply message is separate, and we clear the scrub
-      // state before actually sending it.
+      // chunk - and we better be done with the current scrub. The clearing of
+      // state must be complete before we relinquish the PG lock.
 
-      auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
-      replica_handling_done();
-      dout(15) << __func__ << " chunk map sent " << dendl;
-      send_replica_map(reply);
-    } break;
+      send_replica_map(prep_replica_map_msg(PreemptionNoted::no_preemption));
+      dout(15) << fmt::format("{}: chunk map sent", __func__) << dendl;
+    }
+    break;
 
     default:
       // negative retval: build_scrub_map_chunk() signalled an error
       // Pre-Pacific code ignored this option, treating it as a success.
       // \todo Add an error flag in the returning message.
+      // \todo: must either abort, send a reply, or return some error message
       dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: "
 	      << ret << dendl;
-      replica_handling_done();
       // only in debug mode for now:
       assert(false && "backend error");
       break;
@@ -1520,6 +1507,7 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
   replica_scrubmap_pos.reset();	 // needed? RRR
 
   set_queued_or_active();
+  advance_token();
   m_osds->queue_for_rep_scrub(m_pg,
 			      m_replica_request_priority,
 			      m_flags.priority,
@@ -1675,7 +1663,7 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
   auto m = op->get_req<MOSDScrubReserve>();
   switch (m->type) {
     case MOSDScrubReserve::REQUEST:
-      handle_scrub_reserve_request(op);
+      m_fsm->process_event(ReplicaReserveReq{op, m->from});
       break;
     case MOSDScrubReserve::GRANT:
       m_fsm->process_event(ReplicaGrant{op, m->from});
@@ -1684,65 +1672,12 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
       m_fsm->process_event(ReplicaReject{op, m->from});
       break;
     case MOSDScrubReserve::RELEASE:
-      handle_scrub_reserve_release(op);
+      m_fsm->process_event(ReplicaRelease{op, m->from});
       break;
   }
 }
 
 
-void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
-{
-  auto request_ep = op->sent_epoch;
-  dout(20) << fmt::format("{}: request_ep:{} recovery:{}",
-			  __func__,
-			  request_ep,
-			  m_osds->is_recovery_active())
-	   << dendl;
-
-  // The primary may unilaterally restart the scrub process without notifying
-  // replicas. Unconditionally clear any existing state prior to handling
-  // the new reservation.
-  m_fsm->process_event(FullReset{});
-
-  bool granted{false};
-  if (m_pg->cct->_conf->osd_scrub_during_recovery ||
-      !m_osds->is_recovery_active()) {
-
-    granted = m_osds->get_scrub_services().inc_scrubs_remote(m_pg_id.pgid);
-    if (granted) {
-      m_fsm->process_event(ReplicaGrantReservation{});
-    } else {
-      dout(20) << __func__ << ": failed to reserve remotely" << dendl;
-    }
-  } else {
-    dout(10) << __func__ << ": recovery is active; not granting" << dendl;
-  }
-
-  dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
-
-  Message* reply = new MOSDScrubReserve(
-    spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
-    request_ep,
-    granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
-    m_pg_whoami);
-
-  m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
-}
-
-void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
-{
-  dout(10) << __func__ << " " << *op->get_req() << dendl;
-  if (should_drop_message(op)) {
-    // we might have turned into a Primary in the meantime. The interval
-    // change should have been noticed already, and caused us to reset.
-    return;
-  }
-
-  // this specific scrub session has terminated. All incoming events carrying
-  // the old tag will be discarded.
-  m_fsm->process_event(FullReset{});
-}
-
 bool PgScrubber::set_reserving_now() {
   return m_osds->get_scrub_services().set_reserving_now(m_pg_id,
                                                         ceph_clock_now());
@@ -2211,6 +2146,7 @@ void PgScrubber::handle_query_state(ceph::Formatter* f)
 
 PgScrubber::~PgScrubber()
 {
+  m_fsm->process_event(IntervalChanged{});
   if (m_scrub_job) {
     // make sure the OSD won't try to scrub this one just now
     rm_from_osd_scrubbing();
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index 9946f9ce5ee..2553e49b263 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -258,14 +258,6 @@ class PgScrubber : public ScrubPgIF,
    */
   void handle_scrub_reserve_msgs(OpRequestRef op) final;
 
-  /**
-   *  we are a replica being asked by the Primary to reserve OSD resources for
-   *  scrubbing
-   */
-  void handle_scrub_reserve_request(OpRequestRef op);
-
-  void handle_scrub_reserve_release(OpRequestRef op);
-
   // managing scrub op registration
 
   void update_scrub_job(const requested_scrub_t& request_flags) final;
@@ -334,6 +326,8 @@ class PgScrubber : public ScrubPgIF,
 
   void on_new_interval() final;
 
+  void on_replica_activate() final;
+
   void scrub_clear_state() final;
 
   bool is_queued_or_active() const final;
@@ -476,13 +470,9 @@ class PgScrubber : public ScrubPgIF,
   [[nodiscard]] bool was_epoch_changed() const final;
 
   void set_queued_or_active() final;
-  /// Clears `m_queued_or_active` and restarts snaptrimming
+  /// Clears `m_queued_or_active` and restarts snap-trimming
   void clear_queued_or_active() final;
 
-  void dec_scrubs_remote() final;
-
-  void advance_token() final;
-
   void mark_local_map_ready() final;
 
   [[nodiscard]] bool are_all_maps_available() const final;
@@ -567,6 +557,9 @@ class PgScrubber : public ScrubPgIF,
 
   void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
 
+  /// Modify the token identifying the current replica scrub operation
+  void advance_token();
+
   /**
    *  mark down some parameters of the initiated scrub:
    *  - the epoch when started;
@@ -675,11 +668,12 @@ class PgScrubber : public ScrubPgIF,
   epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
 
   /**
-   * (replica) a tag identifying a specific scrub "session". Incremented
-   * whenever the Primary releases the replica scrub resources. When the scrub
-   * session is terminated (even if the interval remains unchanged, as might
-   * happen following an asok no-scrub command), stale scrub-resched messages
-   *  triggered by the backend will be discarded.
+   * (replica) a tag identifying a specific replica operation, i.e. the
+   * creation of the replica scrub map for a single chunk.
+   * Incremented immediately before sending a response to the primary,
+   * so that the next request would be identified as such. Also changed
+   * on reservation release.
+   * Used to identify stale scrub-re-sched messages triggered by the backend.
    */
   Scrub::act_token_t m_current_token{1};
 
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
index 99286acaa12..cb10d87236b 100644
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -654,66 +654,163 @@ ScrubMachine::~ScrubMachine() = default;
 
 // -------- for replicas -----------------------------------------------------
 
-// ----------------------- ReservedReplica --------------------------------
+// ----------------------- ReplicaActive --------------------------------
 
-ReservedReplica::ReservedReplica(my_context ctx)
+ReplicaActive::ReplicaActive(my_context ctx)
     : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "ReservedReplica")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "ReplicaActive")
 {
-  dout(10) << "-- state -->> ReservedReplica" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "-- state -->> ReplicaActive" << dendl;
+  m_pg = scrbr->get_pg();
+  m_osds = m_pg->get_pg_osd(ScrubberPasskey());
+}
+
+ReplicaActive::~ReplicaActive()
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  if (reserved_by_my_primary) {
+    dout(10) << "ReplicaActive::~ReplicaActive(): clearing reservation"
+	     << dendl;
+    clear_reservation_by_remote_primary();
+  }
+}
+
+
+/*
+ * Note: we are expected to be in the initial internal state (Idle) when
+ * receiving any registration request. Our other internal states, the
+ * active ones, have their own handler for this event, and will treat it
+ * as an abort request.
+ *
+ * Process:
+ * - if already reserved: clear existing reservation, then continue
+ * - ask the OSD for the "reservation resource"
+ * - if granted: mark it internally and notify the Primary.
+ * - otherwise: just notify the requesting primary.
+ */
+void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaActive::on_reserve_req()" << dendl;
+
+  if (reserved_by_my_primary) {
+    dout(10) << "ReplicaActive::on_reserve_req(): already reserved" << dendl;
+    // clear the existing reservation
+    clear_reservation_by_remote_primary();  // clears the flag, too
+  }
+
+  // ask the OSD for the reservation
+  const auto ret = get_remote_reservation();
+  if (ret.granted) {
+    reserved_by_my_primary = true;
+    dout(10) << fmt::format("{}: reserved? yes", __func__) << dendl;
+  } else {
+    dout(10) << fmt::format("{}: reserved? no ({})", __func__, ret.error_msg)
+	     << dendl;
+  }
+
+  Message* reply = new MOSDScrubReserve(
+      spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch, ret.op,
+      m_pg->pg_whoami);
+  m_osds->send_message_osd_cluster(reply, ev.m_op->get_req()->get_connection());
+}
+
+
+void ReplicaActive::on_release(const ReplicaRelease& ev)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  if (!reserved_by_my_primary) {
+    dout(5) << fmt::format(
+		   "ReplicaActive::on_release() from {}: not reserved!",
+		   ev.m_from)
+	    << dendl;
+    return;
+  }
+  dout(10) << fmt::format("ReplicaActive::on_release() from {}", ev.m_from)
+	   << dendl;
+  clear_reservation_by_remote_primary();
+}
+
+
+ReplicaActive::ReservationAttemptRes ReplicaActive::get_remote_reservation()
+{
+  using ReservationAttemptRes = ReplicaActive::ReservationAttemptRes;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  if (!scrbr->get_pg_cct()->_conf.get_val<bool>("osd_scrub_during_recovery") &&
+      m_osds->is_recovery_active()) {
+    return ReservationAttemptRes{
+	MOSDScrubReserve::REJECT, "recovery is active", false};
+  }
+
+  if (m_osds->get_scrub_services().inc_scrubs_remote(scrbr->get_spgid().pgid)) {
+    return ReservationAttemptRes{MOSDScrubReserve::GRANT, "", true};
+  } else {
+    return ReservationAttemptRes{
+	MOSDScrubReserve::REJECT, "failed to reserve remotely", false};
+  }
 }
 
-ReservedReplica::~ReservedReplica()
+
+void ReplicaActive::clear_reservation_by_remote_primary()
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  scrbr->dec_scrubs_remote();
-  scrbr->advance_token();
+  dout(10) << "ReplicaActive::clear_reservation_by_remote_primary()" << dendl;
+  m_osds->get_scrub_services().dec_scrubs_remote(scrbr->get_spgid().pgid);
+  reserved_by_my_primary = false;
 }
 
-// ----------------------- ReplicaIdle --------------------------------
+
+void ReplicaActive::check_for_updates(const StartReplica& ev)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaActive::check_for_updates()" << dendl;
+  post_event(ReplicaPushesUpd{});
+}
+
+// ---------------- ReplicaActive/ReplicaIdle ---------------------------
 
 ReplicaIdle::ReplicaIdle(my_context ctx)
     : my_base(ctx)
-    , NamedSimply(
-	  context<ScrubMachine>().m_scrbr,
-	  "ReservedReplica/ReplicaIdle")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "ReplicaActive/ReplicaIdle")
 {
-  dout(10) << "-- state -->> ReservedReplica/ReplicaIdle" << dendl;
+  dout(10) << "-- state -->> ReplicaActive/ReplicaIdle" << dendl;
 }
 
-ReplicaIdle::~ReplicaIdle() = default;
 
-// ----------------------- ReplicaActiveOp --------------------------------
+// ------------- ReplicaActive/ReplicaActiveOp --------------------------
 
 ReplicaActiveOp::ReplicaActiveOp(my_context ctx)
     : my_base(ctx)
-    , NamedSimply(
-	  context<ScrubMachine>().m_scrbr,
-	  "ReservedReplica/ReplicaActiveOp")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "ReplicaActiveOp")
 {
-  dout(10) << "-- state -->> ReservedReplica/ReplicaActiveOp" << dendl;
+  dout(10) << "-- state -->> ReplicaActive/ReplicaActiveOp" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  scrbr->on_replica_init();
+}
+
+
+ReplicaActiveOp::~ReplicaActiveOp()
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << __func__ << dendl;
+  scrbr->replica_handling_done();
 }
 
-/**
- * \note: here is too late to call replica_handling_done(). See the
- * comment in build_replica_map_chunk()
- */
-ReplicaActiveOp::~ReplicaActiveOp() = default;
 
-// ----------------------- ReplicaWaitUpdates --------------------------------
+// ------------- ReplicaActive/ReplicaWaitUpdates ------------------------
 
 ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx)
     : my_base(ctx)
     , NamedSimply(
 	  context<ScrubMachine>().m_scrbr,
-	  "ReservedReplica/ReplicaActiveOp/ReplicaWaitUpdates")
+	  "ReplicaActive/ReplicaActiveOp/ReplicaWaitUpdates")
 {
-  dout(10) << "-- state -->> ReservedReplica/ReplicaActiveOp/ReplicaWaitUpdates"
+  dout(10) << "-- state -->> ReplicaActive/ReplicaActiveOp/ReplicaWaitUpdates"
 	   << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  scrbr->on_replica_init();
 }
 
+
 /*
  * Triggered externally, by the entity that had an update re pushes
  */
@@ -724,7 +821,6 @@ sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
 	   << scrbr->pending_active_pushes() << dendl;
 
   if (scrbr->pending_active_pushes() == 0) {
-
     // done waiting
     return transit<ReplicaBuildingMap>();
   }
@@ -732,22 +828,21 @@ sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
   return discard_event();
 }
 
+
 // ----------------------- ReplicaBuildingMap -----------------------------------
 
 ReplicaBuildingMap::ReplicaBuildingMap(my_context ctx)
     : my_base(ctx)
     , NamedSimply(
 	  context<ScrubMachine>().m_scrbr,
-	  "ReservedReplica/ReplicaActiveOp/ReplicaBuildingMap")
+	  "ReplicaActive/ReplicaActiveOp/ReplicaBuildingMap")
 {
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "-- state -->> ReservedReplica/ReplicaActiveOp/ReplicaBuildingMap"
+  dout(10) << "-- state -->> ReplicaActive/ReplicaActiveOp/ReplicaBuildingMap"
 	   << dendl;
-  // and as we might have skipped ReplicaWaitUpdates:
-  scrbr->on_replica_init();
   post_event(SchedReplica{});
 }
 
+
 sc::result ReplicaBuildingMap::react(const SchedReplica&)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
@@ -758,7 +853,6 @@ sc::result ReplicaBuildingMap::react(const SchedReplica&)
     dout(10) << "replica scrub job preempted" << dendl;
 
     scrbr->send_preempted_replica();
-    scrbr->replica_handling_done();
     return transit<ReplicaIdle>();
   }
 
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h
index fcca0d2cc40..6dbc8b565e5 100644
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -164,15 +164,12 @@ MEV(IntLocalMapDone)
 /// scrub_snapshot_metadata()
 MEV(DigestUpdate)
 
-/// event emitted when the replica grants a reservation to the primary
-MEV(ReplicaGrantReservation)
+/// we are a replica for this PG
+MEV(ReplicaActivate)
 
 /// initiating replica scrub
 MEV(StartReplica)
 
-/// 'start replica' when there are no pending updates
-MEV(StartReplicaNoWait)
-
 MEV(SchedReplica)
 
 /// Update to active_pushes. 'active_pushes' represents recovery
@@ -211,8 +208,11 @@ struct NotActive;	    ///< the quiescent state. No active scrubbing.
 struct Session;            ///< either reserving or actively scrubbing
 struct ReservingReplicas;   ///< securing scrub resources from replicas' OSDs
 struct ActiveScrubbing;	    ///< the active state for a Primary. A sub-machine.
-struct ReplicaIdle;         ///< Initial reserved replica state
-struct ReplicaBuildingMap;	    ///< an active state for a replica.
+// the active states for a replica:
+struct ReplicaActive;    ///< the quiescent state for a replica
+struct ReplicaActiveOp;
+struct ReplicaWaitUpdates;
+struct ReplicaBuildingMap;
 
 
 class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
@@ -370,8 +370,8 @@ public:
  *
  *  - a special end-of-recovery Primary scrub event ('AfterRepairScrub').
  *
- *  - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by
- *    an incoming MOSDRepScrub message.
+ *  - (if already in ReplicaActive): an incoming MOSDRepScrub triggers
+ *    'StartReplica'.
  *
  *  note (20.8.21): originally, AfterRepairScrub was triggering a scrub without
  *  waiting for replica resources to be acquired. But once replicas started
@@ -381,11 +381,13 @@ public:
 struct NotActive : sc::state<NotActive, ScrubMachine>, NamedSimply {
   explicit NotActive(my_context ctx);
 
-  using reactions =
-    mpl::list<sc::custom_reaction<StartScrub>,
-	      // a scrubbing that was initiated at recovery completion:
-	      sc::custom_reaction<AfterRepairScrub>,
-	      sc::transition<ReplicaGrantReservation, ReplicaIdle>>;
+  using reactions = mpl::list<
+      sc::custom_reaction<StartScrub>,
+      // a scrubbing that was initiated at recovery completion:
+      sc::custom_reaction<AfterRepairScrub>,
+      // peering done, and we are a replica
+      sc::transition<ReplicaActivate, ReplicaActive>>;
+
   sc::result react(const StartScrub&);
   sc::result react(const AfterRepairScrub&);
 };
@@ -611,45 +613,95 @@ struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing>,
 
 // ----------------------------- the "replica active" states
 
-/**
- * ReservedReplica
+/*
+ *  The replica states:
  *
- * Parent state for replica states,  Controls lifecycle for
- * PgScrubber::m_reservations.
+ *  ReplicaActive - starts after being peered as a replica. Ends on interval.
+ *   - maintain the "I am reserved by a primary" state;
+ *   - handles reservation requests
+ *
+ *     - ReplicaIdle - ready for a new scrub request
+ *          * initial state of ReplicaActive
+ *
+ *     - ReplicaActiveOp - handling a single map request op
+ *          * ReplicaWaitUpdates
+ *  	    * ReplicaBuildingMap
  */
-struct ReservedReplica : sc::state<ReservedReplica, ScrubMachine, ReplicaIdle>,
+
+struct ReplicaIdle;
+
+struct ReplicaActive : sc::state<ReplicaActive, ScrubMachine, ReplicaIdle>,
 			 NamedSimply {
-  explicit ReservedReplica(my_context ctx);
-  ~ReservedReplica();
+  explicit ReplicaActive(my_context ctx);
+  ~ReplicaActive();
 
-  using reactions = mpl::list<sc::transition<FullReset, NotActive>>;
-};
+  /// handle a reservation request from a primary
+  void on_reserve_req(const ReplicaReserveReq&);
 
-struct ReplicaWaitUpdates;
+  /// handle a 'release' from a primary
+  void on_release(const ReplicaRelease&);
 
-/**
- * ReplicaIdle
- *
- * Replica is waiting for a map request.
- */
-struct ReplicaIdle : sc::state<ReplicaIdle, ReservedReplica>,
-		     NamedSimply {
-  explicit ReplicaIdle(my_context ctx);
-  ~ReplicaIdle();
+  void check_for_updates(const StartReplica&);
 
   using reactions = mpl::list<
-    sc::transition<StartReplica, ReplicaWaitUpdates>,
-    sc::transition<StartReplicaNoWait, ReplicaBuildingMap>>;
+      // a reservation request from the primary
+      sc::in_state_reaction<
+	  ReplicaReserveReq,
+	  ReplicaActive,
+	  &ReplicaActive::on_reserve_req>,
+      // an explicit release request from the primary
+      sc::in_state_reaction<
+	  ReplicaRelease,
+	  ReplicaActive,
+	  &ReplicaActive::on_release>,
+      // when the interval ends - we may not be a replica anymore
+      sc::transition<IntervalChanged, NotActive>>;
+
+ private:
+  bool reserved_by_my_primary{false};
+
+  // shortcuts:
+  PG* m_pg;
+  OSDService* m_osds;
+
+  /// a convenience internal result structure
+  struct ReservationAttemptRes {
+    MOSDScrubReserve::ReserveMsgOp op;	// GRANT or REJECT
+    std::string_view error_msg;
+    bool granted;
+  };
+
+  /// request a scrub resource from our local OSD
+  /// (after performing some checks)
+  ReservationAttemptRes get_remote_reservation();
+
+  void clear_reservation_by_remote_primary();
 };
 
+
+struct ReplicaIdle : sc::state<ReplicaIdle, ReplicaActive>, NamedSimply {
+  explicit ReplicaIdle(my_context ctx);
+  ~ReplicaIdle() = default;
+
+  // note the execution of check_for_updates() when transitioning to
+  // ReplicaActiveOp/ReplicaWaitUpdates. That would trigger a ReplicaPushesUpd
+  // event, which will be handled by ReplicaWaitUpdates.
+  using reactions = mpl::list<sc::transition<
+      StartReplica,
+      ReplicaWaitUpdates,
+      ReplicaActive,
+      &ReplicaActive::check_for_updates>>;
+};
+
+
 /**
  * ReplicaActiveOp
  *
- * Lifetime matches handling for a single map request op
+ * Lifetime matches handling for a single map request op.
  */
 struct ReplicaActiveOp
-  : sc::state<ReplicaActiveOp, ReservedReplica, ReplicaWaitUpdates>,
-    NamedSimply {
+    : sc::state<ReplicaActiveOp, ReplicaActive, ReplicaWaitUpdates>,
+      NamedSimply {
   explicit ReplicaActiveOp(my_context ctx);
   ~ReplicaActiveOp();
 };
@@ -670,8 +722,8 @@ struct ReplicaWaitUpdates : sc::state<ReplicaWaitUpdates, ReplicaActiveOp>,
 };
 
 
-struct ReplicaBuildingMap : sc::state<ReplicaBuildingMap, ReplicaActiveOp>
-			  , NamedSimply {
+struct ReplicaBuildingMap : sc::state<ReplicaBuildingMap, ReplicaActiveOp>,
+			    NamedSimply {
   explicit ReplicaBuildingMap(my_context ctx);
   using reactions = mpl::list<sc::custom_reaction<SchedReplica>>;
 
diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h
index 4206c789f91..890a70a8a12 100644
--- a/src/osd/scrubber/scrub_machine_lstnr.h
+++ b/src/osd/scrubber/scrub_machine_lstnr.h
@@ -196,12 +196,6 @@ struct ScrubMachineListener {
   virtual void set_queued_or_active() = 0;
   virtual void clear_queued_or_active() = 0;
 
-  /// Release remote scrub reservation
-  virtual void dec_scrubs_remote() = 0;
-
-  /// Advance replica token
-  virtual void advance_token() = 0;
-
   /**
    * Our scrubbing is blocked, waiting for an excessive length of time for
    * our target chunk to be unlocked. We will set the corresponding flags,
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
index 16810bba15c..d24bb79b801 100644
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -18,12 +18,14 @@ struct PGPool;
 
 namespace Scrub {
   class ReplicaReservations;
+  struct ReplicaActive;
 }
 
 /// Facilitating scrub-related object access to private PG data
 class ScrubberPasskey {
 private:
   friend class Scrub::ReplicaReservations;
+  friend struct Scrub::ReplicaActive;
   friend class PrimaryLogScrub;
   friend class PgScrubber;
   friend class ScrubBackend;
@@ -310,6 +312,9 @@ struct ScrubPgIF {
   /// the OSD scrub queue
   virtual void on_new_interval() = 0;
 
+  /// we are peered as a replica
+  virtual void on_replica_activate() = 0;
+
   virtual void scrub_clear_state() = 0;
 
   virtual void handle_query_state(ceph::Formatter* f) = 0;
-- 
2.39.5