osd/scrub: decouple being reserved from handling scrub requests

author Ronen Friedman <rfriedma@redhat.com>

Mon, 13 Nov 2023 07:09:45 +0000 (01:09 -0600)

committer Ronen Friedman <rfriedma@redhat.com>

Thu, 30 Nov 2023 11:40:27 +0000 (05:40 -0600)
author Ronen Friedman <rfriedma@redhat.com>
Mon, 13 Nov 2023 07:09:45 +0000 (01:09 -0600)
committer Ronen Friedman <rfriedma@redhat.com>
Thu, 30 Nov 2023 11:40:27 +0000 (05:40 -0600)
diff --git a/src/messages/MOSDScrubReserve.h b/src/messages/MOSDScrubReserve.h

index f1f76b3e6fe37d14a2cc3928180ea08031b4893c..c7ab985411750bc3320f4eca462aecd03530965c 100644 (file)
--- a/src/messages/MOSDScrubReserve.h
+++ b/src/messages/MOSDScrubReserve.h
@@ -24,7 +24,7 @@ private:
  public:
    spg_t pgid;
    epoch_t map_epoch;
-  enum {
+  enum ReserveMsgOp {
      REQUEST = 0,
      GRANT = 1,
      RELEASE = 2,
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index d2f97a129a241336ea91d2d3f35fa450607b7c62..ddef326e2a8acd5d88169702a3a147de3182cb76 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1823,6 +1823,11 @@ void PG::on_activate(interval_set<snapid_t> snaps)
    m_scrubber->on_pg_activate(m_planned_scrub);
  }
  
+void PG::on_replica_activate()
+{
+  m_scrubber->on_replica_activate();
+}
+
  void PG::on_active_exit()
  {
    backfill_reserving = false;
diff --git a/src/osd/PG.h b/src/osd/PG.h

index 2e82e74ab0127a466bc3f83c64e11063ada7cfec..e0f070960b4eb4dd0526cabf36fbeec60b0ee5e9 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -624,6 +624,8 @@ public:
  
    void on_activate(interval_set<snapid_t> snaps) override;
  
+  void on_replica_activate() override;
+
    void on_activate_committed() override;
  
    void on_active_actmap() override;
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc

index 70d314f0d2f6e83442ceaa16d4444aa1a045d521..a88a09aeb07bb9043157e066d72e32c6d69fcaf9 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -85,6 +85,13 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf)
    return out;
  }
  
+void PgScrubber::on_replica_activate()
+{
+  dout(10) << __func__ << dendl;
+  m_fsm->process_event(ReplicaActivate{});
+}
+
+
  /*
   * if the incoming message is from a previous interval, it must mean
   * PrimaryLogPG::on_change() was called when that interval ended. We can safely
@@ -197,7 +204,6 @@ bool PgScrubber::should_abort() const
   *
   * Some of the considerations above are also relevant to the replica-side
   * initiation
- * ('StartReplica' & 'StartReplicaNoWait').
   */
  
  void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
@@ -216,11 +222,6 @@ void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
    }
  }
  
-void PgScrubber::dec_scrubs_remote()
-{
-  m_osds->get_scrub_services().dec_scrubs_remote(m_pg_id.pgid);
-}
-
  void PgScrubber::advance_token()
  {
    m_current_token++;
@@ -274,13 +275,7 @@ void PgScrubber::send_start_replica(epoch_t epoch_queued,
    }
  
    if (check_interval(epoch_queued) && is_token_current(token)) {
-    // save us some time by not waiting for updates if there are none
-    // to wait for. Affects the transition from NotActive into either
-    // ReplicaWaitUpdates or ActiveReplica.
-    if (pending_active_pushes())
-      m_fsm->process_event(StartReplica{});
-    else
-      m_fsm->process_event(StartReplicaNoWait{});
+    m_fsm->process_event(StartReplica{});
    }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
@@ -452,6 +447,11 @@ unsigned int PgScrubber::scrub_requeue_priority(
   * Responsible for resetting any scrub state and releasing any resources.
   * Any inflight events will be ignored via check_interval/should_drop_message
   * or canceled.
+ * Specifically:
+ * - if Primary and in an active session - the IntervalChanged handler takes
+ *   care of discarding the remote reservations, and transitioning out of
+ *   Session. That resets both the scrubber and the FSM.
+ * - if we are a reserved replica - we need to free ourselves;
   */
  void PgScrubber::on_new_interval()
  {
@@ -461,13 +461,7 @@ void PgScrubber::on_new_interval()
                   is_scrub_active(), is_queued_or_active())
            << dendl;
  
-  // If in active session - the IntervalChanged handler takes care of
-  // discarding the remote reservations, and transitioning out of Session.
-  // That resets both the scrubber and the FSM.
    m_fsm->process_event(IntervalChanged{});
-
-  // The 'FullReset' is only relevant if we are not an active Primary
-  m_fsm->process_event(FullReset{});
    rm_from_osd_scrubbing();
  }
  
@@ -1139,13 +1133,7 @@ void PgScrubber::on_init()
    m_pg->publish_stats_to_osd();
  }
  
-/*
- * Note: as on_replica_init() is likely to be called twice (entering
- * both ReplicaWaitUpdates & ActiveReplica), its operations should be
- * idempotent.
- * Now that it includes some state-changing operations, we need to check
- * m_active against double-activation.
- */
+
  void PgScrubber::on_replica_init()
  {
    dout(10) << __func__ << " called with 'active' "
@@ -1159,6 +1147,7 @@ void PgScrubber::on_replica_init()
    }
  }
  
+
  int PgScrubber::build_primary_map_chunk()
  {
    epoch_t map_building_since = m_pg->get_osdmap_epoch();
@@ -1217,23 +1206,21 @@ int PgScrubber::build_replica_map_chunk()
  
        // the local map has been created. Send it to the primary.
        // Note: once the message reaches the Primary, it may ask us for another
-      // chunk - and we better be done with the current scrub. Thus - the
-      // preparation of the reply message is separate, and we clear the scrub
-      // state before actually sending it.
+      // chunk - and we better be done with the current scrub. The clearing of
+      // state must be complete before we relinquish the PG lock.
  
-      auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
-      replica_handling_done();
-      dout(15) << __func__ << " chunk map sent " << dendl;
-      send_replica_map(reply);
-    } break;
+      send_replica_map(prep_replica_map_msg(PreemptionNoted::no_preemption));
+      dout(15) << fmt::format("{}: chunk map sent", __func__) << dendl;
+    }
+    break;
  
      default:
        // negative retval: build_scrub_map_chunk() signalled an error
        // Pre-Pacific code ignored this option, treating it as a success.
        // \todo Add an error flag in the returning message.
+      // \todo: must either abort, send a reply, or return some error message
        dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: "
               << ret << dendl;
-      replica_handling_done();
        // only in debug mode for now:
        assert(false && "backend error");
        break;
@@ -1520,6 +1507,7 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
    replica_scrubmap_pos.reset();         // needed? RRR
  
    set_queued_or_active();
+  advance_token();
    m_osds->queue_for_rep_scrub(m_pg,
                               m_replica_request_priority,
                               m_flags.priority,
@@ -1675,7 +1663,7 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
    auto m = op->get_req<MOSDScrubReserve>();
    switch (m->type) {
      case MOSDScrubReserve::REQUEST:
-      handle_scrub_reserve_request(op);
+      m_fsm->process_event(ReplicaReserveReq{op, m->from});
        break;
      case MOSDScrubReserve::GRANT:
        m_fsm->process_event(ReplicaGrant{op, m->from});
@@ -1684,65 +1672,12 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
        m_fsm->process_event(ReplicaReject{op, m->from});
        break;
      case MOSDScrubReserve::RELEASE:
-      handle_scrub_reserve_release(op);
+      m_fsm->process_event(ReplicaRelease{op, m->from});
        break;
    }
  }
  
  
-void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
-{
-  auto request_ep = op->sent_epoch;
-  dout(20) << fmt::format("{}: request_ep:{} recovery:{}",
-                         __func__,
-                         request_ep,
-                         m_osds->is_recovery_active())
-          << dendl;
-
-  // The primary may unilaterally restart the scrub process without notifying
-  // replicas. Unconditionally clear any existing state prior to handling
-  // the new reservation.
-  m_fsm->process_event(FullReset{});
-
-  bool granted{false};
-  if (m_pg->cct->_conf->osd_scrub_during_recovery ||
-      !m_osds->is_recovery_active()) {
-
-    granted = m_osds->get_scrub_services().inc_scrubs_remote(m_pg_id.pgid);
-    if (granted) {
-      m_fsm->process_event(ReplicaGrantReservation{});
-    } else {
-      dout(20) << __func__ << ": failed to reserve remotely" << dendl;
-    }
-  } else {
-    dout(10) << __func__ << ": recovery is active; not granting" << dendl;
-  }
-
-  dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
-
-  Message* reply = new MOSDScrubReserve(
-    spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
-    request_ep,
-    granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
-    m_pg_whoami);
-
-  m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
-}
-
-void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
-{
-  dout(10) << __func__ << " " << *op->get_req() << dendl;
-  if (should_drop_message(op)) {
-    // we might have turned into a Primary in the meantime. The interval
-    // change should have been noticed already, and caused us to reset.
-    return;
-  }
-
-  // this specific scrub session has terminated. All incoming events carrying
-  // the old tag will be discarded.
-  m_fsm->process_event(FullReset{});
-}
-
  bool PgScrubber::set_reserving_now() {
    return m_osds->get_scrub_services().set_reserving_now(m_pg_id,
                                                          ceph_clock_now());
@@ -2211,6 +2146,7 @@ void PgScrubber::handle_query_state(ceph::Formatter* f)
  
  PgScrubber::~PgScrubber()
  {
+  m_fsm->process_event(IntervalChanged{});
    if (m_scrub_job) {
      // make sure the OSD won't try to scrub this one just now
      rm_from_osd_scrubbing();
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h

index 9946f9ce5ee96e6004c34fedb3aa1224333f7959..2553e49b263ab5e82b6b4b5a165a306740e4e93f 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -258,14 +258,6 @@ class PgScrubber : public ScrubPgIF,
     */
    void handle_scrub_reserve_msgs(OpRequestRef op) final;
  
-  /**
-   *  we are a replica being asked by the Primary to reserve OSD resources for
-   *  scrubbing
-   */
-  void handle_scrub_reserve_request(OpRequestRef op);
-
-  void handle_scrub_reserve_release(OpRequestRef op);
-
    // managing scrub op registration
  
    void update_scrub_job(const requested_scrub_t& request_flags) final;
@@ -334,6 +326,8 @@ class PgScrubber : public ScrubPgIF,
  
    void on_new_interval() final;
  
+  void on_replica_activate() final;
+
    void scrub_clear_state() final;
  
    bool is_queued_or_active() const final;
@@ -476,13 +470,9 @@ class PgScrubber : public ScrubPgIF,
    [[nodiscard]] bool was_epoch_changed() const final;
  
    void set_queued_or_active() final;
-  /// Clears `m_queued_or_active` and restarts snaptrimming
+  /// Clears `m_queued_or_active` and restarts snap-trimming
    void clear_queued_or_active() final;
  
-  void dec_scrubs_remote() final;
-
-  void advance_token() final;
-
    void mark_local_map_ready() final;
  
    [[nodiscard]] bool are_all_maps_available() const final;
@@ -567,6 +557,9 @@ class PgScrubber : public ScrubPgIF,
  
    void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
  
+  /// Modify the token identifying the current replica scrub operation
+  void advance_token();
+
    /**
     *  mark down some parameters of the initiated scrub:
     *  - the epoch when started;
@@ -675,11 +668,12 @@ class PgScrubber : public ScrubPgIF,
    epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
  
    /**
-   * (replica) a tag identifying a specific scrub "session". Incremented
-   * whenever the Primary releases the replica scrub resources. When the scrub
-   * session is terminated (even if the interval remains unchanged, as might
-   * happen following an asok no-scrub command), stale scrub-resched messages
-   *  triggered by the backend will be discarded.
+   * (replica) a tag identifying a specific replica operation, i.e. the
+   * creation of the replica scrub map for a single chunk.
+   * Incremented immediately before sending a response to the primary,
+   * so that the next request would be identified as such. Also changed
+   * on reservation release.
+   * Used to identify stale scrub-re-sched messages triggered by the backend.
     */
    Scrub::act_token_t m_current_token{1};
  
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc

index 99286acaa12b175ac8aa0aeaaf9941b898ac0038..cb10d87236b8ebf20461bfe76a1b4f3aabcc37bb 100644 (file)
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -654,66 +654,163 @@ ScrubMachine::~ScrubMachine() = default;
  
  // -------- for replicas -----------------------------------------------------
  
-// ----------------------- ReservedReplica --------------------------------
+// ----------------------- ReplicaActive --------------------------------
  
-ReservedReplica::ReservedReplica(my_context ctx)
+ReplicaActive::ReplicaActive(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "ReservedReplica")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "ReplicaActive")
  {
-  dout(10) << "-- state -->> ReservedReplica" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "-- state -->> ReplicaActive" << dendl;
+  m_pg = scrbr->get_pg();
+  m_osds = m_pg->get_pg_osd(ScrubberPasskey());
+}
+
+ReplicaActive::~ReplicaActive()
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  if (reserved_by_my_primary) {
+    dout(10) << "ReplicaActive::~ReplicaActive(): clearing reservation"
+            << dendl;
+    clear_reservation_by_remote_primary();
+  }
+}
+
+
+/*
+ * Note: we are expected to be in the initial internal state (Idle) when
+ * receiving any registration request. Our other internal states, the
+ * active ones, have their own handler for this event, and will treat it
+ * as an abort request.
+ *
+ * Process:
+ * - if already reserved: clear existing reservation, then continue
+ * - ask the OSD for the "reservation resource"
+ * - if granted: mark it internally and notify the Primary.
+ * - otherwise: just notify the requesting primary.
+ */
+void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaActive::on_reserve_req()" << dendl;
+
+  if (reserved_by_my_primary) {
+    dout(10) << "ReplicaActive::on_reserve_req(): already reserved" << dendl;
+    // clear the existing reservation
+    clear_reservation_by_remote_primary();  // clears the flag, too
+  }
+
+  // ask the OSD for the reservation
+  const auto ret = get_remote_reservation();
+  if (ret.granted) {
+    reserved_by_my_primary = true;
+    dout(10) << fmt::format("{}: reserved? yes", __func__) << dendl;
+  } else {
+    dout(10) << fmt::format("{}: reserved? no ({})", __func__, ret.error_msg)
+            << dendl;
+  }
+
+  Message* reply = new MOSDScrubReserve(
+      spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch, ret.op,
+      m_pg->pg_whoami);
+  m_osds->send_message_osd_cluster(reply, ev.m_op->get_req()->get_connection());
+}
+
+
+void ReplicaActive::on_release(const ReplicaRelease& ev)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  if (!reserved_by_my_primary) {
+    dout(5) << fmt::format(
+                  "ReplicaActive::on_release() from {}: not reserved!",
+                  ev.m_from)
+           << dendl;
+    return;
+  }
+  dout(10) << fmt::format("ReplicaActive::on_release() from {}", ev.m_from)
+          << dendl;
+  clear_reservation_by_remote_primary();
+}
+
+
+ReplicaActive::ReservationAttemptRes ReplicaActive::get_remote_reservation()
+{
+  using ReservationAttemptRes = ReplicaActive::ReservationAttemptRes;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  if (!scrbr->get_pg_cct()->_conf.get_val<bool>("osd_scrub_during_recovery") &&
+      m_osds->is_recovery_active()) {
+    return ReservationAttemptRes{
+       MOSDScrubReserve::REJECT, "recovery is active", false};
+  }
+
+  if (m_osds->get_scrub_services().inc_scrubs_remote(scrbr->get_spgid().pgid)) {
+    return ReservationAttemptRes{MOSDScrubReserve::GRANT, "", true};
+  } else {
+    return ReservationAttemptRes{
+       MOSDScrubReserve::REJECT, "failed to reserve remotely", false};
+  }
  }
  
-ReservedReplica::~ReservedReplica()
+
+void ReplicaActive::clear_reservation_by_remote_primary()
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  scrbr->dec_scrubs_remote();
-  scrbr->advance_token();
+  dout(10) << "ReplicaActive::clear_reservation_by_remote_primary()" << dendl;
+  m_osds->get_scrub_services().dec_scrubs_remote(scrbr->get_spgid().pgid);
+  reserved_by_my_primary = false;
  }
  
-// ----------------------- ReplicaIdle --------------------------------
+
+void ReplicaActive::check_for_updates(const StartReplica& ev)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaActive::check_for_updates()" << dendl;
+  post_event(ReplicaPushesUpd{});
+}
+
+// ---------------- ReplicaActive/ReplicaIdle ---------------------------
  
  ReplicaIdle::ReplicaIdle(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(
-         context<ScrubMachine>().m_scrbr,
-         "ReservedReplica/ReplicaIdle")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "ReplicaActive/ReplicaIdle")
  {
-  dout(10) << "-- state -->> ReservedReplica/ReplicaIdle" << dendl;
+  dout(10) << "-- state -->> ReplicaActive/ReplicaIdle" << dendl;
  }
  
-ReplicaIdle::~ReplicaIdle() = default;
  
-// ----------------------- ReplicaActiveOp --------------------------------
+// ------------- ReplicaActive/ReplicaActiveOp --------------------------
  
  ReplicaActiveOp::ReplicaActiveOp(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(
-         context<ScrubMachine>().m_scrbr,
-         "ReservedReplica/ReplicaActiveOp")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "ReplicaActiveOp")
  {
-  dout(10) << "-- state -->> ReservedReplica/ReplicaActiveOp" << dendl;
+  dout(10) << "-- state -->> ReplicaActive/ReplicaActiveOp" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  scrbr->on_replica_init();
+}
+
+
+ReplicaActiveOp::~ReplicaActiveOp()
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << __func__ << dendl;
+  scrbr->replica_handling_done();
  }
  
-/**
- * \note: here is too late to call replica_handling_done(). See the
- * comment in build_replica_map_chunk()
- */
-ReplicaActiveOp::~ReplicaActiveOp() = default;
  
-// ----------------------- ReplicaWaitUpdates --------------------------------
+// ------------- ReplicaActive/ReplicaWaitUpdates ------------------------
  
  ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx)
      : my_base(ctx)
      , NamedSimply(
           context<ScrubMachine>().m_scrbr,
-         "ReservedReplica/ReplicaActiveOp/ReplicaWaitUpdates")
+         "ReplicaActive/ReplicaActiveOp/ReplicaWaitUpdates")
  {
-  dout(10) << "-- state -->> ReservedReplica/ReplicaActiveOp/ReplicaWaitUpdates"
+  dout(10) << "-- state -->> ReplicaActive/ReplicaActiveOp/ReplicaWaitUpdates"
            << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  scrbr->on_replica_init();
  }
  
+
  /*
   * Triggered externally, by the entity that had an update re pushes
   */
@@ -724,7 +821,6 @@ sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
            << scrbr->pending_active_pushes() << dendl;
  
    if (scrbr->pending_active_pushes() == 0) {
-
      // done waiting
      return transit<ReplicaBuildingMap>();
    }
@@ -732,22 +828,21 @@ sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
    return discard_event();
  }
  
+
  // ----------------------- ReplicaBuildingMap -----------------------------------
  
  ReplicaBuildingMap::ReplicaBuildingMap(my_context ctx)
      : my_base(ctx)
      , NamedSimply(
           context<ScrubMachine>().m_scrbr,
-         "ReservedReplica/ReplicaActiveOp/ReplicaBuildingMap")
+         "ReplicaActive/ReplicaActiveOp/ReplicaBuildingMap")
  {
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "-- state -->> ReservedReplica/ReplicaActiveOp/ReplicaBuildingMap"
+  dout(10) << "-- state -->> ReplicaActive/ReplicaActiveOp/ReplicaBuildingMap"
            << dendl;
-  // and as we might have skipped ReplicaWaitUpdates:
-  scrbr->on_replica_init();
    post_event(SchedReplica{});
  }
  
+
  sc::result ReplicaBuildingMap::react(const SchedReplica&)
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
@@ -758,7 +853,6 @@ sc::result ReplicaBuildingMap::react(const SchedReplica&)
      dout(10) << "replica scrub job preempted" << dendl;
  
      scrbr->send_preempted_replica();
-    scrbr->replica_handling_done();
      return transit<ReplicaIdle>();
    }
  
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h

index fcca0d2cc40d84bcfe39f75fc516d9e20842c746..6dbc8b565e585508211c21338e4408cfd5201cf1 100644 (file)
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -164,15 +164,12 @@ MEV(IntLocalMapDone)
  /// scrub_snapshot_metadata()
  MEV(DigestUpdate)
  
-/// event emitted when the replica grants a reservation to the primary
-MEV(ReplicaGrantReservation)
+/// we are a replica for this PG
+MEV(ReplicaActivate)
  
  /// initiating replica scrub
  MEV(StartReplica)
  
-/// 'start replica' when there are no pending updates
-MEV(StartReplicaNoWait)
-
  MEV(SchedReplica)
  
  /// Update to active_pushes. 'active_pushes' represents recovery
@@ -211,8 +208,11 @@ struct NotActive;      ///< the quiescent state. No active scrubbing.
  struct Session;            ///< either reserving or actively scrubbing
  struct ReservingReplicas;   ///< securing scrub resources from replicas' OSDs
  struct ActiveScrubbing;            ///< the active state for a Primary. A sub-machine.
-struct ReplicaIdle;         ///< Initial reserved replica state
-struct ReplicaBuildingMap;         ///< an active state for a replica.
+// the active states for a replica:
+struct ReplicaActive;    ///< the quiescent state for a replica
+struct ReplicaActiveOp;
+struct ReplicaWaitUpdates;
+struct ReplicaBuildingMap;
  
  
  class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
@@ -370,8 +370,8 @@ public:
   *
   *  - a special end-of-recovery Primary scrub event ('AfterRepairScrub').
   *
- *  - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by
- *    an incoming MOSDRepScrub message.
+ *  - (if already in ReplicaActive): an incoming MOSDRepScrub triggers
+ *    'StartReplica'.
   *
   *  note (20.8.21): originally, AfterRepairScrub was triggering a scrub without
   *  waiting for replica resources to be acquired. But once replicas started
@@ -381,11 +381,13 @@ public:
  struct NotActive : sc::state<NotActive, ScrubMachine>, NamedSimply {
    explicit NotActive(my_context ctx);
  
-  using reactions =
-    mpl::list<sc::custom_reaction<StartScrub>,
-             // a scrubbing that was initiated at recovery completion:
-             sc::custom_reaction<AfterRepairScrub>,
-             sc::transition<ReplicaGrantReservation, ReplicaIdle>>;
+  using reactions = mpl::list<
+      sc::custom_reaction<StartScrub>,
+      // a scrubbing that was initiated at recovery completion:
+      sc::custom_reaction<AfterRepairScrub>,
+      // peering done, and we are a replica
+      sc::transition<ReplicaActivate, ReplicaActive>>;
+
    sc::result react(const StartScrub&);
    sc::result react(const AfterRepairScrub&);
  };
@@ -611,45 +613,95 @@ struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing>,
  
  // ----------------------------- the "replica active" states
  
-/**
- * ReservedReplica
+/*
+ *  The replica states:
   *
- * Parent state for replica states,  Controls lifecycle for
- * PgScrubber::m_reservations.
+ *  ReplicaActive - starts after being peered as a replica. Ends on interval.
+ *   - maintain the "I am reserved by a primary" state;
+ *   - handles reservation requests
+ *
+ *     - ReplicaIdle - ready for a new scrub request
+ *          * initial state of ReplicaActive
+ *
+ *     - ReplicaActiveOp - handling a single map request op
+ *          * ReplicaWaitUpdates
+ *         * ReplicaBuildingMap
   */
-struct ReservedReplica : sc::state<ReservedReplica, ScrubMachine, ReplicaIdle>,
+
+struct ReplicaIdle;
+
+struct ReplicaActive : sc::state<ReplicaActive, ScrubMachine, ReplicaIdle>,
                          NamedSimply {
-  explicit ReservedReplica(my_context ctx);
-  ~ReservedReplica();
+  explicit ReplicaActive(my_context ctx);
+  ~ReplicaActive();
  
-  using reactions = mpl::list<sc::transition<FullReset, NotActive>>;
-};
+  /// handle a reservation request from a primary
+  void on_reserve_req(const ReplicaReserveReq&);
  
-struct ReplicaWaitUpdates;
+  /// handle a 'release' from a primary
+  void on_release(const ReplicaRelease&);
  
-/**
- * ReplicaIdle
- *
- * Replica is waiting for a map request.
- */
-struct ReplicaIdle : sc::state<ReplicaIdle, ReservedReplica>,
-                    NamedSimply {
-  explicit ReplicaIdle(my_context ctx);
-  ~ReplicaIdle();
+  void check_for_updates(const StartReplica&);
  
    using reactions = mpl::list<
-    sc::transition<StartReplica, ReplicaWaitUpdates>,
-    sc::transition<StartReplicaNoWait, ReplicaBuildingMap>>;
+      // a reservation request from the primary
+      sc::in_state_reaction<
+         ReplicaReserveReq,
+         ReplicaActive,
+         &ReplicaActive::on_reserve_req>,
+      // an explicit release request from the primary
+      sc::in_state_reaction<
+         ReplicaRelease,
+         ReplicaActive,
+         &ReplicaActive::on_release>,
+      // when the interval ends - we may not be a replica anymore
+      sc::transition<IntervalChanged, NotActive>>;
+
+ private:
+  bool reserved_by_my_primary{false};
+
+  // shortcuts:
+  PG* m_pg;
+  OSDService* m_osds;
+
+  /// a convenience internal result structure
+  struct ReservationAttemptRes {
+    MOSDScrubReserve::ReserveMsgOp op; // GRANT or REJECT
+    std::string_view error_msg;
+    bool granted;
+  };
+
+  /// request a scrub resource from our local OSD
+  /// (after performing some checks)
+  ReservationAttemptRes get_remote_reservation();
+
+  void clear_reservation_by_remote_primary();
  };
  
+
+struct ReplicaIdle : sc::state<ReplicaIdle, ReplicaActive>, NamedSimply {
+  explicit ReplicaIdle(my_context ctx);
+  ~ReplicaIdle() = default;
+
+  // note the execution of check_for_updates() when transitioning to
+  // ReplicaActiveOp/ReplicaWaitUpdates. That would trigger a ReplicaPushesUpd
+  // event, which will be handled by ReplicaWaitUpdates.
+  using reactions = mpl::list<sc::transition<
+      StartReplica,
+      ReplicaWaitUpdates,
+      ReplicaActive,
+      &ReplicaActive::check_for_updates>>;
+};
+
+
  /**
   * ReplicaActiveOp
   *
- * Lifetime matches handling for a single map request op
+ * Lifetime matches handling for a single map request op.
   */
  struct ReplicaActiveOp
-  : sc::state<ReplicaActiveOp, ReservedReplica, ReplicaWaitUpdates>,
-    NamedSimply {
+    : sc::state<ReplicaActiveOp, ReplicaActive, ReplicaWaitUpdates>,
+      NamedSimply {
    explicit ReplicaActiveOp(my_context ctx);
    ~ReplicaActiveOp();
  };
@@ -670,8 +722,8 @@ struct ReplicaWaitUpdates : sc::state<ReplicaWaitUpdates, ReplicaActiveOp>,
  };
  
  
-struct ReplicaBuildingMap : sc::state<ReplicaBuildingMap, ReplicaActiveOp>
-                         , NamedSimply {
+struct ReplicaBuildingMap : sc::state<ReplicaBuildingMap, ReplicaActiveOp>,
+                           NamedSimply {
    explicit ReplicaBuildingMap(my_context ctx);
    using reactions = mpl::list<sc::custom_reaction<SchedReplica>>;
  
diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h

index 4206c789f91a02078cd9f39f2987ec1f69ccadf5..890a70a8a129af9d43d4759897831fb093dd3697 100644 (file)
--- a/src/osd/scrubber/scrub_machine_lstnr.h
+++ b/src/osd/scrubber/scrub_machine_lstnr.h
@@ -196,12 +196,6 @@ struct ScrubMachineListener {
    virtual void set_queued_or_active() = 0;
    virtual void clear_queued_or_active() = 0;
  
-  /// Release remote scrub reservation
-  virtual void dec_scrubs_remote() = 0;
-
-  /// Advance replica token
-  virtual void advance_token() = 0;
-
    /**
     * Our scrubbing is blocked, waiting for an excessive length of time for
     * our target chunk to be unlocked. We will set the corresponding flags,
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h

index 16810bba15c651b720edd7ae78a103fb1dbfdf90..d24bb79b801e367f15ac461887ceee9fcc82e49e 100644 (file)
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -18,12 +18,14 @@ struct PGPool;
  
  namespace Scrub {
    class ReplicaReservations;
+  struct ReplicaActive;
  }
  
  /// Facilitating scrub-related object access to private PG data
  class ScrubberPasskey {
  private:
    friend class Scrub::ReplicaReservations;
+  friend struct Scrub::ReplicaActive;
    friend class PrimaryLogScrub;
    friend class PgScrubber;
    friend class ScrubBackend;
@@ -310,6 +312,9 @@ struct ScrubPgIF {
    /// the OSD scrub queue
    virtual void on_new_interval() = 0;
  
+  /// we are peered as a replica
+  virtual void on_replica_activate() = 0;
+
    virtual void scrub_clear_state() = 0;
  
    virtual void handle_query_state(ceph::Formatter* f) = 0;
author	Ronen Friedman <rfriedma@redhat.com>
	Mon, 13 Nov 2023 07:09:45 +0000 (01:09 -0600)
committer	Ronen Friedman <rfriedma@redhat.com>
	Thu, 30 Nov 2023 11:40:27 +0000 (05:40 -0600)
src/messages/MOSDScrubReserve.h		patch \| blob \| history
src/osd/PG.cc		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history
src/osd/scrubber/pg_scrubber.cc		patch \| blob \| history
src/osd/scrubber/pg_scrubber.h		patch \| blob \| history
src/osd/scrubber/scrub_machine.cc		patch \| blob \| history
src/osd/scrubber/scrub_machine.h		patch \| blob \| history
src/osd/scrubber/scrub_machine_lstnr.h		patch \| blob \| history
src/osd/scrubber_common.h		patch \| blob \| history