From: Ronen Friedman <rfriedma@redhat.com>
Date: Thu, 21 Dec 2023 17:29:09 +0000 (-0600)
Subject: osd/scrub: add a "clean primary" base state
X-Git-Tag: v19.1.0~556^2~2
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=bde52ebef19bc0d3fd3382926df6e202fbb79a53;p=ceph.git

osd/scrub: add a "clean primary" base state

... to the scrubber state machine.

Similar to ReplicaActive, this state is entered after the peering
is concluded and the PG is set to be Primary, active & clean.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---

diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 8420fd432a3ff..3138b8c32f9cc 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1861,6 +1861,16 @@ void PG::on_active_exit()
   agent_stop();
 }
 
+Context* PG::on_clean()
+{
+  if (is_active()) {
+    kick_snap_trim();
+  }
+  m_scrubber->on_primary_active_clean();
+  requeue_ops(waiting_for_clean_to_primary_repair);
+  return finish_recovery();
+}
+
 void PG::on_active_advmap(const OSDMapRef &osdmap)
 {
   const auto& new_removed_snaps = osdmap->get_new_removed_snaps();
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 9272000eb34d8..2a823f675969a 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -608,13 +608,7 @@ public:
 
   void on_active_exit() override;
 
-  Context *on_clean() override {
-    if (is_active()) {
-      kick_snap_trim();
-    }
-    requeue_ops(waiting_for_clean_to_primary_repair);
-    return finish_recovery();
-  }
+  Context *on_clean() override;
 
   void on_activate(interval_set<snapid_t> snaps) override;
 
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 253962f0fd492..dd279f720add5 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -152,7 +152,7 @@ bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify)
 
   // if we were not aware of the abort before - kill the scrub.
   if (epoch_to_verify >= m_last_aborted) {
-    scrub_clear_state();
+    m_fsm->process_event(FullReset{});
     m_last_aborted = std::max(epoch_to_verify, m_epoch_start);
   }
   return false;
@@ -407,7 +407,7 @@ void PgScrubber::reset_epoch(epoch_t epoch_queued)
 {
   dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB)
 	   << dendl;
-  m_fsm->assert_not_active();
+  m_fsm->assert_not_in_session();
 
   m_epoch_start = epoch_queued;
   ceph_assert(m_is_deep == state_test(PG_STATE_DEEP_SCRUB));
@@ -462,6 +462,11 @@ void PgScrubber::on_new_interval()
 	   << dendl;
 
   m_fsm->process_event(IntervalChanged{});
+  // the following asserts were added due to a past bug, where PG flags were
+  // left set in some scenarios.
+  ceph_assert(!is_queued_or_active());
+  ceph_assert(!state_test(PG_STATE_SCRUBBING));
+  ceph_assert(!state_test(PG_STATE_DEEP_SCRUB));
   rm_from_osd_scrubbing();
 }
 
@@ -517,6 +522,14 @@ void PgScrubber::on_pg_activate(const requested_scrub_t& request_flags)
 	   << dendl;
 }
 
+void PgScrubber::on_primary_active_clean()
+{
+  dout(10) << fmt::format(
+		  "{}: reg state: {}", __func__, m_scrub_job->state_desc())
+	   << dendl;
+  m_fsm->process_event(PrimaryActivate{});
+}
+
 /*
  * A note re the call to publish_stats_to_osd() below:
  * - we are called from either request_rescrubbing() or scrub_requested().
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index c6be51b2cae63..af667f732143c 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -314,6 +314,8 @@ class PgScrubber : public ScrubPgIF,
 
   void on_new_interval() final;
 
+  void on_primary_active_clean() final;
+
   void on_replica_activate() final;
 
   void scrub_clear_state() final;
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
index e0da61dde0f85..1928eed7d0cd4 100644
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -47,9 +47,9 @@ void on_event_discard(std::string_view nm)
   dout(20) << " event: --^^^^---- " << nm << dendl;
 }
 
-void ScrubMachine::assert_not_active() const
+void ScrubMachine::assert_not_in_session() const
 {
-  ceph_assert(state_cast<const NotActive*>());
+  ceph_assert(!state_cast<const Session*>());
 }
 
 bool ScrubMachine::is_reserving() const
@@ -114,27 +114,64 @@ NotActive::NotActive(my_context ctx)
   scrbr->clear_queued_or_active();
 }
 
-sc::result NotActive::react(const StartScrub&)
+
+// ----------------------- PrimaryActive --------------------------------
+
+PrimaryActive::PrimaryActive(my_context ctx)
+    : my_base(ctx)
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "PrimaryActive")
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "-- state -->> PrimaryActive" << dendl;
+}
+
+PrimaryActive::~PrimaryActive()
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  // we may have set some PG state flags without reaching Session.
+  // And we may be holding a 'local resource'.
+  scrbr->clear_pgscrub_state();
+  scrbr->rm_from_osd_scrubbing();
+}
+
+
+// ---------------- PrimaryActive/PrimaryIdle ---------------------------
+
+PrimaryIdle::PrimaryIdle(my_context ctx)
+    : my_base(ctx)
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "PrimaryActive/PrimaryIdle")
+{
+  dout(10) << "-- state -->> PrimaryActive/PrimaryIdle" << dendl;
+}
+
+sc::result PrimaryIdle::react(const StartScrub&)
 {
-  dout(10) << "NotActive::react(const StartScrub&)" << dendl;
+  dout(10) << "PrimaryIdle::react(const StartScrub&)" << dendl;
   DECLARE_LOCALS;
   return transit<ReservingReplicas>();
 }
 
-sc::result NotActive::react(const AfterRepairScrub&)
+sc::result PrimaryIdle::react(const AfterRepairScrub&)
 {
-  dout(10) << "NotActive::react(const AfterRepairScrub&)" << dendl;
+  dout(10) << "PrimaryIdle::react(const AfterRepairScrub&)" << dendl;
   DECLARE_LOCALS;
   return transit<ReservingReplicas>();
 }
 
+void PrimaryIdle::clear_state(const FullReset&) {
+  dout(10) << "PrimaryIdle::react(const FullReset&): clearing state flags"
+           << dendl;
+  DECLARE_LOCALS;
+  scrbr->clear_pgscrub_state();
+}
+
 // ----------------------- Session -----------------------------------------
 
 Session::Session(my_context ctx)
     : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "PrimaryActive/Session")
 {
-  dout(10) << "-- state -->> Session" << dendl;
+  dout(10) << "-- state -->> PrimaryActive/Session" << dendl;
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
 
   // while we've checked the 'someone is reserving' flag before queueing
@@ -242,7 +279,7 @@ sc::result ReservingReplicas::react(const ReplicaReject& ev)
   scrbr->flag_reservations_failure();
 
   // 'Session' state dtor stops the scrubber
-  return transit<NotActive>();
+  return transit<PrimaryIdle>();
 }
 
 sc::result ReservingReplicas::react(const ReservationTimeout&)
@@ -261,7 +298,7 @@ sc::result ReservingReplicas::react(const ReservationTimeout&)
   // cause the scrubber to stop the scrub session, marking 'reservation
   // failure' as the cause (affecting future scheduling)
   scrbr->flag_reservations_failure();
-  return transit<NotActive>();
+  return transit<PrimaryIdle>();
 }
 
 // ----------------------- ActiveScrubbing -----------------------------------
@@ -678,7 +715,7 @@ sc::result WaitDigestUpdate::react(const ScrubFinished&)
   session.m_session_started_at = ScrubTimePoint{};
 
   scrbr->scrub_finish();
-  return transit<NotActive>();
+  return transit<PrimaryIdle>();
 }
 
 ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
@@ -813,6 +850,11 @@ ReplicaIdle::ReplicaIdle(my_context ctx)
   dout(10) << "-- state -->> ReplicaActive/ReplicaIdle" << dendl;
 }
 
+void ReplicaIdle::reset_ignored(const FullReset&)
+{
+  dout(10) << "ReplicaIdle::react(const FullReset&): FullReset ignored"
+	   << dendl;
+}
 
 // ------------- ReplicaActive/ReplicaActiveOp --------------------------
 
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h
index df9a4c45e95f8..fbd8f8b3a3596 100644
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -162,6 +162,9 @@ MEV(IntLocalMapDone)
 /// scrub_snapshot_metadata()
 MEV(DigestUpdate)
 
+/// peered as Primary - and clean
+MEV(PrimaryActivate)
+
 /// we are a replica for this PG
 MEV(ReplicaActivate)
 
@@ -176,7 +179,9 @@ MEV(ReplicaPushesUpd)
 
 /**
  * IntervalChanged
+ * The only path from PrimaryActive or ReplicaActive down to NotActive.
  *
+ * Note re reserved replicas:
  * This event notifies the ScrubMachine that it is no longer responsible for
  * releasing replica state.  It will generally be submitted upon a PG interval
  * change.
@@ -189,7 +194,12 @@ MEV(ReplicaPushesUpd)
  */
 MEV(IntervalChanged)
 
-/// guarantee that the FSM is in the quiescent state (i.e. NotActive)
+/**
+ * stops the scrubbing session, and resets the scrubber.
+ * For a replica - aborts the handling of the current request.
+ * In both cases - a transition to the peering mode quiescent state (i.e.
+ * PrimaryIdle or ReplicaIdle).
+ */
 MEV(FullReset)
 
 /// finished handling this chunk. Go get the next one
@@ -203,9 +213,15 @@ MEV(ScrubFinished)
 //
 
 struct NotActive;	    ///< the quiescent state. No active scrubbing.
-struct Session;            ///< either reserving or actively scrubbing
 struct ReservingReplicas;   ///< securing scrub resources from replicas' OSDs
 struct ActiveScrubbing;	    ///< the active state for a Primary. A sub-machine.
+
+// the states for a Primary:
+// note: PrimaryActive <==> in the OSD scrub queue
+struct PrimaryActive;	   ///< base state for a Primary
+struct PrimaryIdle;	   ///< ready for a new scrub request
+struct Session;            ///< either reserving or actively scrubbing
+
 // the active states for a replica:
 struct ReplicaActive;    ///< the quiescent state for a replica
 struct ReplicaActiveOp;
@@ -225,7 +241,7 @@ class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
   ScrubMachineListener* m_scrbr;
   std::ostream& gen_prefix(std::ostream& out) const;
 
-  void assert_not_active() const;
+  void assert_not_in_session() const;
   [[nodiscard]] bool is_reserving() const;
   [[nodiscard]] bool is_accepting_updates() const;
 
@@ -359,40 +375,94 @@ public:
 
 // ///////////////// the states //////////////////////// //
 
+/*
+ * When not scrubbing, the FSM is in one of three states:
+ *
+ * <> PrimaryActive - we are a Primary and active. The PG
+ * is queued for some future scrubs in the OSD's scrub queue.
+ *
+ * <> ReplicaActive - we are a replica. In this state, we are
+ * expecting either a replica reservation request from the Primary, or a
+ * scrubbing request for a specific chunk.
+ *
+ * <> NotActive - the quiescent state. No active scrubbing.
+ * We are neither an active Primary nor a replica.
+ */
+struct NotActive : sc::state<NotActive, ScrubMachine>, NamedSimply {
+  explicit NotActive(my_context ctx);
+
+  using reactions = mpl::list<
+      // peering done, and we are a replica
+      sc::transition<ReplicaActivate, ReplicaActive>,
+      // peering done, and we are a Primary
+      sc::transition<PrimaryActivate, PrimaryActive>>;
+};
+
+// ----------------------- when Primary --------------------------------------
+// ---------------------------------------------------------------------------
+
+
+/*
+ *  The primary states:
+ *
+ *  PrimaryActive - starts when peering ends with us as a primary,
+ *     and we are active and clean.
+ *   - when in this state - we (our scrub targets) are queued in the
+ *     OSD's scrub queue.
+ *
+ *  Sub-states:
+ *     - PrimaryIdle - ready for a new scrub request
+ *          * initial state of PrimaryActive
+ *
+ *     - Session - handling a single scrub session
+ */
+
+struct PrimaryIdle;
 
 /**
- *  The Scrubber's base (quiescent) state.
- *  Scrubbing is triggered by one of the following events:
+ *  PrimaryActive
+ *
+ *  The basic state for an active Primary. Ready to accept a new scrub request.
+ *  State managed here: being in the OSD's scrub queue (unless when scrubbing).
  *
+ *  Scrubbing is triggered by one of the following events:
  *  - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs
  *    resources reservation process. Will be issued by PG::scrub(), following a
  *    queued "PGScrub" op.
- *
  *  - a special end-of-recovery Primary scrub event ('AfterRepairScrub').
- *
- *  - (if already in ReplicaActive): an incoming MOSDRepScrub triggers
- *    'StartReplica'.
- *
- *  note (20.8.21): originally, AfterRepairScrub was triggering a scrub without
- *  waiting for replica resources to be acquired. But once replicas started
- *  using the resource-request to identify and tag the scrub session, this
- *  bypass cannot be supported anymore.
  */
-struct NotActive : sc::state<NotActive, ScrubMachine>, NamedSimply {
-  explicit NotActive(my_context ctx);
+struct PrimaryActive : sc::state<PrimaryActive, ScrubMachine, PrimaryIdle>,
+			 NamedSimply {
+  explicit PrimaryActive(my_context ctx);
+  ~PrimaryActive();
+
+  using reactions = mpl::list<
+      // when the interval ends - we may not be a primary anymore
+      sc::transition<IntervalChanged, NotActive>>;
+};
+
+/**
+ * \ATTN: set_op_parameters() is called while we are still in this state (waiting
+ * for a queued OSD message to trigger the transition into Session). Thus,
+ * even in this 'idle' state - there is some state we must take care to reset.
+ * Specifically - the PG state flags we were playing with in set_op_parameters().
+ */
+struct PrimaryIdle : sc::state<PrimaryIdle, PrimaryActive>, NamedSimply {
+  explicit PrimaryIdle(my_context ctx);
+  ~PrimaryIdle() = default;
+  void clear_state(const FullReset&);
 
   using reactions = mpl::list<
       sc::custom_reaction<StartScrub>,
       // a scrubbing that was initiated at recovery completion:
       sc::custom_reaction<AfterRepairScrub>,
-      // peering done, and we are a replica
-      sc::transition<ReplicaActivate, ReplicaActive>>;
+      // undoing set_op_params(), if aborted before starting the scrub:
+      sc::in_state_reaction<FullReset, PrimaryIdle, &PrimaryIdle::clear_state>>;
 
   sc::result react(const StartScrub&);
   sc::result react(const AfterRepairScrub&);
 };
 
-
 /**
  *  Session
  *
@@ -407,12 +477,12 @@ struct NotActive : sc::state<NotActive, ScrubMachine>, NamedSimply {
  *  reservations are released. This is because we know that the replicas are
  *  also resetting their reservations.
  */
-struct Session : sc::state<Session, ScrubMachine, ReservingReplicas>,
+struct Session : sc::state<Session, PrimaryActive, ReservingReplicas>,
                  NamedSimply {
   explicit Session(my_context ctx);
   ~Session();
 
-  using reactions = mpl::list<sc::transition<FullReset, NotActive>,
+  using reactions = mpl::list<sc::transition<FullReset, PrimaryIdle>,
                               sc::custom_reaction<IntervalChanged>>;
 
   sc::result react(const IntervalChanged&);
@@ -612,7 +682,9 @@ struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing>,
   sc::result react(const ScrubFinished&);
 };
 
-// ----------------------------- the "replica active" states
+
+// ---------------------------------------------------------------------------
+// ----------------------------- the "replica active" states -----------------
 
 /*
  *  The replica states:
@@ -683,15 +755,21 @@ struct ReplicaActive : sc::state<ReplicaActive, ScrubMachine, ReplicaIdle>,
 struct ReplicaIdle : sc::state<ReplicaIdle, ReplicaActive>, NamedSimply {
   explicit ReplicaIdle(my_context ctx);
   ~ReplicaIdle() = default;
+  void reset_ignored(const FullReset&);
 
   // note the execution of check_for_updates() when transitioning to
   // ReplicaActiveOp/ReplicaWaitUpdates. That would trigger a ReplicaPushesUpd
   // event, which will be handled by ReplicaWaitUpdates.
-  using reactions = mpl::list<sc::transition<
-      StartReplica,
-      ReplicaWaitUpdates,
-      ReplicaActive,
-      &ReplicaActive::check_for_updates>>;
+  using reactions = mpl::list<
+      sc::transition<
+	  StartReplica,
+	  ReplicaWaitUpdates,
+	  ReplicaActive,
+	  &ReplicaActive::check_for_updates>,
+      sc::in_state_reaction<
+	  FullReset,
+	  ReplicaIdle,
+	  &ReplicaIdle::reset_ignored>>;
 };
 
 
@@ -706,7 +784,9 @@ struct ReplicaActiveOp
   explicit ReplicaActiveOp(my_context ctx);
   ~ReplicaActiveOp();
 
-  using reactions = mpl::list<sc::custom_reaction<StartReplica>>;
+  using reactions = mpl::list<
+      sc::custom_reaction<StartReplica>,
+      sc::transition<FullReset, ReplicaIdle>>;
 
   /**
    * Handling the unexpected (read - caused by a bug) case of receiving a
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
index 30d305036f444..fbbef578ae693 100644
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -319,6 +319,10 @@ struct ScrubPgIF {
   /// the OSD scrub queue
   virtual void on_new_interval() = 0;
 
+  /// we are peered as primary, and the PG is active and clean
+  /// Scrubber's internal FSM should be ActivePrimary
+  virtual void on_primary_active_clean() = 0;
+
   /// we are peered as a replica
   virtual void on_replica_activate() = 0;