osd/scrub: group all scrub session states into a Session state

author Ronen Friedman <rfriedma@redhat.com>

Mon, 2 Oct 2023 16:29:51 +0000 (11:29 -0500)

committer Ronen Friedman <rfriedma@redhat.com>

Sat, 14 Oct 2023 18:49:01 +0000 (21:49 +0300)
author Ronen Friedman <rfriedma@redhat.com>
Mon, 2 Oct 2023 16:29:51 +0000 (11:29 -0500)
committer Ronen Friedman <rfriedma@redhat.com>
Sat, 14 Oct 2023 18:49:01 +0000 (21:49 +0300)
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc

index 0d52d5b76d77b61d8727bff70eec91316e90099c..efb0917882067afdc2b5dd50eb4b898de5aa72cc 100644 (file)
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -109,25 +109,47 @@ sc::result NotActive::react(const AfterRepairScrub&)
    return transit<ReservingReplicas>();
  }
  
+// ----------------------- Session -----------------------------------------
+
+Session::Session(my_context ctx)
+    : my_base(ctx)
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session")
+{
+  dout(10) << "-- state -->> Session" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+
+  // while we've checked the 'someone is reserving' flag before queueing
+  // the start-scrub event, it's possible that the flag was set in the meantime.
+  // Handling this case here requires adding a new sub-state, and the
+  // complication of reporting a failure to the caller in a new failure
+  // path. On the other hand - ignoring an ongoing reservation on rare
+  // occasions will cause no harm.
+  // We choose ignorance.
+  std::ignore = scrbr->set_reserving_now();
+}
+
+Session::~Session()
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+
+  // note the interaction between clearing the 'queued' flag and two
+  // other states: the snap-mapper and the scrubber internal state.
+  // All of these must be cleared in the correct order, and the snap mapper
+  // (re-triggered by resetting the 'queued' flag) must not resume before
+  // the scrubber is reset.
+  scrbr->clear_pgscrub_state();
+}
+
+
  // ----------------------- ReservingReplicas ---------------------------------
  
  ReservingReplicas::ReservingReplicas(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "ReservingReplicas")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/ReservingReplicas")
  {
    dout(10) << "-- state -->> ReservingReplicas" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
  
-  // prevent the OSD from starting another scrub while we are trying to secure
-  // replicas resources
-  if (!scrbr->set_reserving_now()) {
-    dout(1) << "ReservingReplicas::ReservingReplicas() some other PG is "
-               "already reserving replicas resources"
-            << dendl;
-    post_event(ReservationFailure{});
-    return;
-  }
-  m_holding_isreserving_flag = true;
    scrbr->reserve_replicas();
  
    auto timeout = scrbr->get_cct()->_conf.get_val<
@@ -143,9 +165,9 @@ ReservingReplicas::ReservingReplicas(my_context ctx)
  ReservingReplicas::~ReservingReplicas()
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  if (m_holding_isreserving_flag) {
-    scrbr->clear_reserving_now();
-  }
+  // it's OK to try and clear the flag even if we don't hold it
+  // (the flag remembers the actual holder)
+  scrbr->clear_reserving_now();
  }
  
  sc::result ReservingReplicas::react(const ReservationTimeout&)
@@ -166,18 +188,6 @@ sc::result ReservingReplicas::react(const ReservationFailure&)
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
-
-  // the Scrubber must release all resources and abort the scrubbing
-  scrbr->clear_pgscrub_state();
-  return transit<NotActive>();
-}
-
-/**
- * note: the event poster is handling the scrubber reset
- */
-sc::result ReservingReplicas::react(const FullReset&)
-{
-  dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl;
    return transit<NotActive>();
  }
  
@@ -185,7 +195,7 @@ sc::result ReservingReplicas::react(const FullReset&)
  
  ActiveScrubbing::ActiveScrubbing(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "ActiveScrubbing")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/ActiveScrubbing")
  {
    dout(10) << "-- state -->> ActiveScrubbing" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
@@ -199,8 +209,6 @@ ActiveScrubbing::~ActiveScrubbing()
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(15) << __func__ << dendl;
-  scrbr->unreserve_replicas();
-  scrbr->clear_queued_or_active();
  }
  
  /*
@@ -212,14 +220,6 @@ sc::result ActiveScrubbing::react(const InternalError&)
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(10) << __func__ << dendl;
-  scrbr->clear_pgscrub_state();
-  return transit<NotActive>();
-}
-
-sc::result ActiveScrubbing::react(const FullReset&)
-{
-  dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl;
-  // caller takes care of clearing the scrubber & FSM states
    return transit<NotActive>();
  }
  
@@ -237,9 +237,9 @@ sc::result ActiveScrubbing::react(const FullReset&)
   */
  RangeBlocked::RangeBlocked(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Act/RangeBlocked")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/Act/RangeBlocked")
  {
-  dout(10) << "-- state -->> Act/RangeBlocked" << dendl;
+  dout(10) << "-- state -->> Session/Act/RangeBlocked" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
  
    auto grace = scrbr->get_range_blocked_grace();
@@ -287,9 +287,9 @@ sc::result RangeBlocked::react(const RangeBlockedAlarm&)
   */
  PendingTimer::PendingTimer(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Act/PendingTimer")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/Act/PendingTimer")
  {
-  dout(10) << "-- state -->> Act/PendingTimer" << dendl;
+  dout(10) << "-- state -->> Session/Act/PendingTimer" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
  
    auto sleep_time = scrbr->get_scrub_sleep_time();
@@ -328,9 +328,9 @@ sc::result PendingTimer::react(const SleepComplete&)
   */
  NewChunk::NewChunk(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Act/NewChunk")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/Act/NewChunk")
  {
-  dout(10) << "-- state -->> Act/NewChunk" << dendl;
+  dout(10) << "-- state -->> Session/Act/NewChunk" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
  
    scrbr->get_preemptor().adjust_parameters();
@@ -355,9 +355,9 @@ sc::result NewChunk::react(const SelectedChunkFree&)
  
  WaitPushes::WaitPushes(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Act/WaitPushes")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/Act/WaitPushes")
  {
-  dout(10) << " -- state -->> Act/WaitPushes" << dendl;
+  dout(10) << " -- state -->> Session/Act/WaitPushes" << dendl;
    post_event(ActivePushesUpd{});
  }
  
@@ -383,9 +383,9 @@ sc::result WaitPushes::react(const ActivePushesUpd&)
  
  WaitLastUpdate::WaitLastUpdate(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Act/WaitLastUpdate")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/Act/WaitLastUpdate")
  {
-  dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl;
+  dout(10) << " -- state -->> Session/Act/WaitLastUpdate" << dendl;
    post_event(UpdatesApplied{});
  }
  
@@ -427,9 +427,9 @@ sc::result WaitLastUpdate::react(const InternalAllUpdates&)
  
  BuildMap::BuildMap(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Act/BuildMap")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/Act/BuildMap")
  {
-  dout(10) << " -- state -->> Act/BuildMap" << dendl;
+  dout(10) << " -- state -->> Session/Act/BuildMap" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
  
    // no need to check for an epoch change, as all possible flows that brought
@@ -477,9 +477,9 @@ sc::result BuildMap::react(const IntLocalMapDone&)
  
  DrainReplMaps::DrainReplMaps(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Act/DrainReplMaps")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/Act/DrainReplMaps")
  {
-  dout(10) << "-- state -->> Act/DrainReplMaps" << dendl;
+  dout(10) << "-- state -->> Session/Act/DrainReplMaps" << dendl;
    // we may have got all maps already. Send the event that will make us check.
    post_event(GotReplicas{});
  }
@@ -504,9 +504,9 @@ sc::result DrainReplMaps::react(const GotReplicas&)
  
  WaitReplicas::WaitReplicas(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Act/WaitReplicas")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/Act/WaitReplicas")
  {
-  dout(10) << "-- state -->> Act/WaitReplicas" << dendl;
+  dout(10) << "-- state -->> Session/Act/WaitReplicas" << dendl;
    post_event(GotReplicas{});
  }
  
@@ -564,10 +564,10 @@ sc::result WaitReplicas::react(const DigestUpdate&)
  
  WaitDigestUpdate::WaitDigestUpdate(my_context ctx)
      : my_base(ctx)
-    , NamedSimply(context<ScrubMachine>().m_scrbr, "Act/WaitDigestUpdate")
+    , NamedSimply(context<ScrubMachine>().m_scrbr, "Session/Act/WaitDigestUpdate")
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl;
+  dout(10) << "-- state -->> Session/Act/WaitDigestUpdate" << dendl;
  
    // perform an initial check: maybe we already
    // have all the updates we need:
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h

index 658abfa494f1b07c92ca8990ad107ba12d5f245f..071a464ce130e18b1b3053e4bda1b70414038e7b 100644 (file)
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -141,6 +141,7 @@ MEV(ScrubFinished)
  //
  
  struct NotActive;          ///< the quiescent state. No active scrubbing.
+struct Session;            ///< either reserving or actively scrubbing
  struct ReservingReplicas;   ///< securing scrub resources from replicas' OSDs
  struct ActiveScrubbing;            ///< the active state for a Primary. A sub-machine.
  struct ReplicaIdle;         ///< Initial reserved replica state
@@ -163,13 +164,17 @@ class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
    [[nodiscard]] bool is_reserving() const;
    [[nodiscard]] bool is_accepting_updates() const;
  
+
+// ///////////////// aux declarations & functions //////////////////////// //
+
+
  private:
    /**
     * scheduled_event_state_t
     *
     * Heap allocated, ref-counted state shared between scheduled event callback
     * and timer_event_token_t.  Ensures that callback and timer_event_token_t
-   * can be safetly destroyed in either order while still allowing for
+   * can be safely destroyed in either order while still allowing for
     * cancellation.
     */
    struct scheduled_event_state_t {
@@ -183,7 +188,7 @@ private:
      ~scheduled_event_state_t() {
        /* For the moment, this assert encodes an assumption that we always
         * retain the token until the event either fires or is canceled.
-       * If a user needs/wants to relaxt that requirement, this assert can
+       * If a user needs/wants to relax that requirement, this assert can
         * be removed */
        assert(!cb_token);
      }
@@ -259,7 +264,7 @@ public:
     * schedule_timer_event_after
     *
     * Schedules event EventT{Args...} to be delivered duration in the future.
-   * The implementation implicitely drops the event on interval change.  The
+   * The implementation implicitly drops the event on interval change.  The
     * returned timer_event_token_t can be used to cancel the event prior to
     * its delivery -- it should generally be embedded as a member in the state
     * intended to handle the event.  See the comment on timer_event_token_t
@@ -284,6 +289,10 @@ public:
    }
  };
  
+
+// ///////////////// the states //////////////////////// //
+
+
  /**
   *  The Scrubber's base (quiescent) state.
   *  Scrubbing is triggered by one of the following events:
@@ -314,11 +323,34 @@ struct NotActive : sc::state<NotActive, ScrubMachine>, NamedSimply {
    sc::result react(const AfterRepairScrub&);
  };
  
-struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine>,
+
+/**
+ *  Session
+ *
+ *  This state encompasses the two main "active" states: ReservingReplicas and
+ *  ActiveScrubbing.
+ *  'Session' is the owner of all the resources that are allocated for a
+ *  scrub session performed as a Primary.
+ *
+ *  Exit from this state is either following an interval change, or with
+ *  'FullReset' (that would cover all other completion/termination paths).
+ *  Note that if terminating the session following an interval change - no
+ *  reservations are released. This is because we know that the replicas are
+ *  also resetting their reservations.
+ */
+struct Session : sc::state<Session, ScrubMachine, ReservingReplicas>, NamedSimply {
+  explicit Session(my_context ctx);
+  ~Session();
+
+  using reactions = mpl::list<sc::transition<FullReset, NotActive>>;
+  /// \todo handle interval change
+};
+
+struct ReservingReplicas : sc::state<ReservingReplicas, Session>,
                            NamedSimply {
    explicit ReservingReplicas(my_context ctx);
    ~ReservingReplicas();
-  using reactions = mpl::list<sc::custom_reaction<FullReset>,
+  using reactions = mpl::list<
                               // all replicas granted our resources request
                               sc::transition<RemotesReserved, ActiveScrubbing>,
                               sc::custom_reaction<ReservationTimeout>,
@@ -328,11 +360,6 @@ struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine>,
      ceph::coarse_real_clock::now();
    ScrubMachine::timer_event_token_t m_timeout_token;
  
-  /// if true - we must 'clear_reserving_now()' upon exit
-  bool m_holding_isreserving_flag{false};
-
-  sc::result react(const FullReset&);
-
    sc::result react(const ReservationTimeout&);
  
    /// at least one replica denied us the scrub resources we've requested
@@ -364,15 +391,13 @@ struct WaitReplicas;
  struct WaitDigestUpdate;
  
  struct ActiveScrubbing
-    : sc::state<ActiveScrubbing, ScrubMachine, PendingTimer>, NamedSimply {
+    : sc::state<ActiveScrubbing, Session, PendingTimer>, NamedSimply {
  
    explicit ActiveScrubbing(my_context ctx);
    ~ActiveScrubbing();
  
-  using reactions = mpl::list<sc::custom_reaction<InternalError>,
-                             sc::custom_reaction<FullReset>>;
+  using reactions = mpl::list<sc::custom_reaction<InternalError>>;
  
-  sc::result react(const FullReset&);
    sc::result react(const InternalError&);
  };
author	Ronen Friedman <rfriedma@redhat.com>
	Mon, 2 Oct 2023 16:29:51 +0000 (11:29 -0500)
committer	Ronen Friedman <rfriedma@redhat.com>
	Sat, 14 Oct 2023 18:49:01 +0000 (21:49 +0300)
src/osd/scrubber/scrub_machine.cc		patch \| blob \| history
src/osd/scrubber/scrub_machine.h		patch \| blob \| history