osd/scrub: handle reservation completion within the Scrubber FSM

author Ronen Friedman <rfriedma@redhat.com>

Sat, 14 Oct 2023 12:36:06 +0000 (07:36 -0500)

committer Ronen Friedman <rfriedma@redhat.com>

Sat, 14 Oct 2023 18:49:01 +0000 (21:49 +0300)
author Ronen Friedman <rfriedma@redhat.com>
Sat, 14 Oct 2023 12:36:06 +0000 (07:36 -0500)
committer Ronen Friedman <rfriedma@redhat.com>
Sat, 14 Oct 2023 18:49:01 +0000 (21:49 +0300)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 88a0bc037564bc2b0ef07d581fd1d26aa1c17de3..fa938c08278132e18e2105343ab7da25702c3cfc 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1799,12 +1799,6 @@ void OSDService::queue_for_rep_scrub_resched(PG* pg,
                                            act_token);
  }
  
-void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
-{
-  // Resulting scrub event: 'RemotesReserved'
-  queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
-}
-
  void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
  {
    // Resulting scrub event: 'InternalSchedScrub'
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index d86443351cf28084ff27e2a6b2088a0e114e65f8..38f9a6ca8475a41a563242a967bb92d6cdb9e81e 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -504,9 +504,6 @@ public:
  
    void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);
  
-  /// queue the message (-> event) that all replicas have reserved scrub resources for us
-  void queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority);
-
    /// Signals either (a) the end of a sleep period, or (b) a recheck of the availability
    /// of the primary map being created by the backend.
    void queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority);
diff --git a/src/osd/PG.h b/src/osd/PG.h

index fe335b85e0003e5073dbf2003ecb2e8b00c567b9..70c1d12b2105ced2bc7ea016e228f075a222740a 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -449,11 +449,6 @@ public:
                         "SchedReplica");
    }
  
-  void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle& handle)
-  {
-    forward_scrub_event(&ScrubPgIF::send_remotes_reserved, queued, "RemotesReserved");
-  }
-
    void scrub_send_scrub_resched(epoch_t queued, ThreadPool::TPHandle& handle)
    {
      forward_scrub_event(&ScrubPgIF::send_scrub_resched, queued, "InternalSchedScrub");
diff --git a/src/osd/scheduler/OpSchedulerItem.cc b/src/osd/scheduler/OpSchedulerItem.cc

index 0641aafdc1c94310caa2c627c92cffa185a8ea08..750fc2a4f58ef650d93f6967f82b526e6e223895 100644 (file)
--- a/src/osd/scheduler/OpSchedulerItem.cc
+++ b/src/osd/scheduler/OpSchedulerItem.cc
@@ -77,15 +77,6 @@ void PGScrubResched::run(OSD* osd,
    pg->unlock();
  }
  
-void PGScrubResourcesOK::run(OSD* osd,
-                            OSDShard* sdata,
-                            PGRef& pg,
-                            ThreadPool::TPHandle& handle)
-{
-  pg->scrub_send_resources_granted(epoch_queued, handle);
-  pg->unlock();
-}
-
  void PGScrubPushesUpdate::run(OSD* osd,
                               OSDShard* sdata,
                               PGRef& pg,
diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h

index 2803169a9bf973eccc75ee3c3c6a8b2c7fd408d2..7fb7125a14169b1bf64f71f7a98d7dda7824b8d4 100644 (file)
--- a/src/osd/scheduler/OpSchedulerItem.h
+++ b/src/osd/scheduler/OpSchedulerItem.h
@@ -373,17 +373,6 @@ class PGScrubResched : public PGScrubItem {
    void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
  };
  
-/**
- *  all replicas have granted our scrub resources request
- */
-class PGScrubResourcesOK : public PGScrubItem {
- public:
-  PGScrubResourcesOK(spg_t pg, epoch_t epoch_queued)
-      : PGScrubItem{pg, epoch_queued, "PGScrubResourcesOK"}
-  {}
-  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
-};
-
  /**
   *  called when a repair process completes, to initiate scrubbing. No local/remote
   *  resources are allocated.
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc

index 98290a06ac4591e1683c9c81f2f1a7784f07aae2..b090ec113ee3de81bd36af7d6fe9b46f19b387c4 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -359,17 +359,6 @@ void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued)
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_remotes_reserved(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
-          << dendl;
-  // note: scrub is not active yet
-  if (check_interval(epoch_queued)) {
-    m_fsm->process_event(RemotesReserved{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
  void PgScrubber::send_chunk_free(epoch_t epoch_queued)
  {
    dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h

index 0c8fa8c34fb790cafbfd93ea9689448ac7530d67..97bf7da8f22a173fd9c1591cd9083ecf0c36f667 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -445,8 +445,6 @@ class PgScrubber : public ScrubPgIF,
  
    void send_preempted_replica() final;
  
-  void send_remotes_reserved(epoch_t epoch_queued) final;
-
    /**
     *  does the PG have newer updates than what we (the scrubber) know?
     */
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc

index cc257a47f0e314e254a026493c92b5ae7859d4da..40b43b6e07702fedd75d04d5c39ab3daf88d3eea 100644 (file)
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -163,13 +163,22 @@ ReservingReplicas::ReservingReplicas(my_context ctx)
    // initiate the reservation process
    context<Session>().m_reservations.emplace(*scrbr);
  
-  auto timeout = scrbr->get_pg_cct()->_conf.get_val<milliseconds>(
-      "osd_scrub_reservation_timeout");
-  if (timeout.count() > 0) {
-    // Start a timer to handle case where the replicas take a long time to
-    // ack the reservation.  See ReservationTimeout handler below.
-    m_timeout_token = machine.schedule_timer_event_after<ReservationTimeout>(
-      timeout);
+  if (context<Session>().m_reservations->get_last_sent()) {
+    // the 1'st reservation request was sent
+
+    auto timeout = scrbr->get_pg_cct()->_conf.get_val<milliseconds>(
+       "osd_scrub_reservation_timeout");
+    if (timeout.count() > 0) {
+      // Start a timer to handle case where the replicas take a long time to
+      // ack the reservation.  See ReservationTimeout handler below.
+      m_timeout_token =
+         machine.schedule_timer_event_after<ReservationTimeout>(timeout);
+    }
+  } else {
+    // no replicas to reserve
+    dout(10) << "no replicas to reserve" << dendl;
+    // can't transit directly from here
+    post_event(RemotesReserved{});
    }
  }
  
@@ -186,7 +195,11 @@ sc::result ReservingReplicas::react(const ReplicaGrant& ev)
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(10) << "ReservingReplicas::react(const ReplicaGrant&)" << dendl;
  
-  context<Session>().m_reservations->handle_reserve_grant(ev.m_op, ev.m_from);
+  if (context<Session>().m_reservations->handle_reserve_grant(
+         ev.m_op, ev.m_from)) {
+    // we are done with the reservation process
+    return transit<ActiveScrubbing>();
+  }
    return discard_event();
  }
  
diff --git a/src/osd/scrubber/scrub_reservations.cc b/src/osd/scrubber/scrub_reservations.cc

index 011ace2a28363b4d9d3015f5b19000995b6f6cd2..4d187a2a644f1b0b2e7bccdaf7903a6e704d97f2 100644 (file)
--- a/src/osd/scrubber/scrub_reservations.cc
+++ b/src/osd/scrubber/scrub_reservations.cc
@@ -84,7 +84,7 @@ ReplicaReservations::~ReplicaReservations()
    release_all();
  }
  
-void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
  {
    // verify that the grant is from the peer we expected. If not?
    // for now - abort the OSD. \todo reconsider the reaction.
@@ -94,7 +94,7 @@ void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
                    get_last_sent().value_or(pg_shard_t{}))
             << dendl;
      ceph_assert(from == get_last_sent());
-    return;
+    return false;
    }
  
    auto elapsed = clock::now() - m_last_request_sent_at;
@@ -115,31 +115,31 @@ void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
                   active_requests_cnt(), m_sorted_secondaries.size(),
                   duration_cast<milliseconds>(elapsed).count())
            << dendl;
-  send_next_reservation_or_complete();
+  return send_next_reservation_or_complete();
  }
  
-void ReplicaReservations::send_next_reservation_or_complete()
+bool ReplicaReservations::send_next_reservation_or_complete()
  {
    if (m_next_to_request == m_sorted_secondaries.cend()) {
      // granted by all replicas
      dout(10) << "remote reservation complete" << dendl;
-    m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
-
-  } else {
-    // send the next reservation request
-    const auto peer = *m_next_to_request;
-    const auto epoch = m_pg->get_osdmap_epoch();
-    auto m = make_message<MOSDScrubReserve>(
-       spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
-       m_pg->pg_whoami);
-    m_pg->send_cluster_message(peer.osd, m, epoch, false);
-    m_last_request_sent_at = clock::now();
-    dout(10) << fmt::format(
-                   "reserving {} (the {} of {} replicas)", *m_next_to_request,
-                   active_requests_cnt()+1, m_sorted_secondaries.size())
-            << dendl;
-    m_next_to_request++;
+    return true;  // done
    }
+
+  // send the next reservation request
+  const auto peer = *m_next_to_request;
+  const auto epoch = m_pg->get_osdmap_epoch();
+  auto m = make_message<MOSDScrubReserve>(
+      spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
+      m_pg->pg_whoami);
+  m_pg->send_cluster_message(peer.osd, m, epoch, false);
+  m_last_request_sent_at = clock::now();
+  dout(10) << fmt::format(
+                 "reserving {} (the {} of {} replicas)", *m_next_to_request,
+                 active_requests_cnt() + 1, m_sorted_secondaries.size())
+          << dendl;
+  m_next_to_request++;
+  return false;
  }
  
  void ReplicaReservations::verify_rejections_source(
diff --git a/src/osd/scrubber/scrub_reservations.h b/src/osd/scrubber/scrub_reservations.h

index 634e7e580027fed687b2df853663b7c961445f17..a603c70735631e8729688980066a16c7fb98297d 100644 (file)
--- a/src/osd/scrubber/scrub_reservations.h
+++ b/src/osd/scrubber/scrub_reservations.h
@@ -79,8 +79,11 @@ class ReplicaReservations {
     * the replica we are expecting a reply from) is noted, and triggers
     * one of two: either sending a reservation request to the next replica,
     * or notifying the scrubber that we have reserved them all.
+   *
+   * \returns true if there are no more replicas to send reservation requests
+   * (i.e., the scrubber should proceed to the next phase), false otherwise.
     */
-  void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+  bool handle_reserve_grant(OpRequestRef op, pg_shard_t from);
  
    /**
     * Verify that the sender of the received rejection is the replica we
@@ -105,6 +108,9 @@ class ReplicaReservations {
     */
    void discard_remote_reservations();
  
+  /// the only replica we are expecting a reply from
+  std::optional<pg_shard_t> get_last_sent() const;
+
    // note: 'public', as accessed via the 'standard' dout_prefix() macro
    std::ostream& gen_prefix(std::ostream& out, std::string fn) const;
  
@@ -112,17 +118,14 @@ class ReplicaReservations {
    /// send 'release' messages to all replicas we have managed to reserve
    void release_all();
  
-  /// the only replica we are expecting a reply from
-  std::optional<pg_shard_t> get_last_sent() const;
-
    /// The number of requests that have been sent (and not rejected) so far.
    size_t active_requests_cnt() const;
  
    /**
-   * Either send a reservation request to the next replica, or notify the
-   * scrubber that we have reserved all the replicas.
+   * Send a reservation request to the next replica.
+   * - if there are no more replicas to send requests to, return true
     */
-  void send_next_reservation_or_complete();
+  bool send_next_reservation_or_complete();
  };
  
  } // namespace Scrub
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h

index 61be0bd5a62c4121693edb841f59a81c6a6891ac..745ea2388b67b43170c1d1e4bb091e20c9f396b8 100644 (file)
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -356,12 +356,6 @@ struct ScrubPgIF {
     */
    virtual void clear_pgscrub_state() = 0;
  
-  /**
-   *  triggers the 'RemotesReserved' (all replicas granted scrub resources)
-   *  state-machine event
-   */
-  virtual void send_remotes_reserved(epoch_t epoch_queued) = 0;
-
    virtual void cleanup_store(ObjectStore::Transaction* t) = 0;
  
    virtual bool get_store_errors(const scrub_ls_arg_t& arg,
author	Ronen Friedman <rfriedma@redhat.com>
	Sat, 14 Oct 2023 12:36:06 +0000 (07:36 -0500)
committer	Ronen Friedman <rfriedma@redhat.com>
	Sat, 14 Oct 2023 18:49:01 +0000 (21:49 +0300)
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history
src/osd/scheduler/OpSchedulerItem.cc		patch \| blob \| history
src/osd/scheduler/OpSchedulerItem.h		patch \| blob \| history
src/osd/scrubber/pg_scrubber.cc		patch \| blob \| history
src/osd/scrubber/pg_scrubber.h		patch \| blob \| history
src/osd/scrubber/scrub_machine.cc		patch \| blob \| history
src/osd/scrubber/scrub_reservations.cc		patch \| blob \| history
src/osd/scrubber/scrub_reservations.h		patch \| blob \| history
src/osd/scrubber_common.h		patch \| blob \| history