From: Ronen Friedman <rfriedma@redhat.com>
Date: Sat, 14 Oct 2023 12:36:06 +0000 (-0500)
Subject: osd/scrub: handle reservation completion within the Scrubber FSM
X-Git-Tag: v19.0.0~197^2~3
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=13b84c856e797c872eab3231a9e352ea3ad99564;p=ceph.git

osd/scrub: handle reservation completion within the Scrubber FSM

with special handling for the 0-replica case.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 88a0bc037564..fa938c082781 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1799,12 +1799,6 @@ void OSDService::queue_for_rep_scrub_resched(PG* pg,
 					   act_token);
 }
 
-void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
-{
-  // Resulting scrub event: 'RemotesReserved'
-  queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
-}
-
 void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
 {
   // Resulting scrub event: 'InternalSchedScrub'
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index d86443351cf2..38f9a6ca8475 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -504,9 +504,6 @@ public:
 
   void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);
 
-  /// queue the message (-> event) that all replicas have reserved scrub resources for us
-  void queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority);
-
   /// Signals either (a) the end of a sleep period, or (b) a recheck of the availability
   /// of the primary map being created by the backend.
   void queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority);
diff --git a/src/osd/PG.h b/src/osd/PG.h
index fe335b85e000..70c1d12b2105 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -449,11 +449,6 @@ public:
 			"SchedReplica");
   }
 
-  void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle& handle)
-  {
-    forward_scrub_event(&ScrubPgIF::send_remotes_reserved, queued, "RemotesReserved");
-  }
-
   void scrub_send_scrub_resched(epoch_t queued, ThreadPool::TPHandle& handle)
   {
     forward_scrub_event(&ScrubPgIF::send_scrub_resched, queued, "InternalSchedScrub");
diff --git a/src/osd/scheduler/OpSchedulerItem.cc b/src/osd/scheduler/OpSchedulerItem.cc
index 0641aafdc1c9..750fc2a4f58e 100644
--- a/src/osd/scheduler/OpSchedulerItem.cc
+++ b/src/osd/scheduler/OpSchedulerItem.cc
@@ -77,15 +77,6 @@ void PGScrubResched::run(OSD* osd,
   pg->unlock();
 }
 
-void PGScrubResourcesOK::run(OSD* osd,
-			     OSDShard* sdata,
-			     PGRef& pg,
-			     ThreadPool::TPHandle& handle)
-{
-  pg->scrub_send_resources_granted(epoch_queued, handle);
-  pg->unlock();
-}
-
 void PGScrubPushesUpdate::run(OSD* osd,
 			      OSDShard* sdata,
 			      PGRef& pg,
diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h
index 2803169a9bf9..7fb7125a1416 100644
--- a/src/osd/scheduler/OpSchedulerItem.h
+++ b/src/osd/scheduler/OpSchedulerItem.h
@@ -373,17 +373,6 @@ class PGScrubResched : public PGScrubItem {
   void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
 };
 
-/**
- *  all replicas have granted our scrub resources request
- */
-class PGScrubResourcesOK : public PGScrubItem {
- public:
-  PGScrubResourcesOK(spg_t pg, epoch_t epoch_queued)
-      : PGScrubItem{pg, epoch_queued, "PGScrubResourcesOK"}
-  {}
-  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
-};
-
 /**
  *  called when a repair process completes, to initiate scrubbing. No local/remote
  *  resources are allocated.
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 98290a06ac45..b090ec113ee3 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -359,17 +359,6 @@ void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued)
   dout(10) << "scrubber event --<< " << __func__ << dendl;
 }
 
-void PgScrubber::send_remotes_reserved(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
-	   << dendl;
-  // note: scrub is not active yet
-  if (check_interval(epoch_queued)) {
-    m_fsm->process_event(RemotesReserved{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
 void PgScrubber::send_chunk_free(epoch_t epoch_queued)
 {
   dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index 0c8fa8c34fb7..97bf7da8f22a 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -445,8 +445,6 @@ class PgScrubber : public ScrubPgIF,
 
   void send_preempted_replica() final;
 
-  void send_remotes_reserved(epoch_t epoch_queued) final;
-
   /**
    *  does the PG have newer updates than what we (the scrubber) know?
    */
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
index cc257a47f0e3..40b43b6e0770 100644
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -163,13 +163,22 @@ ReservingReplicas::ReservingReplicas(my_context ctx)
   // initiate the reservation process
   context<Session>().m_reservations.emplace(*scrbr);
 
-  auto timeout = scrbr->get_pg_cct()->_conf.get_val<milliseconds>(
-      "osd_scrub_reservation_timeout");
-  if (timeout.count() > 0) {
-    // Start a timer to handle case where the replicas take a long time to
-    // ack the reservation.  See ReservationTimeout handler below.
-    m_timeout_token = machine.schedule_timer_event_after<ReservationTimeout>(
-      timeout);
+  if (context<Session>().m_reservations->get_last_sent()) {
+    // the 1'st reservation request was sent
+
+    auto timeout = scrbr->get_pg_cct()->_conf.get_val<milliseconds>(
+	"osd_scrub_reservation_timeout");
+    if (timeout.count() > 0) {
+      // Start a timer to handle case where the replicas take a long time to
+      // ack the reservation.  See ReservationTimeout handler below.
+      m_timeout_token =
+	  machine.schedule_timer_event_after<ReservationTimeout>(timeout);
+    }
+  } else {
+    // no replicas to reserve
+    dout(10) << "no replicas to reserve" << dendl;
+    // can't transit directly from here
+    post_event(RemotesReserved{});
   }
 }
 
@@ -186,7 +195,11 @@ sc::result ReservingReplicas::react(const ReplicaGrant& ev)
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   dout(10) << "ReservingReplicas::react(const ReplicaGrant&)" << dendl;
 
-  context<Session>().m_reservations->handle_reserve_grant(ev.m_op, ev.m_from);
+  if (context<Session>().m_reservations->handle_reserve_grant(
+	  ev.m_op, ev.m_from)) {
+    // we are done with the reservation process
+    return transit<ActiveScrubbing>();
+  }
   return discard_event();
 }
 
diff --git a/src/osd/scrubber/scrub_reservations.cc b/src/osd/scrubber/scrub_reservations.cc
index 011ace2a2836..4d187a2a644f 100644
--- a/src/osd/scrubber/scrub_reservations.cc
+++ b/src/osd/scrubber/scrub_reservations.cc
@@ -84,7 +84,7 @@ ReplicaReservations::~ReplicaReservations()
   release_all();
 }
 
-void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
 {
   // verify that the grant is from the peer we expected. If not?
   // for now - abort the OSD. \todo reconsider the reaction.
@@ -94,7 +94,7 @@ void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
 		   get_last_sent().value_or(pg_shard_t{}))
 	    << dendl;
     ceph_assert(from == get_last_sent());
-    return;
+    return false;
   }
 
   auto elapsed = clock::now() - m_last_request_sent_at;
@@ -115,31 +115,31 @@ void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
 		  active_requests_cnt(), m_sorted_secondaries.size(),
 		  duration_cast<milliseconds>(elapsed).count())
 	   << dendl;
-  send_next_reservation_or_complete();
+  return send_next_reservation_or_complete();
 }
 
-void ReplicaReservations::send_next_reservation_or_complete()
+bool ReplicaReservations::send_next_reservation_or_complete()
 {
   if (m_next_to_request == m_sorted_secondaries.cend()) {
     // granted by all replicas
     dout(10) << "remote reservation complete" << dendl;
-    m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
-
-  } else {
-    // send the next reservation request
-    const auto peer = *m_next_to_request;
-    const auto epoch = m_pg->get_osdmap_epoch();
-    auto m = make_message<MOSDScrubReserve>(
-	spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
-	m_pg->pg_whoami);
-    m_pg->send_cluster_message(peer.osd, m, epoch, false);
-    m_last_request_sent_at = clock::now();
-    dout(10) << fmt::format(
-		    "reserving {} (the {} of {} replicas)", *m_next_to_request,
-		    active_requests_cnt()+1, m_sorted_secondaries.size())
-	     << dendl;
-    m_next_to_request++;
+    return true;  // done
   }
+
+  // send the next reservation request
+  const auto peer = *m_next_to_request;
+  const auto epoch = m_pg->get_osdmap_epoch();
+  auto m = make_message<MOSDScrubReserve>(
+      spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
+      m_pg->pg_whoami);
+  m_pg->send_cluster_message(peer.osd, m, epoch, false);
+  m_last_request_sent_at = clock::now();
+  dout(10) << fmt::format(
+		  "reserving {} (the {} of {} replicas)", *m_next_to_request,
+		  active_requests_cnt() + 1, m_sorted_secondaries.size())
+	   << dendl;
+  m_next_to_request++;
+  return false;
 }
 
 void ReplicaReservations::verify_rejections_source(
diff --git a/src/osd/scrubber/scrub_reservations.h b/src/osd/scrubber/scrub_reservations.h
index 634e7e580027..a603c7073563 100644
--- a/src/osd/scrubber/scrub_reservations.h
+++ b/src/osd/scrubber/scrub_reservations.h
@@ -79,8 +79,11 @@ class ReplicaReservations {
    * the replica we are expecting a reply from) is noted, and triggers
    * one of two: either sending a reservation request to the next replica,
    * or notifying the scrubber that we have reserved them all.
+   *
+   * \returns true if there are no more replicas to send reservation requests
+   * (i.e., the scrubber should proceed to the next phase), false otherwise.
    */
-  void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+  bool handle_reserve_grant(OpRequestRef op, pg_shard_t from);
 
   /**
    * Verify that the sender of the received rejection is the replica we
@@ -105,6 +108,9 @@ class ReplicaReservations {
    */
   void discard_remote_reservations();
 
+  /// the only replica we are expecting a reply from
+  std::optional<pg_shard_t> get_last_sent() const;
+
   // note: 'public', as accessed via the 'standard' dout_prefix() macro
   std::ostream& gen_prefix(std::ostream& out, std::string fn) const;
 
@@ -112,17 +118,14 @@ class ReplicaReservations {
   /// send 'release' messages to all replicas we have managed to reserve
   void release_all();
 
-  /// the only replica we are expecting a reply from
-  std::optional<pg_shard_t> get_last_sent() const;
-
   /// The number of requests that have been sent (and not rejected) so far.
   size_t active_requests_cnt() const;
 
   /**
-   * Either send a reservation request to the next replica, or notify the
-   * scrubber that we have reserved all the replicas.
+   * Send a reservation request to the next replica.
+   * - if there are no more replicas to send requests to, return true
    */
-  void send_next_reservation_or_complete();
+  bool send_next_reservation_or_complete();
 };
 
 } // namespace Scrub
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
index 61be0bd5a62c..745ea2388b67 100644
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -356,12 +356,6 @@ struct ScrubPgIF {
    */
   virtual void clear_pgscrub_state() = 0;
 
-  /**
-   *  triggers the 'RemotesReserved' (all replicas granted scrub resources)
-   *  state-machine event
-   */
-  virtual void send_remotes_reserved(epoch_t epoch_queued) = 0;
-
   virtual void cleanup_store(ObjectStore::Transaction* t) = 0;
 
   virtual bool get_store_errors(const scrub_ls_arg_t& arg,