From 983b6f08820c2e56cc8dcf765dd01183cc42190f Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Sun, 8 Aug 2021 14:14:55 +0000
Subject: [PATCH] osd/scrub: only telling the scrubber of 'updates' events if
 these events are waited for

Only the Primary, and only when there is an active scrubbing going on and the
scrubber state machine is in WaitUpdates states, should be notified of 'updates'.
The extra messages we queued and processed earlier, apart from creating redundant
log lines and wasting CPU time, were increasing the chance of a race in the handling
of stale scrubs.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
(cherry picked from commit 0aed4a5dd63b51288f1e01ca73d9e3632a1238be)
---
 src/messages/MOSDRepScrub.h |  2 +-
 src/osd/PrimaryLogPG.cc     |  6 ++++--
 src/osd/pg_scrubber.cc      | 17 ++++++++++++++---
 src/osd/pg_scrubber.h       |  7 +++++++
 src/osd/scrub_machine.cc    | 17 +++++++++++++++++
 src/osd/scrub_machine.h     |  9 ++-------
 src/osd/scrubber_common.h   | 16 ++++++++++++++++
 7 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/src/messages/MOSDRepScrub.h b/src/messages/MOSDRepScrub.h
index 5b9eaae18545a..ededcdca8c949 100644
--- a/src/messages/MOSDRepScrub.h
+++ b/src/messages/MOSDRepScrub.h
@@ -29,7 +29,7 @@ public:
 
   spg_t pgid;             // PG to scrub
   eversion_t scrub_from; // only scrub log entries after scrub_from
-  eversion_t scrub_to;   // last_update_applied when message sent
+  eversion_t scrub_to;   // last_update_applied when message sent (not used)
   epoch_t map_epoch = 0, min_epoch = 0;
   bool chunky;           // true for chunky scrubs
   hobject_t start;       // lower bound of scrub, inclusive
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index ff97b21e0e4cb..6494dddf09914 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -10962,8 +10962,10 @@ void PrimaryLogPG::op_applied(const eversion_t &applied_version)
   ceph_assert(applied_version <= info.last_update);
   recovery_state.local_write_applied(applied_version);
 
-  if (is_primary() && m_scrubber->should_requeue_blocked_ops(recovery_state.get_last_update_applied())) {
-    osd->queue_scrub_applied_update(this, is_scrub_blocking_ops());
+  if (is_primary() && m_scrubber) {
+    // if there's a scrub operation waiting for the selected chunk to be fully updated -
+    // allow it to continue
+    m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied());
   }
 }
 
diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc
index 089a026b08e50..ad8c2c6d774a2 100644
--- a/src/osd/pg_scrubber.cc
+++ b/src/osd/pg_scrubber.cc
@@ -484,6 +484,17 @@ void PgScrubber::set_subset_last_update(eversion_t e)
   dout(15) << __func__ << " last-update: " << e << dendl;
 }
 
+void PgScrubber::on_applied_when_primary(const eversion_t& applied_version)
+{
+  // we are only interested in updates if we are the Primary, and in state
+  // WaitLastUpdate
+  if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) {
+    m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops());
+    dout(15) << __func__ << " update: " << applied_version
+	     << " vs. required: " << m_subset_last_update << dendl;
+  }
+}
+
 /*
  * setting:
  * - m_subset_last_update
@@ -991,7 +1002,7 @@ int PgScrubber::build_scrub_map_chunk(
   // scan objects
   while (!pos.done()) {
     int r = m_pg->get_pgbackend()->be_scan_list(map, pos);
-    dout(10) << __func__ << " be r " << r << dendl;
+    dout(30) << __func__ << " BE returned " << r << dendl;
     if (r == -EINPROGRESS) {
       dout(20) << __func__ << " in progress" << dendl;
       return r;
@@ -1086,7 +1097,6 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
   // are we still processing a previous scrub-map request without noticing that the
   // interval changed? won't see it here, but rather at the reservation stage.
 
-
   if (msg->map_epoch < m_pg->info.history.same_interval_since) {
     dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
 	     << " < " << m_pg->info.history.same_interval_since << dendl;
@@ -1336,12 +1346,13 @@ void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
    *  otherwise the interval would have changed.
    *  Ostensibly we can discard & redo the reservation. But then we
    *  will be temporarily releasing the OSD resource - and might not be able to grab it
-   *  again. Thus we simple treat this as a successful new request.
+   *  again. Thus, we simply treat this as a successful new request.
    */
 
   if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
     // we are holding a stale reservation from a past epoch
     m_remote_osd_resource.reset();
+    dout(10) << __func__ << " stale reservation request" << dendl;
   }
 
   if (request_ep < m_pg->get_same_interval_since()) {
diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h
index 176a00a23e9f9..56b27d7848d49 100644
--- a/src/osd/pg_scrubber.h
+++ b/src/osd/pg_scrubber.h
@@ -212,6 +212,13 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
   void send_sched_replica(epoch_t epoch_queued) final;
 
   void send_replica_pushes_upd(epoch_t epoch_queued) final;
+  /**
+   *  The PG has updated its 'applied version'. It might be that we are waiting for this
+   *  information: after selecting a range of objects to scrub, we've marked the latest
+   *  version of these objects in m_subset_last_update. We will not start the map building
+   *  before we know that the PG has reached this version.
+   */
+  void on_applied_when_primary(const eversion_t& applied_version) final;
 
   /**
    *  we allow some number of preemptions of the scrub, which mean we do
diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc
index 2a2ee8732bda9..e8ae1bc561760 100644
--- a/src/osd/scrub_machine.cc
+++ b/src/osd/scrub_machine.cc
@@ -60,6 +60,14 @@ bool ScrubMachine::is_reserving() const
   return state_cast<const ReservingReplicas*>();
 }
 
+bool ScrubMachine::is_accepting_updates() const
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  ceph_assert(scrbr->is_primary());
+
+  return state_cast<const WaitLastUpdate*>();
+}
+
 // for the rest of the code in this file - we know what PG we are dealing with:
 #undef dout_prefix
 #define dout_prefix _prefix(_dout, this->context<ScrubMachine>().m_pg)
@@ -236,6 +244,15 @@ WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx)
   post_event(UpdatesApplied{});
 }
 
+/**
+ *  Note:
+ *  Updates are locally readable immediately. Thus, on the replicas we do need
+ *  to wait for the update notifications before scrubbing. For the Primary it's
+ *  a bit different: on EC (and only there) rmw operations have an additional
+ *  read roundtrip. That means that on the Primary we need to wait for
+ *  last_update_applied (the replica side, even on EC, is still safe
+ *  since the actual transaction will already be readable by commit time.
+ */
 void WaitLastUpdate::on_new_updates(const UpdatesApplied&)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h
index 95035389e9ab6..7386163d8601a 100644
--- a/src/osd/scrub_machine.h
+++ b/src/osd/scrub_machine.h
@@ -64,7 +64,7 @@ MEV(SelectedChunkFree)
 MEV(ChunkIsBusy)
 MEV(ActivePushesUpd)	 ///< Update to active_pushes. 'active_pushes' represents recovery
 			 ///< that is in-flight to the local ObjectStore
-MEV(UpdatesApplied)	 // external
+MEV(UpdatesApplied)	 ///< (Primary only) all updates are committed
 MEV(InternalAllUpdates)	 ///< the internal counterpart of UpdatesApplied
 MEV(GotReplicas)	 ///< got a map from a replica
 
@@ -76,8 +76,6 @@ MEV(IntLocalMapDone)
 
 MEV(DigestUpdate)  ///< external. called upon success of a MODIFY op. See
 		   ///< scrub_snapshot_metadata()
-MEV(AllChunksDone)
-
 MEV(StartReplica)	 ///< initiating replica scrub. replica_scrub_op() -> OSD Q ->
 			 ///< replica_scrub()
 MEV(StartReplicaNoWait)	 ///< 'start replica' when there are no pending updates
@@ -112,6 +110,7 @@ class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
   void my_states() const;
   void assert_not_active() const;
   [[nodiscard]] bool is_reserving() const;
+  [[nodiscard]] bool is_accepting_updates() const;
 };
 
 /**
@@ -170,13 +169,9 @@ struct ActiveScrubbing : sc::state<ActiveScrubbing, ScrubMachine, PendingTimer>
   ~ActiveScrubbing();
 
   using reactions = mpl::list<
-    // done scrubbing
-    sc::transition<AllChunksDone, NotActive>,
-
     sc::custom_reaction<InternalError>,
     sc::custom_reaction<FullReset>>;
 
-  sc::result react(const AllChunksDone&);
   sc::result react(const FullReset&);
   sc::result react(const InternalError&);
 };
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
index 15a6cdf4dede4..ab6f84e88f54c 100644
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -138,6 +138,22 @@ struct ScrubPgIF {
 
   virtual void send_sched_replica(epoch_t epoch_queued) = 0;
 
+//   virtual void send_full_reset(epoch_t epoch_queued) = 0;
+// 
+//   virtual void send_chunk_free(epoch_t epoch_queued) = 0;
+// 
+//   virtual void send_chunk_busy(epoch_t epoch_queued) = 0;
+// 
+//   virtual void send_local_map_done(epoch_t epoch_queued) = 0;
+// 
+//   virtual void send_get_next_chunk(epoch_t epoch_queued) = 0;
+// 
+//   virtual void send_scrub_is_finished(epoch_t epoch_queued) = 0;
+// 
+//   virtual void send_maps_compared(epoch_t epoch_queued) = 0;
+
+  virtual void on_applied_when_primary(const eversion_t &applied_version) = 0;
+
   // --------------------------------------------------
 
   [[nodiscard]] virtual bool are_callbacks_pending()
-- 
2.39.5