From e0ed0122526791547a317c6ca19ed081a92dfe69 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Thu, 14 Jan 2021 20:39:32 -0800
Subject: [PATCH] osd: Try other PGs when reservation failures occur

Fixes: https://tracker.ceph.com/issues/48843

Signed-off-by: David Zafman <dzafman@redhat.com>
(cherry picked from commit 08c3ede0844507074de2b43638423c094976b493)
---
 PendingReleaseNotes           |  3 +++
 src/osd/OSD.cc                | 34 ++++++++++++++++++++++++++++++++--
 src/osd/PG.cc                 |  1 +
 src/osd/PG.h                  |  4 ++++
 src/osd/pg_scrubber.h         |  7 +++++++
 src/osd/scrub_machine.cc      |  2 ++
 src/osd/scrub_machine_lstnr.h |  4 ++++
 src/osd/scrubber_common.h     |  4 ++++
 8 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index fc75ba0d20c3d..bd0fbb3eab406 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -67,6 +67,9 @@
   to property reflect the upper cache utilization before space is reclaimed. The default ``immutable_object_cache_watermark``
   now is ``0.9``. If the capacity reaches 90% the daemon will delete cold cache.
 
+* Scubs are more aggressive in trying to find more simultaneous possible PGs within osd_max_scrubs limitation.
+  It is possible that increasing osd_scrub_sleep may be necessary to maintain client responsiveness.
+
 >=15.0.0
 --------
 
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index c53ab0a08564c..1f699ecc9aad5 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -7611,6 +7611,13 @@ void OSD::sched_scrub()
 	continue;
       }
 
+      // If this one couldn't reserve, skip for now
+      if (pg->get_reserve_failed()) {
+	pg->unlock();
+	dout(20) << __func__ << " pg  " << scrub_job.pgid << " reserve failed, skipped" << dendl;
+        continue;
+      }
+
       // This has already started, so go on to the next scrub job
       if (pg->is_scrub_active()) {
 	pg->unlock();
@@ -7630,7 +7637,7 @@ void OSD::sched_scrub()
       if (pg->m_scrubber->is_reserving()) {
 	pg->unlock();
 	dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
-	break;
+	goto out;
       }
       dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
 	       << (pg->get_must_scrub() ? ", explicitly requested" :
@@ -7639,11 +7646,34 @@ void OSD::sched_scrub()
       if (pg->sched_scrub()) {
 	pg->unlock();
         dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
-	break;
+	goto out;
       }
+      // If this is set now we must have had a local reserve failure, so can't scrub anything right now
+      if (pg->get_reserve_failed()) {
+	pg->unlock();
+	dout(20) << __func__ << " pg  " << scrub_job.pgid << " local reserve failed, nothing to be done now" << dendl;
+        goto out;
+      }
+
       pg->unlock();
     } while (service.next_scrub_stamp(scrub_job, &scrub_job));
+
+    // Clear reserve_failed from all pending PGs, so we try again
+    if (service.first_scrub_stamp(&scrub_job)) {
+      do {
+        if (scrub_job.sched_time > now)
+	  break;
+        PGRef pg = _lookup_lock_pg(scrub_job.pgid);
+	// If we can't lock, it's ok we can get it next time
+        if (!pg)
+	  continue;
+        pg->clear_reserve_failed();
+        pg->unlock();
+      } while (service.next_scrub_stamp(scrub_job, &scrub_job));
+    }
   }
+
+out:
   dout(20) << "sched_scrub done" << dendl;
 }
 
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index df8c4f3451d4a..7aff77ca99e4b 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1365,6 +1365,7 @@ bool PG::sched_scrub()
   // be retried by the OSD later on.
   if (!m_scrubber->reserve_local()) {
     dout(10) << __func__ << ": failed to reserve locally" << dendl;
+    set_reserve_failed();
     return false;
   }
 
diff --git a/src/osd/PG.h b/src/osd/PG.h
index cad031c10861f..85bbfe858497c 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -186,6 +186,10 @@ public:
   /// scrubbing state for both Primary & replicas
   bool is_scrub_active() const { return m_scrubber->is_scrub_active(); }
 
+  bool get_reserve_failed() const { return m_scrubber->get_reserve_failed(); }
+  void set_reserve_failed() { m_scrubber->set_reserve_failed(); }
+  void clear_reserve_failed() { m_scrubber->clear_reserve_failed(); }
+
 public:
   // -- members --
   const coll_t coll;
diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h
index bbbac38ca7565..69fa6b0fcebdb 100644
--- a/src/osd/pg_scrubber.h
+++ b/src/osd/pg_scrubber.h
@@ -412,6 +412,10 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
 
   [[nodiscard]] bool is_scrub_active() const final { return m_active; }
 
+  [[nodiscard]] bool get_reserve_failed() const final { return m_reserve_failed; }
+  void set_reserve_failed() final { m_reserve_failed = true; }
+  void clear_reserve_failed() final { m_reserve_failed = false; }
+
  private:
   void reset_internal_state();
 
@@ -536,6 +540,9 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
 
   bool m_active{false};
 
+  // This PG could not get all the scrub reservations
+  bool m_reserve_failed{false};
+
   eversion_t m_subset_last_update{};
 
   std::unique_ptr<Scrub::Store> m_store;
diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc
index 2a2ee8732bda9..64d79e7236732 100644
--- a/src/osd/scrub_machine.cc
+++ b/src/osd/scrub_machine.cc
@@ -91,6 +91,8 @@ sc::result ReservingReplicas::react(const ReservationFailure&)
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
   dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
 
+  // Mark PG so that we will try other PGs, before coming back to this one
+  scrbr->set_reserve_failed();
   // the Scrubber must release all resources and abort the scrubbing
   scrbr->clear_pgscrub_state();
   return transit<NotActive>();
diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h
index 2b96161215474..b2139773b4d59 100644
--- a/src/osd/scrub_machine_lstnr.h
+++ b/src/osd/scrub_machine_lstnr.h
@@ -114,6 +114,10 @@ struct ScrubMachineListener {
 
   virtual void unreserve_replicas() = 0;
 
+  [[nodiscard]] virtual bool get_reserve_failed() const = 0;
+  virtual void set_reserve_failed() = 0;
+  virtual void clear_reserve_failed() = 0;
+
   /**
    * the FSM interface into the "are we waiting for maps, either our own or from
    * replicas" state.
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
index 15a6cdf4dede4..3f3a618f8249b 100644
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -150,6 +150,10 @@ struct ScrubPgIF {
    */
   [[nodiscard]] virtual bool is_scrub_active() const = 0;
 
+  [[nodiscard]] virtual bool get_reserve_failed() const = 0;
+  virtual void set_reserve_failed() = 0;
+  virtual void clear_reserve_failed() = 0;
+
   /// are we waiting for resource reservation grants form our replicas?
   [[nodiscard]] virtual bool is_reserving() const = 0;
 
-- 
2.39.5