From 411ea10084033ad69b44f0a04c9e18ab19d43639 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Mon, 24 Jan 2022 13:19:01 +0000
Subject: [PATCH] osd/scrub: report replicas slow to repond to scrub requests

Implemented timeouts:

1: Slow-Secondary Warning:

Once at least half of the replicas have accepted the reservation, we
start reporting any secondary that takes too long (more than <conf>
milliseconds after the previous response received) to respond to the reservation
request.
(Why? because we have encountered real-life situations where a specific
OSD was systematically very slow to respond (e.g. 5 seconds in one case) to
the reservation requests, slowing the scrub process to a crawl).

2: Reservation Process Timeout:

We now limit the total time the primary waits for the replicas to
respond to the reservation request. If we do not get all the responses
(either Grant or Reject) within <conf> milliseconds, we give up and release all the
reservations we have acquired so far.
(Why? because we have encountered instances where a reservation request
was lost - either due to a bug or due to a network issue.)

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/common/options/osd.yaml.in  |  26 +++++++
 src/osd/scrubber/pg_scrubber.cc | 122 +++++++++++++++++++++++++++-----
 src/osd/scrubber/pg_scrubber.h  |  55 +++++++++++++-
 3 files changed, 185 insertions(+), 18 deletions(-)
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in
index 273eea909bcf3..ff2c4ac74c866 100644
--- a/src/common/options/osd.yaml.in
+++ b/src/common/options/osd.yaml.in
@@ -484,6 +484,32 @@ options:
     stats (inc. scrub/block duration) every this many seconds.
   default: 120
   with_legacy: false
+# when replicas are slow to respond to scrub resource reservations
+# Note: disable by using a very large value
+- name: osd_scrub_slow_reservation_response
+  type: millisecs
+  level: advanced
+  desc: Duration before issuing a cluster-log warning
+  long_desc: Waiting too long for a replica to respond (after at least half of the
+    replicas have responded).
+  default: 2200
+  min: 500
+  see_also:
+  - osd_scrub_reservation_timeout
+  with_legacy: false
+# when a replica does not respond to scrub resource request
+# Note: disable by using a very large value
+- name: osd_scrub_reservation_timeout
+  type: millisecs
+  level: advanced
+  desc: Duration before aborting the scrub session
+  long_desc: Waiting too long for some replicas to respond to
+    scrub reservation requests.
+  default: 5000
+  min: 2000
+  see_also:
+  - osd_scrub_slow_reservation_response
+  with_legacy: false
 # where rados plugins are stored
 - name: osd_class_dir
   type: str
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 77f1e16c9fde3..79dfc60c007fe 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -2255,7 +2255,8 @@ void PgScrubber::set_scrub_duration()
 void PgScrubber::reserve_replicas()
 {
   dout(10) << __func__ << dendl;
-  m_reservations.emplace(m_pg, m_pg_whoami, m_scrub_job);
+  m_reservations.emplace(
+    m_pg, m_pg_whoami, m_scrub_job, m_pg->get_cct()->_conf);
 }
 
 void PgScrubber::cleanup_on_finish()
@@ -2534,39 +2535,43 @@ void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch)
   m_osds->send_message_osd_cluster(peer.osd, m, epoch);
 }
 
-ReplicaReservations::ReplicaReservations(PG* pg,
-					 pg_shard_t whoami,
-					 ScrubQueue::ScrubJobRef scrubjob)
+ReplicaReservations::ReplicaReservations(
+  PG* pg,
+  pg_shard_t whoami,
+  ScrubQueue::ScrubJobRef scrubjob,
+  const ConfigProxy& conf)
     : m_pg{pg}
     , m_acting_set{pg->get_actingset()}
     , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
     , m_pending{static_cast<int>(m_acting_set.size()) - 1}
     , m_pg_info{m_pg->get_pg_info(ScrubberPasskey())}
     , m_scrub_job{scrubjob}
+    , m_conf{conf}
 {
   epoch_t epoch = m_pg->get_osdmap_epoch();
+  m_timeout = conf.get_val<std::chrono::milliseconds>(
+    "osd_scrub_slow_reservation_response");
+  m_log_msg_prefix = fmt::format(
+    "osd.{} ep: {} scrubber::ReplicaReservations pg[{}]: ", m_osds->whoami,
+    epoch, pg->pg_id);
 
-  {
-    std::stringstream prefix;
-    prefix << "osd." << m_osds->whoami << " ep: " << epoch
-	   << " scrubber::ReplicaReservations pg[" << pg->pg_id << "]: ";
-    m_log_msg_prefix = prefix.str();
-  }
-
-  // handle the special case of no replicas
   if (m_pending <= 0) {
+    // A special case of no replicas.
     // just signal the scrub state-machine to continue
     send_all_done();
 
   } else {
+    // start a timer to handle the case of no replies
+    m_no_reply = make_unique<ReplicaReservations::no_reply_t>(
+      m_osds, m_conf, *this, m_log_msg_prefix);
 
+    // send the reservation requests
     for (auto p : m_acting_set) {
       if (p == whoami)
 	continue;
-      auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, p.shard),
-				    epoch,
-				    MOSDScrubReserve::REQUEST,
-				    m_pg->pg_whoami);
+      auto m = new MOSDScrubReserve(
+	spg_t(m_pg_info.pgid.pgid, p.shard), epoch, MOSDScrubReserve::REQUEST,
+	m_pg->pg_whoami);
       m_osds->send_message_osd_cluster(p.osd, m, epoch);
       m_waited_for_peers.push_back(p);
       dout(10) << __func__ << ": reserve " << p.osd << dendl;
@@ -2576,11 +2581,15 @@ ReplicaReservations::ReplicaReservations(PG* pg,
 
 void ReplicaReservations::send_all_done()
 {
+  // stop any pending timeout timer
+  m_no_reply.reset();
   m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
 }
 
 void ReplicaReservations::send_reject()
 {
+  // stop any pending timeout timer
+  m_no_reply.reset();
   m_scrub_job->resources_failure = true;
   m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
 }
@@ -2589,17 +2598,39 @@ void ReplicaReservations::discard_all()
 {
   dout(10) << __func__ << ": " << m_reserved_peers << dendl;
 
+  m_no_reply.reset();
   m_had_rejections = true;  // preventing late-coming responses from triggering
 			    // events
   m_reserved_peers.clear();
   m_waited_for_peers.clear();
 }
 
+/*
+ * The following holds when update_latecomers() is called:
+ * - we are still waiting for replies from some of the replicas;
+ * - we might have already set a timer. If so, we should restart it.
+ * - we might have received responses from 50% of the replicas.
+ */
+std::optional<ReplicaReservations::tpoint_t>
+ReplicaReservations::update_latecomers(tpoint_t now_is)
+{
+  if (m_reserved_peers.size() > m_waited_for_peers.size()) {
+    // at least half of the replicas have already responded. Time we flag
+    // latecomers.
+    return now_is + m_timeout;
+  } else {
+    return std::nullopt;
+  }
+}
+
 ReplicaReservations::~ReplicaReservations()
 {
   m_had_rejections = true;  // preventing late-coming responses from triggering
 			    // events
 
+  // stop any pending timeout timer
+  m_no_reply.reset();
+
   // send un-reserve messages to all reserved replicas. We do not wait for
   // answer (there wouldn't be one). Other incoming messages will be discarded
   // on the way, by our owner.
@@ -2655,6 +2686,21 @@ void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
     dout(10) << __func__ << ": osd." << from << " scrub reserve = success"
 	     << dendl;
     m_reserved_peers.push_back(from);
+
+    // was this response late?
+    auto now_is = clock::now();
+    if (m_timeout_point && (now_is > *m_timeout_point)) {
+      m_osds->clog->warn() << fmt::format(
+	"osd.{} scrubber pg[{}]: late reservation from osd.{}",
+	m_osds->whoami,
+	m_pg->pg_id,
+	from);
+      m_timeout_point.reset();
+    } else {
+      // possibly set a timer to warn about late-coming reservations
+      m_timeout_point = update_latecomers(now_is);
+    }
+
     if (--m_pending == 0) {
       send_all_done();
     }
@@ -2698,11 +2744,55 @@ void ReplicaReservations::handle_reserve_reject(OpRequestRef op,
   }
 }
 
+void ReplicaReservations::handle_no_reply_timeout()
+{
+  dout(1) << fmt::format(
+	       "{}: timeout! no reply from {}", __func__, m_waited_for_peers)
+	  << dendl;
+
+  m_had_rejections = true;  // preventing any additional notifications
+  send_reject();
+}
+
 std::ostream& ReplicaReservations::gen_prefix(std::ostream& out) const
 {
   return out << m_log_msg_prefix;
 }
 
+ReplicaReservations::no_reply_t::no_reply_t(
+  OSDService* osds,
+  const ConfigProxy& conf,
+  ReplicaReservations& parent,
+  std::string_view log_prfx)
+    : m_osds{osds}
+    , m_conf{conf}
+    , m_parent{parent}
+    , m_log_prfx{log_prfx}
+{
+  using namespace std::chrono;
+  auto now_is = clock::now();
+  auto timeout =
+    conf.get_val<std::chrono::milliseconds>("osd_scrub_reservation_timeout");
+
+  m_abort_callback = new LambdaContext([this, now_is]([[maybe_unused]] int r) {
+    // behave as if a REJECT was received
+    m_osds->clog->warn() << fmt::format(
+      "{} timeout on replica reservations (since {})", m_log_prfx, now_is);
+    m_parent.handle_no_reply_timeout();
+  });
+
+  std::lock_guard l(m_osds->sleep_lock);
+  m_osds->sleep_timer.add_event_after(timeout, m_abort_callback);
+}
+
+ReplicaReservations::no_reply_t::~no_reply_t()
+{
+  std::lock_guard l(m_osds->sleep_lock);
+  if (m_abort_callback) {
+    m_osds->sleep_timer.cancel_event(m_abort_callback);
+  }
+}
+
 // ///////////////////// LocalReservation //////////////////////////////////
 
 // note: no dout()s in LocalReservation functions. Client logs interactions.
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index aa1c341edf49d..77659377a4e32 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -95,6 +95,24 @@ struct BuildMap;
  * event. All previous requests, whether already granted or not, are explicitly
  * released.
  *
+ * Timeouts:
+ *
+ *  Slow-Secondary Warning:
+ *  Once at least half of the replicas have accepted the reservation, we start
+ *  reporting any secondary that takes too long (more than <conf> milliseconds
+ *  after the previous response received) to respond to the reservation request.
+ *  (Why? because we have encountered real-life situations where a specific OSD
+ *  was systematically very slow (e.g. 5 seconds) to respond to the reservation
+ *  requests, slowing the scrub process to a crawl).
+ *
+ *  Reservation Timeout:
+ *  We limit the total time we wait for the replicas to respond to the
+ *  reservation request. If we don't get all the responses (either Grant or
+ *  Reject) within <conf> milliseconds, we give up and release all the
+ *  reservations we have acquired so far.
+ *  (Why? because we have encountered instances where a reservation request was
+ *  lost - either due to a bug or due to a network issue.)
+ *
  * A note re performance: I've measured a few container alternatives for
  * m_reserved_peers, with its specific usage pattern. Std::set is extremely
  * slow, as expected. flat_set is only slightly better. Surprisingly -
@@ -102,6 +120,25 @@ struct BuildMap;
  * std::vector: no need to pre-reserve.
  */
 class ReplicaReservations {
+  using clock = std::chrono::system_clock;
+  using tpoint_t = std::chrono::time_point<clock>;
+
+  /// a no-reply timeout handler
+  struct no_reply_t {
+    explicit no_reply_t(
+      OSDService* osds,
+      const ConfigProxy& conf,
+      ReplicaReservations& parent,
+      std::string_view log_prfx);
+
+    ~no_reply_t();
+    OSDService* m_osds;
+    const ConfigProxy& m_conf;
+    ReplicaReservations& m_parent;
+    std::string m_log_prfx;
+    Context* m_abort_callback{nullptr};
+  };
+
   PG* m_pg;
   std::set<pg_shard_t> m_acting_set;
   OSDService* m_osds;
@@ -111,6 +148,14 @@ class ReplicaReservations {
   int m_pending{-1};
   const pg_info_t& m_pg_info;
   ScrubQueue::ScrubJobRef m_scrub_job;	///< a ref to this PG's scrub job
+  const ConfigProxy& m_conf;
+
+  // detecting slow peers (see 'slow-secondary' above)
+  std::chrono::milliseconds m_timeout;
+  std::optional<tpoint_t> m_timeout_point;
+
+  // detecting & handling a "no show" of a replica
+  std::unique_ptr<no_reply_t> m_no_reply;
 
   void release_replica(pg_shard_t peer, epoch_t epoch);
 
@@ -119,6 +164,8 @@ class ReplicaReservations {
   /// notify the scrubber that we have failed to reserve replicas' resources
   void send_reject();
 
+  std::optional<tpoint_t> update_latecomers(tpoint_t now_is);
+
  public:
   std::string m_log_msg_prefix;
 
@@ -132,8 +179,9 @@ class ReplicaReservations {
   void discard_all();
 
   ReplicaReservations(PG* pg,
-		      pg_shard_t whoami,
-		      ScrubQueue::ScrubJobRef scrubjob);
+                      pg_shard_t whoami,
+                      ScrubQueue::ScrubJobRef scrubjob,
+                      const ConfigProxy& conf); 
 
   ~ReplicaReservations();
 
@@ -141,6 +189,9 @@ class ReplicaReservations {
 
   void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
 
+  // if timing out on receiving replies from our replicas:
+  void handle_no_reply_timeout();
+
   std::ostream& gen_prefix(std::ostream& out) const;
 };
 
-- 
2.39.5