osd/scrub: report replicas slow to repond to scrub requests

author Ronen Friedman <rfriedma@redhat.com>

Mon, 24 Jan 2022 13:19:01 +0000 (13:19 +0000)

committer Ronen Friedman <rfriedma@redhat.com>

Sun, 11 Dec 2022 15:27:42 +0000 (17:27 +0200)
author Ronen Friedman <rfriedma@redhat.com>
Mon, 24 Jan 2022 13:19:01 +0000 (13:19 +0000)
committer Ronen Friedman <rfriedma@redhat.com>
Sun, 11 Dec 2022 15:27:42 +0000 (17:27 +0200)
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in

index 273eea909bcf3409cba428070f2726ae84a6ccf1..ff2c4ac74c866da0c586980138bd1e92479e96f4 100644 (file)
--- a/src/common/options/osd.yaml.in
+++ b/src/common/options/osd.yaml.in
@@ -484,6 +484,32 @@ options:
      stats (inc. scrub/block duration) every this many seconds.
    default: 120
    with_legacy: false
+# when replicas are slow to respond to scrub resource reservations
+# Note: disable by using a very large value
+- name: osd_scrub_slow_reservation_response
+  type: millisecs
+  level: advanced
+  desc: Duration before issuing a cluster-log warning
+  long_desc: Waiting too long for a replica to respond (after at least half of the
+    replicas have responded).
+  default: 2200
+  min: 500
+  see_also:
+  - osd_scrub_reservation_timeout
+  with_legacy: false
+# when a replica does not respond to scrub resource request
+# Note: disable by using a very large value
+- name: osd_scrub_reservation_timeout
+  type: millisecs
+  level: advanced
+  desc: Duration before aborting the scrub session
+  long_desc: Waiting too long for some replicas to respond to
+    scrub reservation requests.
+  default: 5000
+  min: 2000
+  see_also:
+  - osd_scrub_slow_reservation_response
+  with_legacy: false
  # where rados plugins are stored
  - name: osd_class_dir
    type: str
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc

index 77f1e16c9fde334d8cdb520c46520a7eacb3bf11..79dfc60c007fe1a5ac29a44cdd59cdf183499fa6 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -2255,7 +2255,8 @@ void PgScrubber::set_scrub_duration()
  void PgScrubber::reserve_replicas()
  {
    dout(10) << __func__ << dendl;
-  m_reservations.emplace(m_pg, m_pg_whoami, m_scrub_job);
+  m_reservations.emplace(
+    m_pg, m_pg_whoami, m_scrub_job, m_pg->get_cct()->_conf);
  }
  
  void PgScrubber::cleanup_on_finish()
@@ -2534,39 +2535,43 @@ void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch)
    m_osds->send_message_osd_cluster(peer.osd, m, epoch);
  }
  
-ReplicaReservations::ReplicaReservations(PG* pg,
-                                        pg_shard_t whoami,
-                                        ScrubQueue::ScrubJobRef scrubjob)
+ReplicaReservations::ReplicaReservations(
+  PG* pg,
+  pg_shard_t whoami,
+  ScrubQueue::ScrubJobRef scrubjob,
+  const ConfigProxy& conf)
      : m_pg{pg}
      , m_acting_set{pg->get_actingset()}
      , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
      , m_pending{static_cast<int>(m_acting_set.size()) - 1}
      , m_pg_info{m_pg->get_pg_info(ScrubberPasskey())}
      , m_scrub_job{scrubjob}
+    , m_conf{conf}
  {
    epoch_t epoch = m_pg->get_osdmap_epoch();
+  m_timeout = conf.get_val<std::chrono::milliseconds>(
+    "osd_scrub_slow_reservation_response");
+  m_log_msg_prefix = fmt::format(
+    "osd.{} ep: {} scrubber::ReplicaReservations pg[{}]: ", m_osds->whoami,
+    epoch, pg->pg_id);
  
-  {
-    std::stringstream prefix;
-    prefix << "osd." << m_osds->whoami << " ep: " << epoch
-          << " scrubber::ReplicaReservations pg[" << pg->pg_id << "]: ";
-    m_log_msg_prefix = prefix.str();
-  }
-
-  // handle the special case of no replicas
    if (m_pending <= 0) {
+    // A special case of no replicas.
      // just signal the scrub state-machine to continue
      send_all_done();
  
    } else {
+    // start a timer to handle the case of no replies
+    m_no_reply = make_unique<ReplicaReservations::no_reply_t>(
+      m_osds, m_conf, *this, m_log_msg_prefix);
  
+    // send the reservation requests
      for (auto p : m_acting_set) {
        if (p == whoami)
         continue;
-      auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, p.shard),
-                                   epoch,
-                                   MOSDScrubReserve::REQUEST,
-                                   m_pg->pg_whoami);
+      auto m = new MOSDScrubReserve(
+       spg_t(m_pg_info.pgid.pgid, p.shard), epoch, MOSDScrubReserve::REQUEST,
+       m_pg->pg_whoami);
        m_osds->send_message_osd_cluster(p.osd, m, epoch);
        m_waited_for_peers.push_back(p);
        dout(10) << __func__ << ": reserve " << p.osd << dendl;
@@ -2576,11 +2581,15 @@ ReplicaReservations::ReplicaReservations(PG* pg,
  
  void ReplicaReservations::send_all_done()
  {
+  // stop any pending timeout timer
+  m_no_reply.reset();
    m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
  }
  
  void ReplicaReservations::send_reject()
  {
+  // stop any pending timeout timer
+  m_no_reply.reset();
    m_scrub_job->resources_failure = true;
    m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
  }
@@ -2589,17 +2598,39 @@ void ReplicaReservations::discard_all()
  {
    dout(10) << __func__ << ": " << m_reserved_peers << dendl;
  
+  m_no_reply.reset();
    m_had_rejections = true;  // preventing late-coming responses from triggering
                             // events
    m_reserved_peers.clear();
    m_waited_for_peers.clear();
  }
  
+/*
+ * The following holds when update_latecomers() is called:
+ * - we are still waiting for replies from some of the replicas;
+ * - we might have already set a timer. If so, we should restart it.
+ * - we might have received responses from 50% of the replicas.
+ */
+std::optional<ReplicaReservations::tpoint_t>
+ReplicaReservations::update_latecomers(tpoint_t now_is)
+{
+  if (m_reserved_peers.size() > m_waited_for_peers.size()) {
+    // at least half of the replicas have already responded. Time we flag
+    // latecomers.
+    return now_is + m_timeout;
+  } else {
+    return std::nullopt;
+  }
+}
+
  ReplicaReservations::~ReplicaReservations()
  {
    m_had_rejections = true;  // preventing late-coming responses from triggering
                             // events
  
+  // stop any pending timeout timer
+  m_no_reply.reset();
+
    // send un-reserve messages to all reserved replicas. We do not wait for
    // answer (there wouldn't be one). Other incoming messages will be discarded
    // on the way, by our owner.
@@ -2655,6 +2686,21 @@ void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
      dout(10) << __func__ << ": osd." << from << " scrub reserve = success"
              << dendl;
      m_reserved_peers.push_back(from);
+
+    // was this response late?
+    auto now_is = clock::now();
+    if (m_timeout_point && (now_is > *m_timeout_point)) {
+      m_osds->clog->warn() << fmt::format(
+       "osd.{} scrubber pg[{}]: late reservation from osd.{}",
+       m_osds->whoami,
+       m_pg->pg_id,
+       from);
+      m_timeout_point.reset();
+    } else {
+      // possibly set a timer to warn about late-coming reservations
+      m_timeout_point = update_latecomers(now_is);
+    }
+
      if (--m_pending == 0) {
        send_all_done();
      }
@@ -2698,11 +2744,55 @@ void ReplicaReservations::handle_reserve_reject(OpRequestRef op,
    }
  }
  
+void ReplicaReservations::handle_no_reply_timeout()
+{
+  dout(1) << fmt::format(
+              "{}: timeout! no reply from {}", __func__, m_waited_for_peers)
+         << dendl;
+
+  m_had_rejections = true;  // preventing any additional notifications
+  send_reject();
+}
+
  std::ostream& ReplicaReservations::gen_prefix(std::ostream& out) const
  {
    return out << m_log_msg_prefix;
  }
  
+ReplicaReservations::no_reply_t::no_reply_t(
+  OSDService* osds,
+  const ConfigProxy& conf,
+  ReplicaReservations& parent,
+  std::string_view log_prfx)
+    : m_osds{osds}
+    , m_conf{conf}
+    , m_parent{parent}
+    , m_log_prfx{log_prfx}
+{
+  using namespace std::chrono;
+  auto now_is = clock::now();
+  auto timeout =
+    conf.get_val<std::chrono::milliseconds>("osd_scrub_reservation_timeout");
+
+  m_abort_callback = new LambdaContext([this, now_is]([[maybe_unused]] int r) {
+    // behave as if a REJECT was received
+    m_osds->clog->warn() << fmt::format(
+      "{} timeout on replica reservations (since {})", m_log_prfx, now_is);
+    m_parent.handle_no_reply_timeout();
+  });
+
+  std::lock_guard l(m_osds->sleep_lock);
+  m_osds->sleep_timer.add_event_after(timeout, m_abort_callback);
+}
+
+ReplicaReservations::no_reply_t::~no_reply_t()
+{
+  std::lock_guard l(m_osds->sleep_lock);
+  if (m_abort_callback) {
+    m_osds->sleep_timer.cancel_event(m_abort_callback);
+  }
+}
+
  // ///////////////////// LocalReservation //////////////////////////////////
  
  // note: no dout()s in LocalReservation functions. Client logs interactions.
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h

index aa1c341edf49d7d7694fe4fe74748f2eafc4bad0..77659377a4e32a84cd1172cecd5169ab9d9cccce 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -95,6 +95,24 @@ struct BuildMap;
   * event. All previous requests, whether already granted or not, are explicitly
   * released.
   *
+ * Timeouts:
+ *
+ *  Slow-Secondary Warning:
+ *  Once at least half of the replicas have accepted the reservation, we start
+ *  reporting any secondary that takes too long (more than <conf> milliseconds
+ *  after the previous response received) to respond to the reservation request.
+ *  (Why? because we have encountered real-life situations where a specific OSD
+ *  was systematically very slow (e.g. 5 seconds) to respond to the reservation
+ *  requests, slowing the scrub process to a crawl).
+ *
+ *  Reservation Timeout:
+ *  We limit the total time we wait for the replicas to respond to the
+ *  reservation request. If we don't get all the responses (either Grant or
+ *  Reject) within <conf> milliseconds, we give up and release all the
+ *  reservations we have acquired so far.
+ *  (Why? because we have encountered instances where a reservation request was
+ *  lost - either due to a bug or due to a network issue.)
+ *
   * A note re performance: I've measured a few container alternatives for
   * m_reserved_peers, with its specific usage pattern. Std::set is extremely
   * slow, as expected. flat_set is only slightly better. Surprisingly -
@@ -102,6 +120,25 @@ struct BuildMap;
   * std::vector: no need to pre-reserve.
   */
  class ReplicaReservations {
+  using clock = std::chrono::system_clock;
+  using tpoint_t = std::chrono::time_point<clock>;
+
+  /// a no-reply timeout handler
+  struct no_reply_t {
+    explicit no_reply_t(
+      OSDService* osds,
+      const ConfigProxy& conf,
+      ReplicaReservations& parent,
+      std::string_view log_prfx);
+
+    ~no_reply_t();
+    OSDService* m_osds;
+    const ConfigProxy& m_conf;
+    ReplicaReservations& m_parent;
+    std::string m_log_prfx;
+    Context* m_abort_callback{nullptr};
+  };
+
    PG* m_pg;
    std::set<pg_shard_t> m_acting_set;
    OSDService* m_osds;
@@ -111,6 +148,14 @@ class ReplicaReservations {
    int m_pending{-1};
    const pg_info_t& m_pg_info;
    ScrubQueue::ScrubJobRef m_scrub_job; ///< a ref to this PG's scrub job
+  const ConfigProxy& m_conf;
+
+  // detecting slow peers (see 'slow-secondary' above)
+  std::chrono::milliseconds m_timeout;
+  std::optional<tpoint_t> m_timeout_point;
+
+  // detecting & handling a "no show" of a replica
+  std::unique_ptr<no_reply_t> m_no_reply;
  
    void release_replica(pg_shard_t peer, epoch_t epoch);
  
@@ -119,6 +164,8 @@ class ReplicaReservations {
    /// notify the scrubber that we have failed to reserve replicas' resources
    void send_reject();
  
+  std::optional<tpoint_t> update_latecomers(tpoint_t now_is);
+
   public:
    std::string m_log_msg_prefix;
  
@@ -132,8 +179,9 @@ class ReplicaReservations {
    void discard_all();
  
    ReplicaReservations(PG* pg,
-                     pg_shard_t whoami,
-                     ScrubQueue::ScrubJobRef scrubjob);
+                      pg_shard_t whoami,
+                      ScrubQueue::ScrubJobRef scrubjob,
+                      const ConfigProxy& conf); 
  
    ~ReplicaReservations();
  
@@ -141,6 +189,9 @@ class ReplicaReservations {
  
    void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
  
+  // if timing out on receiving replies from our replicas:
+  void handle_no_reply_timeout();
+
    std::ostream& gen_prefix(std::ostream& out) const;
  };
author	Ronen Friedman <rfriedma@redhat.com>
	Mon, 24 Jan 2022 13:19:01 +0000 (13:19 +0000)
committer	Ronen Friedman <rfriedma@redhat.com>
	Sun, 11 Dec 2022 15:27:42 +0000 (17:27 +0200)
src/common/options/osd.yaml.in		patch \| blob \| history
src/osd/scrubber/pg_scrubber.cc		patch \| blob \| history
src/osd/scrubber/pg_scrubber.h		patch \| blob \| history