From 411ea10084033ad69b44f0a04c9e18ab19d43639 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 24 Jan 2022 13:19:01 +0000 Subject: [PATCH] osd/scrub: report replicas slow to repond to scrub requests Implemented timeouts: 1: Slow-Secondary Warning: Once at least half of the replicas have accepted the reservation, we start reporting any secondary that takes too long (more than milliseconds after the previous response received) to respond to the reservation request. (Why? because we have encountered real-life situations where a specific OSD was systematically very slow to respond (e.g. 5 seconds in one case) to the reservation requests, slowing the scrub process to a crawl). 2: Reservation Process Timeout: We now limit the total time the primary waits for the replicas to respond to the reservation request. If we do not get all the responses (either Grant or Reject) within milliseconds, we give up and release all the reservations we have acquired so far. (Why? because we have encountered instances where a reservation request was lost - either due to a bug or due to a network issue.) Signed-off-by: Ronen Friedman --- src/common/options/osd.yaml.in | 26 +++++++ src/osd/scrubber/pg_scrubber.cc | 122 +++++++++++++++++++++++++++----- src/osd/scrubber/pg_scrubber.h | 55 +++++++++++++- 3 files changed, 185 insertions(+), 18 deletions(-) diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index 273eea909bcf3..ff2c4ac74c866 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -484,6 +484,32 @@ options: stats (inc. scrub/block duration) every this many seconds. default: 120 with_legacy: false +# when replicas are slow to respond to scrub resource reservations +# Note: disable by using a very large value +- name: osd_scrub_slow_reservation_response + type: millisecs + level: advanced + desc: Duration before issuing a cluster-log warning + long_desc: Waiting too long for a replica to respond (after at least half of the + replicas have responded). + default: 2200 + min: 500 + see_also: + - osd_scrub_reservation_timeout + with_legacy: false +# when a replica does not respond to scrub resource request +# Note: disable by using a very large value +- name: osd_scrub_reservation_timeout + type: millisecs + level: advanced + desc: Duration before aborting the scrub session + long_desc: Waiting too long for some replicas to respond to + scrub reservation requests. + default: 5000 + min: 2000 + see_also: + - osd_scrub_slow_reservation_response + with_legacy: false # where rados plugins are stored - name: osd_class_dir type: str diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 77f1e16c9fde3..79dfc60c007fe 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -2255,7 +2255,8 @@ void PgScrubber::set_scrub_duration() void PgScrubber::reserve_replicas() { dout(10) << __func__ << dendl; - m_reservations.emplace(m_pg, m_pg_whoami, m_scrub_job); + m_reservations.emplace( + m_pg, m_pg_whoami, m_scrub_job, m_pg->get_cct()->_conf); } void PgScrubber::cleanup_on_finish() @@ -2534,39 +2535,43 @@ void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch) m_osds->send_message_osd_cluster(peer.osd, m, epoch); } -ReplicaReservations::ReplicaReservations(PG* pg, - pg_shard_t whoami, - ScrubQueue::ScrubJobRef scrubjob) +ReplicaReservations::ReplicaReservations( + PG* pg, + pg_shard_t whoami, + ScrubQueue::ScrubJobRef scrubjob, + const ConfigProxy& conf) : m_pg{pg} , m_acting_set{pg->get_actingset()} , m_osds{m_pg->get_pg_osd(ScrubberPasskey())} , m_pending{static_cast(m_acting_set.size()) - 1} , m_pg_info{m_pg->get_pg_info(ScrubberPasskey())} , m_scrub_job{scrubjob} + , m_conf{conf} { epoch_t epoch = m_pg->get_osdmap_epoch(); + m_timeout = conf.get_val( + "osd_scrub_slow_reservation_response"); + m_log_msg_prefix = fmt::format( + "osd.{} ep: {} scrubber::ReplicaReservations pg[{}]: ", m_osds->whoami, + epoch, pg->pg_id); - { - std::stringstream prefix; - prefix << "osd." << m_osds->whoami << " ep: " << epoch - << " scrubber::ReplicaReservations pg[" << pg->pg_id << "]: "; - m_log_msg_prefix = prefix.str(); - } - - // handle the special case of no replicas if (m_pending <= 0) { + // A special case of no replicas. // just signal the scrub state-machine to continue send_all_done(); } else { + // start a timer to handle the case of no replies + m_no_reply = make_unique( + m_osds, m_conf, *this, m_log_msg_prefix); + // send the reservation requests for (auto p : m_acting_set) { if (p == whoami) continue; - auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, p.shard), - epoch, - MOSDScrubReserve::REQUEST, - m_pg->pg_whoami); + auto m = new MOSDScrubReserve( + spg_t(m_pg_info.pgid.pgid, p.shard), epoch, MOSDScrubReserve::REQUEST, + m_pg->pg_whoami); m_osds->send_message_osd_cluster(p.osd, m, epoch); m_waited_for_peers.push_back(p); dout(10) << __func__ << ": reserve " << p.osd << dendl; @@ -2576,11 +2581,15 @@ ReplicaReservations::ReplicaReservations(PG* pg, void ReplicaReservations::send_all_done() { + // stop any pending timeout timer + m_no_reply.reset(); m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority); } void ReplicaReservations::send_reject() { + // stop any pending timeout timer + m_no_reply.reset(); m_scrub_job->resources_failure = true; m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority); } @@ -2589,17 +2598,39 @@ void ReplicaReservations::discard_all() { dout(10) << __func__ << ": " << m_reserved_peers << dendl; + m_no_reply.reset(); m_had_rejections = true; // preventing late-coming responses from triggering // events m_reserved_peers.clear(); m_waited_for_peers.clear(); } +/* + * The following holds when update_latecomers() is called: + * - we are still waiting for replies from some of the replicas; + * - we might have already set a timer. If so, we should restart it. + * - we might have received responses from 50% of the replicas. + */ +std::optional +ReplicaReservations::update_latecomers(tpoint_t now_is) +{ + if (m_reserved_peers.size() > m_waited_for_peers.size()) { + // at least half of the replicas have already responded. Time we flag + // latecomers. + return now_is + m_timeout; + } else { + return std::nullopt; + } +} + ReplicaReservations::~ReplicaReservations() { m_had_rejections = true; // preventing late-coming responses from triggering // events + // stop any pending timeout timer + m_no_reply.reset(); + // send un-reserve messages to all reserved replicas. We do not wait for // answer (there wouldn't be one). Other incoming messages will be discarded // on the way, by our owner. @@ -2655,6 +2686,21 @@ void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from) dout(10) << __func__ << ": osd." << from << " scrub reserve = success" << dendl; m_reserved_peers.push_back(from); + + // was this response late? + auto now_is = clock::now(); + if (m_timeout_point && (now_is > *m_timeout_point)) { + m_osds->clog->warn() << fmt::format( + "osd.{} scrubber pg[{}]: late reservation from osd.{}", + m_osds->whoami, + m_pg->pg_id, + from); + m_timeout_point.reset(); + } else { + // possibly set a timer to warn about late-coming reservations + m_timeout_point = update_latecomers(now_is); + } + if (--m_pending == 0) { send_all_done(); } @@ -2698,11 +2744,55 @@ void ReplicaReservations::handle_reserve_reject(OpRequestRef op, } } +void ReplicaReservations::handle_no_reply_timeout() +{ + dout(1) << fmt::format( + "{}: timeout! no reply from {}", __func__, m_waited_for_peers) + << dendl; + + m_had_rejections = true; // preventing any additional notifications + send_reject(); +} + std::ostream& ReplicaReservations::gen_prefix(std::ostream& out) const { return out << m_log_msg_prefix; } +ReplicaReservations::no_reply_t::no_reply_t( + OSDService* osds, + const ConfigProxy& conf, + ReplicaReservations& parent, + std::string_view log_prfx) + : m_osds{osds} + , m_conf{conf} + , m_parent{parent} + , m_log_prfx{log_prfx} +{ + using namespace std::chrono; + auto now_is = clock::now(); + auto timeout = + conf.get_val("osd_scrub_reservation_timeout"); + + m_abort_callback = new LambdaContext([this, now_is]([[maybe_unused]] int r) { + // behave as if a REJECT was received + m_osds->clog->warn() << fmt::format( + "{} timeout on replica reservations (since {})", m_log_prfx, now_is); + m_parent.handle_no_reply_timeout(); + }); + + std::lock_guard l(m_osds->sleep_lock); + m_osds->sleep_timer.add_event_after(timeout, m_abort_callback); +} + +ReplicaReservations::no_reply_t::~no_reply_t() +{ + std::lock_guard l(m_osds->sleep_lock); + if (m_abort_callback) { + m_osds->sleep_timer.cancel_event(m_abort_callback); + } +} + // ///////////////////// LocalReservation ////////////////////////////////// // note: no dout()s in LocalReservation functions. Client logs interactions. diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index aa1c341edf49d..77659377a4e32 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -95,6 +95,24 @@ struct BuildMap; * event. All previous requests, whether already granted or not, are explicitly * released. * + * Timeouts: + * + * Slow-Secondary Warning: + * Once at least half of the replicas have accepted the reservation, we start + * reporting any secondary that takes too long (more than milliseconds + * after the previous response received) to respond to the reservation request. + * (Why? because we have encountered real-life situations where a specific OSD + * was systematically very slow (e.g. 5 seconds) to respond to the reservation + * requests, slowing the scrub process to a crawl). + * + * Reservation Timeout: + * We limit the total time we wait for the replicas to respond to the + * reservation request. If we don't get all the responses (either Grant or + * Reject) within milliseconds, we give up and release all the + * reservations we have acquired so far. + * (Why? because we have encountered instances where a reservation request was + * lost - either due to a bug or due to a network issue.) + * * A note re performance: I've measured a few container alternatives for * m_reserved_peers, with its specific usage pattern. Std::set is extremely * slow, as expected. flat_set is only slightly better. Surprisingly - @@ -102,6 +120,25 @@ struct BuildMap; * std::vector: no need to pre-reserve. */ class ReplicaReservations { + using clock = std::chrono::system_clock; + using tpoint_t = std::chrono::time_point; + + /// a no-reply timeout handler + struct no_reply_t { + explicit no_reply_t( + OSDService* osds, + const ConfigProxy& conf, + ReplicaReservations& parent, + std::string_view log_prfx); + + ~no_reply_t(); + OSDService* m_osds; + const ConfigProxy& m_conf; + ReplicaReservations& m_parent; + std::string m_log_prfx; + Context* m_abort_callback{nullptr}; + }; + PG* m_pg; std::set m_acting_set; OSDService* m_osds; @@ -111,6 +148,14 @@ class ReplicaReservations { int m_pending{-1}; const pg_info_t& m_pg_info; ScrubQueue::ScrubJobRef m_scrub_job; ///< a ref to this PG's scrub job + const ConfigProxy& m_conf; + + // detecting slow peers (see 'slow-secondary' above) + std::chrono::milliseconds m_timeout; + std::optional m_timeout_point; + + // detecting & handling a "no show" of a replica + std::unique_ptr m_no_reply; void release_replica(pg_shard_t peer, epoch_t epoch); @@ -119,6 +164,8 @@ class ReplicaReservations { /// notify the scrubber that we have failed to reserve replicas' resources void send_reject(); + std::optional update_latecomers(tpoint_t now_is); + public: std::string m_log_msg_prefix; @@ -132,8 +179,9 @@ class ReplicaReservations { void discard_all(); ReplicaReservations(PG* pg, - pg_shard_t whoami, - ScrubQueue::ScrubJobRef scrubjob); + pg_shard_t whoami, + ScrubQueue::ScrubJobRef scrubjob, + const ConfigProxy& conf); ~ReplicaReservations(); @@ -141,6 +189,9 @@ class ReplicaReservations { void handle_reserve_reject(OpRequestRef op, pg_shard_t from); + // if timing out on receiving replies from our replicas: + void handle_no_reply_timeout(); + std::ostream& gen_prefix(std::ostream& out) const; }; -- 2.39.5