From 5b35551d9b336c7ca0398a40b5f80c87dfdba6f9 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 16 Feb 2023 20:44:58 -0800 Subject: [PATCH] osd/scrubber: use schedule_timer_event_after for reservation timeout Previously, ReplicaReservations::no_reply_t was responsible for managing the timer callback and cancellation. Move the timeout management to the ReservingReplicas state using schedule_timer_event_after. Fixes: https://tracker.ceph.com/issues/58461 Signed-off-by: Samuel Just (cherry picked from commit 525478da3698cfe8a154c642dedd0570f1d52b37) --- src/osd/scrubber/pg_scrubber.cc | 51 ++++---------------------- src/osd/scrubber/pg_scrubber.h | 21 +---------- src/osd/scrubber/scrub_machine.cc | 28 ++++++++++++++ src/osd/scrubber/scrub_machine.h | 11 +++++- src/osd/scrubber/scrub_machine_lstnr.h | 2 + 5 files changed, 50 insertions(+), 63 deletions(-) diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 868003ebdf97..2cca552107c9 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1836,6 +1836,13 @@ void PgScrubber::unreserve_replicas() m_reservations.reset(); } +void PgScrubber::on_replica_reservation_timeout() +{ + if (m_reservations) { + m_reservations->handle_no_reply_timeout(); + } +} + void PgScrubber::set_reserving_now() { m_osds->get_scrub_services().set_reserving_now(); @@ -2645,10 +2652,6 @@ ReplicaReservations::ReplicaReservations( send_all_done(); } else { - // start a timer to handle the case of no replies - m_no_reply = make_unique( - m_osds, m_conf, *this, m_log_msg_prefix); - // send the reservation requests for (auto p : m_acting_set) { if (p == whoami) @@ -2666,14 +2669,12 @@ ReplicaReservations::ReplicaReservations( void ReplicaReservations::send_all_done() { // stop any pending timeout timer - m_no_reply.reset(); m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority); } void ReplicaReservations::send_reject() { // stop any pending timeout timer - m_no_reply.reset(); m_scrub_job->resources_failure = true; m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority); } @@ -2682,7 +2683,6 @@ void ReplicaReservations::discard_all() { dout(10) << __func__ << ": " << m_reserved_peers << dendl; - m_no_reply.reset(); m_had_rejections = true; // preventing late-coming responses from triggering // events m_reserved_peers.clear(); @@ -2712,9 +2712,6 @@ ReplicaReservations::~ReplicaReservations() m_had_rejections = true; // preventing late-coming responses from triggering // events - // stop any pending timeout timer - m_no_reply.reset(); - // send un-reserve messages to all reserved replicas. We do not wait for // answer (there wouldn't be one). Other incoming messages will be discarded // on the way, by our owner. @@ -2834,6 +2831,7 @@ void ReplicaReservations::handle_no_reply_timeout() "{}: timeout! no reply from {}", __func__, m_waited_for_peers) << dendl; + // treat reply timeout as if a REJECT was received m_had_rejections = true; // preventing any additional notifications send_reject(); } @@ -2843,39 +2841,6 @@ std::ostream& ReplicaReservations::gen_prefix(std::ostream& out) const return out << m_log_msg_prefix; } -ReplicaReservations::no_reply_t::no_reply_t( - OSDService* osds, - const ConfigProxy& conf, - ReplicaReservations& parent, - std::string_view log_prfx) - : m_osds{osds} - , m_conf{conf} - , m_parent{parent} - , m_log_prfx{log_prfx} -{ - using namespace std::chrono; - auto now_is = clock::now(); - auto timeout = - conf.get_val("osd_scrub_reservation_timeout"); - - m_abort_callback = new LambdaContext([this, now_is]([[maybe_unused]] int r) { - // behave as if a REJECT was received - m_osds->clog->warn() << fmt::format( - "{} timeout on replica reservations (since {})", m_log_prfx, now_is); - m_parent.handle_no_reply_timeout(); - }); - - std::lock_guard l(m_osds->sleep_lock); - m_osds->sleep_timer.add_event_after(timeout, m_abort_callback); -} - -ReplicaReservations::no_reply_t::~no_reply_t() -{ - std::lock_guard l(m_osds->sleep_lock); - if (m_abort_callback) { - m_osds->sleep_timer.cancel_event(m_abort_callback); - } -} // ///////////////////// LocalReservation ////////////////////////////////// diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 0818b57e0cee..ce9ac9c96f06 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -123,22 +123,6 @@ class ReplicaReservations { using clock = std::chrono::system_clock; using tpoint_t = std::chrono::time_point; - /// a no-reply timeout handler - struct no_reply_t { - explicit no_reply_t( - OSDService* osds, - const ConfigProxy& conf, - ReplicaReservations& parent, - std::string_view log_prfx); - - ~no_reply_t(); - OSDService* m_osds; - const ConfigProxy& m_conf; - ReplicaReservations& m_parent; - std::string m_log_prfx; - Context* m_abort_callback{nullptr}; - }; - PG* m_pg; std::set m_acting_set; OSDService* m_osds; @@ -154,9 +138,6 @@ class ReplicaReservations { std::chrono::milliseconds m_timeout; std::optional m_timeout_point; - // detecting & handling a "no show" of a replica - std::unique_ptr m_no_reply; - void release_replica(pg_shard_t peer, epoch_t epoch); void send_all_done(); ///< all reservations are granted @@ -398,6 +379,8 @@ class PgScrubber : public ScrubPgIF, void discard_replica_reservations() final; void clear_scrub_reservations() final; // PG::clear... fwds to here void unreserve_replicas() final; + void on_replica_reservation_timeout() final; + // managing scrub op registration diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index fff9b0fca91c..56b060779992 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -122,6 +122,15 @@ ReservingReplicas::ReservingReplicas(my_context ctx) // replicas resources scrbr->set_reserving_now(); scrbr->reserve_replicas(); + + auto timeout = scrbr->get_cct()->_conf.get_val< + std::chrono::milliseconds>("osd_scrub_reservation_timeout"); + if (timeout.count() > 0) { + // Start a timer to handle case where the replicas take a long time to + // ack the reservation. See ReservationTimeout handler below. + m_timeout_token = machine.schedule_timer_event_after( + timeout); + } } ReservingReplicas::~ReservingReplicas() @@ -130,6 +139,25 @@ ReservingReplicas::~ReservingReplicas() scrbr->clear_reserving_now(); } +sc::result ReservingReplicas::react(const ReservationTimeout&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ReservingReplicas::react(const ReservationTimeout&)" << dendl; + + dout(10) + << "PgScrubber: " << scrbr->get_spgid() + << " timeout on reserving replicas (since " << entered_at + << ")" << dendl; + scrbr->get_clog()->warn() + << "osd." << scrbr->get_whoami() + << " PgScrubber: " << scrbr->get_spgid() + << " timeout on reserving replicsa (since " << entered_at + << ")"; + + scrbr->on_replica_reservation_timeout(); + return discard_event(); +} + sc::result ReservingReplicas::react(const ReservationFailure&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 01710836a598..1d999c2cabec 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -65,6 +65,9 @@ MEV(RemotesReserved) /// a reservation request has failed MEV(ReservationFailure) +/// reservations have timed out +MEV(ReservationTimeout) + /// initiate a new scrubbing session (relevant if we are a Primary) MEV(StartScrub) @@ -310,16 +313,22 @@ struct NotActive : sc::state, NamedSimply { struct ReservingReplicas : sc::state, NamedSimply { - explicit ReservingReplicas(my_context ctx); ~ReservingReplicas(); using reactions = mpl::list, // all replicas granted our resources request sc::transition, + sc::custom_reaction, sc::custom_reaction>; + ceph::coarse_real_clock::time_point entered_at = + ceph::coarse_real_clock::now(); + ScrubMachine::timer_event_token_t m_timeout_token; + sc::result react(const FullReset&); + sc::result react(const ReservationTimeout&); + /// at least one replica denied us the scrub resources we've requested sc::result react(const ReservationFailure&); }; diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h index fb31ba2e2f5e..43154f7b61e2 100644 --- a/src/osd/scrubber/scrub_machine_lstnr.h +++ b/src/osd/scrubber/scrub_machine_lstnr.h @@ -179,6 +179,8 @@ struct ScrubMachineListener { virtual void unreserve_replicas() = 0; + virtual void on_replica_reservation_timeout() = 0; + virtual void set_scrub_begin_time() = 0; virtual void set_scrub_duration() = 0; -- 2.47.3