From: Ronen Friedman Date: Fri, 13 Oct 2023 16:07:56 +0000 (-0500) Subject: osd/scrub: extract ReplicaReservations into separate files X-Git-Tag: v19.0.0~197^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3bb2e14eb8ab5426199f36b93f54c85d40ced70e;p=ceph.git osd/scrub: extract ReplicaReservations into separate files As a preliminary step before ReplicaReservations ownership is moved to the scrubber's FSM. No code changes in this commit (apart from required 'include's). Signed-off-by: Ronen Friedman --- diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt index 7d19424b404a..aad0fdb15cd1 100644 --- a/src/osd/CMakeLists.txt +++ b/src/osd/CMakeLists.txt @@ -27,6 +27,7 @@ set(osd_srcs scrubber/PrimaryLogScrub.cc scrubber/scrub_job.cc scrubber/scrub_machine.cc + scrubber/scrub_reservations.cc scrubber/scrub_resources.cc scrubber/ScrubStore.cc scrubber/scrub_backend.cc diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 8ee964c964b4..609eb0a72e6a 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -2441,200 +2441,10 @@ void PgScrubber::preemption_data_t::reset() } -// ///////////////////// ReplicaReservations ////////////////////////////////// - -#undef dout_prefix -#define dout_prefix _prefix_fn(_dout, this, __func__) -template -static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "") -{ - return t->gen_prefix(*_dout, fn); -} +// ///////////////////// LocalReservation ////////////////////////////////// namespace Scrub { -ReplicaReservations::ReplicaReservations(ScrubMachineListener& scrbr) - - : m_scrubber{scrbr} - , m_pg{m_scrubber.get_pg()} - , m_pgid{m_scrubber.get_spgid().pgid} - , m_osds{m_pg->get_pg_osd(ScrubberPasskey())} -{ - // the acting set is sorted by pg_shard_t. The reservations are to be issued - // in this order, so that the OSDs will receive the requests in a consistent - // order. This is done to reduce the chance of having two PGs that share some - // of their acting-set OSDs, consistently interfering with each other's - // reservation process. - auto acting = m_pg->get_actingset(); - m_sorted_secondaries.reserve(acting.size()); - std::copy_if( - acting.cbegin(), acting.cend(), std::back_inserter(m_sorted_secondaries), - [whoami = m_pg->pg_whoami](const pg_shard_t& shard) { - return shard != whoami; - }); - - m_next_to_request = m_sorted_secondaries.cbegin(); - // send out the 1'st request (unless we have no replicas) - send_next_reservation_or_complete(); -} - -void ReplicaReservations::release_all() -{ - std::span replicas{ - m_sorted_secondaries.cbegin(), m_next_to_request}; - dout(10) << fmt::format("releasing {}", replicas) << dendl; - epoch_t epoch = m_pg->get_osdmap_epoch(); - - // send 'release' messages to all replicas we have managed to reserve - for (const auto& peer : replicas) { - auto m = make_message( - spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::RELEASE, - m_pg->pg_whoami); - m_pg->send_cluster_message(peer.osd, m, epoch, false); - } - - m_sorted_secondaries.clear(); - m_next_to_request = m_sorted_secondaries.cbegin(); -} - -void ReplicaReservations::discard_remote_reservations() -{ - dout(10) << "reset w/o issuing messages" << dendl; - m_sorted_secondaries.clear(); - m_next_to_request = m_sorted_secondaries.cbegin(); -} - -ReplicaReservations::~ReplicaReservations() -{ - release_all(); -} - -/** - * @ATTN we would not reach here if the ReplicaReservation object managed by - * the scrubber was reset. - */ -void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from) -{ - // verify that the grant is from the peer we expected. If not? - // for now - abort the OSD. \todo reconsider the reaction. - if (!get_last_sent().has_value() || from != *get_last_sent()) { - dout(1) << fmt::format( - "unexpected grant from {} (expected {})", from, - get_last_sent().value_or(pg_shard_t{})) - << dendl; - ceph_assert(from == get_last_sent()); - return; - } - - auto elapsed = clock::now() - m_last_request_sent_at; - - // log a warning if the response was slow to arrive - auto warn_timeout = m_scrubber.get_pg_cct()->_conf.get_val( - "osd_scrub_slow_reservation_response"); - if (!m_slow_response_warned && (elapsed > warn_timeout)) { - dout(1) << fmt::format( - "slow reservation response from {} ({}ms)", from, - duration_cast(elapsed).count()) - << dendl; - // prevent additional warnings - m_slow_response_warned = true; - } - dout(10) << fmt::format( - "granted by {} ({} of {}) in {}ms", from, - active_requests_cnt(), m_sorted_secondaries.size(), - duration_cast(elapsed).count()) - << dendl; - send_next_reservation_or_complete(); -} - -void ReplicaReservations::send_next_reservation_or_complete() -{ - if (m_next_to_request == m_sorted_secondaries.cend()) { - // granted by all replicas - dout(10) << "remote reservation complete" << dendl; - m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority); - - } else { - // send the next reservation request - const auto peer = *m_next_to_request; - const auto epoch = m_pg->get_osdmap_epoch(); - auto m = make_message( - spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST, - m_pg->pg_whoami); - m_pg->send_cluster_message(peer.osd, m, epoch, false); - m_last_request_sent_at = clock::now(); - dout(10) << fmt::format( - "reserving {} (the {} of {} replicas)", *m_next_to_request, - active_requests_cnt()+1, m_sorted_secondaries.size()) - << dendl; - m_next_to_request++; - } -} - -// temporary comment: the part of handle_reserve_reject() that will not -// be delegated to the FSM in the following commits -void ReplicaReservations::verify_rejections_source( - OpRequestRef op, - pg_shard_t from) -{ - // a convenient log message for the reservation process conclusion - // (matches the one in send_next_reservation_or_complete()) - dout(10) << fmt::format( - "remote reservation failure. Rejected by {} ({})", from, - *op->get_req()) - << dendl; - - // verify that the denial is from the peer we expected. If not? - // we should treat it as though the *correct* peer has rejected the request, - // but remember to release that peer, too. - - ceph_assert(get_last_sent().has_value()); - const auto expected = *get_last_sent(); - if (from != expected) { - dout(1) << fmt::format( - "unexpected rejection from {} (expected {})", from, expected) - << dendl; - } else { - // correct peer, wrong answer... - m_next_to_request--; // no need to release this one - } -} - -// to be delegated to the FSM in the following commits -void ReplicaReservations::handle_reserve_reject( - OpRequestRef op, - pg_shard_t from) -{ - verify_rejections_source(op, from); - release_all(); - m_scrubber.flag_reservations_failure(); - m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority); -} - -std::optional ReplicaReservations::get_last_sent() const -{ - if (m_next_to_request == m_sorted_secondaries.cbegin()) { - return std::nullopt; - } - return *(m_next_to_request - 1); -} - -size_t ReplicaReservations::active_requests_cnt() const -{ - return m_next_to_request - m_sorted_secondaries.cbegin(); -} - -std::ostream& ReplicaReservations::gen_prefix( - std::ostream& out, - std::string fn) const -{ - return m_pg->gen_prefix(out) - << fmt::format("scrubber::ReplicaReservations:{}: ", fn); -} - - -// ///////////////////// LocalReservation ////////////////////////////////// - // note: no dout()s in LocalReservation functions. Client logs interactions. LocalReservation::LocalReservation(OSDService* osds) : m_osds{osds} { diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 1370fc953c0c..fc7b735d219e 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -82,122 +82,12 @@ Main Scrubber interfaces: #include "osd_scrub_sched.h" #include "scrub_backend.h" #include "scrub_machine_lstnr.h" +#include "scrub_reservations.h" namespace Scrub { class ScrubMachine; struct BuildMap; -/** - * Reserving/freeing scrub resources at the replicas. - * - * When constructed - sends reservation requests to the acting_set OSDs, one - * by one. - * Once a replica's OSD replies with a 'grant'ed reservation, we send a - * reservation request to the next replica. - * A rejection triggers a "couldn't acquire the replicas' scrub resources" - * event. All granted reservations are released. - * - * Reserved replicas should be released at the end of the scrub session. The - * one exception is if the scrub terminates upon an interval change. In that - * scenario - the replicas discard their reservations on their own accord - * when noticing the change in interval, and there is no need (and no - * guaranteed way) to send them the release message. - * - * Timeouts: - * - * Slow-Secondary Warning: - * Warn if a replica takes more than milliseconds to reply to a - * reservation request. Only one warning is issued per session. - * - * Reservation Timeout: - * We limit the total time we wait for the replicas to respond to the - * reservation request. If the reservation back-and-forth does not complete - * within milliseconds, we give up and release all the reservations - * that have been acquired until that moment. - * (Why? because we have encountered instances where a reservation request was - * lost - either due to a bug or due to a network issue.) - */ -class ReplicaReservations { - using clock = ceph::coarse_real_clock; - - ScrubMachineListener& m_scrubber; - PG* m_pg; - - /// shorthand for m_scrubber.get_spgid().pgid - const pg_t m_pgid; - - /// for dout && when queueing messages to the FSM - OSDService* m_osds; - - /// the acting set (not including myself), sorted by pg_shard_t - std::vector m_sorted_secondaries; - - /// the next replica to which we will send a reservation request - std::vector::const_iterator m_next_to_request; - - /// for logs, and for detecting slow peers - clock::time_point m_last_request_sent_at; - - /// used to prevent multiple "slow response" warnings - bool m_slow_response_warned{false}; - - public: - ReplicaReservations(ScrubMachineListener& scrubber); - - ~ReplicaReservations(); - - /** - * The OK received from the replica (after verifying that it is indeed - * the replica we are expecting a reply from) is noted, and triggers - * one of two: either sending a reservation request to the next replica, - * or notifying the scrubber that we have reserved them all. - */ - void handle_reserve_grant(OpRequestRef op, pg_shard_t from); - - /** - * Verify that the sender of the received rejection is the replica we - * were expecting a reply from. - * If this is so - just mark the fact that the specific peer need not - * be released. - * - * Note - the actual handling of scrub session termination and of - * releasing the reserved replicas is done by the caller (the FSM). - */ - void verify_rejections_source(OpRequestRef op, pg_shard_t from); - - void handle_reserve_reject(OpRequestRef op, pg_shard_t from); - - /** - * Notifies implementation that it is no longer responsible for releasing - * tracked remote reservations. - * - * The intended usage is upon interval change. In general, replicas are - * responsible for releasing their own resources upon interval change without - * coordination from the primary. - * - * Sends no messages. - */ - void discard_remote_reservations(); - - // note: 'public', as accessed via the 'standard' dout_prefix() macro - std::ostream& gen_prefix(std::ostream& out, std::string fn) const; - - private: - /// send 'release' messages to all replicas we have managed to reserve - void release_all(); - - /// the only replica we are expecting a reply from - std::optional get_last_sent() const; - - /// The number of requests that have been sent (and not rejected) so far. - size_t active_requests_cnt() const; - - /** - * Either send a reservation request to the next replica, or notify the - * scrubber that we have reserved all the replicas. - */ - void send_next_reservation_or_complete(); -}; /** * wraps the local OSD scrub resource reservation in an RAII wrapper diff --git a/src/osd/scrubber/scrub_reservations.cc b/src/osd/scrubber/scrub_reservations.cc new file mode 100644 index 000000000000..bd5f61c9b9ec --- /dev/null +++ b/src/osd/scrubber/scrub_reservations.cc @@ -0,0 +1,206 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "./scrub_reservations.h" + +#include + +#include "common/ceph_time.h" +#include "messages/MOSDScrubReserve.h" +#include "osd/OSD.h" +#include "osd/PG.h" +#include "osd/osd_types_fmt.h" + +#include "pg_scrubber.h" + +using namespace Scrub; +using namespace std::chrono; +using namespace std::chrono_literals; + +#define dout_context (m_osds->cct) +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix_fn(_dout, this, __func__) +template +static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "") +{ + return t->gen_prefix(*_dout, fn); +} + +namespace Scrub { + +ReplicaReservations::ReplicaReservations(ScrubMachineListener& scrbr) + : m_scrubber{scrbr} + , m_pg{m_scrubber.get_pg()} + , m_pgid{m_scrubber.get_spgid().pgid} + , m_osds{m_pg->get_pg_osd(ScrubberPasskey())} +{ + // the acting set is sorted by pg_shard_t. The reservations are to be issued + // in this order, so that the OSDs will receive the requests in a consistent + // order. This is done to reduce the chance of having two PGs that share some + // of their acting-set OSDs, consistently interfering with each other's + // reservation process. + auto acting = m_pg->get_actingset(); + m_sorted_secondaries.reserve(acting.size()); + std::copy_if( + acting.cbegin(), acting.cend(), std::back_inserter(m_sorted_secondaries), + [whoami = m_pg->pg_whoami](const pg_shard_t& shard) { + return shard != whoami; + }); + + m_next_to_request = m_sorted_secondaries.cbegin(); + // send out the 1'st request (unless we have no replicas) + send_next_reservation_or_complete(); +} + +void ReplicaReservations::release_all() +{ + std::span replicas{ + m_sorted_secondaries.cbegin(), m_next_to_request}; + dout(10) << fmt::format("releasing {}", replicas) << dendl; + epoch_t epoch = m_pg->get_osdmap_epoch(); + + // send 'release' messages to all replicas we have managed to reserve + for (const auto& peer : replicas) { + auto m = make_message( + spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::RELEASE, + m_pg->pg_whoami); + m_pg->send_cluster_message(peer.osd, m, epoch, false); + } + + m_sorted_secondaries.clear(); + m_next_to_request = m_sorted_secondaries.cbegin(); +} + +void ReplicaReservations::discard_remote_reservations() +{ + dout(10) << "reset w/o issuing messages" << dendl; + m_sorted_secondaries.clear(); + m_next_to_request = m_sorted_secondaries.cbegin(); +} + +ReplicaReservations::~ReplicaReservations() +{ + release_all(); +} + +void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from) +{ + // verify that the grant is from the peer we expected. If not? + // for now - abort the OSD. \todo reconsider the reaction. + if (!get_last_sent().has_value() || from != *get_last_sent()) { + dout(1) << fmt::format( + "unexpected grant from {} (expected {})", from, + get_last_sent().value_or(pg_shard_t{})) + << dendl; + ceph_assert(from == get_last_sent()); + return; + } + + auto elapsed = clock::now() - m_last_request_sent_at; + + // log a warning if the response was slow to arrive + auto warn_timeout = m_scrubber.get_pg_cct()->_conf.get_val( + "osd_scrub_slow_reservation_response"); + if (!m_slow_response_warned && (elapsed > warn_timeout)) { + dout(1) << fmt::format( + "slow reservation response from {} ({}ms)", from, + duration_cast(elapsed).count()) + << dendl; + // prevent additional warnings + m_slow_response_warned = true; + } + dout(10) << fmt::format( + "granted by {} ({} of {}) in {}ms", from, + active_requests_cnt(), m_sorted_secondaries.size(), + duration_cast(elapsed).count()) + << dendl; + send_next_reservation_or_complete(); +} + +void ReplicaReservations::send_next_reservation_or_complete() +{ + if (m_next_to_request == m_sorted_secondaries.cend()) { + // granted by all replicas + dout(10) << "remote reservation complete" << dendl; + m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority); + + } else { + // send the next reservation request + const auto peer = *m_next_to_request; + const auto epoch = m_pg->get_osdmap_epoch(); + auto m = make_message( + spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST, + m_pg->pg_whoami); + m_pg->send_cluster_message(peer.osd, m, epoch, false); + m_last_request_sent_at = clock::now(); + dout(10) << fmt::format( + "reserving {} (the {} of {} replicas)", *m_next_to_request, + active_requests_cnt()+1, m_sorted_secondaries.size()) + << dendl; + m_next_to_request++; + } +} + +// temporary comment: the part of handle_reserve_reject() that will not +// be delegated to the FSM in the following commits +void ReplicaReservations::verify_rejections_source( + OpRequestRef op, + pg_shard_t from) +{ + // a convenient log message for the reservation process conclusion + // (matches the one in send_next_reservation_or_complete()) + dout(10) << fmt::format( + "remote reservation failure. Rejected by {} ({})", from, + *op->get_req()) + << dendl; + + // verify that the denial is from the peer we expected. If not? + // we should treat it as though the *correct* peer has rejected the request, + // but remember to release that peer, too. + + ceph_assert(get_last_sent().has_value()); + const auto expected = *get_last_sent(); + if (from != expected) { + dout(1) << fmt::format( + "unexpected rejection from {} (expected {})", from, expected) + << dendl; + } else { + // correct peer, wrong answer... + m_next_to_request--; // no need to release this one + } +} + +// to be delegated to the FSM in the following commits +void ReplicaReservations::handle_reserve_reject( + OpRequestRef op, + pg_shard_t from) +{ + verify_rejections_source(op, from); + release_all(); + m_scrubber.flag_reservations_failure(); + m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority); +} + +std::optional ReplicaReservations::get_last_sent() const +{ + if (m_next_to_request == m_sorted_secondaries.cbegin()) { + return std::nullopt; + } + return *(m_next_to_request - 1); +} + +size_t ReplicaReservations::active_requests_cnt() const +{ + return m_next_to_request - m_sorted_secondaries.cbegin(); +} + +std::ostream& ReplicaReservations::gen_prefix( + std::ostream& out, + std::string fn) const +{ + return m_pg->gen_prefix(out) + << fmt::format("scrubber::ReplicaReservations:{}: ", fn); +} + +} // namespace Scrub diff --git a/src/osd/scrubber/scrub_reservations.h b/src/osd/scrubber/scrub_reservations.h new file mode 100644 index 000000000000..a52f385a4fdf --- /dev/null +++ b/src/osd/scrubber/scrub_reservations.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include +#include +#include +#include +#include + +#include "osd/scrubber_common.h" + +#include "osd_scrub_sched.h" +#include "scrub_machine_lstnr.h" + +namespace Scrub { + +/** + * Reserving/freeing scrub resources at the replicas. + * + * When constructed - sends reservation requests to the acting_set OSDs, one + * by one. + * Once a replica's OSD replies with a 'grant'ed reservation, we send a + * reservation request to the next replica. + * A rejection triggers a "couldn't acquire the replicas' scrub resources" + * event. All granted reservations are released. + * + * Reserved replicas should be released at the end of the scrub session. The + * one exception is if the scrub terminates upon an interval change. In that + * scenario - the replicas discard their reservations on their own accord + * when noticing the change in interval, and there is no need (and no + * guaranteed way) to send them the release message. + * + * Timeouts: + * + * Slow-Secondary Warning: + * Warn if a replica takes more than milliseconds to reply to a + * reservation request. Only one warning is issued per session. + * + * Reservation Timeout: + * We limit the total time we wait for the replicas to respond to the + * reservation request. If the reservation back-and-forth does not complete + * within milliseconds, we give up and release all the reservations + * that have been acquired until that moment. + * (Why? because we have encountered instances where a reservation request was + * lost - either due to a bug or due to a network issue.) + */ +class ReplicaReservations { + using clock = ceph::coarse_real_clock; + + ScrubMachineListener& m_scrubber; + PG* m_pg; + + /// shorthand for m_scrubber.get_spgid().pgid + const pg_t m_pgid; + + /// for dout && when queueing messages to the FSM + OSDService* m_osds; + + /// the acting set (not including myself), sorted by pg_shard_t + std::vector m_sorted_secondaries; + + /// the next replica to which we will send a reservation request + std::vector::const_iterator m_next_to_request; + + /// for logs, and for detecting slow peers + clock::time_point m_last_request_sent_at; + + /// used to prevent multiple "slow response" warnings + bool m_slow_response_warned{false}; + + public: + ReplicaReservations(ScrubMachineListener& scrubber); + + ~ReplicaReservations(); + + /** + * The OK received from the replica (after verifying that it is indeed + * the replica we are expecting a reply from) is noted, and triggers + * one of two: either sending a reservation request to the next replica, + * or notifying the scrubber that we have reserved them all. + */ + void handle_reserve_grant(OpRequestRef op, pg_shard_t from); + + /** + * Verify that the sender of the received rejection is the replica we + * were expecting a reply from. + * If this is so - just mark the fact that the specific peer need not + * be released. + * + * Note - the actual handling of scrub session termination and of + * releasing the reserved replicas is done by the caller (the FSM). + */ + void verify_rejections_source(OpRequestRef op, pg_shard_t from); + + void handle_reserve_reject(OpRequestRef op, pg_shard_t from); + + /** + * Notifies implementation that it is no longer responsible for releasing + * tracked remote reservations. + * + * The intended usage is upon interval change. In general, replicas are + * responsible for releasing their own resources upon interval change without + * coordination from the primary. + * + * Sends no messages. + */ + void discard_remote_reservations(); + + // note: 'public', as accessed via the 'standard' dout_prefix() macro + std::ostream& gen_prefix(std::ostream& out, std::string fn) const; + + private: + /// send 'release' messages to all replicas we have managed to reserve + void release_all(); + + /// the only replica we are expecting a reply from + std::optional get_last_sent() const; + + /// The number of requests that have been sent (and not rejected) so far. + size_t active_requests_cnt() const; + + /** + * Either send a reservation request to the next replica, or notify the + * scrubber that we have reserved all the replicas. + */ + void send_next_reservation_or_complete(); +}; + +} // namespace Scrub