scrubber/PrimaryLogScrub.cc
scrubber/scrub_job.cc
scrubber/scrub_machine.cc
+ scrubber/scrub_reservations.cc
scrubber/scrub_resources.cc
scrubber/ScrubStore.cc
scrubber/scrub_backend.cc
}
-// ///////////////////// ReplicaReservations //////////////////////////////////
-
-#undef dout_prefix
-#define dout_prefix _prefix_fn(_dout, this, __func__)
-template <class T>
-static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "")
-{
- return t->gen_prefix(*_dout, fn);
-}
+// ///////////////////// LocalReservation //////////////////////////////////
namespace Scrub {
-ReplicaReservations::ReplicaReservations(ScrubMachineListener& scrbr)
-
- : m_scrubber{scrbr}
- , m_pg{m_scrubber.get_pg()}
- , m_pgid{m_scrubber.get_spgid().pgid}
- , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
-{
- // the acting set is sorted by pg_shard_t. The reservations are to be issued
- // in this order, so that the OSDs will receive the requests in a consistent
- // order. This is done to reduce the chance of having two PGs that share some
- // of their acting-set OSDs, consistently interfering with each other's
- // reservation process.
- auto acting = m_pg->get_actingset();
- m_sorted_secondaries.reserve(acting.size());
- std::copy_if(
- acting.cbegin(), acting.cend(), std::back_inserter(m_sorted_secondaries),
- [whoami = m_pg->pg_whoami](const pg_shard_t& shard) {
- return shard != whoami;
- });
-
- m_next_to_request = m_sorted_secondaries.cbegin();
- // send out the 1'st request (unless we have no replicas)
- send_next_reservation_or_complete();
-}
-
-void ReplicaReservations::release_all()
-{
- std::span<const pg_shard_t> replicas{
- m_sorted_secondaries.cbegin(), m_next_to_request};
- dout(10) << fmt::format("releasing {}", replicas) << dendl;
- epoch_t epoch = m_pg->get_osdmap_epoch();
-
- // send 'release' messages to all replicas we have managed to reserve
- for (const auto& peer : replicas) {
- auto m = make_message<MOSDScrubReserve>(
- spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::RELEASE,
- m_pg->pg_whoami);
- m_pg->send_cluster_message(peer.osd, m, epoch, false);
- }
-
- m_sorted_secondaries.clear();
- m_next_to_request = m_sorted_secondaries.cbegin();
-}
-
-void ReplicaReservations::discard_remote_reservations()
-{
- dout(10) << "reset w/o issuing messages" << dendl;
- m_sorted_secondaries.clear();
- m_next_to_request = m_sorted_secondaries.cbegin();
-}
-
-ReplicaReservations::~ReplicaReservations()
-{
- release_all();
-}
-
-/**
- * @ATTN we would not reach here if the ReplicaReservation object managed by
- * the scrubber was reset.
- */
-void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
-{
- // verify that the grant is from the peer we expected. If not?
- // for now - abort the OSD. \todo reconsider the reaction.
- if (!get_last_sent().has_value() || from != *get_last_sent()) {
- dout(1) << fmt::format(
- "unexpected grant from {} (expected {})", from,
- get_last_sent().value_or(pg_shard_t{}))
- << dendl;
- ceph_assert(from == get_last_sent());
- return;
- }
-
- auto elapsed = clock::now() - m_last_request_sent_at;
-
- // log a warning if the response was slow to arrive
- auto warn_timeout = m_scrubber.get_pg_cct()->_conf.get_val<milliseconds>(
- "osd_scrub_slow_reservation_response");
- if (!m_slow_response_warned && (elapsed > warn_timeout)) {
- dout(1) << fmt::format(
- "slow reservation response from {} ({}ms)", from,
- duration_cast<milliseconds>(elapsed).count())
- << dendl;
- // prevent additional warnings
- m_slow_response_warned = true;
- }
- dout(10) << fmt::format(
- "granted by {} ({} of {}) in {}ms", from,
- active_requests_cnt(), m_sorted_secondaries.size(),
- duration_cast<milliseconds>(elapsed).count())
- << dendl;
- send_next_reservation_or_complete();
-}
-
-void ReplicaReservations::send_next_reservation_or_complete()
-{
- if (m_next_to_request == m_sorted_secondaries.cend()) {
- // granted by all replicas
- dout(10) << "remote reservation complete" << dendl;
- m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
-
- } else {
- // send the next reservation request
- const auto peer = *m_next_to_request;
- const auto epoch = m_pg->get_osdmap_epoch();
- auto m = make_message<MOSDScrubReserve>(
- spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
- m_pg->pg_whoami);
- m_pg->send_cluster_message(peer.osd, m, epoch, false);
- m_last_request_sent_at = clock::now();
- dout(10) << fmt::format(
- "reserving {} (the {} of {} replicas)", *m_next_to_request,
- active_requests_cnt()+1, m_sorted_secondaries.size())
- << dendl;
- m_next_to_request++;
- }
-}
-
-// temporary comment: the part of handle_reserve_reject() that will not
-// be delegated to the FSM in the following commits
-void ReplicaReservations::verify_rejections_source(
- OpRequestRef op,
- pg_shard_t from)
-{
- // a convenient log message for the reservation process conclusion
- // (matches the one in send_next_reservation_or_complete())
- dout(10) << fmt::format(
- "remote reservation failure. Rejected by {} ({})", from,
- *op->get_req())
- << dendl;
-
- // verify that the denial is from the peer we expected. If not?
- // we should treat it as though the *correct* peer has rejected the request,
- // but remember to release that peer, too.
-
- ceph_assert(get_last_sent().has_value());
- const auto expected = *get_last_sent();
- if (from != expected) {
- dout(1) << fmt::format(
- "unexpected rejection from {} (expected {})", from, expected)
- << dendl;
- } else {
- // correct peer, wrong answer...
- m_next_to_request--; // no need to release this one
- }
-}
-
-// to be delegated to the FSM in the following commits
-void ReplicaReservations::handle_reserve_reject(
- OpRequestRef op,
- pg_shard_t from)
-{
- verify_rejections_source(op, from);
- release_all();
- m_scrubber.flag_reservations_failure();
- m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
-}
-
-std::optional<pg_shard_t> ReplicaReservations::get_last_sent() const
-{
- if (m_next_to_request == m_sorted_secondaries.cbegin()) {
- return std::nullopt;
- }
- return *(m_next_to_request - 1);
-}
-
-size_t ReplicaReservations::active_requests_cnt() const
-{
- return m_next_to_request - m_sorted_secondaries.cbegin();
-}
-
-std::ostream& ReplicaReservations::gen_prefix(
- std::ostream& out,
- std::string fn) const
-{
- return m_pg->gen_prefix(out)
- << fmt::format("scrubber::ReplicaReservations:{}: ", fn);
-}
-
-
-// ///////////////////// LocalReservation //////////////////////////////////
-
// note: no dout()s in LocalReservation functions. Client logs interactions.
LocalReservation::LocalReservation(OSDService* osds) : m_osds{osds}
{
#include "osd_scrub_sched.h"
#include "scrub_backend.h"
#include "scrub_machine_lstnr.h"
+#include "scrub_reservations.h"
namespace Scrub {
class ScrubMachine;
struct BuildMap;
-/**
- * Reserving/freeing scrub resources at the replicas.
- *
- * When constructed - sends reservation requests to the acting_set OSDs, one
- * by one.
- * Once a replica's OSD replies with a 'grant'ed reservation, we send a
- * reservation request to the next replica.
- * A rejection triggers a "couldn't acquire the replicas' scrub resources"
- * event. All granted reservations are released.
- *
- * Reserved replicas should be released at the end of the scrub session. The
- * one exception is if the scrub terminates upon an interval change. In that
- * scenario - the replicas discard their reservations on their own accord
- * when noticing the change in interval, and there is no need (and no
- * guaranteed way) to send them the release message.
- *
- * Timeouts:
- *
- * Slow-Secondary Warning:
- * Warn if a replica takes more than <conf> milliseconds to reply to a
- * reservation request. Only one warning is issued per session.
- *
- * Reservation Timeout:
- * We limit the total time we wait for the replicas to respond to the
- * reservation request. If the reservation back-and-forth does not complete
- * within <conf> milliseconds, we give up and release all the reservations
- * that have been acquired until that moment.
- * (Why? because we have encountered instances where a reservation request was
- * lost - either due to a bug or due to a network issue.)
- */
-class ReplicaReservations {
- using clock = ceph::coarse_real_clock;
-
- ScrubMachineListener& m_scrubber;
- PG* m_pg;
-
- /// shorthand for m_scrubber.get_spgid().pgid
- const pg_t m_pgid;
-
- /// for dout && when queueing messages to the FSM
- OSDService* m_osds;
-
- /// the acting set (not including myself), sorted by pg_shard_t
- std::vector<pg_shard_t> m_sorted_secondaries;
-
- /// the next replica to which we will send a reservation request
- std::vector<pg_shard_t>::const_iterator m_next_to_request;
-
- /// for logs, and for detecting slow peers
- clock::time_point m_last_request_sent_at;
-
- /// used to prevent multiple "slow response" warnings
- bool m_slow_response_warned{false};
-
- public:
- ReplicaReservations(ScrubMachineListener& scrubber);
-
- ~ReplicaReservations();
-
- /**
- * The OK received from the replica (after verifying that it is indeed
- * the replica we are expecting a reply from) is noted, and triggers
- * one of two: either sending a reservation request to the next replica,
- * or notifying the scrubber that we have reserved them all.
- */
- void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
-
- /**
- * Verify that the sender of the received rejection is the replica we
- * were expecting a reply from.
- * If this is so - just mark the fact that the specific peer need not
- * be released.
- *
- * Note - the actual handling of scrub session termination and of
- * releasing the reserved replicas is done by the caller (the FSM).
- */
- void verify_rejections_source(OpRequestRef op, pg_shard_t from);
-
- void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
-
- /**
- * Notifies implementation that it is no longer responsible for releasing
- * tracked remote reservations.
- *
- * The intended usage is upon interval change. In general, replicas are
- * responsible for releasing their own resources upon interval change without
- * coordination from the primary.
- *
- * Sends no messages.
- */
- void discard_remote_reservations();
-
- // note: 'public', as accessed via the 'standard' dout_prefix() macro
- std::ostream& gen_prefix(std::ostream& out, std::string fn) const;
-
- private:
- /// send 'release' messages to all replicas we have managed to reserve
- void release_all();
-
- /// the only replica we are expecting a reply from
- std::optional<pg_shard_t> get_last_sent() const;
-
- /// The number of requests that have been sent (and not rejected) so far.
- size_t active_requests_cnt() const;
-
- /**
- * Either send a reservation request to the next replica, or notify the
- * scrubber that we have reserved all the replicas.
- */
- void send_next_reservation_or_complete();
-};
/**
* wraps the local OSD scrub resource reservation in an RAII wrapper
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "./scrub_reservations.h"
+
+#include <span>
+
+#include "common/ceph_time.h"
+#include "messages/MOSDScrubReserve.h"
+#include "osd/OSD.h"
+#include "osd/PG.h"
+#include "osd/osd_types_fmt.h"
+
+#include "pg_scrubber.h"
+
+using namespace Scrub;
+using namespace std::chrono;
+using namespace std::chrono_literals;
+
+#define dout_context (m_osds->cct)
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix_fn(_dout, this, __func__)
+template <class T>
+static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "")
+{
+ return t->gen_prefix(*_dout, fn);
+}
+
+namespace Scrub {
+
+ReplicaReservations::ReplicaReservations(ScrubMachineListener& scrbr)
+ : m_scrubber{scrbr}
+ , m_pg{m_scrubber.get_pg()}
+ , m_pgid{m_scrubber.get_spgid().pgid}
+ , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
+{
+ // the acting set is sorted by pg_shard_t. The reservations are to be issued
+ // in this order, so that the OSDs will receive the requests in a consistent
+ // order. This is done to reduce the chance of having two PGs that share some
+ // of their acting-set OSDs, consistently interfering with each other's
+ // reservation process.
+ auto acting = m_pg->get_actingset();
+ m_sorted_secondaries.reserve(acting.size());
+ std::copy_if(
+ acting.cbegin(), acting.cend(), std::back_inserter(m_sorted_secondaries),
+ [whoami = m_pg->pg_whoami](const pg_shard_t& shard) {
+ return shard != whoami;
+ });
+
+ m_next_to_request = m_sorted_secondaries.cbegin();
+ // send out the 1'st request (unless we have no replicas)
+ send_next_reservation_or_complete();
+}
+
+void ReplicaReservations::release_all()
+{
+ std::span<const pg_shard_t> replicas{
+ m_sorted_secondaries.cbegin(), m_next_to_request};
+ dout(10) << fmt::format("releasing {}", replicas) << dendl;
+ epoch_t epoch = m_pg->get_osdmap_epoch();
+
+ // send 'release' messages to all replicas we have managed to reserve
+ for (const auto& peer : replicas) {
+ auto m = make_message<MOSDScrubReserve>(
+ spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::RELEASE,
+ m_pg->pg_whoami);
+ m_pg->send_cluster_message(peer.osd, m, epoch, false);
+ }
+
+ m_sorted_secondaries.clear();
+ m_next_to_request = m_sorted_secondaries.cbegin();
+}
+
+void ReplicaReservations::discard_remote_reservations()
+{
+ dout(10) << "reset w/o issuing messages" << dendl;
+ m_sorted_secondaries.clear();
+ m_next_to_request = m_sorted_secondaries.cbegin();
+}
+
+ReplicaReservations::~ReplicaReservations()
+{
+ release_all();
+}
+
+void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+ // verify that the grant is from the peer we expected. If not?
+ // for now - abort the OSD. \todo reconsider the reaction.
+ if (!get_last_sent().has_value() || from != *get_last_sent()) {
+ dout(1) << fmt::format(
+ "unexpected grant from {} (expected {})", from,
+ get_last_sent().value_or(pg_shard_t{}))
+ << dendl;
+ ceph_assert(from == get_last_sent());
+ return;
+ }
+
+ auto elapsed = clock::now() - m_last_request_sent_at;
+
+ // log a warning if the response was slow to arrive
+ auto warn_timeout = m_scrubber.get_pg_cct()->_conf.get_val<milliseconds>(
+ "osd_scrub_slow_reservation_response");
+ if (!m_slow_response_warned && (elapsed > warn_timeout)) {
+ dout(1) << fmt::format(
+ "slow reservation response from {} ({}ms)", from,
+ duration_cast<milliseconds>(elapsed).count())
+ << dendl;
+ // prevent additional warnings
+ m_slow_response_warned = true;
+ }
+ dout(10) << fmt::format(
+ "granted by {} ({} of {}) in {}ms", from,
+ active_requests_cnt(), m_sorted_secondaries.size(),
+ duration_cast<milliseconds>(elapsed).count())
+ << dendl;
+ send_next_reservation_or_complete();
+}
+
+void ReplicaReservations::send_next_reservation_or_complete()
+{
+ if (m_next_to_request == m_sorted_secondaries.cend()) {
+ // granted by all replicas
+ dout(10) << "remote reservation complete" << dendl;
+ m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
+
+ } else {
+ // send the next reservation request
+ const auto peer = *m_next_to_request;
+ const auto epoch = m_pg->get_osdmap_epoch();
+ auto m = make_message<MOSDScrubReserve>(
+ spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
+ m_pg->pg_whoami);
+ m_pg->send_cluster_message(peer.osd, m, epoch, false);
+ m_last_request_sent_at = clock::now();
+ dout(10) << fmt::format(
+ "reserving {} (the {} of {} replicas)", *m_next_to_request,
+ active_requests_cnt()+1, m_sorted_secondaries.size())
+ << dendl;
+ m_next_to_request++;
+ }
+}
+
+// temporary comment: the part of handle_reserve_reject() that will not
+// be delegated to the FSM in the following commits
+void ReplicaReservations::verify_rejections_source(
+ OpRequestRef op,
+ pg_shard_t from)
+{
+ // a convenient log message for the reservation process conclusion
+ // (matches the one in send_next_reservation_or_complete())
+ dout(10) << fmt::format(
+ "remote reservation failure. Rejected by {} ({})", from,
+ *op->get_req())
+ << dendl;
+
+ // verify that the denial is from the peer we expected. If not?
+ // we should treat it as though the *correct* peer has rejected the request,
+ // but remember to release that peer, too.
+
+ ceph_assert(get_last_sent().has_value());
+ const auto expected = *get_last_sent();
+ if (from != expected) {
+ dout(1) << fmt::format(
+ "unexpected rejection from {} (expected {})", from, expected)
+ << dendl;
+ } else {
+ // correct peer, wrong answer...
+ m_next_to_request--; // no need to release this one
+ }
+}
+
+// to be delegated to the FSM in the following commits
+void ReplicaReservations::handle_reserve_reject(
+ OpRequestRef op,
+ pg_shard_t from)
+{
+ verify_rejections_source(op, from);
+ release_all();
+ m_scrubber.flag_reservations_failure();
+ m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
+}
+
+std::optional<pg_shard_t> ReplicaReservations::get_last_sent() const
+{
+ if (m_next_to_request == m_sorted_secondaries.cbegin()) {
+ return std::nullopt;
+ }
+ return *(m_next_to_request - 1);
+}
+
+size_t ReplicaReservations::active_requests_cnt() const
+{
+ return m_next_to_request - m_sorted_secondaries.cbegin();
+}
+
+std::ostream& ReplicaReservations::gen_prefix(
+ std::ostream& out,
+ std::string fn) const
+{
+ return m_pg->gen_prefix(out)
+ << fmt::format("scrubber::ReplicaReservations:{}: ", fn);
+}
+
+} // namespace Scrub
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <cassert>
+#include <chrono>
+#include <optional>
+#include <string_view>
+#include <vector>
+
+#include "osd/scrubber_common.h"
+
+#include "osd_scrub_sched.h"
+#include "scrub_machine_lstnr.h"
+
+namespace Scrub {
+
+/**
+ * Reserving/freeing scrub resources at the replicas.
+ *
+ * When constructed - sends reservation requests to the acting_set OSDs, one
+ * by one.
+ * Once a replica's OSD replies with a 'grant'ed reservation, we send a
+ * reservation request to the next replica.
+ * A rejection triggers a "couldn't acquire the replicas' scrub resources"
+ * event. All granted reservations are released.
+ *
+ * Reserved replicas should be released at the end of the scrub session. The
+ * one exception is if the scrub terminates upon an interval change. In that
+ * scenario - the replicas discard their reservations on their own accord
+ * when noticing the change in interval, and there is no need (and no
+ * guaranteed way) to send them the release message.
+ *
+ * Timeouts:
+ *
+ * Slow-Secondary Warning:
+ * Warn if a replica takes more than <conf> milliseconds to reply to a
+ * reservation request. Only one warning is issued per session.
+ *
+ * Reservation Timeout:
+ * We limit the total time we wait for the replicas to respond to the
+ * reservation request. If the reservation back-and-forth does not complete
+ * within <conf> milliseconds, we give up and release all the reservations
+ * that have been acquired until that moment.
+ * (Why? because we have encountered instances where a reservation request was
+ * lost - either due to a bug or due to a network issue.)
+ */
+class ReplicaReservations {
+ using clock = ceph::coarse_real_clock;
+
+ ScrubMachineListener& m_scrubber;
+ PG* m_pg;
+
+ /// shorthand for m_scrubber.get_spgid().pgid
+ const pg_t m_pgid;
+
+ /// for dout && when queueing messages to the FSM
+ OSDService* m_osds;
+
+ /// the acting set (not including myself), sorted by pg_shard_t
+ std::vector<pg_shard_t> m_sorted_secondaries;
+
+ /// the next replica to which we will send a reservation request
+ std::vector<pg_shard_t>::const_iterator m_next_to_request;
+
+ /// for logs, and for detecting slow peers
+ clock::time_point m_last_request_sent_at;
+
+ /// used to prevent multiple "slow response" warnings
+ bool m_slow_response_warned{false};
+
+ public:
+ ReplicaReservations(ScrubMachineListener& scrubber);
+
+ ~ReplicaReservations();
+
+ /**
+ * The OK received from the replica (after verifying that it is indeed
+ * the replica we are expecting a reply from) is noted, and triggers
+ * one of two: either sending a reservation request to the next replica,
+ * or notifying the scrubber that we have reserved them all.
+ */
+ void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+
+ /**
+ * Verify that the sender of the received rejection is the replica we
+ * were expecting a reply from.
+ * If this is so - just mark the fact that the specific peer need not
+ * be released.
+ *
+ * Note - the actual handling of scrub session termination and of
+ * releasing the reserved replicas is done by the caller (the FSM).
+ */
+ void verify_rejections_source(OpRequestRef op, pg_shard_t from);
+
+ void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
+
+ /**
+ * Notifies implementation that it is no longer responsible for releasing
+ * tracked remote reservations.
+ *
+ * The intended usage is upon interval change. In general, replicas are
+ * responsible for releasing their own resources upon interval change without
+ * coordination from the primary.
+ *
+ * Sends no messages.
+ */
+ void discard_remote_reservations();
+
+ // note: 'public', as accessed via the 'standard' dout_prefix() macro
+ std::ostream& gen_prefix(std::ostream& out, std::string fn) const;
+
+ private:
+ /// send 'release' messages to all replicas we have managed to reserve
+ void release_all();
+
+ /// the only replica we are expecting a reply from
+ std::optional<pg_shard_t> get_last_sent() const;
+
+ /// The number of requests that have been sent (and not rejected) so far.
+ size_t active_requests_cnt() const;
+
+ /**
+ * Either send a reservation request to the next replica, or notify the
+ * scrubber that we have reserved all the replicas.
+ */
+ void send_next_reservation_or_complete();
+};
+
+} // namespace Scrub