class MOSDScrubReserve : public MOSDFastDispatchOp {
private:
- static constexpr int HEAD_VERSION = 1;
+ static constexpr int HEAD_VERSION = 2;
static constexpr int COMPAT_VERSION = 1;
public:
+ using reservation_nonce_t = uint32_t;
+
spg_t pgid;
epoch_t map_epoch;
enum ReserveMsgOp {
};
int32_t type;
pg_shard_t from;
+ reservation_nonce_t reservation_nonce{0};
epoch_t get_map_epoch() const override {
return map_epoch;
MOSDScrubReserve(spg_t pgid,
epoch_t map_epoch,
int type,
- pg_shard_t from)
+ pg_shard_t from,
+ reservation_nonce_t nonce)
: MOSDFastDispatchOp{MSG_OSD_SCRUB_RESERVE, HEAD_VERSION, COMPAT_VERSION},
pgid(pgid), map_epoch(map_epoch),
- type(type), from(from) {}
+ type(type), from(from), reservation_nonce{nonce} {}
std::string_view get_type_name() const {
return "MOSDScrubReserve";
out << "RELEASE ";
break;
}
- out << "e" << map_epoch << ")";
+ out << "e" << map_epoch << " from: " << from
+ << " reservation_nonce: " << reservation_nonce << ")";
return;
}
decode(map_epoch, p);
decode(type, p);
decode(from, p);
+ if (header.version >= 2) {
+ decode(reservation_nonce, p);
+ } else {
+ // a zero nonce (identifying legacy senders) is ignored when
+ // checking the request for obsolescence
+ reservation_nonce = 0;
+ }
}
void encode_payload(uint64_t features) {
encode(map_epoch, payload);
encode(type, payload);
encode(from, payload);
+ encode(reservation_nonce, payload);
}
private:
template<class T, typename... Args>
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
// initiate the reservation process
- session.m_reservations.emplace(*scrbr, *session.m_perf_set);
+ session.m_reservations.emplace(
+ *scrbr, context<PrimaryActive>().last_request_sent_nonce,
+ *session.m_perf_set);
if (session.m_reservations->get_last_sent()) {
// the 1'st reservation request was sent
{
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
dout(10) << "ReservingReplicas::react(const ReplicaGrant&)" << dendl;
+ const auto& m = ev.m_op->get_req<MOSDScrubReserve>();
- if (context<Session>().m_reservations->handle_reserve_grant(
- ev.m_op, ev.m_from)) {
+ if (context<Session>().m_reservations->handle_reserve_grant(*m, ev.m_from)) {
// we are done with the reservation process
return transit<ActiveScrubbing>();
}
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
auto& session = context<Session>();
dout(10) << "ReservingReplicas::react(const ReplicaReject&)" << dendl;
- session.m_reservations->log_failure_and_duration(scrbcnt_resrv_rejected);
-
- // manipulate the 'next to reserve' iterator to exclude
- // the rejecting replica from the set of replicas requiring release
- session.m_reservations->verify_rejections_source(ev.m_op, ev.m_from);
+ const auto m = ev.m_op->get_req<MOSDScrubReserve>();
+
+ // Verify that the message is from the replica we were expecting a reply from,
+ // and that the message is not stale. If all is well - this is a real rejection:
+ // - log required details;
+ // - manipulate the 'next to reserve' iterator to exclude
+ // the rejecting replica from the set of replicas requiring release
+ if (!session.m_reservations->handle_reserve_rejection(*m, ev.m_from)) {
+ // stale or unexpected
+ return discard_event();
+ }
- // set 'reservation failure' as the scrub termination cause (affecting
+ // The rejection was carrying the correct reservation_nonce. It was
+ // logged by handle_reserve_rejection().
+ // Set 'reservation failure' as the scrub termination cause (affecting
// the rescheduling of this PG)
scrbr->flag_reservations_failure();
void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev)
{
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "ReplicaActive::on_reserve_req()" << dendl;
+ const auto m = ev.m_op->get_req<MOSDScrubReserve>();
+ const auto msg_nonce = m->reservation_nonce;
+ dout(10)
+ << fmt::format(
+ "ReplicaActive::on_reserve_req() from {} (reservation_nonce:{})",
+ ev.m_from, msg_nonce)
+ << dendl;
if (reserved_by_my_primary) {
dout(10) << "ReplicaActive::on_reserve_req(): already reserved" << dendl;
Message* reply = new MOSDScrubReserve(
spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch, ret.op,
- m_pg->pg_whoami);
+ m_pg->pg_whoami, msg_nonce);
m_osds->send_message_osd_cluster(reply, ev.m_op->get_req()->get_connection());
}
using reactions = mpl::list<
// when the interval ends - we may not be a primary anymore
sc::transition<IntervalChanged, NotActive>>;
+
+ /**
+ * Identifies a specific reservation request.
+ * The primary is permitted to cancel outstanding reservation requests without
+ * waiting for the pending response from the replica. Thus, we may, in general,
+ * see responses from prior reservation attempts that we need to ignore. Each
+ * reservation request is therefore associated with a nonce incremented within
+ * an interval with each reservation request. Any response with a non-matching
+ * nonce must be from a reservation request we canceled. Note that this check
+ * occurs after validating that the message is from the current interval, so
+ * reusing nonces between intervals is safe.
+ *
+ * 0 is a special value used to indicate that the sender did not include a nonce due
+ * to not being a sufficiently recent version.
+ */
+ reservation_nonce_t last_request_sent_nonce{1};
};
/**
#include <span>
#include "common/ceph_time.h"
-#include "messages/MOSDScrubReserve.h"
#include "osd/OSD.h"
#include "osd/PG.h"
#include "osd/osd_types_fmt.h"
ReplicaReservations::ReplicaReservations(
ScrubMachineListener& scrbr,
+ reservation_nonce_t& nonce,
PerfCounters& pc)
: m_scrubber{scrbr}
, m_pg{m_scrubber.get_pg()}
, m_pgid{m_scrubber.get_spgid().pgid}
, m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
+ , m_last_request_sent_nonce{nonce}
, m_perf_set{pc}
{
// the acting set is sorted by pg_shard_t. The reservations are to be issued
for (const auto& peer : replicas) {
auto m = make_message<MOSDScrubReserve>(
spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::RELEASE,
- m_pg->pg_whoami);
+ m_pg->pg_whoami, 0);
m_pg->send_cluster_message(peer.osd, m, epoch, false);
}
log_failure_and_duration(scrbcnt_resrv_aborted);
}
-bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+bool ReplicaReservations::is_reservation_response_relevant(
+ reservation_nonce_t msg_nonce) const
{
- // verify that the grant is from the peer we expected. If not?
- // for now - abort the OSD. \todo reconsider the reaction.
- if (!get_last_sent().has_value() || from != *get_last_sent()) {
+ return (msg_nonce == 0) || (msg_nonce == m_last_request_sent_nonce);
+}
+
+bool ReplicaReservations::is_msg_source_correct(pg_shard_t from) const
+{
+ const auto exp_source = get_last_sent();
+ return exp_source && from == *exp_source;
+}
+
+bool ReplicaReservations::handle_reserve_grant(
+ const MOSDScrubReserve& msg,
+ pg_shard_t from)
+{
+ if (!is_reservation_response_relevant(msg.reservation_nonce)) {
+ // this is a stale response to a previous request (e.g. one that
+ // timed-out). See m_last_request_sent_nonce for details.
dout(1) << fmt::format(
- "unexpected grant from {} (expected {})", from,
- get_last_sent().value_or(pg_shard_t{}))
+ "stale reservation response from {} with nonce {} vs. "
+ "expected {} (e:{})",
+ from, msg.reservation_nonce, m_last_request_sent_nonce,
+ msg.map_epoch)
<< dendl;
- ceph_assert(from == get_last_sent());
+ return false;
+ }
+
+ // verify that the grant is from the peer we expected. If not?
+ // for now - abort the OSD. There is no known scenario in which a
+ // grant message with a correct nonce can arrive from the wrong peer.
+ // (we would not abort for arriving messages with nonce 0, as those
+ // are legacy messages, for which the nonce was not verified).
+ if (!is_msg_source_correct(from)) {
+ const auto error_text = fmt::format(
+ "unexpected reservation grant from {} vs. the expected {} (e:{} "
+ "message nonce:{})",
+ from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch,
+ msg.reservation_nonce);
+ dout(1) << error_text << dendl;
+ if (msg.reservation_nonce != 0) {
+ m_osds->clog->error() << error_text;
+ ceph_abort_msg(error_text);
+ }
return false;
}
// log a warning if the response was slow to arrive
if ((m_slow_response_warn_timeout > 0ms) &&
(elapsed > m_slow_response_warn_timeout)) {
- dout(1) << fmt::format(
- "slow reservation response from {} ({}ms)", from,
- duration_cast<milliseconds>(elapsed).count())
- << dendl;
+ m_osds->clog->warn() << fmt::format(
+ "slow reservation response from {} ({}ms)", from,
+ duration_cast<milliseconds>(elapsed).count());
// prevent additional warnings
m_slow_response_warn_timeout = 0ms;
}
dout(10) << fmt::format(
- "granted by {} ({} of {}) in {}ms", from,
+ "(e:{} nonce:{}) granted by {} ({} of {}) in {}ms",
+ msg.map_epoch, msg.reservation_nonce, from,
active_requests_cnt(), m_sorted_secondaries.size(),
duration_cast<milliseconds>(elapsed).count())
<< dendl;
// send the next reservation request
const auto peer = *m_next_to_request;
const auto epoch = m_pg->get_osdmap_epoch();
+ m_last_request_sent_nonce++;
+
auto m = make_message<MOSDScrubReserve>(
- spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
- m_pg->pg_whoami);
+ spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST, m_pg->pg_whoami,
+ m_last_request_sent_nonce);
m_pg->send_cluster_message(peer.osd, m, epoch, false);
m_last_request_sent_at = ScrubClock::now();
dout(10) << fmt::format(
- "reserving {} (the {} of {} replicas)", *m_next_to_request,
- active_requests_cnt() + 1, m_sorted_secondaries.size())
+ "reserving {} (the {} of {} replicas) e:{} nonce:{}",
+ *m_next_to_request, active_requests_cnt() + 1,
+ m_sorted_secondaries.size(), epoch, m_last_request_sent_nonce)
<< dendl;
m_next_to_request++;
return false;
}
-void ReplicaReservations::verify_rejections_source(
- OpRequestRef op,
+bool ReplicaReservations::handle_reserve_rejection(
+ const MOSDScrubReserve& msg,
pg_shard_t from)
{
// a convenient log message for the reservation process conclusion
// (matches the one in send_next_reservation_or_complete())
dout(10) << fmt::format(
- "remote reservation failure. Rejected by {} ({})", from,
- *op->get_req())
+ "remote reservation failure. Rejected by {} ({})", from, msg)
<< dendl;
+ if (!is_reservation_response_relevant(msg.reservation_nonce)) {
+ // this is a stale response to a previous request (e.g. one that
+ // timed-out). See m_last_request_sent_nonce for details.
+ dout(10) << fmt::format(
+ "stale reservation response from {} with reservation_nonce "
+ "{} vs. expected {} (e:{})",
+ from, msg.reservation_nonce, m_last_request_sent_nonce,
+ msg.map_epoch)
+ << dendl;
+ return false;
+ }
+
+ log_failure_and_duration(scrbcnt_resrv_rejected);
+
+ // we should never see a rejection carrying a valid
+ // reservation nonce - arriving while we have no pending requests
+ ceph_assert(get_last_sent().has_value() || msg.reservation_nonce == 0);
+
// verify that the denial is from the peer we expected. If not?
+ // There is no known scenario in which this can happen, but if it does -
// we should treat it as though the *correct* peer has rejected the request,
// but remember to release that peer, too.
-
- ceph_assert(get_last_sent().has_value());
- const auto expected = *get_last_sent();
- if (from != expected) {
- dout(1) << fmt::format(
- "unexpected rejection from {} (expected {})", from, expected)
- << dendl;
- } else {
- // correct peer, wrong answer...
+ if (is_msg_source_correct(from)) {
m_next_to_request--; // no need to release this one
+ } else {
+ m_osds->clog->warn() << fmt::format(
+ "unexpected reservation denial from {} vs the expected {} (e:{} "
+ "message reservation_nonce:{})",
+ from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch,
+ msg.reservation_nonce);
}
+ return true;
}
std::optional<pg_shard_t> ReplicaReservations::get_last_sent() const
#include <string_view>
#include <vector>
+#include "messages/MOSDScrubReserve.h"
#include "osd/scrubber_common.h"
#include "osd_scrub_sched.h"
namespace Scrub {
+using reservation_nonce_t = MOSDScrubReserve::reservation_nonce_t;
+
/**
* Reserving/freeing scrub resources at the replicas.
*
* that have been acquired until that moment.
* (Why? because we have encountered instances where a reservation request was
* lost - either due to a bug or due to a network issue.)
+ *
+ * Keeping primary & replica in sync:
+ *
+ * Reservation requests may be canceled by the primary independently of the
+ * replica's response. Depending on timing, a cancellation by the primary might
+ * or might not be processed by a replica prior to sending a response (either
+ * rejection or success). Thus, we associate each reservation request with a
+ * nonce incremented with each reservation during an interval and drop any
+ * responses that do not match our current nonce.
+ * This check occurs after rejecting any messages from prior intervals, so
+ * reusing nonces between intervals is not a problem. Note that epoch would
+ * not suffice as it is possible for this sequence to occur several times
+ * without a new map epoch.
+ * Note - 'release' messages, which are not replied to by the replica,
+ * do not need or use that field.
*/
class ReplicaReservations {
ScrubMachineListener& m_scrubber;
/// for logs, and for detecting slow peers
ScrubTimePoint m_last_request_sent_at;
+ /**
+ * A ref to PrimaryActive::last_request_sent_nonce.
+ * Identifies a specific request sent, to verify against grant/deny
+ * responses.
+ * See PrimaryActive::last_request_sent_nonce for details.
+ */
+ reservation_nonce_t& m_last_request_sent_nonce;
+
/// the 'slow response' timeout (in milliseconds) - as configured.
/// Doubles as a 'do once' flag for the warning.
std::chrono::milliseconds m_slow_response_warn_timeout;
std::optional<ScrubTimePoint> m_process_started_at;
public:
- ReplicaReservations(ScrubMachineListener& scrubber, PerfCounters& pc);
+ ReplicaReservations(
+ ScrubMachineListener& scrubber,
+ reservation_nonce_t& nonce,
+ PerfCounters& pc);
~ReplicaReservations();
* \returns true if there are no more replicas to send reservation requests
* (i.e., the scrubber should proceed to the next phase), false otherwise.
*/
- bool handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+ bool handle_reserve_grant(const MOSDScrubReserve& msg, pg_shard_t from);
/**
+ * React to an incoming reservation rejection.
+ *
* Verify that the sender of the received rejection is the replica we
- * were expecting a reply from.
- * If this is so - just mark the fact that the specific peer need not
- * be released.
+ * were expecting a reply from, and that the message isn't stale (see
+ * m_last_request_sent_nonce for details).
+ * If a valid rejection: log it, and mark the fact that the specific peer
+ * need not be released.
*
* Note - the actual handling of scrub session termination and of
* releasing the reserved replicas is done by the caller (the FSM).
+ *
+ * Returns true if the rejection is valid, false otherwise.
*/
- void verify_rejections_source(OpRequestRef op, pg_shard_t from);
+ bool handle_reserve_rejection(const MOSDScrubReserve& msg, pg_shard_t from);
/**
* Notifies implementation that it is no longer responsible for releasing
*/
bool send_next_reservation_or_complete();
+ /**
+ * is this is a reply to our last request?
+ * Checks response once against m_last_request_sent_nonce. See
+ * m_last_request_sent_nonce for details.
+ */
+ bool is_reservation_response_relevant(reservation_nonce_t msg_nonce) const;
+
+ /**
+ * is this reply coming from the expected replica?
+ * Now that we check the nonce before checking the sender - this
+ * check should never fail.
+ */
+ bool is_msg_source_correct(pg_shard_t from) const;
+
// --- perf counters helpers
/**