m_reservations.reset();
}
+void PgScrubber::on_replica_reservation_timeout()
+{
+ if (m_reservations) {
+ m_reservations->handle_no_reply_timeout();
+ }
+}
+
void PgScrubber::set_reserving_now()
{
m_osds->get_scrub_services().set_reserving_now();
send_all_done();
} else {
- // start a timer to handle the case of no replies
- m_no_reply = make_unique<ReplicaReservations::no_reply_t>(
- m_osds, m_conf, *this, m_log_msg_prefix);
-
// send the reservation requests
for (auto p : m_acting_set) {
if (p == whoami)
void ReplicaReservations::send_all_done()
{
// stop any pending timeout timer
- m_no_reply.reset();
m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
}
void ReplicaReservations::send_reject()
{
// stop any pending timeout timer
- m_no_reply.reset();
m_scrub_job->resources_failure = true;
m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
}
{
dout(10) << __func__ << ": " << m_reserved_peers << dendl;
- m_no_reply.reset();
m_had_rejections = true; // preventing late-coming responses from triggering
// events
m_reserved_peers.clear();
m_had_rejections = true; // preventing late-coming responses from triggering
// events
- // stop any pending timeout timer
- m_no_reply.reset();
-
// send un-reserve messages to all reserved replicas. We do not wait for
// answer (there wouldn't be one). Other incoming messages will be discarded
// on the way, by our owner.
"{}: timeout! no reply from {}", __func__, m_waited_for_peers)
<< dendl;
+ // treat reply timeout as if a REJECT was received
m_had_rejections = true; // preventing any additional notifications
send_reject();
}
return out << m_log_msg_prefix;
}
-ReplicaReservations::no_reply_t::no_reply_t(
- OSDService* osds,
- const ConfigProxy& conf,
- ReplicaReservations& parent,
- std::string_view log_prfx)
- : m_osds{osds}
- , m_conf{conf}
- , m_parent{parent}
- , m_log_prfx{log_prfx}
-{
- using namespace std::chrono;
- auto now_is = clock::now();
- auto timeout =
- conf.get_val<std::chrono::milliseconds>("osd_scrub_reservation_timeout");
-
- m_abort_callback = new LambdaContext([this, now_is]([[maybe_unused]] int r) {
- // behave as if a REJECT was received
- m_osds->clog->warn() << fmt::format(
- "{} timeout on replica reservations (since {})", m_log_prfx, now_is);
- m_parent.handle_no_reply_timeout();
- });
-
- std::lock_guard l(m_osds->sleep_lock);
- m_osds->sleep_timer.add_event_after(timeout, m_abort_callback);
-}
-
-ReplicaReservations::no_reply_t::~no_reply_t()
-{
- std::lock_guard l(m_osds->sleep_lock);
- if (m_abort_callback) {
- m_osds->sleep_timer.cancel_event(m_abort_callback);
- }
-}
// ///////////////////// LocalReservation //////////////////////////////////
using clock = std::chrono::system_clock;
using tpoint_t = std::chrono::time_point<clock>;
- /// a no-reply timeout handler
- struct no_reply_t {
- explicit no_reply_t(
- OSDService* osds,
- const ConfigProxy& conf,
- ReplicaReservations& parent,
- std::string_view log_prfx);
-
- ~no_reply_t();
- OSDService* m_osds;
- const ConfigProxy& m_conf;
- ReplicaReservations& m_parent;
- std::string m_log_prfx;
- Context* m_abort_callback{nullptr};
- };
-
PG* m_pg;
std::set<pg_shard_t> m_acting_set;
OSDService* m_osds;
std::chrono::milliseconds m_timeout;
std::optional<tpoint_t> m_timeout_point;
- // detecting & handling a "no show" of a replica
- std::unique_ptr<no_reply_t> m_no_reply;
-
void release_replica(pg_shard_t peer, epoch_t epoch);
void send_all_done(); ///< all reservations are granted
void discard_replica_reservations() final;
void clear_scrub_reservations() final; // PG::clear... fwds to here
void unreserve_replicas() final;
+ void on_replica_reservation_timeout() final;
+
// managing scrub op registration
// replicas resources
scrbr->set_reserving_now();
scrbr->reserve_replicas();
+
+ auto timeout = scrbr->get_cct()->_conf.get_val<
+ std::chrono::milliseconds>("osd_scrub_reservation_timeout");
+ if (timeout.count() > 0) {
+ // Start a timer to handle case where the replicas take a long time to
+ // ack the reservation. See ReservationTimeout handler below.
+ m_timeout_token = machine.schedule_timer_event_after<ReservationTimeout>(
+ timeout);
+ }
}
ReservingReplicas::~ReservingReplicas()
scrbr->clear_reserving_now();
}
+sc::result ReservingReplicas::react(const ReservationTimeout&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "ReservingReplicas::react(const ReservationTimeout&)" << dendl;
+
+ dout(10)
+ << "PgScrubber: " << scrbr->get_spgid()
+ << " timeout on reserving replicas (since " << entered_at
+ << ")" << dendl;
+ scrbr->get_clog()->warn()
+ << "osd." << scrbr->get_whoami()
+ << " PgScrubber: " << scrbr->get_spgid()
+ << " timeout on reserving replicsa (since " << entered_at
+ << ")";
+
+ scrbr->on_replica_reservation_timeout();
+ return discard_event();
+}
+
sc::result ReservingReplicas::react(const ReservationFailure&)
{
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
/// a reservation request has failed
MEV(ReservationFailure)
+/// reservations have timed out
+MEV(ReservationTimeout)
+
/// initiate a new scrubbing session (relevant if we are a Primary)
MEV(StartScrub)
struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine>,
NamedSimply {
-
explicit ReservingReplicas(my_context ctx);
~ReservingReplicas();
using reactions = mpl::list<sc::custom_reaction<FullReset>,
// all replicas granted our resources request
sc::transition<RemotesReserved, ActiveScrubbing>,
+ sc::custom_reaction<ReservationTimeout>,
sc::custom_reaction<ReservationFailure>>;
+ ceph::coarse_real_clock::time_point entered_at =
+ ceph::coarse_real_clock::now();
+ ScrubMachine::timer_event_token_t m_timeout_token;
+
sc::result react(const FullReset&);
+ sc::result react(const ReservationTimeout&);
+
/// at least one replica denied us the scrub resources we've requested
sc::result react(const ReservationFailure&);
};
virtual void unreserve_replicas() = 0;
+ virtual void on_replica_reservation_timeout() = 0;
+
virtual void set_scrub_begin_time() = 0;
virtual void set_scrub_duration() = 0;