also - move most time keeping from the Scrubber to the FSM.
Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
// create a separate set for each pool type & scrub level
for (const auto& label : perf_labels) {
PerfCounters* counters = build_scrub_labeled_perf(cct, label);
+ ceph_assert(counters);
cct->get_perfcounters_collection()->add(counters);
- m_perf_counters[*idx++] = counters;
+ m_perf_counters[*(idx++)] = counters;
}
}
void PgScrubber::select_range_n_notify()
{
+ get_counters_set().inc(scrbcnt_chunks_selected);
+
if (select_range()) {
// the next chunk to handle is not blocked
dout(20) << __func__ << ": selection OK" << dendl;
// we will wait for the objects range to become available for scrubbing
dout(10) << __func__ << ": selected chunk is busy" << dendl;
m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority);
+ get_counters_set().inc(scrbcnt_chunks_busy);
}
}
return false;
}
+ get_counters_set().inc(scrbcnt_write_blocked);
dout(20) << __func__ << " " << soid << " can preempt? "
<< preemption_data.is_preemptable() << " already preempted? "
<< preemption_data.was_preempted() << dendl;
<< ". Displayed: " << m_mode_desc << dendl;
}
+std::string_view PgScrubber::get_op_mode_text() const
+{
+ return m_mode_desc;
+}
+
void PgScrubber::_request_scrub_map(pg_shard_t replica,
eversion_t version,
hobject_t start,
false};
} else {
- int32_t duration = (utime_t{now_is} - scrub_begin_stamp).sec();
+ int32_t dur_seconds =
+ duration_cast<seconds>(m_fsm->get_time_scrubbing()).count();
return pg_scrubbing_status_t{
- utime_t{},
- duration,
- pg_scrub_sched_status_t::active,
- true, // active
- (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow),
- false /* is periodic? unknown, actually */};
+ utime_t{},
+ dur_seconds,
+ pg_scrub_sched_status_t::active,
+ true, // active
+ (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow),
+ false /* is periodic? unknown, actually */};
}
}
if (m_scrub_job->state != Scrub::qu_state_t::registered) {
m_osds->cct, m_pg->pg_id, m_osds->get_nodeid());
}
-void PgScrubber::set_scrub_begin_time()
+void PgScrubber::set_scrub_duration(std::chrono::milliseconds duration)
{
- scrub_begin_stamp = ceph_clock_now();
- m_osds->clog->debug() << fmt::format(
- "{} {} starts",
- m_pg->info.pgid.pgid,
- m_mode_desc);
-}
-
-void PgScrubber::set_scrub_duration()
-{
- utime_t stamp = ceph_clock_now();
- utime_t duration = stamp - scrub_begin_stamp;
+ dout(20) << fmt::format("{}: to {}", __func__, duration) << dendl;
+ double dur_ms = double(duration.count());
m_pg->recovery_state.update_stats([=](auto& history, auto& stats) {
- stats.last_scrub_duration = ceill(duration.to_msec() / 1000.0);
- stats.scrub_duration = double(duration);
+ stats.last_scrub_duration = ceill(dur_ms / 1000.0);
+ stats.scrub_duration = dur_ms;
return true;
});
}
+PerfCounters& PgScrubber::get_counters_set() const
+{
+ return *m_osds->get_scrub_services().get_perf_counters(
+ (m_pg->pool.info.is_replicated() ? pg_pool_t::TYPE_REPLICATED
+ : pg_pool_t::TYPE_ERASURE),
+ (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow));
+}
+
void PgScrubber::cleanup_on_finish()
{
dout(10) << __func__ << dendl;
int get_whoami() const final;
spg_t get_spgid() const final { return m_pg->get_pgid(); }
PG* get_pg() const final { return m_pg; }
+ PerfCounters& get_counters_set() const final;
// temporary interface (to be discarded in a follow-up PR)
/// set the 'resources_failure' flag in the scrub-job object
std::string dump_awaited_maps() const final;
- void set_scrub_begin_time() final;
+ void set_scrub_duration(std::chrono::milliseconds duration) final;
- void set_scrub_duration() final;
-
- utime_t scrub_begin_stamp;
std::ostream& gen_prefix(std::ostream& out) const final;
/// facilitate scrub-backend access to SnapMapper mappings
void update_op_mode_text();
+ std::string_view get_op_mode_text() const final;
+
private:
/**
* initiate a deep-scrub after the current scrub ended with errors.
return m_scrbr->gen_prefix(out) << "FSM: ";
}
+ceph::timespan ScrubMachine::get_time_scrubbing() const
+{
+ // note: the state_cast does not work in the Session ctor
+ auto session = state_cast<const Session*>();
+ if (!session) {
+ dout(20) << fmt::format("{}: not in session", __func__) << dendl;
+ return ceph::timespan{};
+ }
+
+ if (session && session->m_session_started_at != ScrubTimePoint{}) {
+ dout(20) << fmt::format(
+ "{}: session_started_at: {} d:{}", __func__,
+ session->m_session_started_at,
+ ScrubClock::now() - session->m_session_started_at)
+ << dendl;
+ return ScrubClock::now() - session->m_session_started_at;
+ }
+ dout(30) << fmt::format("{}: no session_start time", __func__) << dendl;
+ return ceph::timespan{};
+}
+
// ////////////// the actual actions
// ----------------------- NotActive -----------------------------------------
{
dout(10) << "NotActive::react(const StartScrub&)" << dendl;
DECLARE_LOCALS;
- scrbr->set_scrub_begin_time();
return transit<ReservingReplicas>();
}
{
dout(10) << "NotActive::react(const AfterRepairScrub&)" << dendl;
DECLARE_LOCALS;
- scrbr->set_scrub_begin_time();
return transit<ReservingReplicas>();
}
// occasions will cause no harm.
// We choose ignorance.
std::ignore = scrbr->set_reserving_now();
+
+ m_perf_set = &scrbr->get_counters_set();
+ m_perf_set->inc(scrbcnt_started);
}
Session::~Session()
{
dout(10) << "-- state -->> ActiveScrubbing" << dendl;
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ auto& session = context<Session>();
+
+ session.m_perf_set->inc(scrbcnt_active_started);
+ scrbr->get_clog()->debug() << fmt::format(
+ "{} {} starts", machine.m_pg_id, scrbr->get_op_mode_text());
+
scrbr->on_init();
}
ActiveScrubbing::~ActiveScrubbing()
{
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ auto& session = context<Session>();
dout(15) << __func__ << dendl;
+
+ // if the begin-time stamp was not set 'off' (as done if the scrubbing
+ // completed successfully), we use it now to set the 'failed scrub' duration.
+ if (session.m_session_started_at != ScrubTimePoint{}) {
+ auto logged_duration = ScrubClock::now() - session.m_session_started_at;
+ session.m_perf_set->tinc(scrbcnt_failed_elapsed, logged_duration);
+ session.m_perf_set->inc(scrbcnt_failed);
+ }
}
/*
m_timeout_token = machine.schedule_timer_event_after<RangeBlockedAlarm>(
grace);
}
+ context<Session>().m_perf_set->inc(scrbcnt_blocked);
}
sc::result RangeBlocked::react(const RangeBlockedAlarm&)
{
DECLARE_LOCALS;
char buf[50];
- std::time_t now_c = ceph::coarse_real_clock::to_time_t(entered_at);
+ std::time_t now_c = ScrubClock::to_time_t(entered_at);
strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c));
dout(10)
<< "PgScrubber: " << scrbr->get_spgid()
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
dout(10) << "PendingTimer::react(const SleepComplete&)" << dendl;
- auto slept_for = ceph::coarse_real_clock::now() - entered_at;
+ auto slept_for = ScrubClock::now() - entered_at;
dout(20) << "PgScrubber: " << scrbr->get_spgid()
<< " slept for " << slept_for << dendl;
{
dout(10) << " -- state -->> Session/Act/BuildMap" << dendl;
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ auto& session = context<Session>();
// no need to check for an epoch change, as all possible flows that brought
// us here have a check_interval() verification of their final event.
dout(10) << __func__ << " preempted!!!" << dendl;
scrbr->mark_local_map_ready();
post_event(IntBmPreempted{});
+ session.m_perf_set->inc(scrbcnt_preempted);
} else {
{
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
dout(10) << "WaitDigestUpdate::react(const ScrubFinished&)" << dendl;
- scrbr->set_scrub_duration();
+ auto& session = context<Session>();
+
+ session.m_perf_set->inc(scrbcnt_successful);
+
+ // set the 'scrub duration'
+ auto duration = machine.get_time_scrubbing();
+ session.m_perf_set->tinc(scrbcnt_successful_elapsed, duration);
+ scrbr->set_scrub_duration(duration_cast<milliseconds>(duration));
+ session.m_session_started_at = ScrubTimePoint{};
+
scrbr->scrub_finish();
return transit<NotActive>();
}
[[nodiscard]] bool is_reserving() const;
[[nodiscard]] bool is_accepting_updates() const;
+ // elapsed time for the currently active scrub.session
+ ceph::timespan get_time_scrubbing() const;
// ///////////////// aux declarations & functions //////////////////////// //
/// managing the scrub session's reservations (optional, as
/// it's an RAII wrapper around the state of 'holding reservations')
std::optional<ReplicaReservations> m_reservations{std::nullopt};
+
+ /// the relevant set of performance counters for this session
+ /// (relevant, i.e. for this pool type X scrub level)
+ PerfCounters* m_perf_set{nullptr};
+
+ /// the time when the session was initiated
+ ScrubTimePoint m_session_started_at{ScrubClock::now()};
};
struct ReservingReplicas : sc::state<ReservingReplicas, Session>,
sc::transition<RemotesReserved, ActiveScrubbing>,
sc::custom_reaction<ReservationTimeout>>;
- ceph::coarse_real_clock::time_point entered_at =
- ceph::coarse_real_clock::now();
+ ScrubTimePoint entered_at = ScrubClock::now();
ScrubMachine::timer_event_token_t m_timeout_token;
/// a "raw" event carrying a peer's grant response
sc::custom_reaction<RangeBlockedAlarm>,
sc::transition<Unblocked, PendingTimer>>;
- ceph::coarse_real_clock::time_point entered_at =
- ceph::coarse_real_clock::now();
+ ScrubTimePoint entered_at = ScrubClock::now();
ScrubMachine::timer_event_token_t m_timeout_token;
- sc::result react(const RangeBlockedAlarm &);
+ sc::result react(const RangeBlockedAlarm&);
};
/**
sc::transition<InternalSchedScrub, NewChunk>,
sc::custom_reaction<SleepComplete>>;
- ceph::coarse_real_clock::time_point entered_at =
- ceph::coarse_real_clock::now();
+ ScrubTimePoint entered_at = ScrubClock::now();
ScrubMachine::timer_event_token_t m_sleep_timer;
sc::result react(const SleepComplete&);
};
virtual spg_t get_spgid() const = 0;
virtual PG* get_pg() const = 0;
+ /**
+ * access the set of performance counters relevant to this scrub
+ * (one of the four sets of counters maintained by the OSD)
+ */
+ virtual PerfCounters& get_counters_set() const = 0;
+
using scrubber_callback_t = std::function<void(void)>;
using scrubber_callback_cancel_token_t = Context*;
/// state.
virtual void set_state_name(const char* name) = 0;
+ /// access the text specifying scrub level and whether it is a repair
+ virtual std::string_view get_op_mode_text() const = 0;
+
[[nodiscard]] virtual bool is_primary() const = 0;
virtual void select_range_n_notify() = 0;
*/
virtual void maps_compare_n_cleanup() = 0;
- virtual void set_scrub_begin_time() = 0;
-
- virtual void set_scrub_duration() = 0;
+ virtual void set_scrub_duration(std::chrono::milliseconds duration) = 0;
/**
* No new scrub session will start while a scrub was initiate on a PG,
#include <fmt/ranges.h>
+#include "common/ceph_time.h"
#include "common/scrub_types.h"
#include "include/types.h"
#include "os/ObjectStore.h"
}
struct PGPool;
+using ScrubClock = ceph::coarse_real_clock;
+using ScrubTimePoint = ScrubClock::time_point;
namespace Scrub {
class ReplicaReservations;