run_mon $dir a --osd_pool_default_size=3 || return 1
run_mgr $dir x || return 1
+ local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+ ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+ ceph_osd_args+="--osd_stats_update_period_scrubbing=2"
for osd in $(seq 0 $(expr $OSDS - 1))
do
- run_osd $dir $osd || return 1
+ run_osd $dir $osd $ceph_osd_args || return 1
done
# Create a pool with a single pg
run_mon $dir a --osd_pool_default_size=3 || return 1
run_mgr $dir x || return 1
+
+ local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+ ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+ ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_scrub_sleep=0 "
+ ceph_osd_args+="--osd_scrub_extended_sleep=20 --osd_scrub_begin_week_day=$DAY_START "
+ ceph_osd_args+="--osd_op_queue=wpq --osd_scrub_end_week_day=$DAY_END "
+ ceph_osd_args+="--bluestore_cache_autotune=false" # why needed?
+
for osd in $(seq 0 $(expr $OSDS - 1))
do
- run_osd $dir $osd --osd_scrub_sleep=0 \
- --osd_scrub_extended_sleep=20 \
- --bluestore_cache_autotune=false \
- --osd_deep_scrub_randomize_ratio=0.0 \
- --osd_scrub_interval_randomize_ratio=0 \
- --osd_scrub_begin_week_day=$DAY_START \
- --osd_scrub_end_week_day=$DAY_END \
- || return 1
+ run_osd $dir $osd $ceph_osd_args || return 1
done
# Create a pool with a single pg
--osd_scrub_interval_randomize_ratio=0 \
--osd_scrub_backoff_ratio=0.0 \
--osd_op_queue=wpq \
+ --osd_stats_update_period_not_scrubbing=3 \
+ --osd_stats_update_period_scrubbing=2 \
--osd_scrub_sleep=0.2"
for osd in $(seq 0 $(expr $OSDS - 1))
, m_resource_bookkeeper{[this](std::string msg) { log_fwd(msg); }, conf}
, m_queue{cct, m_osd_svc}
, m_log_prefix{fmt::format("osd.{}: osd-scrub:", m_osd_svc.get_nodeid())}
+ , m_load_tracker{cct, conf, m_osd_svc.get_nodeid()}
{}
std::ostream& OsdScrub::gen_prefix(std::ostream& out, std::string_view fn) const
// if there is a PG that is just now trying to reserve scrub replica resources -
// we should wait and not initiate a new scrub
- if (is_reserving_now()) {
+ if (m_queue.is_reserving_now()) {
dout(10) << "scrub resources reservation in progress" << dendl;
return;
}
- Scrub::OSDRestrictions env_conditions;
+ utime_t scrub_time = ceph_clock_now();
+ dout(10) << fmt::format(
+ "{}: time now:{}, recover is active?:{}", __func__,
+ scrub_time, is_recovery_active)
+ << dendl;
- if (is_recovery_active && !conf->osd_scrub_during_recovery) {
- if (!conf->osd_repair_during_recovery) {
- dout(15) << "not scheduling scrubs due to active recovery" << dendl;
- return;
- }
- dout(10) << "will only schedule explicitly requested repair due to active "
- "recovery"
- << dendl;
- env_conditions.allow_requested_repair_only = true;
+ // check the OSD-wide environment conditions (scrub resources, time, etc.).
+ // These may restrict the type of scrubs we are allowed to start, or just
+ // prevent us from starting any scrub at all.
+ auto env_restrictions =
+ restrictions_on_scrubbing(is_recovery_active, scrub_time);
+ if (!env_restrictions) {
+ return;
}
if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
}
}
- auto was_started = select_pg_and_scrub(env_conditions);
+ auto was_started = select_pg_and_scrub(*env_restrictions);
dout(20) << fmt::format(
"scrub scheduling done ({})",
ScrubQueue::attempt_res_text(was_started))
<< dendl;
}
+
+std::optional<Scrub::OSDRestrictions> OsdScrub::restrictions_on_scrubbing(
+ bool is_recovery_active,
+ utime_t scrub_clock_now) const
+{
+ // our local OSD may already be running too many scrubs
+ if (!m_resource_bookkeeper.can_inc_scrubs()) {
+ dout(10) << "OSD cannot inc scrubs" << dendl;
+ return std::nullopt;
+ }
+
+ // if there is a PG that is just now trying to reserve scrub replica resources
+ // - we should wait and not initiate a new scrub
+ if (m_queue.is_reserving_now()) {
+ dout(10) << "scrub resources reservation in progress" << dendl;
+ return std::nullopt;
+ }
+
+ Scrub::OSDRestrictions env_conditions;
+ env_conditions.time_permit = scrub_time_permit(scrub_clock_now);
+ env_conditions.load_is_low = m_load_tracker.scrub_load_below_threshold();
+ env_conditions.only_deadlined =
+ !env_conditions.time_permit || !env_conditions.load_is_low;
+
+ if (is_recovery_active && !conf->osd_scrub_during_recovery) {
+ if (!conf->osd_repair_during_recovery) {
+ dout(15) << "not scheduling scrubs due to active recovery" << dendl;
+ return std::nullopt;
+ }
+
+ dout(10) << "will only schedule explicitly requested repair due to active "
+ "recovery"
+ << dendl;
+ env_conditions.allow_requested_repair_only = true;
+ }
+
+ return env_conditions;
+}
+
+
// ////////////////////////////////////////////////////////////////////////// //
// scrub initiation - OSD code temporarily moved here from OSD.cc
* - try that one. If not suitable, discard from 'to_scrub_copy'
*/
Scrub::schedule_result_t ScrubQueue::select_pg_and_scrub(
- Scrub::OSDRestrictions& preconds)
+ Scrub::OSDRestrictions preconds)
{
dout(10) << " reg./pen. sizes: " << to_scrub.size() << " / "
<< penalized.size() << dendl;
utime_t now_is = time_now();
- preconds.time_permit = scrub_time_permit(now_is);
- preconds.load_is_low = scrub_load_below_threshold();
- preconds.only_deadlined = !preconds.time_permit || !preconds.load_is_low;
-
// create a list of candidates (copying, as otherwise creating a deadlock):
// - possibly restore penalized
// - (if we didn't handle directly) remove invalid jobs
// not holding jobs_lock. 'group' is a copy of the actual list.
Scrub::schedule_result_t ScrubQueue::select_from_group(
Scrub::ScrubQContainer& group,
- const Scrub::OSDRestrictions& preconds,
+ Scrub::OSDRestrictions preconds,
utime_t now_is)
{
dout(15) << "jobs #: " << group.size() << dendl;
// ////////////////////////////////////////////////////////////////////////// //
// CPU load tracking and related
+OsdScrub::LoadTracker::LoadTracker(
+ CephContext* cct,
+ const ceph::common::ConfigProxy& config,
+ int node_id)
+ : cct{cct}
+ , conf{config}
+ , log_prefix{fmt::format("osd.{} scrub-queue::load-tracker::", node_id)}
+{
+ // initialize the daily loadavg with current 15min loadavg
+ if (double loadavgs[3]; getloadavg(loadavgs, 3) == 3) {
+ daily_loadavg = loadavgs[2];
+ } else {
+ derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
+ daily_loadavg = 1.0;
+ }
+}
///\todo replace with Knuth's algo (to reduce the numerical error)
-std::optional<double> ScrubQueue::update_load_average()
+std::optional<double> OsdScrub::LoadTracker::update_load_average()
{
- int hb_interval = conf()->osd_heartbeat_interval;
+ int hb_interval = conf->osd_heartbeat_interval;
int n_samples = std::chrono::duration_cast<seconds>(24h).count();
if (hb_interval > 1) {
n_samples = std::max(n_samples / hb_interval, 1);
}
- // get CPU load avg
double loadavg;
if (getloadavg(&loadavg, 1) == 1) {
daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavg) / n_samples;
return 100 * loadavg;
}
- return std::nullopt;
+ return std::nullopt; // getloadavg() failed
}
-bool ScrubQueue::scrub_load_below_threshold() const
+bool OsdScrub::LoadTracker::scrub_load_below_threshold() const
{
double loadavgs[3];
if (getloadavg(loadavgs, 3) != 3) {
// allow scrub if below configured threshold
long cpus = sysconf(_SC_NPROCESSORS_ONLN);
double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
- if (loadavg_per_cpu < conf()->osd_scrub_load_threshold) {
+ if (loadavg_per_cpu < conf->osd_scrub_load_threshold) {
dout(20) << fmt::format(
"loadavg per cpu {:.3f} < max {:.3f} = yes",
- loadavg_per_cpu, conf()->osd_scrub_load_threshold)
+ loadavg_per_cpu, conf->osd_scrub_load_threshold)
<< dendl;
return true;
}
dout(10) << fmt::format(
"loadavg {:.3f} >= max {:.3f} and ( >= daily_loadavg {:.3f} "
"or >= 15m avg {:.3f} ) = no",
- loadavgs[0], conf()->osd_scrub_load_threshold, daily_loadavg,
+ loadavgs[0], conf->osd_scrub_load_threshold, daily_loadavg,
loadavgs[2])
<< dendl;
return false;
}
+std::ostream& OsdScrub::LoadTracker::gen_prefix(
+ std::ostream& out,
+ std::string_view fn) const
+{
+ return out << log_prefix << fn << ": ";
+}
std::optional<double> OsdScrub::update_load_average()
{
- return m_queue.update_load_average();
+ return m_load_tracker.update_load_average();
}
-
-
// ////////////////////////////////////////////////////////////////////////// //
// checks for half-closed ranges. Modify the (p<till)to '<=' to check for
}
Scrub::schedule_result_t OsdScrub::select_pg_and_scrub(
- Scrub::OSDRestrictions& preconds)
+ Scrub::OSDRestrictions preconds)
{
return m_queue.select_pg_and_scrub(preconds);
}
Scrub::ScrubSchedListener& m_osd_svc;
const ceph::common::ConfigProxy& conf;
+ /**
+ * check the OSD-wide environment conditions (scrub resources, time, etc.).
+ * These may restrict the type of scrubs we are allowed to start, or just
+ * prevent us from starting any scrub at all.
+ *
+ * Specifically:
+ * a nullopt is returned if we are not allowed to scrub at all, for either of
+ * the following reasons: no local resources (too many scrubs on this OSD);
+ * a dice roll says we will not scrub in this tick;
+ * a recovery is in progress, and we are not allowed to scrub while recovery;
+ * a PG is trying to acquire replica resources.
+ *
+ * If we are allowed to scrub, the returned value specifies whether the only
+ * high priority scrubs or only overdue ones are allowed to go on.
+ */
+ std::optional<Scrub::OSDRestrictions> restrictions_on_scrubbing(
+ bool is_recovery_active,
+ utime_t scrub_clock_now) const;
+
/// resource reservation management
Scrub::ScrubResources m_resource_bookkeeper;
ScrubQueue m_queue;
public:
- // for this transitory commit only - to be removed
- bool can_inc_scrubs() { return m_resource_bookkeeper.can_inc_scrubs(); }
-
// for this transitory commit only - to be removed
Scrub::schedule_result_t select_pg_and_scrub(
- Scrub::OSDRestrictions& preconds);
+ Scrub::OSDRestrictions preconds);
// for this transitory commit only - to be moved elsewhere
/**
* \returns true with probability of osd_scrub_backoff_ratio.
*/
bool scrub_random_backoff() const;
+
+ /**
+ * tracking the average load on the CPU. Used both by the
+ * OSD logger, and by the scrub queue (as no scrubbing is allowed if
+ * the load is too high).
+ */
+ class LoadTracker {
+ CephContext* cct;
+ const ceph::common::ConfigProxy& conf;
+ const std::string log_prefix;
+ double daily_loadavg{0.0};
+
+ public:
+ explicit LoadTracker(
+ CephContext* cct,
+ const ceph::common::ConfigProxy& config,
+ int node_id);
+
+ std::optional<double> update_load_average();
+
+ [[nodiscard]] bool scrub_load_below_threshold() const;
+
+ std::ostream& gen_prefix(std::ostream& out, std::string_view fn) const;
+ };
+ LoadTracker m_load_tracker;
};
*
* locking: locks jobs_lock
*/
- Scrub::schedule_result_t select_pg_and_scrub(Scrub::OSDRestrictions& preconds);
+ Scrub::schedule_result_t select_pg_and_scrub(Scrub::OSDRestrictions preconds);
/**
* Translate attempt_ values into readable text
void clear_reserving_now();
bool is_reserving_now() const;
-
- bool can_inc_scrubs() const;
- bool inc_scrubs_local();
- void dec_scrubs_local();
- bool inc_scrubs_remote();
- void dec_scrubs_remote();
- void dump_scrub_reservations(ceph::Formatter* f) const;
-
/// counting the number of PGs stuck while scrubbing, waiting for objects
void mark_pg_scrub_blocked(spg_t blocked_pg);
void clear_pg_scrub_blocked(spg_t blocked_pg);
*/
std::chrono::milliseconds scrub_sleep_time(bool must_scrub) const;
- /**
- * called every heartbeat to update the "daily" load average
- *
- * @returns a load value for the logger
- */
- [[nodiscard]] std::optional<double> update_load_average();
-
private:
CephContext* cct;
Scrub::ScrubSchedListener& osd_service;
Scrub::ScrubQContainer penalized; ///< those that failed to reserve remote resources
bool restore_penalized{false};
- double daily_loadavg{0.0};
-
static inline constexpr auto registered_job = [](const auto& jobref) -> bool {
return jobref->state == Scrub::qu_state_t::registered;
};
std::atomic_bool a_pg_is_reserving{false};
- [[nodiscard]] bool scrub_load_below_threshold() const;
[[nodiscard]] bool scrub_time_permit(utime_t now) const;
/**
Scrub::schedule_result_t select_from_group(
Scrub::ScrubQContainer& group,
- const Scrub::OSDRestrictions& preconds,
+ Scrub::OSDRestrictions preconds,
utime_t now_is);
protected: // used by the unit-tests