setup $dir || return 1
run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
run_mgr $dir x || return 1
- local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 "
+ local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0.1 "
ceph_osd_args+="--osd_scrub_backoff_ratio=0 "
ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 "
ceph_osd_args+="--osd_stats_update_period_scrubbing=2"
const auto env_restrictions =
restrictions_on_scrubbing(is_recovery_active, scrub_time);
- dout(10) << fmt::format("scrub scheduling (@tick) starts. "
- "time now:{:s}, recovery is active?:{} restrictions:{}",
- scrub_time, is_recovery_active, env_restrictions)
+ dout(10) << fmt::format(
+ "scrub scheduling (@tick) starts. "
+ "time now:{:s}, recovery is active?:{} restrictions:{}",
+ scrub_time, is_recovery_active, env_restrictions)
<< dendl;
if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>() &&
- !env_restrictions.high_priority_only) {
+ !env_restrictions.max_concurrency_reached &&
+ !env_restrictions.random_backoff_active) {
debug_log_all_jobs();
}
ScrubJob::observes_max_concurrency(e.urgency)) {
return false;
}
- if (!r.load_is_low && ScrubJob::observes_random_backoff(e.urgency)) {
+ if (r.random_backoff_active &&
+ ScrubJob::observes_random_backoff(e.urgency)) {
return false;
}
if (!r.time_permit && ScrubJob::observes_allowed_hours(e.urgency)) {
if (!r.load_is_low && ScrubJob::observes_load_limit(e.urgency)) {
return false;
}
- // recovery?
+ if (r.recovery_in_progress && ScrubJob::observes_recovery(e.urgency)) {
+ return false;
+ }
return true;
}
if (!m_resource_bookkeeper.can_inc_scrubs()) {
// our local OSD is already running too many scrubs
dout(15) << "OSD cannot inc scrubs" << dendl;
- env_conditions.high_priority_only = true;
+ env_conditions.max_concurrency_reached = true;
} else if (scrub_random_backoff()) {
// dice-roll says we should not scrub now
- dout(15) << "Lost in dice. Only high priority scrubs allowed."
- << dendl;
- env_conditions.high_priority_only = true;
+ dout(15) << "Lost in dice. Only high priority scrubs allowed." << dendl;
+ env_conditions.random_backoff_active = true;
} else if (is_recovery_active && !conf->osd_scrub_during_recovery) {
if (conf->osd_repair_during_recovery) {
env_conditions.allow_requested_repair_only = true;
} else {
- dout(15) << "recovery in progress. Only high priority scrubs allowed."
+ dout(15) << "recovery in progress. Operator-initiated scrubs only."
<< dendl;
- env_conditions.high_priority_only = true;
+ env_conditions.recovery_in_progress = true;
}
} else {
}
+/**
+ * mark for a deep-scrub after the current scrub ended with errors.
+ * Note that no need to requeue the target, as it will be requeued
+ * when the scrub ends.
+ */
void PgScrubber::request_rescrubbing(requested_scrub_t& request_flags)
{
dout(10) << __func__ << " flags: " << request_flags << dendl;
request_flags.need_auto = true;
- update_scrub_job(delay_ready_t::no_delay);
+ auto& trgt = m_scrub_job->get_target(scrub_level_t::deep);
+ trgt.up_urgency_to(urgency_t::must_repair);
+ trgt.sched_info.schedule.scheduled_at = {0, 0};
+ trgt.sched_info.schedule.not_before = ceph_clock_now();
+ // no need to requeue, as scrub_finish() will do that.
}
m_epoch_start = m_pg->get_osdmap_epoch();
m_flags.check_repair = m_active_target->urgency() == urgency_t::after_repair;
- m_flags.auto_repair = request.auto_repair || request.need_auto;
+ m_flags.auto_repair = false;
m_flags.required = request.req_scrub || request.must_scrub;
m_flags.priority = (request.must_scrub || request.need_auto)
m_is_deep = m_active_target->sched_info.level == scrub_level_t::deep;
if (m_is_deep) {
state_set(PG_STATE_DEEP_SCRUB);
+ if (pg_cond.can_autorepair || request.auto_repair) {
+ m_flags.auto_repair = true;
+ }
} else {
ceph_assert(!request.must_deep_scrub);
ceph_assert(!request.need_auto);
ceph_assert(tr == 0);
}
- // determine the next scrub time
- update_scrub_job(delay_ready_t::delay_ready);
-
if (has_error) {
m_pg->queue_peering_event(PGPeeringEventRef(
std::make_shared<PGPeeringEvent>(get_osdmap_epoch(),
if (do_auto_scrub) {
request_rescrubbing(m_planned_scrub);
}
+ // determine the next scrub time
+ update_scrub_job(delay_ready_t::delay_ready);
if (m_pg->is_active() && m_pg->is_primary()) {
m_pg->recovery_state.share_pg_info();
{
return urgency < urgency_t::after_repair;
}
+
+bool ScrubJob::observes_recovery(urgency_t urgency)
+{
+ return urgency < urgency_t::operator_requested;
+}
* | noscrub | yes | no? | no | no |
* | max-scrubs | yes | yes? | no | no |
* | backoff | yes | no | no | no |
+ * | recovery | yes | no | no | no |
* +------------+------------+--------------+----------+-------------+
*/
static bool observes_max_concurrency(urgency_t urgency);
- static bool observes_random_backoff(urgency_t urgency);};
+ static bool observes_random_backoff(urgency_t urgency);
+
+ static bool observes_recovery(urgency_t urgency);
+};
} // namespace Scrub
namespace std {
/// (note: struct size should be kept small, as it is copied around)
struct OSDRestrictions {
/// high local OSD concurrency. Thus - only high priority scrubs are allowed
- bool high_priority_only{false};
- bool allow_requested_repair_only{false};
- bool only_deadlined{false};
+ bool max_concurrency_reached{false};
+
+ /// rolled a dice, and decided not to scrub in this tick
+ bool random_backoff_active{false};
+
+ /// the OSD is performing recovery & osd_repair_during_recovery is 'true'
+ bool allow_requested_repair_only:1{false};
+
+ /// the load is high, or the time is not right. For periodic scrubs,
+ /// only the overdue ones are allowed.
+ bool only_deadlined:1{false};
bool load_is_low:1{true};
bool time_permit:1{true};
- bool max_concurrency_reached:1{false};
+ /// the OSD is performing a recovery, osd_scrub_during_recovery is 'false',
+ /// and so is osd_repair_during_recovery
bool recovery_in_progress:1{false};
};
static_assert(sizeof(Scrub::OSDRestrictions) <= sizeof(uint32_t));
auto format(const Scrub::OSDRestrictions& conds, FormatContext& ctx) const
{
return fmt::format_to(
- ctx.out(),
- "priority-only:{},overdue-only:{},load:{},time:{},repair-only:{}",
- conds.high_priority_only,
- conds.only_deadlined,
- conds.load_is_low ? "ok" : "high",
- conds.time_permit ? "ok" : "no",
- conds.allow_requested_repair_only);
+ ctx.out(), "<{}.{}.{}.{}.{}.{}>",
+ conds.max_concurrency_reached ? "max-scrubs" : "",
+ conds.random_backoff_active ? "backoff" : "",
+ conds.load_is_low ? "" : "high-load",
+ conds.time_permit ? "" : "time-restrict",
+ conds.recovery_in_progress ? "recovery" : "",
+ conds.allow_requested_repair_only ? "repair-only" : "");
}
};