}
}
+bool PgScrubber::downgrade_on_operator_abort(
+ Scrub::SchedTarget& targ,
+ utime_t scrub_clock_now)
+{
+ if (targ.urgency() != urgency_t::operator_requested &&
+ targ.urgency() != urgency_t::must_repair) {
+ return false; // no need to downgrade
+ }
+
+ targ.sched_info.urgency = urgency_t::periodic_regular;
+ targ.sched_info.schedule.scheduled_at = scrub_clock_now;
+ targ.sched_info.schedule.not_before = scrub_clock_now;
+ dout(10)
+ << fmt::format(
+ "{}: removing operator-requested urgency from target. Updated: {}",
+ __func__, targ)
+ << dendl;
+ return true;
+}
+
/**
* The scrub session was aborted. We are left with two sets of parameters
* have had its priority, flags, or schedule modified in the meantime.
* And - it does not (at least initially, i.e. immediately after
* set_op_parameters()), have high priority.
+ *
+ * Updated functionality ('Tentacle', 2025): if the abort cause was an explicit
+ * operator request - make sure we are not left with a high-priority
+ * target (one that would immediately restart, against the operator wishes).
*/
void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
{
dout(10) << fmt::format(
"{}: executing target: {}. Session flags: {} up-to-date job: "
- "{}",
- __func__, *m_active_target, m_flags, *m_scrub_job)
+ "{}. Abort cause: {}",
+ __func__, *m_active_target, m_flags, *m_scrub_job, issue)
<< dendl;
// copy the aborted target
- const auto aborted_target = *m_active_target;
+ auto aborted_target = *m_active_target;
m_active_target.reset();
const auto scrub_clock_now = ceph_clock_now();
auto& current_targ = m_scrub_job->get_target(aborted_target.level());
ceph_assert(!current_targ.queued);
- // merge the aborted target with the current one
- auto& curr_sched = current_targ.sched_info.schedule;
- auto& abrt_sched = aborted_target.sched_info.schedule;
-
- current_targ.sched_info.urgency =
- std::max(current_targ.urgency(), aborted_target.urgency());
- curr_sched.scheduled_at =
- std::min(curr_sched.scheduled_at, abrt_sched.scheduled_at);
- curr_sched.not_before =
- std::min(curr_sched.not_before, abrt_sched.not_before);
-
- dout(10) << fmt::format(
- "{}: merged target (before delay): {}", __func__,
- current_targ)
- << dendl;
+ // if the abort trigger was an explicit operator abort command, and the
+ // aborted target had operator-initiated urgency:
+ // - do not perform a 'merge' of the aborted target and the 'next'
+ // target in the scrub-job. Instead - just reinstate the 'next' target.
+ // (the aborted target has more than its urgency attribute wrong. The
+ // scheduled-at was also made irrelevant by the original operator
+ // command that initiated the aborted scrub).
+ bool should_merge = true;
+ if (issue == delay_cause_t::operator_abort) {
+ should_merge = !downgrade_on_operator_abort(aborted_target, scrub_clock_now);
+ }
+
+ if (should_merge) {
+ // the regular case. merge the aborted target with the current one
+ auto& curr_sched = current_targ.sched_info.schedule;
+ auto& abrt_sched = aborted_target.sched_info.schedule;
+
+ current_targ.sched_info.urgency =
+ std::max(current_targ.urgency(), aborted_target.urgency());
+ curr_sched.scheduled_at =
+ std::min(curr_sched.scheduled_at, abrt_sched.scheduled_at);
+ curr_sched.not_before =
+ std::min(curr_sched.not_before, abrt_sched.not_before);
+ dout(10) << fmt::format(
+ "{}: merged target (before delay): {}", __func__,
+ current_targ)
+ << dendl;
+ } else {
+ dout(10) << fmt::format(
+ "{}: aborted oper-urgency target discarded: {}",
+ __func__, current_targ)
+ << dendl;
+ }
// affect a delay, as there was a failure mid-scrub
m_scrub_job->delay_on_failure(current_targ.level(), issue, scrub_clock_now);
-
- // reinstate both targets in the queue
m_osds->get_scrub_services().enqueue_target(current_targ);
current_targ.queued = true;
auto& sister = m_scrub_job->get_target(
aborted_target.level() == scrub_level_t::deep ? scrub_level_t::shallow
: scrub_level_t::deep);
+ // if 'operator-aborted' - that one should be downgraded, too (the scenario
+ // we are trying to help the operator with: trying to recover from a set of
+ // scrub requests issued by mistake).
if (!sister.queued) {
+ if (issue == delay_cause_t::operator_abort) {
+ downgrade_on_operator_abort(sister, scrub_clock_now);
+ }
m_osds->get_scrub_services().enqueue_target(sister);
sister.queued = true;
}