osd/scrub: support an operator-abort command

author Ronen Friedman <rfriedma@redhat.com>

Thu, 4 Dec 2025 14:49:29 +0000 (08:49 -0600)

committer Ronen Friedman <rfriedma@redhat.com>

Tue, 20 Jan 2026 16:40:03 +0000 (16:40 +0000)
author Ronen Friedman <rfriedma@redhat.com>
Thu, 4 Dec 2025 14:49:29 +0000 (08:49 -0600)
committer Ronen Friedman <rfriedma@redhat.com>
Tue, 20 Jan 2026 16:40:03 +0000 (16:40 +0000)
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc

index 1d6b8898c92b5ce745f8bf41f338af0629ca93fc..1bd1ffc309feb1fef5dc91f65e510871859e3cc6 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -2065,6 +2065,26 @@ void PgScrubber::on_digest_updates()
    }
  }
  
+bool PgScrubber::downgrade_on_operator_abort(
+    Scrub::SchedTarget& targ,
+    utime_t scrub_clock_now)
+{
+  if (targ.urgency() != urgency_t::operator_requested &&
+      targ.urgency() != urgency_t::must_repair) {
+    return false;  // no need to downgrade
+  }
+
+  targ.sched_info.urgency = urgency_t::periodic_regular;
+  targ.sched_info.schedule.scheduled_at = scrub_clock_now;
+  targ.sched_info.schedule.not_before = scrub_clock_now;
+  dout(10)
+      << fmt::format(
+             "{}: removing operator-requested urgency from target. Updated: {}",
+             __func__, targ)
+      << dendl;
+  return true;
+}
+
  
  /**
   * The scrub session was aborted. We are left with two sets of parameters
@@ -2076,6 +2096,10 @@ void PgScrubber::on_digest_updates()
   * have had its priority, flags, or schedule modified in the meantime.
   * And - it does not (at least initially, i.e. immediately after
   * set_op_parameters()), have high priority.
+ *
+ * Updated functionality ('Tentacle', 2025): if the abort cause was an explicit
+ * operator request - make sure we are not left with a high-priority
+ * target (one that would immediately restart, against the operator wishes).
   */
  void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
  {
@@ -2090,38 +2114,54 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
  
    dout(10) << fmt::format(
                   "{}: executing target: {}. Session flags: {} up-to-date job: "
-                 "{}",
-                 __func__, *m_active_target, m_flags, *m_scrub_job)
+                 "{}. Abort cause: {}",
+                 __func__, *m_active_target, m_flags, *m_scrub_job, issue)
            << dendl;
  
    // copy the aborted target
-  const auto aborted_target = *m_active_target;
+  auto aborted_target = *m_active_target;
    m_active_target.reset();
  
    const auto scrub_clock_now = ceph_clock_now();
    auto& current_targ = m_scrub_job->get_target(aborted_target.level());
    ceph_assert(!current_targ.queued);
  
-  // merge the aborted target with the current one
-  auto& curr_sched = current_targ.sched_info.schedule;
-  auto& abrt_sched = aborted_target.sched_info.schedule;
-
-  current_targ.sched_info.urgency =
-      std::max(current_targ.urgency(), aborted_target.urgency());
-  curr_sched.scheduled_at =
-      std::min(curr_sched.scheduled_at, abrt_sched.scheduled_at);
-  curr_sched.not_before =
-      std::min(curr_sched.not_before, abrt_sched.not_before);
-
-  dout(10) << fmt::format(
-                 "{}: merged target (before delay): {}", __func__,
-                 current_targ)
-          << dendl;
+  // if the abort trigger was an explicit operator abort command, and the
+  // aborted target had operator-initiated urgency:
+  // - do not perform a 'merge' of the aborted target and the 'next'
+  //   target in the scrub-job. Instead - just reinstate the 'next' target.
+  //   (the aborted target has more than its urgency attribute wrong. The
+  //   scheduled-at was also made irrelevant by the original operator
+  //   command that initiated the aborted scrub).
+  bool should_merge = true;
+  if (issue == delay_cause_t::operator_abort) {
+    should_merge = !downgrade_on_operator_abort(aborted_target, scrub_clock_now);
+  }
+
+  if (should_merge) {
+    // the regular case. merge the aborted target with the current one
+    auto& curr_sched = current_targ.sched_info.schedule;
+    auto& abrt_sched = aborted_target.sched_info.schedule;
+
+    current_targ.sched_info.urgency =
+        std::max(current_targ.urgency(), aborted_target.urgency());
+    curr_sched.scheduled_at =
+        std::min(curr_sched.scheduled_at, abrt_sched.scheduled_at);
+    curr_sched.not_before =
+        std::min(curr_sched.not_before, abrt_sched.not_before);
+    dout(10) << fmt::format(
+                   "{}: merged target (before delay): {}", __func__,
+                   current_targ)
+            << dendl;
+  } else {
+    dout(10) << fmt::format(
+                    "{}: aborted oper-urgency target discarded: {}",
+                    __func__, current_targ)
+             << dendl;
+  }
  
    // affect a delay, as there was a failure mid-scrub
    m_scrub_job->delay_on_failure(current_targ.level(), issue, scrub_clock_now);
-
-  // reinstate both targets in the queue
    m_osds->get_scrub_services().enqueue_target(current_targ);
    current_targ.queued = true;
  
@@ -2129,7 +2169,13 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
    auto& sister = m_scrub_job->get_target(
        aborted_target.level() == scrub_level_t::deep ? scrub_level_t::shallow
                                                     : scrub_level_t::deep);
+  // if 'operator-aborted' - that one should be downgraded, too (the scenario
+  // we are trying to help the operator with: trying to recover from a set of
+  // scrub requests issued by mistake).
    if (!sister.queued) {
+    if (issue == delay_cause_t::operator_abort) {
+      downgrade_on_operator_abort(sister, scrub_clock_now);
+    }
      m_osds->get_scrub_services().enqueue_target(sister);
      sister.queued = true;
    }
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h

index e88e604083e648da5c111d42b7e9a8bf60ed1ab8..1f5ec95e841e5ada1f98e6ff5d8dfc323423d122 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -516,6 +516,17 @@ class PgScrubber : public ScrubPgIF,
  
    void on_mid_scrub_abort(Scrub::delay_cause_t issue) final;
  
+  /**
+   *  an auxiliary used by on_mid_scrub_abort()
+   *  If the target has operator-initiated urgency (either 'must_repair' -
+   *  operator-requested repair or 'operator_requested' - operator-requested
+   *  scrub) - downgrade it to regular periodic.
+   *  \retval true: the urgency was downgraded
+   */
+  bool downgrade_on_operator_abort(
+    Scrub::SchedTarget& targ,
+    utime_t scrub_clock_now);
+
    ScrubMachineListener::MsgAndEpoch prep_replica_map_msg(
      Scrub::PreemptionNoted was_preempted) final;
  
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc

index 3400641b428dcf323d59645c20d94aa8b9811bed..9f9c50d7289e4a426699b07993c2a17a028880e3 100644 (file)
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -224,6 +224,15 @@ sc::result Session::react(const IntervalChanged&)
    return transit<NotActive>();
  }
  
+sc::result Session::react(const OperatorAbort&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "Session::react(const OperatorAbort&)" << dendl;
+  ceph_assert(m_reservations);
+  m_abort_reason = delay_cause_t::operator_abort;
+  return transit<PrimaryIdle>();
+}
+
  std::optional<pg_scrubbing_status_t> Session::get_reservation_status() const
  {
    if (!m_reservations) {
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h

index 8c2d99275be41cea243138b1a4bff5efe3acde35..e55d051360cf029cdec1086f493d6c5dd16e10c2 100644 (file)
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -248,6 +248,12 @@ MEV(IntervalChanged)
   */
  MEV(FullReset)
  
+/**
+ * (Primary only) stops the running scrub. Removes any higher-than-periodic
+ * 'urgency' attributes.
+ */
+MEV(OperatorAbort)
+
  /// finished handling this chunk. Go get the next one
  MEV(NextChunk)
  
@@ -558,10 +564,13 @@ struct Session : sc::state<Session, PrimaryActive, ReservingReplicas>,
    ~Session();
  
    using reactions = mpl::list<sc::transition<FullReset, PrimaryIdle>,
-                              sc::custom_reaction<IntervalChanged>>;
+                              sc::custom_reaction<IntervalChanged>,
+                              sc::custom_reaction<OperatorAbort>>;
  
    sc::result react(const IntervalChanged&);
  
+  sc::result react(const OperatorAbort&);
+
    /// managing the scrub session's reservations (optional, as
    /// it's an RAII wrapper around the state of 'holding reservations')
    std::optional<ReplicaReservations> m_reservations{std::nullopt};
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h

index 9b054f7ab3199c298a2c3d5e16288bb3a370a7f1..b57eec5d81c58281d8805b163002d99826890f2c 100644 (file)
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -219,6 +219,7 @@ enum class delay_cause_t {
    aborted,         ///< scrub was aborted w/ unspecified reason
    interval,        ///< the interval had ended mid-scrub
    scrub_params,     ///< the specific scrub type is not allowed
+  operator_abort    ///< operator-requested abort
  };
  }  // namespace Scrub
  
@@ -242,6 +243,7 @@ struct formatter<Scrub::delay_cause_t> : ::fmt::formatter<std::string_view> {
        case aborted:             desc = "aborted"; break;
        case interval:            desc = "interval"; break;
        case scrub_params:        desc = "scrub-mode"; break;
+      case operator_abort:      desc = "operator-abort"; break;
        // better to not have a default case, so that the compiler will warn
      }
      return ::fmt::formatter<string_view>::format(desc, ctx);
author	Ronen Friedman <rfriedma@redhat.com>
	Thu, 4 Dec 2025 14:49:29 +0000 (08:49 -0600)
committer	Ronen Friedman <rfriedma@redhat.com>
	Tue, 20 Jan 2026 16:40:03 +0000 (16:40 +0000)
src/osd/scrubber/pg_scrubber.cc		patch \| blob \| history
src/osd/scrubber/pg_scrubber.h		patch \| blob \| history
src/osd/scrubber/scrub_machine.cc		patch \| blob \| history
src/osd/scrubber/scrub_machine.h		patch \| blob \| history
src/osd/scrubber_common.h		patch \| blob \| history