osd: modified handling of scrub unexpected events

author Ronen Friedman <rfriedma@redhat.com>

Wed, 2 Dec 2020 08:58:49 +0000 (10:58 +0200)

committer Ronen Friedman <rfriedma@redhat.com>

Thu, 10 Dec 2020 13:21:53 +0000 (15:21 +0200)
author Ronen Friedman <rfriedma@redhat.com>
Wed, 2 Dec 2020 08:58:49 +0000 (10:58 +0200)
committer Ronen Friedman <rfriedma@redhat.com>
Thu, 10 Dec 2020 13:21:53 +0000 (15:21 +0200)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index d0d1f8a6097c272a67c9588f9d4531a30af4826a..8c60f5ea3b96a1694d953e320d88913af99af4c7 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1806,6 +1806,7 @@ void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priorit
  
  void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
  {
+  // Resulting scrub event: 'InternalSchedScrub'
    queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
  }
  
@@ -7433,28 +7434,6 @@ bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs)
    return pgid < rhs.pgid;
  }
  
-// this one is only moved here (from the header) temporarily, for debugging:
-void OSDService::unreg_pg_scrub(spg_t pgid, utime_t t)
-{
-  std::lock_guard l{OSDService::sched_scrub_lock};
-  size_t removed = sched_scrub_pg.erase(ScrubJob{cct, pgid, t});
-  ceph_assert(removed);
-  dout(10) << __func__ << " scrub-set removed: " << pgid << " T(" << t << ")" << dendl;
-}
-
-// this one is only moved here (from the header) temporarily, for debugging:
-utime_t OSDService::reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval,
-                     double pool_scrub_max_interval, bool must)
-{
-  ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval,
-                     must);
-  std::lock_guard l(OSDService::sched_scrub_lock);
-  auto [x, inserted] = sched_scrub_pg.insert(scrub_job);
-  dout(10) << __func__ << " scrub-set inserted: " << pgid << " T(" << t << ")" << " must: " << must << " inserted "
-    << inserted << dendl;
-  return scrub_job.sched_time;
-}
-
  void OSDService::dumps_scrub(ceph::Formatter *f)
  {
    ceph_assert(f != nullptr);
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index b64e35c7075798980c14258e71b420d7046ec83b..25d7359657dd1b828a07b5385333b4982117473b 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -291,10 +291,25 @@ public:
    };
    std::set<ScrubJob> sched_scrub_pg;
  
-  /// @returns the scrub_reg_stamp used for unregister'ing the scrub job
-  utime_t reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval,
-                      double pool_scrub_max_interval, bool must);
-  void unreg_pg_scrub(spg_t pgid, utime_t t);
+  /// @returns the scrub_reg_stamp used for unregistering the scrub job
+  utime_t reg_pg_scrub(spg_t pgid,
+                      utime_t t,
+                      double pool_scrub_min_interval,
+                      double pool_scrub_max_interval,
+                      bool must) {
+    ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval,
+                      must);
+    std::lock_guard l(OSDService::sched_scrub_lock);
+    sched_scrub_pg.insert(scrub_job);
+    return scrub_job.sched_time;
+  }
+
+  void unreg_pg_scrub(spg_t pgid, utime_t t) {
+    std::lock_guard l(sched_scrub_lock);
+    size_t removed = sched_scrub_pg.erase(ScrubJob(cct, pgid, t));
+    ceph_assert(removed);
+  }
+
    bool first_scrub_stamp(ScrubJob *out) {
      std::lock_guard l(sched_scrub_lock);
      if (sched_scrub_pg.empty())
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index c06187b3106e9fde9b85132d8c7d30a730193244..2c82aac4dbda538ff680cd762efbf3c9af397584 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -2064,80 +2064,65 @@ void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
    m_scrubber->replica_scrub_op(op);
  }
  
-void PG::scrub(epoch_t queued, ThreadPool::TPHandle& handle)
+void PG::scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle)
  {
-  dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl;
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
  
    scrub_queued = false;
+  ceph_assert(is_primary());
+  ceph_assert(!m_scrubber->is_scrub_active());
  
-  if (pg_has_reset_since(queued)) {
-    dout(10) << " pg::scrub reset_since " << __func__ << " " << queued << dendl;
-    dout(10) << " pg::scrub reset_since " << __func__ << " "
-           << recovery_state.get_last_peering_reset() << dendl;
-    m_scrubber->scrub_clear_state(false);
-    return;
-  }
+  // a new scrub
  
-  ceph_assert(
-    is_primary());  // as the replica request should have reached PG::replica_scrub()
+  m_scrubber->reset_epoch(epoch_queued);
  
-  ceph_assert(!m_scrubber->is_scrub_active());
-  // a new scrub
-  m_scrubber->reset_epoch(queued);
-  m_scrubber->send_start_scrub();
+  // note: send_start_scrub() will verify 'epoch queued' against our current interval
+  m_scrubber->send_start_scrub(epoch_queued);
  }
  
  // note: no need to secure OSD resources for a recovery scrub
  void PG::recovery_scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle)
  {
-  dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued << dendl;
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
  
    scrub_queued = false;
-
-  if (pg_has_reset_since(epoch_queued)) {
-    dout(10) << " reset_since " << __func__ << " " << epoch_queued << dendl;
-    dout(10) << " reset_since " << __func__ << " "
-           << recovery_state.get_last_peering_reset() << dendl;
-    return;
-  }
-
    ceph_assert(is_primary());
    ceph_assert(!m_scrubber->is_scrub_active());
  
    // a new scrub
    m_scrubber->reset_epoch(epoch_queued);
-  m_scrubber->send_start_after_repair();
+  m_scrubber->send_start_after_repair(epoch_queued);
  }
  
  void PG::replica_scrub(epoch_t epoch_queued,
                        [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued
+  dout(10) << __func__ << " queued at: " << epoch_queued
            << (is_primary() ? " (primary)" : " (replica)") << dendl;
    scrub_queued = false;
-  m_scrubber->replica_scrub(epoch_queued);
+  m_scrubber->send_start_replica(epoch_queued);
  }
  
  void PG::scrub_send_scrub_resched(epoch_t epoch_queued,
                                   [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl;
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
    scrub_queued = false;
-  m_scrubber->send_scrub_resched();
+  m_scrubber->send_scrub_resched(epoch_queued);
  }
  
  void PG::scrub_send_resources_granted(epoch_t epoch_queued,
                                       [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
    dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  m_scrubber->send_remotes_reserved();
+  m_scrubber->send_remotes_reserved(epoch_queued);
  }
  
  void PG::scrub_send_resources_denied(epoch_t epoch_queued,
                                      [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
    dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  m_scrubber->send_reservation_failure();
+  m_scrubber->send_reservation_failure(epoch_queued);
  }
  
  void PG::replica_scrub_resched(epoch_t epoch_queued,
@@ -2145,64 +2130,49 @@ void PG::replica_scrub_resched(epoch_t epoch_queued,
  {
    dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
    scrub_queued = false;
-  m_scrubber->replica_scrub_resched(epoch_queued);
+  m_scrubber->send_sched_replica(epoch_queued);
  }
  
  void PG::scrub_send_pushes_update(epoch_t epoch_queued,
                                   [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
    dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  if (pg_has_reset_since(epoch_queued)) {
-    dout(10) << __func__ << " been reset at "
-           << recovery_state.get_last_peering_reset() << dendl;
-    return;
-  }
-  m_scrubber->active_pushes_notification();
+  m_scrubber->active_pushes_notification(epoch_queued);
  }
  
  void PG::scrub_send_replica_pushes(epoch_t epoch_queued,
                                    [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  m_scrubber->send_replica_pushes_upd();
+  dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
+  m_scrubber->send_replica_pushes_upd(epoch_queued);
  }
  
  void PG::scrub_send_applied_update(epoch_t epoch_queued,
                                    [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  if (pg_has_reset_since(epoch_queued)) {
-    dout(10) << __func__ << " been reset at "
-           << recovery_state.get_last_peering_reset() << dendl;
-    return;
-  }
+  dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
    m_scrubber->update_applied_notification(epoch_queued);
  }
  
  void PG::scrub_send_unblocking(epoch_t epoch_queued,
                                [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  if (pg_has_reset_since(epoch_queued)) {
-    dout(10) << __func__ << " been reset at "
-           << recovery_state.get_last_peering_reset() << dendl;
-    return;
-  }
-  m_scrubber->send_scrub_unblock();
+  dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
+  m_scrubber->send_scrub_unblock(epoch_queued);
  }
  
  void PG::scrub_send_digest_update(epoch_t epoch_queued,
                                   [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  m_scrubber->digest_update_notification();
+  dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
+  m_scrubber->digest_update_notification(epoch_queued);
  }
  
  void PG::scrub_send_replmaps_ready(epoch_t epoch_queued,
                                    [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  m_scrubber->send_replica_maps_ready();
+  dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
+  m_scrubber->send_replica_maps_ready(epoch_queued);
  }
  
  bool PG::ops_blocked_by_scrub() const
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc

index 2df61761b0dcbb2466c87432d8fd751f91521fa3..8cb6495a51a33525154ebd9a2a31511a27ae5fa6 100644 (file)
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -1601,12 +1601,13 @@ int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
      r = -EAGAIN;
    } else {
      bool store_queried = m_scrubber->get_store_errors(arg, result);
-    if (!store_queried) {
+    if (store_queried) {
+      encode(result, osd_op->outdata); 
+    } else {
        // the scrubber's store is not initialized
        r = -ENOENT;
      }
    }
-  encode(result, osd_op->outdata);  // RRR really? even if no store?
  
    return r;
  }
diff --git a/src/osd/PrimaryLogScrub.cc b/src/osd/PrimaryLogScrub.cc

index 6cafa256a5e236bce64e9004b7242777d67eb392..39a16a3a3166fcf4c1e005dd1976ff14ea5cc0c1 100644 (file)
--- a/src/osd/PrimaryLogScrub.cc
+++ b/src/osd/PrimaryLogScrub.cc
@@ -554,6 +554,7 @@ void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap,
      }
      m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
  
+    ++num_digest_updates_pending;
      ctx->register_on_success([this]() {
        dout(20) << "updating scrub digest " << num_digest_updates_pending << dendl;
        if (--num_digest_updates_pending <= 0) {
@@ -561,20 +562,17 @@ void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap,
        }
      });
  
-    ++num_digest_updates_pending;
      m_pl_pg->simple_opc_submit(std::move(ctx));
    }
  
    dout(10) << __func__ << " (" << mode << ") finish" << dendl;
  }
  
-PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg)
-    : PgScrubber{pg}, m_pl_pg{pg}
-{}
+PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {}
  
  void PrimaryLogScrub::_scrub_clear_state()
  {
-  dout(7) << __func__ << " - pg(" << m_pl_pg->pg_id << dendl;
+  dout(15) << __func__ << dendl;
    m_scrub_cstat = object_stat_collection_t();
  }
  
diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc

index 2afa689aed4b764ecd25857e41d3e80f126551c0..8894106dca09449ab48529a08da43ac8c8a871d8 100644 (file)
--- a/src/osd/pg_scrubber.cc
+++ b/src/osd/pg_scrubber.cc
@@ -1,5 +1,5 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
+// -*- mode:C++; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=2 sw=2 smarttab
  
  #include "pg_scrubber.h"
  
@@ -72,123 +72,245 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf)
    return out;
  }
  
-bool PgScrubber::is_event_relevant(epoch_t queued) const
+// returns false if the message should be discarded. Handles the notification of interval
+// change, if not done already. called only for active scrub? not sure.
+
+// let's first make this a Primary-only function
+bool PgScrubber::check_interval(epoch_t epoch_to_verify)
  {
-  return is_primary() && m_pg->is_active() && m_pg->is_clean() && is_scrub_active() &&
-        !was_epoch_changed() && (!queued || !m_pg->pg_has_reset_since(queued));
+  const auto current_interval = m_pg->get_same_interval_since();
+
+  if (epoch_to_verify < current_interval) {
+    // the event will not be delivered. If we have already noticed and handled
+    // the change of seasons, it will be silently discarded. Otherwise - we
+    // reset the scrubber and its FSM.
+    dout(10) << __func__ << " stale message. epoch: " << epoch_to_verify << " vs. "
+            << current_interval << " (handled: " << m_last_dead_interval << ")" << dendl;
+
+    if (epoch_to_verify > m_last_dead_interval) {
+
+      // we have not seen this interval change yet.
+      // The remote reservations are no longer relevant.
+
+      m_last_dead_interval = current_interval;
+
+      // clear the remote reservations. No need to send messages.
+      if (m_reservations) {
+       m_reservations->discard_all();
+      }
+
+      // stop the scrub and send a reset message to the FSM
+      scrub_clear_state();
+    }
+    return false;
+  }
+
+  return true;
  }
  
-bool PgScrubber::should_abort_scrub(epoch_t queued) const
+bool PgScrubber::check_interval_replica(epoch_t epoch_to_verify)
  {
-  dout(10) << __func__ << "(): queued:" << queued << " required: " << m_flags.required
-          << " noscrub: " << get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) << " / "
-          << m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB) << dendl;
+  const auto current_interval = m_pg->get_same_interval_since();
+
+  if (epoch_to_verify < current_interval) {
+    // the event will not be delivered. If we have already noticed and handled
+    // the change of seasons, it will be silently discarded. Otherwise - we
+    // reset the scrubber and its FSM.
+    dout(10) << __func__ << " stale message. epoch: " << epoch_to_verify << " vs. "
+            << current_interval << " (handled: " << m_last_dead_interval << ")" << dendl;
  
-  if (!is_primary() || !m_pg->is_active() ||
-      (queued && m_pg->pg_has_reset_since(queued))) {
+    if (epoch_to_verify > m_last_dead_interval) {
+
+      // we have not seen this interval change yet.
+      // The remote reservations are no longer relevant.
+
+      m_last_dead_interval = current_interval;
+
+      // clear the remote reservations. No need to send messages
+      m_remote_osd_resource.reset();
+
+      // stop the scrub and send a reset message to the FSM
+      // replica_handling_done();
+      send_interval_changed();
+    }
+    return false;
+  }
+
+  // verify that we are reserved by the primary
+  // not true anymore (see rapair scrubs) ceph_assert(m_remote_osd_resource &&
+  // m_remote_osd_resource->is_reserved());
+
+  return true;
+}
+
+bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify)
+{
+  if (!m_active) {
+    // not scrubbing. We can assume that the scrub was already terminated, and we
+    // can silently discard the incoming event.
+    return false;
+  }
+
+  // is this a message from before we started this scrub?
+  if (epoch_to_verify < m_epoch_start) {
+    return false;
+  }
+
+  // check for reasons to abort this scrub
+
+  // has a new interval started?
+  if (!check_interval(epoch_to_verify)) {
+    // if this is a new interval, check_interval() just discarded
+    // remote resources and then killed the scrub
+    return false;
+  }
+
+  ceph_assert(is_primary());
+
+  // were we instructed to abort?
+  return verify_against_abort(epoch_to_verify);
+}
+
+
+// false if the message was discarded because of an abort flag.
+// Reset everything if the abort was not handled before.
+bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify)
+{
+  if (!should_abort()) {
      return true;
    }
  
+  dout(15) << __func__ << " aborting. incoming epoch: " << epoch_to_verify
+          << "vs last-aborted: " << m_last_aborted << dendl;
+
+  // if we were not aware of the abort before - kill the scrub.
+  if (epoch_to_verify > m_last_aborted) {
+    scrub_clear_state();
+    m_last_aborted = std::max(epoch_to_verify, m_epoch_start);
+  }
+  return false;
+}
+
+bool PgScrubber::should_abort() const
+{
    if (m_flags.required) {
      return false;  // not stopping 'required' scrubs for configuration changes
    }
  
-  if (state_test(PG_STATE_DEEP_SCRUB)) {
+  if (m_is_deep) {
      if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
         m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
        dout(10) << "nodeep_scrub set, aborting" << dendl;
        return true;
      }
-  } else if (state_test(PG_STATE_SCRUBBING)) {
-    if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
-       m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
-      dout(10) << "noscrub set, aborting" << dendl;
-      return true;
-    }
+  }
+
+  if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+      m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
+    dout(10) << "noscrub set, aborting" << dendl;
+    return true;
    }
  
    return false;
  }
  
-void PgScrubber::send_start_scrub()
+//   sending (processing) state-machine events --------------------------------
+
+/*
+ * a note re the checks performed before sending scrub-initiating messages:
+ *
+ * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that
+ * possibly were in the queue while the PG changed state and became unavailable for
+ * scrubbing:
+ *
+ * The check_interval() catches all major changes to the PG. As for the other conditions
+ * we may check (and see is_message_relevant() above):
+ *
+ * - we are not 'active' yet, so must check against is_active(), andL
+ *
+ * - the 'abort' flags were just verified (when the triggering message was queued). As
+ *   those are only modified in human speeds - they need not be queried again.
+ *
+ * Some of the considerations above are also relevant to the replica-side initiation
+ * ('StartReplica' & 'StartReplicaNoWait').
+ */
+
+
+void PgScrubber::send_start_scrub(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  if (should_abort_scrub(epoch_t(0))) {
-    dout(10) << __func__ << " aborting!" << dendl;
-    scrub_clear_state(false);
-  } else {
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {
      m_fsm->my_states();
      m_fsm->process_event(StartScrub{});
    }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_start_after_repair()
+void PgScrubber::send_start_after_repair(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  m_fsm->my_states();
-  m_fsm->process_event(AfterRepairScrub{});
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(AfterRepairScrub{});
+  }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_scrub_unblock()
+void PgScrubber::send_scrub_unblock(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  if (should_abort_scrub(epoch_t(0))) {
-
-    dout(10) << __func__ << " aborting!" << dendl;
-    scrub_clear_state(false);
-
-  } else if (is_scrub_active()) {
-
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
      m_fsm->my_states();
      m_fsm->process_event(Unblocked{});
-
-  } else {
-    dout(10) << __func__ << " ignored as scrub not active" << dendl;
    }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_scrub_resched()
+void PgScrubber::send_scrub_resched(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  if (should_abort_scrub(epoch_t(0))) {
-    dout(10) << __func__ << " aborting!" << dendl;
-    scrub_clear_state(false);
-  } else if (is_scrub_active()) {
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
      m_fsm->my_states();
      m_fsm->process_event(InternalSchedScrub{});
-  } else {
-    // no need to send anything
-    dout(10) << __func__ << " event no longer relevant" << dendl;
    }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_start_replica()
+void PgScrubber::send_start_replica(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  m_fsm->my_states();
-  m_fsm->process_event(StartReplica{});
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_primary()) {
+    // shouldn't happen. Ignore
+    dout(1) << "got a replica scrub request while Primary!" << dendl;
+    return;
+  }
+  if (check_interval_replica(epoch_queued)) {
+    m_fsm->my_states();
+    // buy us some time by not waiting for updates if there are none
+    // to wait for. Affects the transition from NotActive into either
+    // ReplicaWaitUpdates or ActiveReplica.
+    if (pending_active_pushes())
+      m_fsm->process_event(StartReplica{});
+    else
+      m_fsm->process_event(StartReplicaNoWait{});
+  }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_sched_replica()
+void PgScrubber::send_sched_replica(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  m_fsm->my_states();
-  m_fsm->process_event(SchedReplica{});         // retest for map availability
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval_replica(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(SchedReplica{});  // retest for map availability
+  }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::active_pushes_notification()
+void PgScrubber::active_pushes_notification(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  if (should_abort_scrub(epoch_t(0))) {
-    dout(10) << __func__ << " aborting!" << dendl;
-    scrub_clear_state(false);
-  } else {
+  // note: Primary only
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
      m_fsm->my_states();
      m_fsm->process_event(ActivePushesUpd{});
    }
@@ -197,82 +319,76 @@ void PgScrubber::active_pushes_notification()
  
  void PgScrubber::update_applied_notification(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << "() epoch: " << epoch_queued << dendl;
-  if (should_abort_scrub(epoch_queued)) {
-    dout(10) << __func__ << " aborting!" << dendl;
-    scrub_clear_state(false);
-  } else {
+  // note: Primary only
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
      m_fsm->my_states();
      m_fsm->process_event(UpdatesApplied{});
    }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::digest_update_notification()
+void PgScrubber::digest_update_notification(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  m_fsm->my_states();
-  if (is_event_relevant(epoch_t(0))) {
+  // note: Primary only
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
      m_fsm->process_event(DigestUpdate{});
-  } else {
-    // no need to send anything
-    dout(10) << __func__ << " event no longer relevant" << dendl;
    }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_epoch_changed()
+// no checks should be performed here
+void PgScrubber::send_interval_changed()
  {
    dout(10) << "scrubber event -->> " << __func__ << dendl;
-  if (is_scrub_active()) {
-    m_fsm->my_states();
-    m_fsm->process_event(EpochChanged{});
-  }
+  m_fsm->my_states();
+  m_fsm->process_event(IntervalChanged{});
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_replica_maps_ready()
+void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  m_fsm->my_states();
-  if (is_scrub_active()) {
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
      m_fsm->process_event(GotReplicas{});
    }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_replica_pushes_upd()
+void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  m_fsm->my_states();
-  if (is_scrub_active()) {
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval_replica(epoch_queued)) {
+    m_fsm->my_states();
      m_fsm->process_event(ReplicaPushesUpd{});
    }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_remotes_reserved()
+void PgScrubber::send_remotes_reserved(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  m_fsm->my_states();
-  m_fsm->process_event(RemotesReserved{});  // note: too early to check for 'active'!
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  // note: scrub is not active yet
+  if (check_interval(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(RemotesReserved{});
+  }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-void PgScrubber::send_reservation_failure()
+void PgScrubber::send_reservation_failure(epoch_t epoch_queued)
  {
-  dout(10) << "scrubber event -->> " << __func__ << dendl;
-  m_fsm->my_states();
-  m_fsm->process_event(ReservationFailure{});  // do not check for 'active'!
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {  // do not check for 'active'!
+    m_fsm->my_states();
+    m_fsm->process_event(ReservationFailure{});
+  }
    dout(10) << "scrubber event --<< " << __func__ << dendl;
  }
  
-bool PgScrubber::is_scrub_active() const
-{
-  dout(10) << " " << __func__ << " actv? " << m_active << "pg:" << m_pg->pg_id << dendl;
-  return m_active;
-}
-
  bool PgScrubber::is_reserving() const
  {
    return m_fsm->is_reserving();
@@ -280,16 +396,11 @@ bool PgScrubber::is_reserving() const
  
  void PgScrubber::reset_epoch(epoch_t epoch_queued)
  {
-  dout(10) << __func__ << " PG( " << m_pg->pg_id
-          << (m_pg->is_primary() ? ") prm" : ") rpl") << " epoch: " << epoch_queued
-          << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
-
-  dout(10) << __func__ << " STATE_SCRUBBING? " << state_test(PG_STATE_SCRUBBING) << dendl;
-  m_epoch_queued = epoch_queued;
-  m_needs_sleep = true;
-
+  dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
    m_fsm->assert_not_active();
  
+  m_epoch_start = epoch_queued;
+  m_needs_sleep = true;
    m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
  }
  
@@ -315,7 +426,7 @@ unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priorit
  }
  
  // ///////////////////////////////////////////////////////////////////// //
-// scrub op registration handling
+// scrub-op registration handling
  
  bool PgScrubber::is_scrub_registered() const
  {
@@ -350,9 +461,9 @@ void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
      reg_stamp = m_pg->info.history.last_scrub_stamp;
    }
  
-  dout(9) << __func__ << " pg(" << m_pg_id << ") must: " << must
-         << " required:" << m_flags.required << " flags: " << request_flags
-         << " stamp: " << reg_stamp << dendl;
+  dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must
+          << " required:" << m_flags.required << " flags: " << request_flags
+          << " stamp: " << reg_stamp << dendl;
  
    // note down the sched_time, so we can locate this scrub, and remove it
    // later on.
@@ -401,7 +512,7 @@ void PgScrubber::scrub_requested(scrub_level_t scrub_level,
  
  void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
  {
-  dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << " ## "
+  dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? "
            << is_scrub_registered() << dendl;
  
    unreg_next_scrub();
@@ -437,6 +548,7 @@ bool PgScrubber::has_pg_marked_new_updates() const
  void PgScrubber::set_subset_last_update(eversion_t e)
  {
    m_subset_last_update = e;
+  dout(15) << __func__ << " last-update: " << e << dendl;
  }
  
  /*
@@ -618,9 +730,6 @@ void PgScrubber::add_delayed_scheduling()
    }
  }
  
-/**
- *  walk the log to find the latest update that affects our chunk
- */
  eversion_t PgScrubber::search_log_for_updates() const
  {
    auto& projected = m_pg->projected_log.log;
@@ -646,7 +755,8 @@ eversion_t PgScrubber::search_log_for_updates() const
  
  bool PgScrubber::get_replicas_maps(bool replica_can_preempt)
  {
-  dout(10) << __func__ << " epoch_start: " << m_epoch_start
+  dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/"
+          << m_interval_start
            << " pg same_interval_since: " << m_pg->info.history.same_interval_since
            << dendl;
  
@@ -673,10 +783,10 @@ bool PgScrubber::get_replicas_maps(bool replica_can_preempt)
  bool PgScrubber::was_epoch_changed() const
  {
    // for crimson we have m_pg->get_info().history.same_interval_since
-  dout(10) << __func__ << " epoch_start: " << m_epoch_start
+  dout(10) << __func__ << " epoch_start: " << m_interval_start
            << " from pg: " << m_pg->get_history().same_interval_since << dendl;
  
-  return m_epoch_start < m_pg->get_history().same_interval_since;
+  return m_interval_start < m_pg->get_history().same_interval_since;
  }
  
  void PgScrubber::mark_local_map_ready()
@@ -705,10 +815,10 @@ void PgScrubber::_request_scrub_map(pg_shard_t replica,
    dout(10) << __func__ << " scrubmap from osd." << replica
            << (deep ? " deep" : " shallow") << dendl;
  
-  auto repscrubop = new MOSDRepScrub(
-    spg_t(m_pg->info.pgid.pgid, replica.shard), version, m_pg->get_osdmap_epoch(),
-    m_pg->get_last_peering_reset(), start, end, deep, allow_preemption, m_flags.priority,
-    m_pg->ops_blocked_by_scrub());
+  auto repscrubop =
+    new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version,
+                    get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep,
+                    allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub());
  
    // default priority. We want the replica-scrub processed prior to any recovery
    // or client io messages (we are holding a lock!)
@@ -738,9 +848,10 @@ void PgScrubber::on_init()
  
    preemption_data.reset();
    m_pg->publish_stats_to_osd();
-  m_epoch_start = m_pg->get_history().same_interval_since;
+  m_interval_start = m_pg->get_history().same_interval_since;
+  // m_epoch_started = m_pg->get_osdmap_epoch();
  
-  dout(10) << __func__ << " start same_interval:" << m_epoch_start << dendl;
+  dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl;
  
    //  create a new store
    {
@@ -752,6 +863,7 @@ void PgScrubber::on_init()
    }
  
    m_start = m_pg->info.pgid.pgid.get_hobj_start();
+  m_last_dead_interval = get_osdmap_epoch();
    m_active = true;
  }
  
@@ -759,6 +871,8 @@ void PgScrubber::on_replica_init()
  {
    ceph_assert(!m_active);
    m_active = true;
+  m_last_dead_interval = get_osdmap_epoch();  // so that check_interval_replica() won't
+                                             // kill a scrub for stale messages
  }
  
  void PgScrubber::_scan_snaps(ScrubMap& smap)
@@ -766,9 +880,9 @@ void PgScrubber::_scan_snaps(ScrubMap& smap)
    hobject_t head;
    SnapSet snapset;
  
-  // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
-  // caller using clean_meta_map(), and it works properly.
-  dout(15) << __func__ << " starts" << dendl;
+  // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings
+  // in this function
+  dout(15) << "_scan_snaps starts" << dendl;
  
    for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
  
@@ -866,16 +980,15 @@ int PgScrubber::build_primary_map_chunk()
                                    m_end, m_is_deep);
  
    if (ret == -EINPROGRESS)
-    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
+    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority);
  
    return ret;
  }
  
  int PgScrubber::build_replica_map_chunk()
  {
-  dout(10) << __func__ << " epoch start: " << m_epoch_start << " ep q: " << m_epoch_queued
-          << dendl;
-  dout(10) << __func__ << " deep: " << m_is_deep << dendl;
+  dout(10) << __func__ << " interval start: " << m_interval_start
+          << " epoch: " << m_epoch_start << " deep: " << m_is_deep << dendl;
  
    auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
                                    m_is_deep);
@@ -885,10 +998,9 @@ int PgScrubber::build_replica_map_chunk()
      // finished!
      // In case we restarted smaller chunk, clear old data
  
-    ScrubMap for_meta_scrub;
      m_cleaned_meta_map.clear_from(m_start);
      m_cleaned_meta_map.insert(replica_scrubmap);
-    clean_meta_map(for_meta_scrub);
+    auto for_meta_scrub = clean_meta_map();
      _scan_snaps(for_meta_scrub);
    }
  
@@ -949,13 +1061,18 @@ int PgScrubber::build_scrub_map_chunk(
    return 0;
  }
  
-/**
- * \todo describe what we are doing here
+/*
+ * Process:
+ * Building a map of objects suitable for snapshot validation.
+ * The data in m_cleaned_meta_map is the left over partial items that need to
+ * be completed before they can be processed.
   *
- * @param for_meta_scrub
+ * Snapshots in maps precede the head object, which is why we are scanning backwards.
   */
-void PgScrubber::clean_meta_map(ScrubMap& for_meta_scrub)
+ScrubMap PgScrubber::clean_meta_map()
  {
+  ScrubMap for_meta_scrub;
+
    if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) {
      m_cleaned_meta_map.swap(for_meta_scrub);
    } else {
@@ -976,6 +1093,8 @@ void PgScrubber::clean_meta_map(ScrubMap& for_meta_scrub)
      for_meta_scrub.objects.insert(begin, iter);
      m_cleaned_meta_map.objects.erase(begin, iter);
    }
+
+  return for_meta_scrub;
  }
  
  void PgScrubber::run_callbacks()
@@ -996,9 +1115,9 @@ void PgScrubber::maps_compare_n_cleanup()
    requeue_waiting();
  }
  
-Scrub::preemption_t* PgScrubber::get_preemptor()
+Scrub::preemption_t& PgScrubber::get_preemptor()
  {
-  return &preemption_data;
+  return preemption_data;
  }
  
  void PgScrubber::requeue_replica(Scrub::scrub_prio_t is_high_priority)
@@ -1018,23 +1137,29 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
    dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch
            << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl;
  
+  // are we still processing a previous scrub-map request without noticing that the
+  // interval changed? won't see it here, but rather at the reservation stage.
+
+
    if (msg->map_epoch < m_pg->info.history.same_interval_since) {
      dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
              << " < " << m_pg->info.history.same_interval_since << dendl;
+
+    // is there a general sync issue? are we holding a stale reservation?
+    // not checking now - assuming we will actively react to interval change.
+
      return;
    }
  
    replica_scrubmap = ScrubMap{};
    replica_scrubmap_pos = ScrubMapBuilder{};
  
-  // m_replica_epoch_start is overwritten if requeued waiting for active pushes
-  m_replica_epoch_start = m_pg->info.history.same_interval_since;
    m_replica_min_epoch = msg->min_epoch;
    m_start = msg->start;
    m_end = msg->end;
    m_max_end = msg->end;
    m_is_deep = msg->deep;
-  m_epoch_start = m_pg->info.history.same_interval_since;
+  m_interval_start = m_pg->info.history.same_interval_since;
    m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority
                                                   : Scrub::scrub_prio_t::low_priority;
    m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority();
@@ -1050,55 +1175,14 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
    m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority);
  }
  
-void PgScrubber::replica_scrub(epoch_t epoch_queued)
-{
-  dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued
-          << dendl;
-  dout(20) << __func__ << " m_epoch_start: " << m_epoch_start
-          << " better be >= " << m_pg->info.history.same_interval_since << dendl;
-  dout(20) << __func__ << " m_is_deep: " << m_is_deep << dendl;
-
-  if (m_pg->pg_has_reset_since(epoch_queued)) {
-    dout(10) << "replica_scrub(epoch,) - reset!" << dendl;
-    send_epoch_changed();
-    return;
-  }
-
-  if (was_epoch_changed()) {
-    dout(10) << "replica_scrub(epoch,) - epoch!" << dendl;
-    send_epoch_changed();
-    return;
-  }
-  ceph_assert(!is_primary());  // as should have been caught by the epoch-changed check
-
-  send_start_replica();
-}
-
-void PgScrubber::replica_scrub_resched(epoch_t epoch_queued)
-{
-  dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued
-          << dendl;
-
-  if (m_pg->pg_has_reset_since(epoch_queued)) {
-    dout(10) << "replica_scrub(epoch,) - reset!" << dendl;
-    send_epoch_changed();
-    return;
-  }
-
-  if (was_epoch_changed()) {
-    dout(10) << __func__ << " epoch changed!" << dendl;
-    send_epoch_changed();
-    return;
-  }
-  ceph_assert(!is_primary());  // as should have been caught by the epoch-changed check
-
-  send_sched_replica();
-}
-
  void PgScrubber::set_op_parameters(requested_scrub_t& request)
  {
    dout(10) << __func__ << " input: " << request << dendl;
  
+  // write down the epoch of starting a new scrub. Will be used
+  // to discard stale messages from previous aborted scrubs.
+  m_epoch_start = m_pg->get_osdmap_epoch();
+
    m_flags.check_repair = request.check_repair;
    m_flags.auto_repair = request.auto_repair || request.need_auto;
    m_flags.required = request.req_scrub || request.must_scrub;
@@ -1124,10 +1208,6 @@ void PgScrubber::set_op_parameters(requested_scrub_t& request)
    request = requested_scrub_t{};
  }
  
-/**
- *  RRR \todo ask why we collect from acting+recovery+backfill, but use the size of
- *  only the acting set
- */
  void PgScrubber::scrub_compare_maps()
  {
    dout(10) << __func__ << " has maps, analyzing" << dendl;
@@ -1163,7 +1243,7 @@ void PgScrubber::scrub_compare_maps()
      m_osds->clog->warn(ss);
    }
  
-  if (m_pg->recovery_state.get_acting().size() > 1) {
+  if (m_pg->recovery_state.get_acting_recovery_backfill().size() > 1) {
  
      dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
  
@@ -1202,8 +1282,7 @@ void PgScrubber::scrub_compare_maps()
      }
    }
  
-  ScrubMap for_meta_scrub;
-  clean_meta_map(for_meta_scrub);
+  auto for_meta_scrub = clean_meta_map();
  
    // ok, do the pg-type specific scrubbing
  
@@ -1227,38 +1306,30 @@ void PgScrubber::scrub_compare_maps()
    }
  }
  
-void PgScrubber::replica_update_start_epoch()
-{
-  dout(10) << __func__ << " start:" << m_pg->info.history.same_interval_since << dendl;
-  m_replica_epoch_start = m_pg->info.history.same_interval_since;
-}
-
  /**
   * Send the requested map back to the primary (or - if we
   * were preempted - let the primary know).
   */
-void PgScrubber::send_replica_map(bool was_preempted)
+void PgScrubber::send_replica_map(PreemptionNoted was_preempted)
  {
-  dout(10) << __func__ << " min epoch:" << m_replica_min_epoch
-          << " epoch_start:" << m_replica_epoch_start << dendl;
+  dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl;
  
    auto reply = new MOSDRepScrubMap(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
                                    m_replica_min_epoch, m_pg_whoami);
  
-  reply->preempted = was_preempted;
+  reply->preempted = (was_preempted == PreemptionNoted::preempted);
    ::encode(replica_scrubmap, reply->get_data());
  
    m_osds->send_message_osd_cluster(m_pg->get_primary().osd, reply, m_replica_min_epoch);
  }
  
-/**
+/*
   *  - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
   *    The state-machine will react to that when all replica maps are received.
   *  - when all maps are received, we signal the FSM with the GotReplicas event (see
   *    scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
   *    FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
- *    handle (well - the incoming message is marked for fast dispatching, which is an
- *    even better reason for handling it via the queue).
+ *    handle.
   */
  void PgScrubber::map_from_replica(OpRequestRef op)
  {
@@ -1276,8 +1347,14 @@ void PgScrubber::map_from_replica(OpRequestRef op)
    m_received_maps[m->from].decode(p, m_pg->info.pgid.pool());
    dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl;
  
-  [[maybe_unused]] auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
-  ceph_assert(is_ok);  // and not an error message, following the original code
+  auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
+  if (!is_ok) {
+    // previously an unexpected map was triggering an assert. Now, as scrubs can be
+    // aborted at any time, the chances of this happening have increased, and aborting is
+    // not justified
+    dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl;
+    return;
+  }
  
    if (m->preempted) {
      dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
@@ -1286,30 +1363,50 @@ void PgScrubber::map_from_replica(OpRequestRef op)
    }
  
    if (m_maps_status.are_all_maps_available()) {
-    dout(10) << __func__ << " osd-queuing GotReplicas" << dendl;
+    dout(15) << __func__ << " all repl-maps available" << dendl;
      m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops());
    }
  }
  
-/**
- *  we are a replica being asked by the Primary to reserve OSD resources for
- * scrubbing
- */
  void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
  {
    dout(10) << __func__ << " " << *op->get_req() << dendl;
    op->mark_started();
+  auto request_ep = op->get_req<MOSDScrubReserve>()->get_map_epoch();
+
+  /*
+   *  if we are currently holding a reservation, then:
+   *  either (1) we, the scrubber, did not yet notice an interval change. The remembered
+   *  reservation epoch is from before our interval, and we can silently discard the
+   *  reservation (no message is required).
+   *  or:
+   *  (2) the interval hasn't changed, but the same Primary that (we think) holds the
+   *  lock just sent us a new request. Note that we know it's the same Primary, as
+   *  otherwise the interval would have changed.
+   *  Ostensibly we can discard & redo the reservation. But then we
+   *  will be temporarily releasing the OSD resource - and might not be able to grab it
+   *  again. Thus we simple treat this as a successful new request.
+   */
+
+  if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
+    // we are holding a stale reservation from a past epoch
+    m_remote_osd_resource.reset();
+  }
  
-  if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_reserved()) {
-    dout(10) << __func__ << " ignoring reserve request: Already reserved" << dendl;
+  if (request_ep < m_pg->get_same_interval_since()) {
+    // will not ack stale requests
      return;
    }
  
    bool granted{false};
+  if (m_remote_osd_resource.has_value()) {
  
-  if (m_pg->cct->_conf->osd_scrub_during_recovery || !m_osds->is_recovery_active()) {
+    dout(10) << __func__ << " already reserved." << dendl;
+    granted = true;
  
-    m_remote_osd_resource.emplace(m_pg, m_osds);
+  } else if (m_pg->cct->_conf->osd_scrub_during_recovery ||
+            !m_osds->is_recovery_active()) {
+    m_remote_osd_resource.emplace(m_pg, m_osds, request_ep);
      // OSD resources allocated?
      granted = m_remote_osd_resource->is_reserved();
      if (!granted) {
@@ -1321,9 +1418,8 @@ void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
  
    dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
  
-  auto m = op->get_req<MOSDScrubReserve>();
    Message* reply = new MOSDScrubReserve(
-    spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), m->map_epoch,
+    spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep,
      granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami);
  
    m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
@@ -1337,7 +1433,8 @@ void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
    if (m_reservations.has_value()) {
      m_reservations->handle_reserve_grant(op, from);
    } else {
-    derr << __func__ << ": replica scrub reservations that will be leaked!" << dendl;
+    derr << __func__ << ": received unsolicited reservation grant from osd " << from
+        << " (" << op << ")" << dendl;
    }
  }
  
@@ -1632,7 +1729,7 @@ Scrub::FsmNext PgScrubber::on_digest_updates()
  void PgScrubber::dump(ceph::Formatter* f) const
  {
    f->open_object_section("scrubber");
-  f->dump_stream("epoch_start") << m_epoch_start;
+  f->dump_stream("epoch_start") << m_interval_start;
    f->dump_bool("active", m_active);
    if (m_active) {
      f->dump_stream("start") << m_start;
@@ -1671,7 +1768,7 @@ void PgScrubber::handle_query_state(ceph::Formatter* f)
    dout(10) << __func__ << dendl;
  
    f->open_object_section("scrub");
-  f->dump_stream("scrubber.epoch_start") << m_epoch_start;
+  f->dump_stream("scrubber.epoch_start") << m_interval_start;
    f->dump_bool("scrubber.active", m_active);
    f->dump_stream("scrubber.start") << m_start;
    f->dump_stream("scrubber.end") << m_end;
@@ -1701,7 +1798,6 @@ PgScrubber::PgScrubber(PG* pg)
      , m_pg_id{pg->pg_id}
      , m_osds{m_pg->osd}
      , m_pg_whoami{pg->pg_whoami}
-    , m_epoch_queued{0}
      , preemption_data{pg}
  {
    dout(20) << " creating PgScrubber for " << pg->pg_id << " / " << m_pg_whoami << dendl;
@@ -1715,7 +1811,6 @@ void PgScrubber::reserve_replicas()
    m_reservations.emplace(m_pg, m_pg_whoami);
  }
  
-//  called only for normal end-of-scrub, and only for a Primary
  void PgScrubber::cleanup_on_finish()
  {
    dout(10) << __func__ << dendl;
@@ -1728,7 +1823,7 @@ void PgScrubber::cleanup_on_finish()
    m_reservations.reset();
    m_local_osd_resource.reset();
  
-  m_pg->requeue_ops(m_pg->waiting_for_scrub);
+  requeue_waiting();
  
    reset_internal_state();
    // type-specific state clear
@@ -1736,31 +1831,31 @@ void PgScrubber::cleanup_on_finish()
  }
  
  // uses process_event(), so must be invoked externally
-void PgScrubber::scrub_clear_state(bool keep_repair_state)
+void PgScrubber::scrub_clear_state()
  {
    dout(10) << __func__ << dendl;
  
-  clear_pgscrub_state(keep_repair_state);
+  clear_pgscrub_state();
    m_fsm->process_event(FullReset{});
  }
  
  /*
   * note: does not access the state-machine
   */
-void PgScrubber::clear_pgscrub_state(bool keep_repair_state)
+void PgScrubber::clear_pgscrub_state()
  {
    dout(10) << __func__ << dendl;
    ceph_assert(m_pg->is_locked());
  
    state_clear(PG_STATE_SCRUBBING);
    state_clear(PG_STATE_DEEP_SCRUB);
-  if (!keep_repair_state)
-    state_clear(PG_STATE_REPAIR);
+
+  state_clear(PG_STATE_REPAIR);
  
    clear_scrub_reservations();
    m_pg->publish_stats_to_osd();
  
-  m_pg->requeue_ops(m_pg->waiting_for_scrub);
+  requeue_waiting();
  
    reset_internal_state();
  
@@ -1775,8 +1870,6 @@ void PgScrubber::replica_handling_done()
    state_clear(PG_STATE_SCRUBBING);
    state_clear(PG_STATE_DEEP_SCRUB);
  
-  // make sure we cleared the reservations!
-
    preemption_data.reset();
    m_maps_status.reset();
    m_received_maps.clear();
@@ -1858,7 +1951,7 @@ ostream& operator<<(ostream& out, const PgScrubber& scrubber)
  
  ostream& PgScrubber::show(ostream& out) const
  {
-  return out << " [ " << m_pg_id << ": " << /*for now*/ m_flags << " ] ";
+  return out << " [ " << m_pg_id << ": " << m_flags << " ] ";
  }
  
  // ///////////////////// preemption_data_t //////////////////////////////////
@@ -1930,26 +2023,12 @@ void ReplicaReservations::send_reject()
    m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
  }
  
-void ReplicaReservations::release_all()
+void ReplicaReservations::discard_all()
  {
    dout(10) << __func__ << " " << m_reserved_peers << dendl;
  
    m_had_rejections = true;  // preventing late-coming responses from triggering events
-  epoch_t epoch = m_pg->get_osdmap_epoch();
-
-  for (auto p : m_reserved_peers) {
-    release_replica(p, epoch);
-  }
    m_reserved_peers.clear();
-
-  // note: the release will follow on the heels of the request. When tried otherwise,
-  // grants that followed a reject arrived after the whole scrub machine-state was
-  // reset, causing leaked reservations.
-  if (m_pending) {
-    for (auto p : m_waited_for_peers) {
-      release_replica(p, epoch);
-    }
-  }
    m_waited_for_peers.clear();
  }
  
@@ -1960,7 +2039,22 @@ ReplicaReservations::~ReplicaReservations()
    // send un-reserve messages to all reserved replicas. We do not wait for answer (there
    // wouldn't be one). Other incoming messages will be discarded on the way, by our
    // owner.
-  release_all();
+  dout(10) << __func__ << " " << m_reserved_peers << dendl;
+
+  epoch_t epoch = m_pg->get_osdmap_epoch();
+
+  for (auto& p : m_reserved_peers) {
+    release_replica(p, epoch);
+  }
+  m_reserved_peers.clear();
+
+  // note: the release will follow on the heels of the request. When tried otherwise,
+  // grants that followed a reject arrived after the whole scrub machine-state was
+  // reset, causing leaked reservations.
+  for (auto& p : m_waited_for_peers) {
+    release_replica(p, epoch);
+  }
+  m_waited_for_peers.clear();
  }
  
  /**
@@ -2021,17 +2115,17 @@ void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from
    } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
              m_reserved_peers.end()) {
  
-    dout(15) << " already had osd." << from << " reserved" << dendl;
+    dout(10) << " already had osd." << from << " reserved" << dendl;
  
    } else {
  
      dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
      m_had_rejections = true;  // preventing any additional notifications
-    --m_pending;             // not sure we need this bookkeeping anymore
      send_reject();
    }
  }
  
+
  // ///////////////////// LocalReservation //////////////////////////////////
  
  LocalReservation::LocalReservation(PG* pg, OSDService* osds)
@@ -2048,7 +2142,7 @@ LocalReservation::LocalReservation(PG* pg, OSDService* osds)
    m_holding_local_reservation = true;
  }
  
-void LocalReservation::early_release()
+LocalReservation::~LocalReservation()
  {
    if (m_holding_local_reservation) {
      m_holding_local_reservation = false;
@@ -2057,17 +2151,11 @@ void LocalReservation::early_release()
    }
  }
  
-LocalReservation::~LocalReservation()
-{
-  early_release();
-}
-
  
  // ///////////////////// ReservedByRemotePrimary ///////////////////////////////
  
-ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds)
-    : m_pg{pg} // holding the "whole PG" for dout() sake
-    , m_osds{osds}
+ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch)
+    : m_pg{pg}, m_osds{osds}, m_reserved_at{epoch}
  {
    if (!m_osds->inc_scrubs_remote()) {
      dout(10) << __func__ << ": failed to reserve at Primary request" << dendl;
@@ -2079,10 +2167,13 @@ ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds)
    m_reserved_by_remote_primary = true;
  }
  
-void ReservedByRemotePrimary::early_release()
+bool ReservedByRemotePrimary::is_stale() const
+{
+  return m_reserved_at < m_pg->get_same_interval_since();
+}
+
+ReservedByRemotePrimary::~ReservedByRemotePrimary()
  {
-  dout(20) << "ReservedByRemotePrimary::" << __func__ << ": "
-          << m_reserved_by_remote_primary << dendl;
    if (m_reserved_by_remote_primary) {
      m_reserved_by_remote_primary = false;
      m_osds->dec_scrubs_remote();
@@ -2090,11 +2181,6 @@ void ReservedByRemotePrimary::early_release()
    }
  }
  
-ReservedByRemotePrimary::~ReservedByRemotePrimary()
-{
-  early_release();
-}
-
  // ///////////////////// MapsCollectionStatus ////////////////////////////////
  
  auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from)
@@ -2106,7 +2192,7 @@ auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from)
      m_maps_awaited_for.erase(fe);
      return std::tuple{true, ""sv};
    } else {
-    return std::tuple{false, "unsolicited scrub-map"sv};
+    return std::tuple{false, " unsolicited scrub-map"sv};
    }
  }
  
@@ -2136,4 +2222,4 @@ ostream& operator<<(ostream& out, const MapsCollectionStatus& sf)
    return out << " ] ";
  }
  
-}  // namespace Scrub
+}  // namespace Scrub
+\ No newline at end of file
diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h

index e3936691578fcc416040f8ed2f59975b58cc0aca..f6536dba6ef0b7bb352969fa3e37dbe05fa3d098 100644 (file)
--- a/src/osd/pg_scrubber.h
+++ b/src/osd/pg_scrubber.h
@@ -53,9 +53,15 @@ class ReplicaReservations {
    /// notify the scrubber that we have failed to reserve replicas' resources
    void send_reject();
  
-  void release_all();
-
   public:
+  /**
+   *  quietly discard all knowledge about existing reservations. No messages
+   *  are sent to peers.
+   *  To be used upon interval change, as we know the the running scrub is no longer
+   *  relevant, and that the replicas had reset the reservations on their side.
+   */
+  void discard_all();
+
    ReplicaReservations(PG* pg, pg_shard_t whoami);
  
    ~ReplicaReservations();
@@ -77,7 +83,6 @@ class LocalReservation {
    LocalReservation(PG* pg, OSDService* osds);
    ~LocalReservation();
    bool is_reserved() const { return m_holding_local_reservation; }
-  void early_release();
  };
  
  /**
@@ -87,12 +92,15 @@ class ReservedByRemotePrimary {
    PG* m_pg;
    OSDService* m_osds;
    bool m_reserved_by_remote_primary{false};
+  const epoch_t m_reserved_at;
  
   public:
-  ReservedByRemotePrimary(PG* pg, OSDService* osds);
+  ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch);
    ~ReservedByRemotePrimary();
    [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; }
-  void early_release();
+
+  /// compare the remembered reserved-at epoch to the current interval
+  [[nodiscard]] bool is_stale() const;
  };
  
  /**
@@ -183,23 +191,27 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
    /// are we waiting for resource reservation grants form our replicas?
    [[nodiscard]] bool is_reserving() const final;
  
-  void send_start_scrub() final;
+  void send_start_scrub(epoch_t epoch_queued) final;
  
-  void send_start_after_repair() final;
+  void send_start_after_repair(epoch_t epoch_queued) final;
  
-  void send_scrub_resched() final;
+  void send_scrub_resched(epoch_t epoch_queued) final;
  
-  void active_pushes_notification() final;
+  void active_pushes_notification(epoch_t epoch_queued) final;
  
    void update_applied_notification(epoch_t epoch_queued) final;
  
-  void send_scrub_unblock() final;
+  void send_scrub_unblock(epoch_t epoch_queued) final;
+
+  void digest_update_notification(epoch_t epoch_queued) final;
  
-  void digest_update_notification() final;
+  void send_replica_maps_ready(epoch_t epoch_queued) final;
  
-  void send_replica_maps_ready() final;
+  void send_start_replica(epoch_t epoch_queued) final;
  
-  void send_replica_pushes_upd() final;
+  void send_sched_replica(epoch_t epoch_queued) final;
+
+  void send_replica_pushes_upd(epoch_t epoch_queued) final;
  
    void reset_epoch(epoch_t epoch_queued) final;
  
@@ -213,7 +225,12 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
    /// true if the given range intersects the scrub interval in any way
    bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
  
+  /**
+   *  we are a replica being asked by the Primary to reserve OSD resources for
+   *  scrubbing
+   */
    void handle_scrub_reserve_request(OpRequestRef op) final;
+
    void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
    void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
    void handle_scrub_reserve_release(OpRequestRef op) final;
@@ -245,8 +262,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
    // used if we are a replica
  
    void replica_scrub_op(OpRequestRef op) final;
-  void replica_scrub(epoch_t epoch_queued) final;
-  void replica_scrub_resched(epoch_t epoch_queued) final;
  
    /// the op priority, taken from the primary's request message
    Scrub::scrub_prio_t replica_op_priority() const final
@@ -279,7 +294,7 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
      return false;
    }
  
-  void scrub_clear_state(bool keep_repair_state = false) final;
+  void scrub_clear_state() final;
  
    /**
     *  add to scrub statistics, but only if the soid is below the scrub start
@@ -319,8 +334,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
      return m_pg->recovery_state.get_last_update_applied();
    }
  
-  void requeue_waiting() const final { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
-
    int pending_active_pushes() const final { return m_pg->active_pushes; }
  
    void scrub_compare_maps() final;
@@ -331,7 +344,7 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
  
    /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
    /// (thus can be called from FSM reactions)
-  void clear_pgscrub_state(bool keep_repair_state) final;
+  void clear_pgscrub_state() final;
  
    void add_delayed_scheduling() final;
  
@@ -344,10 +357,10 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
  
    Scrub::FsmNext on_digest_updates() final;
  
-  void send_replica_map(bool was_preempted) final;
+  void send_replica_map(Scrub::PreemptionNoted was_preempted) final;
  
-  void send_remotes_reserved() final;
-  void send_reservation_failure() final;
+  void send_remotes_reserved(epoch_t epoch_queued) final;
+  void send_reservation_failure(epoch_t epoch_queued) final;
  
    /**
     *  does the PG have newer updates than what we (the scrubber) know?
@@ -356,11 +369,9 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
  
    void set_subset_last_update(eversion_t e) final;
  
-  void replica_update_start_epoch() final;
-
    void maps_compare_n_cleanup() final;
  
-  Scrub::preemption_t* get_preemptor() final;
+  Scrub::preemption_t& get_preemptor() final;
  
    int build_primary_map_chunk() final;
  
@@ -400,32 +411,61 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
  
    virtual ~PgScrubber();  // must be defined separately, in the .cc file
  
-  [[nodiscard]] bool is_scrub_active() const final;
+  [[nodiscard]] bool is_scrub_active() const final { return m_active; }
  
   private:
    void reset_internal_state();
  
-  void _scan_snaps(ScrubMap& smap);  // note that the (non-standard for a
-                                    // non-virtual) name of the function is searched
-                                    // for by the QA standalone tests. Do not modify.
+  void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
  
-  void clean_meta_map(ScrubMap& for_meta_scrub);
+  void _scan_snaps(ScrubMap& smap);
+
+  ScrubMap clean_meta_map();
  
    void run_callbacks();
  
+  void send_interval_changed();
+
+  // -----     methods used to verify the relevance of incoming events:
+
    /**
-   * are we still a clean & healthy scrubbing primary?
+   *  is the incoming event still relevant, and should be processed?
     *
-   * relevant only after the initial sched_scrub
+   *  It isn't if:
+   *  - (1) we are no longer 'actively scrubbing'; or
+   *  - (2) the message is from an epoch prior to when we started the current scrub
+   * session; or
+   *  - (3) the message epoch is from a previous interval; or
+   *  - (4) the 'abort' configuration flags were set.
+   *
+   *  For (1) & (2) - teh incoming message is discarded, w/o further action.
+   *
+   *  For (3): (see check_interval() for a full description) if we have not reacted yet
+   *  to this specific new interval, we do now:
+   *  - replica reservations are silently discarded (we count on the replicas to notice
+   *        the interval change and un-reserve themselves);
+   *  - the scrubbing is halted.
+   *
+   *  For (4): the message will be discarded, but also:
+   *    if this is the first time we've noticed the 'abort' request, we perform the abort.
+   *
+   *  \returns should the incoming event be processed?
     */
-  [[nodiscard]] bool is_event_relevant(epoch_t queued) const;
+  bool is_message_relevant(epoch_t epoch_to_verify);
  
    /**
     * check the 'no scrub' configuration options.
     */
-  [[nodiscard]] bool should_abort_scrub(epoch_t queued) const;
+  [[nodiscard]] bool should_abort() const;
+  [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify);
+
+  bool check_interval(epoch_t epoch_to_verify);
+  bool check_interval_replica(epoch_t epoch_to_verify);
+
+  epoch_t m_last_dead_interval{};
+  epoch_t m_last_aborted{};  // last time we've noticed a request to abort
+
  
-  void send_epoch_changed();
  
    /**
     * return true if any inconsistency/missing is repaired, false otherwise
@@ -435,6 +475,8 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
    bool m_needs_sleep{true};  ///< should we sleep before being rescheduled? always
                              ///< 'true', unless we just got out of a sleep period
  
+  utime_t m_sleep_started_at;
+
  
    // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
    // to guarantee un-reserving when deleted.
@@ -450,8 +492,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
    /// the part that actually finalizes a scrub
    void scrub_finish();
  
-  utime_t m_sleep_started_at;
-
   protected:
    PG* const m_pg;
  
@@ -478,13 +518,18 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
    OSDService* const m_osds;
    const pg_shard_t m_pg_whoami;         ///< a local copy of m_pg->pg_whoami;
  
-  epoch_t m_epoch_start;  ///< epoch when scrubbing was first scheduled
-  epoch_t m_epoch_queued;
+  epoch_t m_interval_start;  ///< interval's 'from' of when scrubbing was first scheduled
+  /*
+   * the exact epoch when the scrubbing actually started (started here - cleared checks
+   *  for no-scrub conf). Incoming events are verified against this, with stale events
+   *  discarded.
+   */
+  epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
    scrub_flags_t m_flags;
  
    bool m_active{false};
  
-  eversion_t m_subset_last_update;
+  eversion_t m_subset_last_update{};
  
    std::unique_ptr<Scrub::Store> m_store;
  
@@ -499,10 +544,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
  
    CephContext* get_pg_cct() const { return m_pg->cct; }
  
-  void send_start_replica();
-
-  void send_sched_replica();
-
    // collected statistics
    int m_shallow_errors{0};
    int m_deep_errors{0};
@@ -522,8 +563,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
     */
    bool m_is_deep{false};
  
-  inline static int fake_count{2};  // unit-tests. To be removed
-
    /**
     * initiate a deep-scrub after the current scrub ended with errors.
     */
@@ -568,11 +607,10 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
  
    // ------------ members used if we are a replica
  
-  epoch_t m_replica_epoch_start;
    epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message
  
-  ScrubMapBuilder replica_scrubmap_pos;         /// \todo document
-  ScrubMap replica_scrubmap;            /// \todo document
+  ScrubMapBuilder replica_scrubmap_pos;
+  ScrubMap replica_scrubmap;
    /**
     * we mark the request priority as it arrived. It influences the queuing priority
     * when we wait for local updates
diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc

index 53c4427d489f6f9917bcc70cb0d52c7de97a2d94..f9c3a0495fa99f055b376e3168d5e6256cb376f5 100644 (file)
--- a/src/osd/scrub_machine.cc
+++ b/src/osd/scrub_machine.cc
@@ -30,13 +30,11 @@ namespace Scrub {
  
  // --------- trace/debug auxiliaries -------------------------------
  
-// development code. To be removed
  void on_event_creation(std::string_view nm)
  {
    dout(20) << " scrubberFSM event: --vvvv---- " << nm << dendl;
  }
  
-// development code. To be removed
  void on_event_discard(std::string_view nm)
  {
    dout(20) << " scrubberFSM event: --^^^^---- " << nm << dendl;
@@ -74,12 +72,12 @@ template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
  
  NotActive::NotActive(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> NotActive" << dendl;
+  dout(10) << "-- state -->> NotActive" << dendl;
  }
  
-sc::result NotActive::react(const EpochChanged&)
+sc::result NotActive::react(const IntervalChanged&)
  {
-  dout(15) << "NotActive::react(const EpochChanged&)" << dendl;
+  dout(15) << "NotActive::react(const IntervalChanged&)" << dendl;
    return discard_event();
  }
  
@@ -87,31 +85,18 @@ sc::result NotActive::react(const EpochChanged&)
  
  ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> ReservingReplicas" << dendl;
+  dout(10) << "-- state -->> ReservingReplicas" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    scrbr->reserve_replicas();
  }
  
-/**
- *  at least one replica denied us the scrub resources we've requested
- */
  sc::result ReservingReplicas::react(const ReservationFailure&)
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
  
    // the Scrubber must release all resources and abort the scrubbing
-  scrbr->clear_pgscrub_state(false);
-  return transit<NotActive>();
-}
-
-sc::result ReservingReplicas::react(const EpochChanged&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "ReservingReplicas::react(const EpochChanged&)" << dendl;
-
-  // the Scrubber must release all resources and abort the scrubbing
-  scrbr->clear_pgscrub_state(false);
+  scrbr->clear_pgscrub_state();
    return transit<NotActive>();
  }
  
@@ -128,7 +113,7 @@ sc::result ReservingReplicas::react(const FullReset&)
  
  ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> ActiveScrubbing" << dendl;
+  dout(10) << "-- state -->> ActiveScrubbing" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    scrbr->on_init();
  }
@@ -143,25 +128,23 @@ ActiveScrubbing::~ActiveScrubbing()
    scrbr->unreserve_replicas();
  }
  
-void ScrubMachine::down_on_epoch_change(const EpochChanged&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << __func__ << dendl;
-  scrbr->unreserve_replicas();
-}
-
-void ScrubMachine::on_epoch_changed(const EpochChanged&)
+/*
+ * The only source of an InternalError event as of now is the BuildMap state,
+ * when encountering a backend error.
+ * We kill the scrub and reset the FSM.
+ */
+sc::result ActiveScrubbing::react(const InternalError&)
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(10) << __func__ << dendl;
-  // the Scrubber must release all resources and abort the scrubbing
-  scrbr->clear_pgscrub_state(false);
+  scrbr->clear_pgscrub_state();
+  return transit<NotActive>();
  }
  
  sc::result ActiveScrubbing::react(const FullReset&)
  {
    dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl;
-  // caller takes care of this: scrbr->clear_pgscrub_state(false);
+  // caller takes care of clearing the scrubber & FSM states
    return transit<NotActive>();
  }
  
@@ -173,7 +156,7 @@ sc::result ActiveScrubbing::react(const FullReset&)
   */
  RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> Act/RangeBlocked" << dendl;
+  dout(10) << "-- state -->> Act/RangeBlocked" << dendl;
  }
  
  // ----------------------- PendingTimer -----------------------------------
@@ -183,7 +166,7 @@ RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx)
   */
  PendingTimer::PendingTimer(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> Act/PendingTimer" << dendl;
+  dout(10) << "-- state -->> Act/PendingTimer" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
  
    scrbr->add_delayed_scheduling();
@@ -198,10 +181,10 @@ PendingTimer::PendingTimer(my_context ctx) : my_base(ctx)
   */
  NewChunk::NewChunk(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> Act/NewChunk" << dendl;
+  dout(10) << "-- state -->> Act/NewChunk" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
  
-  scrbr->get_preemptor()->adjust_parameters();
+  scrbr->get_preemptor().adjust_parameters();
  
    //  choose range to work on
    bool got_a_chunk = scrbr->select_range();
@@ -278,15 +261,7 @@ sc::result WaitLastUpdate::react(const InternalAllUpdates&)
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl;
  
-  if (scrbr->was_epoch_changed()) {
-    dout(10) << "WaitLastUpdate: epoch!" << dendl;
-    post_event(boost::intrusive_ptr<EpochChanged>(new EpochChanged{}));
-    return discard_event();
-  }
-
-  dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&) "
-          << scrbr->get_preemptor()->is_preemptable() << dendl;
-  scrbr->get_replicas_maps(scrbr->get_preemptor()->is_preemptable());
+  scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable());
    return transit<BuildMap>();
  }
  
@@ -296,14 +271,11 @@ BuildMap::BuildMap(my_context ctx) : my_base(ctx)
  {
    dout(10) << " -- state -->> Act/BuildMap" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(15) << __func__ << " same epoch? " << (scrbr->was_epoch_changed() ? "no" : "yes")
-          << dendl;
-
-  if (scrbr->was_epoch_changed()) {
  
-    post_event(boost::intrusive_ptr<EpochChanged>(new EpochChanged{}));
+  // no need to check for an epoch change, as all possible flows that brought us here have
+  // an check_interval() verification of their final event.
  
-  } else if (scrbr->get_preemptor()->was_preempted()) {
+  if (scrbr->get_preemptor().was_preempted()) {
  
      // we were preempted, either directly or by a replica
      dout(10) << __func__ << " preempted!!!" << dendl;
@@ -322,7 +294,7 @@ BuildMap::BuildMap(my_context ctx) : my_base(ctx)
      } else if (ret < 0) {
  
        dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
-      scrbr->mark_local_map_ready();
+      // scrbr->mark_local_map_ready();
        post_event(boost::intrusive_ptr<InternalError>(new InternalError{}));
  
      } else {
@@ -346,7 +318,7 @@ sc::result BuildMap::react(const IntLocalMapDone&)
  
  DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> Act/DrainReplMaps" << dendl;
+  dout(10) << "-- state -->> Act/DrainReplMaps" << dendl;
    // we may have received all maps already. Send the event that will make us check.
    post_event(boost::intrusive_ptr<GotReplicas>(new GotReplicas{}));
  }
@@ -361,7 +333,7 @@ sc::result DrainReplMaps::react(const GotReplicas&)
      return transit<PendingTimer>();
    }
  
-  dout(10) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: "
+  dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: "
            << scrbr->dump_awaited_maps() << dendl;
    return discard_event();
  }
@@ -370,7 +342,7 @@ sc::result DrainReplMaps::react(const GotReplicas&)
  
  WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> Act/WaitReplicas" << dendl;
+  dout(10) << "-- state -->> Act/WaitReplicas" << dendl;
    post_event(boost::intrusive_ptr<GotReplicas>(new GotReplicas{}));
  }
  
@@ -383,7 +355,7 @@ sc::result WaitReplicas::react(const GotReplicas&)
      dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl;
  
      // were we preempted?
-    if (scrbr->get_preemptor()->disable_and_test()) {  // a test&set
+    if (scrbr->get_preemptor().disable_and_test()) {  // a test&set
  
  
        dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl;
@@ -391,7 +363,6 @@ sc::result WaitReplicas::react(const GotReplicas&)
  
      } else {
  
-      dout(8) << "got the replicas!" << dendl;
        scrbr->maps_compare_n_cleanup();
        return transit<WaitDigestUpdate>();
      }
@@ -404,7 +375,7 @@ sc::result WaitReplicas::react(const GotReplicas&)
  
  WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> Act/WaitDigestUpdate" << dendl;
+  dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl;
    // perform an initial check: maybe we already
    // have all the updates we need:
    // (note that DigestUpdate is usually an external event)
@@ -453,16 +424,18 @@ ScrubMachine::~ScrubMachine()
  
  ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> ReplicaWaitUpdates" << dendl;
+  dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl;
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-
    scrbr->on_replica_init();
-  post_event(boost::intrusive_ptr<ReplicaPushesUpd>(new ReplicaPushesUpd{}));
  }
  
-sc::result ReplicaWaitUpdates::react(const EpochChanged&)
+sc::result ReplicaWaitUpdates::react(const IntervalChanged&)
  {
-  dout(10) << "ReplicaWaitUpdates::react(const EpochChanged&)" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaWaitUpdates::react(const IntervalChanged&)" << dendl;
+
+  // note: the master's reservation of us was just discarded by our caller
+  scrbr->replica_handling_done();
    return transit<NotActive>();
  }
  
@@ -474,27 +447,32 @@ sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): "
            << scrbr->pending_active_pushes() << dendl;
-  dout(8) << "same epoch? " << !scrbr->was_epoch_changed() << dendl;
  
-  if (scrbr->was_epoch_changed()) {
-
-    post_event(boost::intrusive_ptr<EpochChanged>(new EpochChanged{}));
-
-  } else if (scrbr->pending_active_pushes() == 0) {
+  if (scrbr->pending_active_pushes() == 0) {
  
      // done waiting
-    scrbr->replica_update_start_epoch();
      return transit<ActiveReplica>();
    }
  
    return discard_event();
  }
  
+/**
+ * the event poster is handling the scrubber reset
+ */
+sc::result ReplicaWaitUpdates::react(const FullReset&)
+{
+  dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl;
+  return transit<NotActive>();
+}
+
  // ----------------------- ActiveReplica -----------------------------------
  
  ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx)
  {
-  dout(10) << " -- state -->> ActiveReplica" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "-- state -->> ActiveReplica" << dendl;
+  scrbr->on_replica_init();  // as we might have skipped ReplicaWaitUpdates
    post_event(boost::intrusive_ptr<SchedReplica>(new SchedReplica{}));
  }
  
@@ -502,82 +480,52 @@ sc::result ActiveReplica::react(const SchedReplica&)
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
    dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? "
-          << scrbr->get_preemptor()->is_preemptable() << dendl;
-
-  if (scrbr->was_epoch_changed()) {
-
-    dout(10) << "epoch changed" << dendl;
-    post_event(boost::intrusive_ptr<EpochChanged>(new EpochChanged{}));
-
-  } else if (scrbr->get_preemptor()->was_preempted()) {
+          << scrbr->get_preemptor().is_preemptable() << dendl;
  
+  if (scrbr->get_preemptor().was_preempted()) {
      dout(10) << "replica scrub job preempted" << dendl;
  
-    scrbr->send_replica_map(true);
-    post_event(boost::intrusive_ptr<IntLocalMapDone>(new IntLocalMapDone{}));
-
-  } else {
-
-    // start or check progress of build_replica_map_chunk()
-
-    auto ret = scrbr->build_replica_map_chunk();
-    dout(15) << "ActiveReplica::react(const SchedReplica&) Ret: " << ret << dendl;
-
-    if (ret == -EINPROGRESS) {
-
-      // must wait for the backend to finish. No external event source.
-      // build_replica_map_chunk() has already requeued a SchedReplica
-      // event.
-
-      dout(20) << "waiting for the backend..." << dendl;
-
-    } else if (ret < 0) {
-
-      //  the existing code ignores this option, treating an error
-      //  report as a success.
-      ///  \todo what should we do here?
+    scrbr->send_replica_map(PreemptionNoted::preempted);
+    scrbr->replica_handling_done();
+    return transit<NotActive>();
+  }
  
-      dout(1) << "Error! Aborting. ActiveReplica::react(const "
-                "SchedReplica&) Ret: "
-             << ret << dendl;
-      post_event(boost::intrusive_ptr<IntLocalMapDone>(new IntLocalMapDone{}));
+  // start or check progress of build_replica_map_chunk()
  
-    } else {
+  auto ret = scrbr->build_replica_map_chunk();
+  dout(15) << "ActiveReplica::react(const SchedReplica&) Ret: " << ret << dendl;
  
-      // the local map was created. Send it to the primary.
+  if (ret == -EINPROGRESS) {
+    // must wait for the backend to finish. No external event source.
+    // build_replica_map_chunk() has already requeued a SchedReplica
+    // event.
  
-      scrbr->send_replica_map(false);  // 'false' == not preempted
-      post_event(boost::intrusive_ptr<IntLocalMapDone>(new IntLocalMapDone{}));
-    }
+    dout(20) << "waiting for the backend..." << dendl;
+    return discard_event();
    }
-  return discard_event();
-}
  
-sc::result ActiveReplica::react(const IntLocalMapDone&)
-{
-  dout(10) << "ActiveReplica::react(const IntLocalMapDone&)" << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-
-  scrbr->replica_handling_done();
-  return transit<NotActive>();
-}
+  if (ret < 0) {
+    //  the existing code ignores this option, treating an error
+    //  report as a success.
+    dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret
+           << dendl;
+    scrbr->replica_handling_done();
+    return transit<NotActive>();
+  }
  
-sc::result ActiveReplica::react(const InternalError&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(1) << "Error! Aborting."
-         << " ActiveReplica::react(const InternalError&) " << dendl;
  
+  // the local map was created. Send it to the primary.
+  scrbr->send_replica_map(PreemptionNoted::no_preemption);
    scrbr->replica_handling_done();
    return transit<NotActive>();
  }
  
-sc::result ActiveReplica::react(const EpochChanged&)
+sc::result ActiveReplica::react(const IntervalChanged&)
  {
    DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "ActiveReplica::react(const EpochChanged&) " << dendl;
+  dout(10) << "ActiveReplica::react(const IntervalChanged&) " << dendl;
  
-  scrbr->send_replica_map(false);
+  scrbr->send_replica_map(PreemptionNoted::no_preemption);
    scrbr->replica_handling_done();
    return transit<NotActive>();
  }
@@ -588,7 +536,6 @@ sc::result ActiveReplica::react(const EpochChanged&)
  sc::result ActiveReplica::react(const FullReset&)
  {
    dout(10) << "ActiveReplica::react(const FullReset&)" << dendl;
-  // caller takes care of this: scrbr->clear_pgscrub_state(false);
    return transit<NotActive>();
  }
  
diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h

index ae1f7e812a1e649e4d62accee428ff473021e823..7bbb7db37fe180422665dbb6a9a7dd94a3a52ebc 100644 (file)
--- a/src/osd/scrub_machine.h
+++ b/src/osd/scrub_machine.h
@@ -53,7 +53,9 @@ void on_event_discard(std::string_view nm);
  
  MEV(RemotesReserved)    ///< all replicas have granted our reserve request
  MEV(ReservationFailure)         ///< a reservation request has failed
-MEV(EpochChanged)       ///< ... from what it was when this chunk started
+
+MEV(IntervalChanged)  ///< ... from what it was when this chunk started
+
  MEV(StartScrub)         ///< initiate a new scrubbing session (relevant if we are a Primary)
  MEV(AfterRepairScrub)  ///< initiate a new scrubbing session. Only triggered at Recovery
                        ///< completion.
@@ -78,8 +80,10 @@ MEV(DigestUpdate)  ///< external. called upon success of a MODIFY op. See
                    ///< scrub_snapshot_metadata()
  MEV(AllChunksDone)
  
-MEV(StartReplica)  ///< initiating replica scrub. replica_scrub_op() -> OSD Q ->
-                  ///< replica_scrub()
+MEV(StartReplica)       ///< initiating replica scrub. replica_scrub_op() -> OSD Q ->
+                        ///< replica_scrub()
+MEV(StartReplicaNoWait)         ///< 'start replica' when there are no pending updates
+
  MEV(SchedReplica)
  MEV(ReplicaPushesUpd)  ///< Update to active_pushes. 'active_pushes' represents recovery
                        ///< that is in-flight to the local ObjectStore
@@ -105,17 +109,10 @@ class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
  
    PG* m_pg;  // only used for dout messages
    spg_t m_pg_id;
-
    ScrubMachineListener* m_scrbr;
  
-  void down_on_epoch_change(const EpochChanged&);
-
-  void on_epoch_changed(const EpochChanged&);
-
    void my_states() const;
-
    void assert_not_active() const;
-
    [[nodiscard]] bool is_reserving() const;
  };
  
@@ -127,37 +124,34 @@ class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
   *    queued "PGScrub" op.
   *  - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is
   *    not required to reserve resources.
- *  - (for a replica) 'StartReplica', triggered by an incoming MOSDRepScrub msg.
+ *  - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming
+ *    MOSDRepScrub message.
   */
  struct NotActive : sc::state<NotActive, ScrubMachine> {
    explicit NotActive(my_context ctx);
  
-  using reactions = mpl::list<sc::custom_reaction<EpochChanged>,
+  using reactions = mpl::list<sc::custom_reaction<IntervalChanged>,
                               sc::transition<StartScrub, ReservingReplicas>,
                               // a scrubbing that was initiated at recovery completion,
                               // and requires no resource reservations:
                               sc::transition<AfterRepairScrub, ActiveScrubbing>,
                               sc::transition<StartReplica, ReplicaWaitUpdates>,
-                             sc::custom_reaction<sc::event_base>>;
+                             sc::transition<StartReplicaNoWait, ActiveReplica>>;
  
-  sc::result react(const EpochChanged&);
-  sc::result react(const sc::event_base&)  // in the future: assert here
-  {
-    return discard_event();
-  }
+  sc::result react(const IntervalChanged&);
  };
  
  struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine> {
  
    explicit ReservingReplicas(my_context ctx);
-  using reactions = mpl::list<sc::custom_reaction<EpochChanged>,
+  using reactions = mpl::list<sc::custom_reaction<FullReset>,
                               // all replicas granted our resources request
                               sc::transition<RemotesReserved, ActiveScrubbing>,
-                             sc::custom_reaction<FullReset>,
                               sc::custom_reaction<ReservationFailure>>;
  
-  sc::result react(const EpochChanged&);
    sc::result react(const FullReset&);
+
+  /// at least one replica denied us the scrub resources we've requested
    sc::result react(const ReservationFailure&);
  };
  
@@ -184,14 +178,12 @@ struct ActiveScrubbing : sc::state<ActiveScrubbing, ScrubMachine, PendingTimer>
      // done scrubbing
      sc::transition<AllChunksDone, NotActive>,
  
-    sc::transition<EpochChanged,
-                  NotActive,
-                  ScrubMachine,
-                  &ScrubMachine::down_on_epoch_change>,
+    sc::custom_reaction<InternalError>,
      sc::custom_reaction<FullReset>>;
  
    sc::result react(const AllChunksDone&);
    sc::result react(const FullReset&);
+  sc::result react(const InternalError&);
  };
  
  struct RangeBlocked : sc::state<RangeBlocked, ActiveScrubbing> {
@@ -250,13 +242,19 @@ struct WaitLastUpdate : sc::state<WaitLastUpdate, ActiveScrubbing> {
  struct BuildMap : sc::state<BuildMap, ActiveScrubbing> {
    explicit BuildMap(my_context ctx);
  
+  // possible error scenarios:
+  // - an error reported by the backend will trigger an 'InternalError' event,
+  //   handled by our parent state;
+  // - if preempted, we switch to DrainReplMaps, where we will wait for all
+  //   replicas to send their maps before acknowledging the preemption;
+  // - an interval change will be handled by the relevant 'send-event' functions,
+  //   and will translated into a 'FullReset' event.
    using reactions =
      mpl::list<sc::transition<IntBmPreempted, DrainReplMaps>,
               sc::transition<InternalSchedScrub, BuildMap>,  // looping, waiting
                                                              // for the backend to
                                                              // finish
-             sc::custom_reaction<IntLocalMapDone>,
-             sc::transition<InternalError, NotActive>>;  // to discuss RRR
+             sc::custom_reaction<IntLocalMapDone>>;
  
    sc::result react(const IntLocalMapDone&);
  };
@@ -301,27 +299,25 @@ struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing> {
   */
  struct ReplicaWaitUpdates : sc::state<ReplicaWaitUpdates, ScrubMachine> {
    explicit ReplicaWaitUpdates(my_context ctx);
-  using reactions =
-    mpl::list<sc::custom_reaction<ReplicaPushesUpd>, sc::custom_reaction<EpochChanged>>;
+  using reactions = mpl::list<sc::custom_reaction<ReplicaPushesUpd>,
+                             sc::custom_reaction<FullReset>,
+                             sc::custom_reaction<IntervalChanged>>;
  
    sc::result react(const ReplicaPushesUpd&);
-  sc::result react(const EpochChanged&);
+  sc::result react(const IntervalChanged&);
+  sc::result react(const FullReset&);
  };
  
  
  struct ActiveReplica : sc::state<ActiveReplica, ScrubMachine> {
    explicit ActiveReplica(my_context ctx);
-  using reactions = mpl::list<sc::custom_reaction<EpochChanged>,
+  using reactions = mpl::list<sc::custom_reaction<IntervalChanged>,
                               sc::custom_reaction<SchedReplica>,
-                             sc::custom_reaction<IntLocalMapDone>,
-                             sc::custom_reaction<FullReset>,
-                             sc::custom_reaction<InternalError>>;
+                             sc::custom_reaction<FullReset>>;
  
    sc::result react(const SchedReplica&);
-  sc::result react(const EpochChanged&);
-  sc::result react(const IntLocalMapDone&);
+  sc::result react(const IntervalChanged&);
    sc::result react(const FullReset&);
-  sc::result react(const InternalError&);
  };
  
  }  // namespace Scrub
diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h

index cfaca4b10f25925510360fc90a7f6839ea4b7583..2b96161215474759da1b398bae71797d0d6efe27 100644 (file)
--- a/src/osd/scrub_machine_lstnr.h
+++ b/src/osd/scrub_machine_lstnr.h
@@ -15,6 +15,7 @@ namespace Scrub {
  /// used when PgScrubber is called by the scrub-machine, to tell the FSM
  /// how to continue
  enum class FsmNext { do_discard, next_chunk, goto_notactive };
+enum class PreemptionNoted { no_preemption, preempted };
  
  /// the interface exposed by the PgScrubber into its internal
  /// preemption_data object
@@ -55,8 +56,6 @@ struct ScrubMachineListener {
  
    virtual eversion_t get_last_update_applied() const = 0;
  
-  virtual void requeue_waiting() const = 0;
-
    virtual int pending_active_pushes() const = 0;
  
    virtual int build_primary_map_chunk() = 0;
@@ -71,9 +70,11 @@ struct ScrubMachineListener {
  
    virtual void replica_handling_done() = 0;
  
+  // no virtual void discard_reservation_by_primary() = 0;
+
    /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
    /// (thus can be called from FSM reactions)
-  virtual void clear_pgscrub_state(bool keep_repair_state) = 0;
+  virtual void clear_pgscrub_state() = 0;
  
    virtual void add_delayed_scheduling() = 0;
  
@@ -86,9 +87,7 @@ struct ScrubMachineListener {
  
    virtual Scrub::FsmNext on_digest_updates() = 0;
  
-  virtual void send_replica_map(bool was_preempted) = 0;
-
-  virtual void replica_update_start_epoch() = 0;
+  virtual void send_replica_map(Scrub::PreemptionNoted was_preempted) = 0;
  
    [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0;
  
@@ -96,7 +95,7 @@ struct ScrubMachineListener {
  
    [[nodiscard]] virtual bool was_epoch_changed() const = 0;
  
-  virtual Scrub::preemption_t* get_preemptor() = 0;
+  virtual Scrub::preemption_t& get_preemptor() = 0;
  
    /**
     *  a "technical" collection of the steps performed once all
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h

index d736319af1be4de66ae3d3fba49ded8c0e2a255c..510c4285469489c258514df6aff44ac5fdc63291 100644 (file)
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -116,25 +116,27 @@ struct ScrubPgIF {
  
    // --------------- triggering state-machine events:
  
-  virtual void send_start_scrub() = 0;
+  virtual void send_start_scrub(epoch_t epoch_queued) = 0;
  
-  virtual void send_start_after_repair() = 0;
+  virtual void send_start_after_repair(epoch_t epoch_queued) = 0;
  
-  virtual void send_scrub_resched() = 0;
+  virtual void send_scrub_resched(epoch_t epoch_queued) = 0;
  
-  virtual void replica_scrub_resched(epoch_t epoch_queued) = 0;
-
-  virtual void active_pushes_notification() = 0;
+  virtual void active_pushes_notification(epoch_t epoch_queued) = 0;
  
    virtual void update_applied_notification(epoch_t epoch_queued) = 0;
  
-  virtual void digest_update_notification() = 0;
+  virtual void digest_update_notification(epoch_t epoch_queued) = 0;
+
+  virtual void send_scrub_unblock(epoch_t epoch_queued) = 0;
+
+  virtual void send_replica_maps_ready(epoch_t epoch_queued) = 0;
  
-  virtual void send_scrub_unblock() = 0;
+  virtual void send_replica_pushes_upd(epoch_t epoch_queued) = 0;
  
-  virtual void send_replica_maps_ready() = 0;
+  virtual void send_start_replica(epoch_t epoch_queued) = 0;
  
-  virtual void send_replica_pushes_upd() = 0;
+  virtual void send_sched_replica(epoch_t epoch_queued) = 0;
  
    // --------------------------------------------------
  
@@ -158,24 +160,25 @@ struct ScrubPgIF {
  
    virtual void replica_scrub_op(OpRequestRef op) = 0;
  
-  virtual void replica_scrub(epoch_t epoch_queued) = 0;
-
    virtual void set_op_parameters(requested_scrub_t&) = 0;
  
-  virtual void scrub_clear_state(bool keep_repair_state = false) = 0;
+  virtual void scrub_clear_state() = 0;
  
    virtual void handle_query_state(ceph::Formatter* f) = 0;
  
    virtual void dump(ceph::Formatter* f) const = 0;
  
    /**
-   * we allow some number of preemptions of the scrub, which mean we do
-   *  not block.  Then we start to block.  Once we start blocking, we do
-   *  not stop until the scrub range is completed.
+   * Return true if soid is currently being scrubbed and pending IOs should block.
+   * May have a side effect of preempting an in-progress scrub -- will return false
+   * in that case.
+   *
+   * @param soid object to check for ongoing scrub
+   * @return boolean whether a request on soid should block until scrub completion
     */
    virtual bool write_blocked_by_scrub(const hobject_t& soid) = 0;
  
-  /// true if the given range intersects the scrub interval in any way
+  /// Returns whether any objects in the range [begin, end] are being scrubbed
    virtual bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) = 0;
  
    /// the op priority, taken from the primary's request message
@@ -201,19 +204,19 @@ struct ScrubPgIF {
     * the version of 'scrub_clear_state()' that does not try to invoke FSM services
     * (thus can be called from FSM reactions)
     */
-  virtual void clear_pgscrub_state(bool keep_repair_state) = 0;
+  virtual void clear_pgscrub_state() = 0;
  
    /**
     *  triggers the 'RemotesReserved' (all replicas granted scrub resources)
     *  state-machine event
     */
-  virtual void send_remotes_reserved() = 0;
+  virtual void send_remotes_reserved(epoch_t epoch_queued) = 0;
  
    /**
     * triggers the 'ReservationFailure' (at least one replica denied us the requested
     * resources) state-machine event
     */
-  virtual void send_reservation_failure() = 0;
+  virtual void send_reservation_failure(epoch_t epoch_queued) = 0;
  
    virtual void cleanup_store(ObjectStore::Transaction* t) = 0;
author	Ronen Friedman <rfriedma@redhat.com>
	Wed, 2 Dec 2020 08:58:49 +0000 (10:58 +0200)
committer	Ronen Friedman <rfriedma@redhat.com>
	Thu, 10 Dec 2020 13:21:53 +0000 (15:21 +0200)
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history
src/osd/PG.cc		patch \| blob \| history
src/osd/PrimaryLogPG.cc		patch \| blob \| history
src/osd/PrimaryLogScrub.cc		patch \| blob \| history
src/osd/pg_scrubber.cc		patch \| blob \| history
src/osd/pg_scrubber.h		patch \| blob \| history
src/osd/scrub_machine.cc		patch \| blob \| history
src/osd/scrub_machine.h		patch \| blob \| history
src/osd/scrub_machine_lstnr.h		patch \| blob \| history
src/osd/scrubber_common.h		patch \| blob \| history