]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: collecting scrub-related files into a separate directory
authorRonen Friedman <rfriedma@redhat.com>
Sun, 11 Apr 2021 18:17:41 +0000 (21:17 +0300)
committerRonen Friedman <rfriedma@redhat.com>
Tue, 14 Sep 2021 11:30:55 +0000 (11:30 +0000)
Cleaning src/osd from scrub implementation files. Triggered by:
- the matching Crimson scrub structure;
- the proliferation of scrub related code files (inc. in coming PRs);

scrubber_common.h, which defines the scrubber's interface, remains
in src/osd.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
23 files changed:
src/osd/CMakeLists.txt
src/osd/OSD.cc
src/osd/PG.cc
src/osd/PGBackend.cc
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogScrub.cc [deleted file]
src/osd/PrimaryLogScrub.h [deleted file]
src/osd/ScrubStore.cc [deleted file]
src/osd/ScrubStore.h [deleted file]
src/osd/pg_scrubber.cc [deleted file]
src/osd/pg_scrubber.h [deleted file]
src/osd/scrub_machine.cc [deleted file]
src/osd/scrub_machine.h [deleted file]
src/osd/scrub_machine_lstnr.h [deleted file]
src/osd/scrubber/PrimaryLogScrub.cc [new file with mode: 0644]
src/osd/scrubber/PrimaryLogScrub.h [new file with mode: 0644]
src/osd/scrubber/ScrubStore.cc [new file with mode: 0644]
src/osd/scrubber/ScrubStore.h [new file with mode: 0644]
src/osd/scrubber/pg_scrubber.cc [new file with mode: 0644]
src/osd/scrubber/pg_scrubber.h [new file with mode: 0644]
src/osd/scrubber/scrub_machine.cc [new file with mode: 0644]
src/osd/scrubber/scrub_machine.h [new file with mode: 0644]
src/osd/scrubber/scrub_machine_lstnr.h [new file with mode: 0644]

index 373456fc65d451107fc963a458597063d7881ed1..82a5451804a717c257feaba2722e38ff37a3b5b3 100644 (file)
@@ -11,9 +11,6 @@ endif()
 
 set(osd_srcs
   OSD.cc
-  pg_scrubber.cc
-  scrub_machine.cc
-  PrimaryLogScrub.cc
   Watch.cc
   ClassHandler.cc
   PG.cc
@@ -24,10 +21,13 @@ set(osd_srcs
   ECTransaction.cc
   PGBackend.cc
   OSDCap.cc
+  scrubber/pg_scrubber.cc
+  scrubber/PrimaryLogScrub.cc
+  scrubber/scrub_machine.cc
+  scrubber/ScrubStore.cc
   Watch.cc
   Session.cc
   SnapMapper.cc
-  ScrubStore.cc
   osd_types.cc
   ECUtil.cc
   ExtentCache.cc
index 425e27a3df131573a1a52b4553276b07c622045a..62aa2f767a4501f3eb70a182d2257749a4845830 100644 (file)
@@ -35,8 +35,8 @@
 #endif
 
 #include "osd/PG.h"
-#include "osd/scrub_machine.h"
-#include "osd/pg_scrubber.h"
+#include "osd/scrubber/scrub_machine.h"
+#include "osd/scrubber/pg_scrubber.h"
 
 #include "include/types.h"
 #include "include/compat.h"
index 644a6a9be9aa710ab82d34bb77157d9da90b3802..0f992d652834d9a3ce052e62cd6ea811caad30cf 100644 (file)
@@ -20,8 +20,8 @@
 #include "common/config.h"
 #include "OSD.h"
 #include "OpRequest.h"
-#include "ScrubStore.h"
-#include "pg_scrubber.h"
+#include "scrubber/ScrubStore.h"
+#include "scrubber/pg_scrubber.h"
 #include "Session.h"
 #include "osd/scheduler/OpSchedulerItem.h"
 
index 4e8c74c487ee087004ecdcf60eea48f6424a6a5f..cca28a8941d3b3cd7fc8ceffb20deb0407183ef6 100644 (file)
@@ -19,7 +19,7 @@
 #include "common/errno.h"
 #include "common/scrub_types.h"
 #include "ReplicatedBackend.h"
-#include "ScrubStore.h"
+#include "scrubber/ScrubStore.h"
 #include "ECBackend.h"
 #include "PGBackend.h"
 #include "OSD.h"
index 6b1a3e52f45b6b8cd93d7cdd4e52798d328240e5..6899c5ea6b05d46f7b9c83d640db9b75809689e4 100644 (file)
  *
  */
 
-#include <errno.h>
-
-#include <charconv>
-#include <sstream>
-#include <utility>
+#include "PrimaryLogPG.h"
 
 #include <boost/intrusive_ptr.hpp>
-#include <boost/tuple/tuple.hpp>
-
-#include "PG.h"
-#include "pg_scrubber.h"
-#include "PrimaryLogPG.h"
-#include "OSD.h"
-#include "PrimaryLogScrub.h"
-#include "OpRequest.h"
-#include "ScrubStore.h"
-#include "Session.h"
-#include "objclass/objclass.h"
-#include "osd/ClassHandler.h"
 
 #include "cls/cas/cls_cas_ops.h"
+#include "common/EventTrace.h"
 #include "common/ceph_crypto.h"
+#include "common/CDC.h"
 #include "common/config.h"
 #include "common/errno.h"
-#include "common/scrub_types.h"
-#include "common/perf_counters.h"
-#include "common/CDC.h"
 #include "common/EventTrace.h"
-
-#include "messages/MOSDOp.h"
+#include "common/perf_counters.h"
+#include "common/scrub_types.h"
+#include "include/compat.h"
+#include "messages/MCommandReply.h"
 #include "messages/MOSDBackoff.h"
-#include "messages/MOSDPGTrim.h"
-#include "messages/MOSDPGScan.h"
-#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDOp.h"
 #include "messages/MOSDPGBackfill.h"
 #include "messages/MOSDPGBackfillRemove.h"
 #include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGScan.h"
+#include "messages/MOSDPGTrim.h"
 #include "messages/MOSDPGUpdateLogMissing.h"
 #include "messages/MOSDPGUpdateLogMissingReply.h"
-#include "messages/MCommandReply.h"
+#include "messages/MOSDRepScrub.h"
 #include "messages/MOSDScrubReserve.h"
-
-#include "include/compat.h"
 #include "mon/MonClient.h"
+#include "objclass/objclass.h"
+#include "osd/ClassHandler.h"
+#include "osd/OpRequest.h"
+#include "osd/Session.h"
 #include "osdc/Objecter.h"
+#include "scrubber/PrimaryLogScrub.h"
+
+// required includes order:
 #include "json_spirit/json_spirit_value.h"
 #include "json_spirit/json_spirit_reader.h"
 #include "include/ceph_assert.h"  // json_spirit clobbers it
diff --git a/src/osd/PrimaryLogScrub.cc b/src/osd/PrimaryLogScrub.cc
deleted file mode 100644 (file)
index ac4049a..0000000
+++ /dev/null
@@ -1,589 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "PrimaryLogScrub.h"
-
-#include "common/scrub_types.h"
-#include "osd/osd_types_fmt.h"
-
-#include "PeeringState.h"
-#include "PrimaryLogPG.h"
-#include "scrub_machine.h"
-
-#define dout_context (m_pg->get_cct())
-#define dout_subsys ceph_subsys_osd
-#undef dout_prefix
-#define dout_prefix _prefix(_dout, this->m_pg)
-
-using std::vector;
-
-template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
-{
-  return t->gen_prefix(*_dout) << " PrimaryLog scrubber pg(" << t->pg_id << ") ";
-}
-
-using namespace Scrub;
-using Scrub::ScrubMachine;
-
-bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg,
-                                      scrub_ls_result_t& res_inout) const
-{
-  if (!m_store) {
-    return false;
-  }
-
-  if (arg.get_snapsets) {
-    res_inout.vals =
-      m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return);
-  } else {
-    res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after,
-                                               arg.max_return);
-  }
-  return true;
-}
-
-void PrimaryLogScrub::_scrub_finish()
-{
-  auto& info = m_pg->get_pg_info(ScrubberPasskey{});  ///< a temporary alias
-
-  dout(10) << __func__
-          << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid")
-          << dendl;
-
-  if (info.stats.stats_invalid) {
-    m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) {
-      stats.stats = m_scrub_cstat;
-      stats.stats_invalid = false;
-      return false;
-    });
-
-    if (m_pl_pg->agent_state)
-      m_pl_pg->agent_choose_mode();
-  }
-
-  dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/"
-          << info.stats.stats.sum.num_objects << " objects, "
-          << m_scrub_cstat.sum.num_object_clones << "/"
-          << info.stats.stats.sum.num_object_clones << " clones, "
-          << m_scrub_cstat.sum.num_objects_dirty << "/"
-          << info.stats.stats.sum.num_objects_dirty << " dirty, "
-          << m_scrub_cstat.sum.num_objects_omap << "/"
-          << info.stats.stats.sum.num_objects_omap << " omap, "
-          << m_scrub_cstat.sum.num_objects_pinned << "/"
-          << info.stats.stats.sum.num_objects_pinned << " pinned, "
-          << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
-          << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
-          << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes
-          << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/"
-          << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
-          << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
-          << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
-          << dendl;
-
-  if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
-      m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
-      (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
-       !info.stats.dirty_stats_invalid) ||
-      (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
-       !info.stats.omap_stats_invalid) ||
-      (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
-       !info.stats.pin_stats_invalid) ||
-      (m_scrub_cstat.sum.num_objects_hit_set_archive !=
-        info.stats.stats.sum.num_objects_hit_set_archive &&
-       !info.stats.hitset_stats_invalid) ||
-      (m_scrub_cstat.sum.num_bytes_hit_set_archive !=
-        info.stats.stats.sum.num_bytes_hit_set_archive &&
-       !info.stats.hitset_bytes_stats_invalid) ||
-      (m_scrub_cstat.sum.num_objects_manifest !=
-        info.stats.stats.sum.num_objects_manifest &&
-       !info.stats.manifest_stats_invalid) ||
-      m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
-      m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
-    m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got "
-                         << m_scrub_cstat.sum.num_objects << "/"
-                         << info.stats.stats.sum.num_objects << " objects, "
-                         << m_scrub_cstat.sum.num_object_clones << "/"
-                         << info.stats.stats.sum.num_object_clones << " clones, "
-                         << m_scrub_cstat.sum.num_objects_dirty << "/"
-                         << info.stats.stats.sum.num_objects_dirty << " dirty, "
-                         << m_scrub_cstat.sum.num_objects_omap << "/"
-                         << info.stats.stats.sum.num_objects_omap << " omap, "
-                         << m_scrub_cstat.sum.num_objects_pinned << "/"
-                         << info.stats.stats.sum.num_objects_pinned << " pinned, "
-                         << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
-                         << info.stats.stats.sum.num_objects_hit_set_archive
-                         << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts
-                         << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
-                         << m_scrub_cstat.sum.num_bytes << "/"
-                         << info.stats.stats.sum.num_bytes << " bytes, "
-                         << m_scrub_cstat.sum.num_objects_manifest << "/"
-                         << info.stats.stats.sum.num_objects_manifest
-                         << " manifest objects, "
-                         << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
-                         << info.stats.stats.sum.num_bytes_hit_set_archive
-                         << " hit_set_archive bytes.";
-    ++m_shallow_errors;
-
-    if (m_is_repair) {
-      ++m_fixed_count;
-      m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) {
-       stats.stats = m_scrub_cstat;
-       stats.dirty_stats_invalid = false;
-       stats.omap_stats_invalid = false;
-       stats.hitset_stats_invalid = false;
-       stats.hitset_bytes_stats_invalid = false;
-       stats.pin_stats_invalid = false;
-       stats.manifest_stats_invalid = false;
-       return false;
-      });
-      m_pl_pg->publish_stats_to_osd();
-      m_pl_pg->recovery_state.share_pg_info();
-    }
-  }
-  // Clear object context cache to get repair information
-  if (m_is_repair)
-    m_pl_pg->object_contexts.clear();
-}
-
-static bool doing_clones(const std::optional<SnapSet>& snapset,
-                        const vector<snapid_t>::reverse_iterator& curclone)
-{
-  return snapset && curclone != snapset->clones.rend();
-}
-
-void PrimaryLogScrub::log_missing(int missing,
-                                 const std::optional<hobject_t>& head,
-                                 LogChannelRef clog,
-                                 const spg_t& pgid,
-                                 const char* func,
-                                 bool allow_incomplete_clones)
-{
-  ceph_assert(head);
-  if (allow_incomplete_clones) {
-    dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped "
-            << missing << " clone(s) in cache tier" << dendl;
-  } else {
-    clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing
-                << " missing clone(s)";
-  }
-}
-
-int PrimaryLogScrub::process_clones_to(const std::optional<hobject_t>& head,
-                                      const std::optional<SnapSet>& snapset,
-                                      LogChannelRef clog,
-                                      const spg_t& pgid,
-                                      bool allow_incomplete_clones,
-                                      std::optional<snapid_t> target,
-                                      vector<snapid_t>::reverse_iterator* curclone,
-                                      inconsistent_snapset_wrapper& e)
-{
-  ceph_assert(head);
-  ceph_assert(snapset);
-  int missing_count = 0;
-
-  // NOTE: clones are in descending order, thus **curclone > target test here
-  hobject_t next_clone(*head);
-  while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
-
-    ++missing_count;
-    // it is okay to be missing one or more clones in a cache tier.
-    // skip higher-numbered clones in the list.
-    if (!allow_incomplete_clones) {
-      next_clone.snap = **curclone;
-      clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone "
-                   << next_clone << " " << m_missing << " missing";
-      ++m_shallow_errors;
-      e.set_clone_missing(next_clone.snap);
-    }
-    // Clones are descending
-    ++(*curclone);
-  }
-  return missing_count;
-}
-
-/*
- * Validate consistency of the object info and snap sets.
- *
- * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
- * the comparison of the objects is against multiple snapset.clones. There are
- * multiple clone lists and in between lists we expect head.
- *
- * Example
- *
- * objects              expected
- * =======              =======
- * obj1 snap 1          head, unexpected obj1 snap 1
- * obj2 head            head, match
- *              [SnapSet clones 6 4 2 1]
- * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
- * obj2 snap 6          obj2 snap 6, match
- * obj2 snap 4          obj2 snap 4, match
- * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), match
- *              [Snapset clones 3 1]
- * obj3 snap 3          obj3 snap 3 match
- * obj3 snap 1          obj3 snap 1 match
- * obj4 head            head, match
- *              [Snapset clones 4]
- * EOL                  obj4 snap 4, (expected)
- */
-void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap,
-                                             const missing_map_t& missing_digest)
-{
-  dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects
-          << dendl;
-
-  auto& info = m_pl_pg->info;
-  const PGPool& pool = m_pl_pg->pool;
-  bool allow_incomplete_clones = pool.info.allow_incomplete_clones();
-
-  std::optional<snapid_t> all_clones;  // Unspecified snapid_t or std::nullopt
-
-  // traverse in reverse order.
-  std::optional<hobject_t> head;
-  std::optional<SnapSet> snapset;              // If initialized so will head (above)
-  vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
-  int missing = 0;
-  inconsistent_snapset_wrapper soid_error, head_error;
-  int soid_error_count = 0;
-
-  for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
-
-    const hobject_t& soid = p->first;
-    ceph_assert(!soid.is_snapdir());
-    soid_error = inconsistent_snapset_wrapper{soid};
-    object_stat_sum_t stat;
-    std::optional<object_info_t> oi;
-
-    stat.num_objects++;
-
-    if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
-      stat.num_objects_hit_set_archive++;
-
-    if (soid.is_snap()) {
-      // it's a clone
-      stat.num_object_clones++;
-    }
-
-    // basic checks.
-    if (p->second.attrs.count(OI_ATTR) == 0) {
-      oi = std::nullopt;
-      m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
-                           << OI_ATTR << "' attr";
-      ++m_shallow_errors;
-      soid_error.set_info_missing();
-    } else {
-      bufferlist bv;
-      bv.push_back(p->second.attrs[OI_ATTR]);
-      try {
-       oi = object_info_t(bv);
-      } catch (ceph::buffer::error& e) {
-       oi = std::nullopt;
-       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
-                             << " : can't decode '" << OI_ATTR << "' attr " << e.what();
-       ++m_shallow_errors;
-       soid_error.set_info_corrupted();
-       soid_error.set_info_missing();  // Not available too
-      }
-    }
-
-    if (oi) {
-      if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
-       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
-                             << " : on disk size (" << p->second.size
-                             << ") does not match object info size (" << oi->size
-                             << ") adjusted for ondisk to ("
-                             << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")";
-       soid_error.set_size_mismatch();
-       ++m_shallow_errors;
-      }
-
-      dout(20) << m_mode_desc << "  " << soid << " " << *oi << dendl;
-
-      // A clone num_bytes will be added later when we have snapset
-      if (!soid.is_snap()) {
-       stat.num_bytes += oi->size;
-      }
-      if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
-       stat.num_bytes_hit_set_archive += oi->size;
-
-      if (oi->is_dirty())
-       ++stat.num_objects_dirty;
-      if (oi->is_whiteout())
-       ++stat.num_whiteouts;
-      if (oi->is_omap())
-       ++stat.num_objects_omap;
-      if (oi->is_cache_pinned())
-       ++stat.num_objects_pinned;
-      if (oi->has_manifest())
-       ++stat.num_objects_manifest;
-    }
-
-    // Check for any problems while processing clones
-    if (doing_clones(snapset, curclone)) {
-      std::optional<snapid_t> target;
-      // Expecting an object with snap for current head
-      if (soid.has_snapset() || soid.get_head() != head->get_head()) {
-
-       dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid
-                << " while processing " << *head << dendl;
-
-       target = all_clones;
-      } else {
-       ceph_assert(soid.is_snap());
-       target = soid.snap;
-      }
-
-      // Log any clones we were expecting to be there up to target
-      // This will set missing, but will be a no-op if snap.soid == *curclone.
-      missing +=
-       process_clones_to(head, snapset, m_osds->clog, info.pgid,
-                         allow_incomplete_clones, target, &curclone, head_error);
-    }
-
-    bool expected;
-    // Check doing_clones() again in case we ran process_clones_to()
-    if (doing_clones(snapset, curclone)) {
-      // A head would have processed all clones above
-      // or all greater than *curclone.
-      ceph_assert(soid.is_snap() && *curclone <= soid.snap);
-
-      // After processing above clone snap should match the expected curclone
-      expected = (*curclone == soid.snap);
-    } else {
-      // If we aren't doing clones any longer, then expecting head
-      expected = soid.has_snapset();
-    }
-    if (!expected) {
-      // If we couldn't read the head's snapset, just ignore clones
-      if (head && !snapset) {
-       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
-                             << " : clone ignored due to missing snapset";
-      } else {
-       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
-                             << " : is an unexpected clone";
-      }
-      ++m_shallow_errors;
-      soid_error.set_headless();
-      m_store->add_snap_error(pool.id, soid_error);
-      ++soid_error_count;
-      if (head && soid.get_head() == head->get_head())
-       head_error.set_clone(soid.snap);
-      continue;
-    }
-
-    // new snapset?
-    if (soid.has_snapset()) {
-
-      if (missing) {
-       log_missing(missing, head, m_osds->clog, info.pgid, __func__,
-                   pool.info.allow_incomplete_clones());
-      }
-
-      // Save previous head error information
-      if (head && (head_error.errors || soid_error_count))
-       m_store->add_snap_error(pool.id, head_error);
-      // Set this as a new head object
-      head = soid;
-      missing = 0;
-      head_error = soid_error;
-      soid_error_count = 0;
-
-      dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl;
-
-      if (p->second.attrs.count(SS_ATTR) == 0) {
-       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
-                             << SS_ATTR << "' attr";
-       ++m_shallow_errors;
-       snapset = std::nullopt;
-       head_error.set_snapset_missing();
-      } else {
-       bufferlist bl;
-       bl.push_back(p->second.attrs[SS_ATTR]);
-       auto blp = bl.cbegin();
-       try {
-         snapset = SnapSet();  // Initialize optional<> before decoding into it
-         decode(*snapset, blp);
-         head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
-       } catch (ceph::buffer::error& e) {
-         snapset = std::nullopt;
-         m_osds->clog->error()
-           << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR
-           << "' attr " << e.what();
-         ++m_shallow_errors;
-         head_error.set_snapset_corrupted();
-       }
-      }
-
-      if (snapset) {
-       // what will be next?
-       curclone = snapset->clones.rbegin();
-
-       if (!snapset->clones.empty()) {
-         dout(20) << "  snapset " << *snapset << dendl;
-         if (snapset->seq == 0) {
-           m_osds->clog->error()
-             << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set";
-           ++m_shallow_errors;
-           head_error.set_snapset_error();
-         }
-       }
-      }
-    } else {
-      ceph_assert(soid.is_snap());
-      ceph_assert(head);
-      ceph_assert(snapset);
-      ceph_assert(soid.snap == *curclone);
-
-      dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl;
-
-      if (snapset->clone_size.count(soid.snap) == 0) {
-       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
-                             << " : is missing in clone_size";
-       ++m_shallow_errors;
-       soid_error.set_size_mismatch();
-      } else {
-       if (oi && oi->size != snapset->clone_size[soid.snap]) {
-         m_osds->clog->error()
-           << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size
-           << " != clone_size " << snapset->clone_size[*curclone];
-         ++m_shallow_errors;
-         soid_error.set_size_mismatch();
-       }
-
-       if (snapset->clone_overlap.count(soid.snap) == 0) {
-         m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
-                               << " : is missing in clone_overlap";
-         ++m_shallow_errors;
-         soid_error.set_size_mismatch();
-       } else {
-         // This checking is based on get_clone_bytes().  The first 2 asserts
-         // can't happen because we know we have a clone_size and
-         // a clone_overlap.  Now we check that the interval_set won't
-         // cause the last assert.
-         uint64_t size = snapset->clone_size.find(soid.snap)->second;
-         const interval_set<uint64_t>& overlap =
-           snapset->clone_overlap.find(soid.snap)->second;
-         bool bad_interval_set = false;
-         for (interval_set<uint64_t>::const_iterator i = overlap.begin();
-              i != overlap.end(); ++i) {
-           if (size < i.get_len()) {
-             bad_interval_set = true;
-             break;
-           }
-           size -= i.get_len();
-         }
-
-         if (bad_interval_set) {
-           m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
-                                 << " : bad interval_set in clone_overlap";
-           ++m_shallow_errors;
-           soid_error.set_size_mismatch();
-         } else {
-           stat.num_bytes += snapset->get_clone_bytes(soid.snap);
-         }
-       }
-      }
-
-      // what's next?
-      ++curclone;
-      if (soid_error.errors) {
-       m_store->add_snap_error(pool.id, soid_error);
-       ++soid_error_count;
-      }
-    }
-    m_scrub_cstat.add(stat);
-  }
-
-  if (doing_clones(snapset, curclone)) {
-    dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid
-            << " No more objects while processing " << *head << dendl;
-
-    missing +=
-      process_clones_to(head, snapset, m_osds->clog, info.pgid,
-                       allow_incomplete_clones, all_clones, &curclone, head_error);
-  }
-
-  // There could be missing found by the test above or even
-  // before dropping out of the loop for the last head.
-  if (missing) {
-    log_missing(missing, head, m_osds->clog, info.pgid, __func__,
-               allow_incomplete_clones);
-  }
-  if (head && (head_error.errors || soid_error_count))
-    m_store->add_snap_error(pool.id, head_error);
-
-  dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing"
-          << dendl;
-  for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
-
-    ceph_assert(!p->first.is_snapdir());
-    dout(10) << __func__ << " recording digests for " << p->first << dendl;
-
-    ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false);
-    if (!obc) {
-      m_osds->clog->error() << info.pgid << " " << m_mode_desc
-                           << " cannot get object context for object " << p->first;
-      continue;
-    }
-    if (obc->obs.oi.soid != p->first) {
-      m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first
-                           << " : object has a valid oi attr with a mismatched name, "
-                           << " obc->obs.oi.soid: " << obc->obs.oi.soid;
-      continue;
-    }
-    PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc);
-    ctx->at_version = m_pl_pg->get_next_version();
-    ctx->mtime = utime_t();  // do not update mtime
-    if (p->second.first) {
-      ctx->new_obs.oi.set_data_digest(*p->second.first);
-    } else {
-      ctx->new_obs.oi.clear_data_digest();
-    }
-    if (p->second.second) {
-      ctx->new_obs.oi.set_omap_digest(*p->second.second);
-    } else {
-      ctx->new_obs.oi.clear_omap_digest();
-    }
-    m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
-
-    ++num_digest_updates_pending;
-    ctx->register_on_success([this]() {
-      dout(20) << "updating scrub digest " << num_digest_updates_pending << dendl;
-      if (--num_digest_updates_pending <= 0) {
-       m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops());
-      }
-    });
-
-    m_pl_pg->simple_opc_submit(std::move(ctx));
-  }
-
-  dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl;
-}
-
-PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {}
-
-void PrimaryLogScrub::_scrub_clear_state()
-{
-  dout(15) << __func__ << dendl;
-  m_scrub_cstat = object_stat_collection_t();
-}
-
-void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats,
-                                              const hobject_t& soid)
-{
-  // We scrub objects in hobject_t order, so objects before m_start have already been
-  // scrubbed and their stats have already been added to the scrubber. Objects after that
-  // point haven't been included in the scrubber's stats accounting yet, so they will be
-  // included when the scrubber gets to that object.
-  if (is_primary() && is_scrub_active()) {
-    if (soid < m_start) {
-
-      dout(20) << fmt::format("{} {} < [{},{})", __func__, soid, m_start, m_end) << dendl;
-      m_scrub_cstat.add(delta_stats);
-
-    } else {
-
-      dout(25) << fmt::format("{} {} >= [{},{})", __func__, soid, m_start, m_end) << dendl;
-    }
-  }
-}
diff --git a/src/osd/PrimaryLogScrub.h b/src/osd/PrimaryLogScrub.h
deleted file mode 100644 (file)
index 78353d6..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#pragma once
-
-// the './' includes are marked this way to affect clang-format
-#include "./pg_scrubber.h"
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include "debug.h"
-
-#include "common/errno.h"
-#include "common/scrub_types.h"
-#include "messages/MOSDOp.h"
-#include "messages/MOSDRepScrub.h"
-#include "messages/MOSDRepScrubMap.h"
-#include "messages/MOSDScrub.h"
-#include "messages/MOSDScrubReserve.h"
-
-#include "OSD.h"
-#include "scrub_machine.h"
-
-class PrimaryLogPG;
-
-/**
- * The derivative of PgScrubber that is used by PrimaryLogPG.
- */
-class PrimaryLogScrub : public PgScrubber {
- public:
-  explicit PrimaryLogScrub(PrimaryLogPG* pg);
-
-  void _scrub_finish() final;
-
-  bool get_store_errors(const scrub_ls_arg_t& arg,
-                       scrub_ls_result_t& res_inout) const final;
-
-  void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
-                               const hobject_t& soid) final;
-
- private:
-  // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object:
-  PrimaryLogPG* const m_pl_pg;
-
-  /**
-   * Validate consistency of the object info and snap sets.
-   */
-  void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final;
-
-  void log_missing(int missing,
-                  const std::optional<hobject_t>& head,
-                  LogChannelRef clog,
-                  const spg_t& pgid,
-                  const char* func,
-                  bool allow_incomplete_clones);
-
-  int process_clones_to(const std::optional<hobject_t>& head,
-                       const std::optional<SnapSet>& snapset,
-                       LogChannelRef clog,
-                       const spg_t& pgid,
-                       bool allow_incomplete_clones,
-                       std::optional<snapid_t> target,
-                       std::vector<snapid_t>::reverse_iterator* curclone,
-                       inconsistent_snapset_wrapper& snap_error);
-
-
-  // handle our part in stats collection
-  object_stat_collection_t m_scrub_cstat;
-  void _scrub_clear_state() final;  // which just clears the stats
-};
diff --git a/src/osd/ScrubStore.cc b/src/osd/ScrubStore.cc
deleted file mode 100644 (file)
index a692a44..0000000
+++ /dev/null
@@ -1,198 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-
-#include "ScrubStore.h"
-#include "osd_types.h"
-#include "common/scrub_types.h"
-#include "include/rados/rados_types.hpp"
-
-using std::ostringstream;
-using std::string;
-using std::vector;
-
-using ceph::bufferlist;
-
-namespace {
-ghobject_t make_scrub_object(const spg_t& pgid)
-{
-  ostringstream ss;
-  ss << "scrub_" << pgid;
-  return pgid.make_temp_ghobject(ss.str());
-}
-
-string first_object_key(int64_t pool)
-{
-  auto hoid = hobject_t(object_t(),
-                       "",
-                       0,
-                       0x00000000,
-                       pool,
-                       "");
-  hoid.build_hash_cache();
-  return "SCRUB_OBJ_" + hoid.to_str();
-}
-
-// the object_key should be unique across pools
-string to_object_key(int64_t pool, const librados::object_id_t& oid)
-{
-  auto hoid = hobject_t(object_t(oid.name),
-                       oid.locator, // key
-                       oid.snap,
-                       0,              // hash
-                       pool,
-                       oid.nspace);
-  hoid.build_hash_cache();
-  return "SCRUB_OBJ_" + hoid.to_str();
-}
-
-string last_object_key(int64_t pool)
-{
-  auto hoid = hobject_t(object_t(),
-                       "",
-                       0,
-                       0xffffffff,
-                       pool,
-                       "");
-  hoid.build_hash_cache();
-  return "SCRUB_OBJ_" + hoid.to_str();
-}
-
-string first_snap_key(int64_t pool)
-{
-  // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
-  // the representing the minimal and maximum keys. and this relies on how
-  // hobject_t::to_str() works: hex(pool).hex(revhash).
-  auto hoid = hobject_t(object_t(),
-                       "",
-                       0,
-                       0x00000000,
-                       pool,
-                       "");
-  hoid.build_hash_cache();
-  return "SCRUB_SS_" + hoid.to_str();
-}
-
-string to_snap_key(int64_t pool, const librados::object_id_t& oid)
-{
-  auto hoid = hobject_t(object_t(oid.name),
-                       oid.locator, // key
-                       oid.snap,
-                       0x77777777, // hash
-                       pool,
-                       oid.nspace);
-  hoid.build_hash_cache();
-  return "SCRUB_SS_" + hoid.to_str();
-}
-
-string last_snap_key(int64_t pool)
-{
-  auto hoid = hobject_t(object_t(),
-                       "",
-                       0,
-                       0xffffffff,
-                       pool,
-                       "");
-  hoid.build_hash_cache();
-  return "SCRUB_SS_" + hoid.to_str();
-}
-}
-
-namespace Scrub {
-
-Store*
-Store::create(ObjectStore* store,
-             ObjectStore::Transaction* t,
-             const spg_t& pgid,
-             const coll_t& coll)
-{
-  ceph_assert(store);
-  ceph_assert(t);
-  ghobject_t oid = make_scrub_object(pgid);
-  t->touch(coll, oid);
-  return new Store{coll, oid, store};
-}
-
-Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
-  : coll(coll),
-    hoid(oid),
-    driver(store, coll, hoid),
-    backend(&driver)
-{}
-
-Store::~Store()
-{
-  ceph_assert(results.empty());
-}
-
-void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
-{
-  bufferlist bl;
-  e.encode(bl);
-  results[to_object_key(pool, e.object)] = bl;
-}
-
-void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
-{
-  bufferlist bl;
-  e.encode(bl);
-  results[to_snap_key(pool, e.object)] = bl;
-}
-
-bool Store::empty() const
-{
-  return results.empty();
-}
-
-void Store::flush(ObjectStore::Transaction* t)
-{
-  if (t) {
-    OSDriver::OSTransaction txn = driver.get_transaction(t);
-    backend.set_keys(results, &txn);
-  }
-  results.clear();
-}
-
-void Store::cleanup(ObjectStore::Transaction* t)
-{
-  t->remove(coll, hoid);
-}
-
-std::vector<bufferlist>
-Store::get_snap_errors(int64_t pool,
-                      const librados::object_id_t& start,
-                      uint64_t max_return) const
-{
-  const string begin = (start.name.empty() ?
-                       first_snap_key(pool) : to_snap_key(pool, start));
-  const string end = last_snap_key(pool);
-  return get_errors(begin, end, max_return);
-}
-
-std::vector<bufferlist>
-Store::get_object_errors(int64_t pool,
-                        const librados::object_id_t& start,
-                        uint64_t max_return) const
-{
-  const string begin = (start.name.empty() ?
-                       first_object_key(pool) : to_object_key(pool, start));
-  const string end = last_object_key(pool);
-  return get_errors(begin, end, max_return);
-}
-
-std::vector<bufferlist>
-Store::get_errors(const string& begin,
-                 const string& end,
-                 uint64_t max_return) const
-{
-  vector<bufferlist> errors;
-  auto next = std::make_pair(begin, bufferlist{});
-  while (max_return && !backend.get_next(next.first, &next)) {
-    if (next.first >= end)
-      break;
-    errors.push_back(next.second);
-    max_return--;
-  }
-  return errors;
-}
-
-} // namespace Scrub
diff --git a/src/osd/ScrubStore.h b/src/osd/ScrubStore.h
deleted file mode 100644 (file)
index 721aae0..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_SCRUB_RESULT_H
-#define CEPH_SCRUB_RESULT_H
-
-#include "SnapMapper.h"                // for OSDriver
-#include "common/map_cacher.hpp"
-
-namespace librados {
-  struct object_id_t;
-}
-
-struct inconsistent_obj_wrapper;
-struct inconsistent_snapset_wrapper;
-
-namespace Scrub {
-
-class Store {
-public:
-  ~Store();
-  static Store* create(ObjectStore* store,
-                      ObjectStore::Transaction* t,
-                      const spg_t& pgid,
-                      const coll_t& coll);
-  void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
-  void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
-  bool empty() const;
-  void flush(ObjectStore::Transaction *);
-  void cleanup(ObjectStore::Transaction *);
-  std::vector<ceph::buffer::list> get_snap_errors(int64_t pool,
-                                         const librados::object_id_t& start,
-                                         uint64_t max_return) const;
-  std::vector<ceph::buffer::list> get_object_errors(int64_t pool,
-                                           const librados::object_id_t& start,
-                                           uint64_t max_return) const;
-private:
-  Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
-  std::vector<ceph::buffer::list> get_errors(const std::string& start, const std::string& end,
-                                    uint64_t max_return) const;
-private:
-  const coll_t coll;
-  const ghobject_t hoid;
-  // a temp object holding mappings from seq-id to inconsistencies found in
-  // scrubbing
-  OSDriver driver;
-  mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
-  std::map<std::string, ceph::buffer::list> results;
-};
-}
-
-#endif // CEPH_SCRUB_RESULT_H
diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc
deleted file mode 100644 (file)
index 12f07ca..0000000
+++ /dev/null
@@ -1,2392 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=2 sw=2 smarttab
-
-#include "./pg_scrubber.h"  // the '.' notation used to affect clang-format order
-
-#include <iostream>
-#include <vector>
-
-#include "debug.h"
-
-#include "common/errno.h"
-#include "messages/MOSDOp.h"
-#include "messages/MOSDRepScrub.h"
-#include "messages/MOSDRepScrubMap.h"
-#include "messages/MOSDScrub.h"
-#include "messages/MOSDScrubReserve.h"
-
-#include "OSD.h"
-#include "ScrubStore.h"
-#include "scrub_machine.h"
-
-using std::list;
-using std::map;
-using std::pair;
-using std::set;
-using std::stringstream;
-using std::vector;
-using namespace Scrub;
-using namespace std::chrono;
-using namespace std::chrono_literals;
-using namespace std::literals;
-
-#define dout_context (m_pg->get_cct())
-#define dout_subsys ceph_subsys_osd
-#undef dout_prefix
-#define dout_prefix _prefix(_dout, this->m_pg)
-
-template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
-{
-  return t->gen_prefix(*_dout) << " scrubber pg(" << t->pg_id << ") ";
-}
-
-ostream& operator<<(ostream& out, const scrub_flags_t& sf)
-{
-  if (sf.auto_repair)
-    out << " AUTO_REPAIR";
-  if (sf.check_repair)
-    out << " CHECK_REPAIR";
-  if (sf.deep_scrub_on_error)
-    out << " DEEP_SCRUB_ON_ERROR";
-  if (sf.required)
-    out << " REQ_SCRUB";
-
-  return out;
-}
-
-ostream& operator<<(ostream& out, const requested_scrub_t& sf)
-{
-  if (sf.must_repair)
-    out << " MUST_REPAIR";
-  if (sf.auto_repair)
-    out << " planned AUTO_REPAIR";
-  if (sf.check_repair)
-    out << " planned CHECK_REPAIR";
-  if (sf.deep_scrub_on_error)
-    out << " planned DEEP_SCRUB_ON_ERROR";
-  if (sf.must_deep_scrub)
-    out << " MUST_DEEP_SCRUB";
-  if (sf.must_scrub)
-    out << " MUST_SCRUB";
-  if (sf.time_for_deep)
-    out << " TIME_FOR_DEEP";
-  if (sf.need_auto)
-    out << " NEED_AUTO";
-  if (sf.req_scrub)
-    out << " planned REQ_SCRUB";
-
-  return out;
-}
-
-/*
- * if the incoming message is from a previous interval, it must mean
- * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard
- * the stale message.
- */
-bool PgScrubber::check_interval(epoch_t epoch_to_verify)
-{
-  return epoch_to_verify >= m_pg->get_same_interval_since();
-}
-
-bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify)
-{
-  if (!m_active) {
-    // not scrubbing. We can assume that the scrub was already terminated, and we
-    // can silently discard the incoming event.
-    return false;
-  }
-
-  // is this a message from before we started this scrub?
-  if (epoch_to_verify < m_epoch_start) {
-    return false;
-  }
-
-  // has a new interval started?
-  if (!check_interval(epoch_to_verify)) {
-    // if this is a new interval, on_change() has already terminated that
-    // old scrub.
-    return false;
-  }
-
-  ceph_assert(is_primary());
-
-  // were we instructed to abort?
-  return verify_against_abort(epoch_to_verify);
-}
-
-bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify)
-{
-  if (!should_abort()) {
-    return true;
-  }
-
-  dout(10) << __func__ << " aborting. incoming epoch: " << epoch_to_verify
-          << " vs last-aborted: " << m_last_aborted << dendl;
-
-  // if we were not aware of the abort before - kill the scrub.
-  if (epoch_to_verify > m_last_aborted) {
-    scrub_clear_state();
-    m_last_aborted = std::max(epoch_to_verify, m_epoch_start);
-  }
-  return false;
-}
-
-bool PgScrubber::should_abort() const
-{
-  if (m_flags.required) {
-    return false;  // not stopping 'required' scrubs for configuration changes
-  }
-
-  if (m_is_deep) {
-    if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
-       m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
-      dout(10) << "nodeep_scrub set, aborting" << dendl;
-      return true;
-    }
-  }
-
-  if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
-      m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
-    dout(10) << "noscrub set, aborting" << dendl;
-    return true;
-  }
-
-  return false;
-}
-
-//   initiating state-machine events --------------------------------
-
-/*
- * a note re the checks performed before sending scrub-initiating messages:
- *
- * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that
- * possibly were in the queue while the PG changed state and became unavailable for
- * scrubbing:
- *
- * The check_interval() catches all major changes to the PG. As for the other conditions
- * we may check (and see is_message_relevant() above):
- *
- * - we are not 'active' yet, so must not check against is_active(), and:
- *
- * - the 'abort' flags were just verified (when the triggering message was queued). As
- *   those are only modified in human speeds - they need not be queried again.
- *
- * Some of the considerations above are also relevant to the replica-side initiation
- * ('StartReplica' & 'StartReplicaNoWait').
- */
-
-void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
-{
-  dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
-  // we may have lost our Primary status while the message languished in the queue
-  if (check_interval(epoch_queued)) {
-    dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl;
-    reset_epoch(epoch_queued);
-    m_fsm->my_states();
-    m_fsm->process_event(StartScrub{});
-    dout(10) << "scrubber event --<< StartScrub" << dendl;
-  }
-}
-
-void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued)
-{
-  dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
-  // we may have lost our Primary status while the message languished in the queue
-  if (check_interval(epoch_queued)) {
-    dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl;
-    reset_epoch(epoch_queued);
-    m_fsm->my_states();
-    m_fsm->process_event(AfterRepairScrub{});
-    dout(10) << "scrubber event --<< AfterRepairScrub" << dendl;
-  }
-}
-
-void PgScrubber::send_scrub_unblock(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (is_message_relevant(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(Unblocked{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_scrub_resched(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (is_message_relevant(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(InternalSchedScrub{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
-          << " token: " << token << dendl;
-  if (is_primary()) {
-    // shouldn't happen. Ignore
-    dout(1) << "got a replica scrub request while Primary!" << dendl;
-    return;
-  }
-
-  if (check_interval(epoch_queued) && is_token_current(token)) {
-    m_fsm->my_states();
-    // save us some time by not waiting for updates if there are none
-    // to wait for. Affects the transition from NotActive into either
-    // ReplicaWaitUpdates or ActiveReplica.
-    if (pending_active_pushes())
-      m_fsm->process_event(StartReplica{});
-    else
-      m_fsm->process_event(StartReplicaNoWait{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
-          << " token: " << token << dendl;
-  if (check_interval(epoch_queued) && is_token_current(token)) {
-    m_fsm->my_states();
-    m_fsm->process_event(SchedReplica{});  // retest for map availability
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::active_pushes_notification(epoch_t epoch_queued)
-{
-  // note: Primary only
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (is_message_relevant(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(ActivePushesUpd{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::update_applied_notification(epoch_t epoch_queued)
-{
-  // note: Primary only
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (is_message_relevant(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(UpdatesApplied{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::digest_update_notification(epoch_t epoch_queued)
-{
-  // note: Primary only
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (is_message_relevant(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(DigestUpdate{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_local_map_done(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (is_message_relevant(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(Scrub::IntLocalMapDone{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (is_message_relevant(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(GotReplicas{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (check_interval(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(ReplicaPushesUpd{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_remotes_reserved(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  // note: scrub is not active yet
-  if (check_interval(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(RemotesReserved{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_reservation_failure(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (check_interval(epoch_queued)) {  // do not check for 'active'!
-    m_fsm->my_states();
-    m_fsm->process_event(ReservationFailure{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_full_reset(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-
-  m_fsm->my_states();
-  m_fsm->process_event(Scrub::FullReset{});
-
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_chunk_free(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (check_interval(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(Scrub::SelectedChunkFree{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_chunk_busy(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (check_interval(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(Scrub::ChunkIsBusy{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_get_next_chunk(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (is_message_relevant(epoch_queued)) {
-    m_fsm->my_states();
-    m_fsm->process_event(Scrub::NextChunk{});
-  }
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-
-  // can't check for "active"
-
-  m_fsm->my_states();
-  m_fsm->process_event(Scrub::ScrubFinished{});
-
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_maps_compared(epoch_t epoch_queued)
-{
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-
-  m_fsm->my_states();
-  m_fsm->process_event(Scrub::MapsCompared{});
-
-  dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-// -----------------
-
-bool PgScrubber::is_reserving() const
-{
-  return m_fsm->is_reserving();
-}
-
-void PgScrubber::reset_epoch(epoch_t epoch_queued)
-{
-  dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
-  m_fsm->assert_not_active();
-
-  m_epoch_start = epoch_queued;
-  m_needs_sleep = true;
-  m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
-  update_op_mode_text();
-}
-
-unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
-{
-  unsigned int qu_priority = m_flags.priority;
-
-  if (with_priority == Scrub::scrub_prio_t::high_priority) {
-    qu_priority =
-      std::max(qu_priority, (unsigned int)m_pg->get_cct()->_conf->osd_client_op_priority);
-  }
-  return qu_priority;
-}
-
-unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
-                                               unsigned int suggested_priority) const
-{
-  if (with_priority == Scrub::scrub_prio_t::high_priority) {
-    suggested_priority = std::max(suggested_priority,
-                                 (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
-  }
-  return suggested_priority;
-}
-
-// ///////////////////////////////////////////////////////////////////// //
-// scrub-op registration handling
-
-bool PgScrubber::is_scrub_registered() const
-{
-  return !m_scrub_reg_stamp.is_zero();
-}
-
-void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
-{
-  if (!is_primary()) {
-    // normal. No warning is required.
-    return;
-  }
-
-  dout(10) << __func__ << " planned: must? " << request_flags.must_scrub << " need-auto? "
-          << request_flags.need_auto << " stamp: " << m_pg->info.history.last_scrub_stamp
-          << dendl;
-
-  ceph_assert(!is_scrub_registered());
-
-  utime_t reg_stamp;
-  bool must = false;
-
-  if (request_flags.must_scrub || request_flags.need_auto) {
-    // Set the smallest time that isn't utime_t()
-    reg_stamp = PgScrubber::scrub_must_stamp();
-    must = true;
-  } else if (m_pg->info.stats.stats_invalid &&
-            m_pg->cct->_conf->osd_scrub_invalid_stats) {
-    reg_stamp = ceph_clock_now();
-    must = true;
-  } else {
-    reg_stamp = m_pg->info.history.last_scrub_stamp;
-  }
-
-  dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must
-          << " required:" << m_flags.required << " flags: " << request_flags
-          << " stamp: " << reg_stamp << dendl;
-
-  const double scrub_min_interval =
-    m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0);
-  const double scrub_max_interval =
-    m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0);
-
-  // note the sched_time, so we can locate this scrub, and remove it later
-  m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval,
-                                          scrub_max_interval, must);
-  dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time "
-          << m_scrub_reg_stamp << ", must = " << (int)must << dendl;
-}
-
-void PgScrubber::unreg_next_scrub()
-{
-  if (is_scrub_registered()) {
-    dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl;
-    m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
-    m_scrub_reg_stamp = utime_t{};
-  }
-}
-
-void PgScrubber::scrub_requested(scrub_level_t scrub_level,
-                                scrub_type_t scrub_type,
-                                requested_scrub_t& req_flags)
-{
-  dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ")
-          << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ")
-          << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered()
-          << dendl;
-
-  unreg_next_scrub();
-
-  req_flags.must_scrub = true;
-  req_flags.must_deep_scrub =
-    (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair);
-  req_flags.must_repair = (scrub_type == scrub_type_t::do_repair);
-  // User might intervene, so clear this
-  req_flags.need_auto = false;
-  req_flags.req_scrub = true;
-
-  dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl;
-
-  reg_next_scrub(req_flags);
-}
-
-void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
-{
-  dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? "
-          << is_scrub_registered() << dendl;
-
-  unreg_next_scrub();
-  req_flags.need_auto = true;
-  reg_next_scrub(req_flags);
-}
-
-bool PgScrubber::reserve_local()
-{
-  // try to create the reservation object (which translates into asking the
-  // OSD for the local scrub resource). If failing - undo it immediately
-
-  m_local_osd_resource.emplace(m_pg, m_osds);
-  if (!m_local_osd_resource->is_reserved()) {
-    m_local_osd_resource.reset();
-    return false;
-  }
-
-  return true;
-}
-
-// ----------------------------------------------------------------------------
-
-bool PgScrubber::has_pg_marked_new_updates() const
-{
-  auto last_applied = m_pg->recovery_state.get_last_update_applied();
-  dout(10) << __func__ << " recovery last: " << last_applied
-          << " vs. scrub's: " << m_subset_last_update << dendl;
-
-  return last_applied >= m_subset_last_update;
-}
-
-void PgScrubber::set_subset_last_update(eversion_t e)
-{
-  m_subset_last_update = e;
-  dout(15) << __func__ << " last-update: " << e << dendl;
-}
-
-void PgScrubber::on_applied_when_primary(const eversion_t& applied_version)
-{
-  // we are only interested in updates if we are the Primary, and in state
-  // WaitLastUpdate
-  if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) {
-    m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops());
-    dout(15) << __func__ << " update: " << applied_version
-            << " vs. required: " << m_subset_last_update << dendl;
-  }
-}
-
-/*
- * The selected range is set directly into 'm_start' and 'm_end'
- * setting:
- * - m_subset_last_update
- * - m_max_end
- * - end
- * - start
- */
-bool PgScrubber::select_range()
-{
-  m_primary_scrubmap = ScrubMap{};
-  m_received_maps.clear();
-
-  /* get the start and end of our scrub chunk
-   *
-   * Our scrub chunk has an important restriction we're going to need to
-   * respect. We can't let head be start or end.
-   * Using a half-open interval means that if end == head,
-   * we'd scrub/lock head and the clone right next to head in different
-   * chunks which would allow us to miss clones created between
-   * scrubbing that chunk and scrubbing the chunk including head.
-   * This isn't true for any of the other clones since clones can
-   * only be created "just to the left of" head.  There is one exception
-   * to this: promotion of clones which always happens to the left of the
-   * left-most clone, but promote_object checks the scrubber in that
-   * case, so it should be ok.  Also, it's ok to "miss" clones at the
-   * left end of the range if we are a tier because they may legitimately
-   * not exist (see _scrub).
-   */
-  int min_idx = std::max<int64_t>(
-    3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor());
-
-  int max_idx = std::max<int64_t>(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max /
-                                            preemption_data.chunk_divisor());
-
-  dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx
-          << " Div: " << preemption_data.chunk_divisor() << dendl;
-
-  hobject_t start = m_start;
-  hobject_t candidate_end;
-  std::vector<hobject_t> objects;
-  int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects,
-                                                       &candidate_end);
-  ceph_assert(ret >= 0);
-
-  if (!objects.empty()) {
-
-    hobject_t back = objects.back();
-    while (candidate_end.is_head() && candidate_end == back.get_head()) {
-      candidate_end = back;
-      objects.pop_back();
-      if (objects.empty()) {
-       ceph_assert(0 ==
-                   "Somehow we got more than 2 objects which"
-                   "have the same head but are not clones");
-      }
-      back = objects.back();
-    }
-
-    if (candidate_end.is_head()) {
-      ceph_assert(candidate_end != back.get_head());
-      candidate_end = candidate_end.get_object_boundary();
-    }
-
-  } else {
-    ceph_assert(candidate_end.is_max());
-  }
-
-  // is that range free for us? if not - we will be rescheduled later by whoever
-  // triggered us this time
-
-  if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) {
-    // we'll be requeued by whatever made us unavailable for scrub
-    dout(10) << __func__ << ": scrub blocked somewhere in range "
-            << "[" << m_start << ", " << candidate_end << ")" << dendl;
-    return false;
-  }
-
-  m_end = candidate_end;
-  if (m_end > m_max_end)
-    m_max_end = m_end;
-
-  dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// "
-          << m_max_end << dendl;
-
-  // debug: be 'blocked' if told so by the 'pg scrub_debug block' asok command
-  if (m_debug_blockrange > 0) {
-    m_debug_blockrange--;
-    return false;
-  }
-  return true;
-}
-
-void PgScrubber::select_range_n_notify()
-{
-  if (select_range()) {
-    // the next chunk to handle is not blocked
-    dout(20) << __func__ << ": selection OK" << dendl;
-    m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority);
-
-  } else {
-    // we will wait for the objects range to become available for scrubbing
-    dout(10) << __func__ << ": selected chunk is busy" << dendl;
-    m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority);
-  }
-}
-
-bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
-{
-  if (soid < m_start || soid >= m_end) {
-    return false;
-  }
-
-  dout(20) << __func__ << " " << soid << " can preempt? "
-          << preemption_data.is_preemptable() << " already preempted? "
-          << preemption_data.was_preempted() << dendl;
-
-  if (preemption_data.was_preempted()) {
-    // otherwise - write requests arriving while 'already preempted' is set
-    // but 'preemptable' is not - will not be allowed to continue, and will
-    // not be requeued on time.
-    return false;
-  }
-
-  if (preemption_data.is_preemptable()) {
-
-    dout(10) << __func__ << " " << soid << " preempted" << dendl;
-
-    // signal the preemption
-    preemption_data.do_preempt();
-    m_end = m_start;  // free the range we were scrubbing
-
-    return false;
-  }
-  return true;
-}
-
-bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end)
-{
-  // does [start, end] intersect [scrubber.start, scrubber.m_max_end)
-  return (start < m_max_end && end >= m_start);
-}
-
-Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm()
-{
-  return std::make_unique<blocked_range_t>(m_osds, ceph::timespan{300s}, m_pg_id);
-}
-
-/**
- *  if we are required to sleep:
- *     arrange a callback sometimes later.
- *     be sure to be able to identify a stale callback.
- *  Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue)
- *    anyway.
- */
-void PgScrubber::add_delayed_scheduling()
-{
-  m_end = m_start;  // not blocking any range now
-
-  milliseconds sleep_time{0ms};
-  if (m_needs_sleep) {
-    double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required);
-    sleep_time = milliseconds{long(scrub_sleep)};
-  }
-  dout(15) << __func__ << " sleep: " << sleep_time.count() << "ms. needed? "
-          << m_needs_sleep << dendl;
-
-  if (sleep_time.count()) {
-    // schedule a transition for some 'sleep_time' ms in the future
-
-    m_needs_sleep = false;
-    m_sleep_started_at = ceph_clock_now();
-
-    // the following log line is used by osd-scrub-test.sh
-    dout(20) << __func__ << " scrub state is PendingTimer, sleeping" << dendl;
-
-    // the 'delayer' for crimson is different. Will be factored out.
-
-    spg_t pgid = m_pg->get_pgid();
-    auto callbk = new LambdaContext([osds = m_osds, pgid,
-                                    scrbr = this]([[maybe_unused]] int r) mutable {
-      PGRef pg = osds->osd->lookup_lock_pg(pgid);
-      if (!pg) {
-       lgeneric_subdout(g_ceph_context, osd, 10)
-         << "scrub_requeue_callback: Could not find "
-         << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl;
-       return;
-      }
-      scrbr->m_needs_sleep = true;
-      lgeneric_dout(scrbr->get_pg_cct(), 7)
-       << "scrub_requeue_callback: slept for "
-       << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl;
-
-      scrbr->m_sleep_started_at = utime_t{};
-      osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority);
-      pg->unlock();
-    });
-
-    std::lock_guard l(m_osds->sleep_lock);
-    m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk);
-
-  } else {
-    // just a requeue
-    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
-  }
-}
-
-eversion_t PgScrubber::search_log_for_updates() const
-{
-  auto& projected = m_pg->projected_log.log;
-  auto pi = find_if(
-    projected.crbegin(), projected.crend(),
-    [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; });
-
-  if (pi != projected.crend())
-    return pi->version;
-
-  // there was no relevant update entry in the log
-
-  auto& log = m_pg->recovery_state.get_pg_log().get_log().log;
-  auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool {
-    return e.soid >= m_start && e.soid < m_end;
-  });
-
-  if (p == log.crend())
-    return eversion_t{};
-  else
-    return p->version;
-}
-
-void PgScrubber::get_replicas_maps(bool replica_can_preempt)
-{
-  dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/"
-          << m_interval_start
-          << " pg same_interval_since: " << m_pg->info.history.same_interval_since
-          << dendl;
-
-  m_primary_scrubmap_pos.reset();
-
-  // ask replicas to scan and send maps
-  for (const auto& i : m_pg->get_acting_recovery_backfill()) {
-
-    if (i == m_pg_whoami)
-      continue;
-
-    m_maps_status.mark_replica_map_request(i);
-    _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep,
-                      replica_can_preempt);
-  }
-
-  dout(10) << __func__ << " awaiting" << m_maps_status << dendl;
-}
-
-bool PgScrubber::was_epoch_changed() const
-{
-  // for crimson we have m_pg->get_info().history.same_interval_since
-  dout(10) << __func__ << " epoch_start: " << m_interval_start
-          << " from pg: " << m_pg->get_history().same_interval_since << dendl;
-
-  return m_interval_start < m_pg->get_history().same_interval_since;
-}
-
-void PgScrubber::mark_local_map_ready()
-{
-  m_maps_status.mark_local_map_ready();
-}
-
-bool PgScrubber::are_all_maps_available() const
-{
-  return m_maps_status.are_all_maps_available();
-}
-
-std::string PgScrubber::dump_awaited_maps() const
-{
-  return m_maps_status.dump();
-}
-
-void PgScrubber::update_op_mode_text()
-{
-  auto visible_repair = state_test(PG_STATE_REPAIR);
-  m_mode_desc = (visible_repair ? "repair" : (m_is_deep ? "deep-scrub" : "scrub"));
-
-  dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false")
-          << ", internal: " << (m_is_repair ? "true" : "false")
-          << ". Displayed: " << m_mode_desc << dendl;
-}
-
-void PgScrubber::_request_scrub_map(pg_shard_t replica,
-                                   eversion_t version,
-                                   hobject_t start,
-                                   hobject_t end,
-                                   bool deep,
-                                   bool allow_preemption)
-{
-  ceph_assert(replica != m_pg_whoami);
-  dout(10) << __func__ << " scrubmap from osd." << replica
-          << (deep ? " deep" : " shallow") << dendl;
-
-  auto repscrubop =
-    new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version,
-                    get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep,
-                    allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub());
-
-  // default priority. We want the replica-scrub processed prior to any recovery
-  // or client io messages (we are holding a lock!)
-  m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
-}
-
-void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
-{
-  if (!m_store)
-    return;
-
-  struct OnComplete : Context {
-    std::unique_ptr<Scrub::Store> store;
-    explicit OnComplete(std::unique_ptr<Scrub::Store>&& store) : store(std::move(store))
-    {}
-    void finish(int) override {}
-  };
-  m_store->cleanup(t);
-  t->register_on_complete(new OnComplete(std::move(m_store)));
-  ceph_assert(!m_store);
-}
-
-void PgScrubber::on_init()
-{
-  // going upwards from 'inactive'
-  ceph_assert(!is_scrub_active());
-
-  preemption_data.reset();
-  m_pg->publish_stats_to_osd();
-  m_interval_start = m_pg->get_history().same_interval_since;
-
-  dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl;
-
-  //  create a new store
-  {
-    ObjectStore::Transaction t;
-    cleanup_store(&t);
-    m_store.reset(
-      Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
-    m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
-  }
-
-  m_start = m_pg->info.pgid.pgid.get_hobj_start();
-  m_active = true;
-}
-
-void PgScrubber::on_replica_init()
-{
-  m_active = true;
-}
-
-void PgScrubber::_scan_snaps(ScrubMap& smap)
-{
-  hobject_t head;
-  SnapSet snapset;
-
-  // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings
-  // in this function
-  dout(15) << "_scan_snaps starts" << dendl;
-
-  for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
-
-    const hobject_t& hoid = i->first;
-    ScrubMap::object& o = i->second;
-
-    dout(20) << __func__ << " " << hoid << dendl;
-
-    ceph_assert(!hoid.is_snapdir());
-    if (hoid.is_head()) {
-      // parse the SnapSet
-      bufferlist bl;
-      if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
-       continue;
-      }
-      bl.push_back(o.attrs[SS_ATTR]);
-      auto p = bl.cbegin();
-      try {
-       decode(snapset, p);
-      } catch (...) {
-       continue;
-      }
-      head = hoid.get_head();
-      continue;
-    }
-
-    if (hoid.snap < CEPH_MAXSNAP) {
-      // check and if necessary fix snap_mapper
-      if (hoid.get_head() != head) {
-       derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl;
-       continue;
-      }
-      set<snapid_t> obj_snaps;
-      auto p = snapset.clone_snaps.find(hoid.snap);
-      if (p == snapset.clone_snaps.end()) {
-       derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl;
-       continue;
-      }
-      obj_snaps.insert(p->second.begin(), p->second.end());
-      set<snapid_t> cur_snaps;
-      int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps);
-      if (r != 0 && r != -ENOENT) {
-       derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
-       ceph_abort();
-      }
-      if (r == -ENOENT || cur_snaps != obj_snaps) {
-       ObjectStore::Transaction t;
-       OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t));
-       if (r == 0) {
-         r = m_pg->snap_mapper.remove_oid(hoid, &_t);
-         if (r != 0) {
-           derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
-           ceph_abort();
-         }
-         m_pg->osd->clog->error()
-           << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
-           << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps
-           << ", oi: " << obj_snaps << "...repaired";
-       } else {
-         m_pg->osd->clog->error()
-           << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
-           << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper"
-           << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r
-           << "...repaired";
-       }
-       m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t);
-
-       // wait for repair to apply to avoid confusing other bits of the system.
-       {
-         dout(15) << __func__ << " wait on repair!" << dendl;
-
-         ceph::condition_variable my_cond;
-         ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
-         int e = 0;
-         bool done;
-
-         t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e));
-
-         e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t));
-         if (e != 0) {
-           derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl;
-         } else {
-           std::unique_lock l{my_lock};
-           my_cond.wait(l, [&done] { return done; });
-         }
-       }
-      }
-    }
-  }
-}
-
-int PgScrubber::build_primary_map_chunk()
-{
-  epoch_t map_building_since = m_pg->get_osdmap_epoch();
-  dout(20) << __func__ << ": initiated at epoch " << map_building_since << dendl;
-
-  auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start,
-                                  m_end, m_is_deep);
-
-  if (ret == -EINPROGRESS) {
-    // reschedule another round of asking the backend to collect the scrub data
-    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority);
-  }
-  return ret;
-}
-
-int PgScrubber::build_replica_map_chunk()
-{
-  dout(10) << __func__ << " interval start: " << m_interval_start
-          << " current token: " << m_current_token << " epoch: " << m_epoch_start
-          << " deep: " << m_is_deep << dendl;
-
-  auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
-                                  m_is_deep);
-
-  switch (ret) {
-
-    case -EINPROGRESS:
-      // must wait for the backend to finish. No external event source.
-      // (note: previous version used low priority here. Now switched to using the
-      // priority of the original message)
-      m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority,
-                                         m_flags.priority, m_current_token);
-      break;
-
-    case 0: {
-      // finished!
-      m_cleaned_meta_map.clear_from(m_start);
-      m_cleaned_meta_map.insert(replica_scrubmap);
-      auto for_meta_scrub = clean_meta_map();
-      _scan_snaps(for_meta_scrub);
-
-      // the local map has been created. Send it to the primary.
-      // Note: once the message reaches the Primary, it may ask us for another
-      // chunk - and we better be done with the current scrub. Thus - the preparation of
-      // the reply message is separate, and we clear the scrub state before actually
-      // sending it.
-
-      auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
-      replica_handling_done();
-      dout(15) << __func__ << " chunk map sent " << dendl;
-      send_replica_map(reply);
-    } break;
-
-    default:
-      // negative retval: build_scrub_map_chunk() signalled an error
-      // Pre-Pacific code ignored this option, treating it as a success.
-      // \todo Add an error flag in the returning message.
-      dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret
-             << dendl;
-      replica_handling_done();
-      // only in debug mode for now:
-      assert(false && "backend error");
-      break;
-  };
-
-  return ret;
-}
-
-int PgScrubber::build_scrub_map_chunk(
-  ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep)
-{
-  dout(10) << __func__ << " [" << start << "," << end << ") "
-          << " pos " << pos << " Deep: " << deep << dendl;
-
-  // start
-  while (pos.empty()) {
-
-    pos.deep = deep;
-    map.valid_through = m_pg->info.last_update;
-
-    // objects
-    vector<ghobject_t> rollback_obs;
-    pos.ret =
-      m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs);
-    dout(10) << __func__ << " while pos empty " << pos.ret << dendl;
-    if (pos.ret < 0) {
-      dout(5) << "objects_list_range error: " << pos.ret << dendl;
-      return pos.ret;
-    }
-    dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl;
-    if (pos.ls.empty()) {
-      break;
-    }
-    m_pg->_scan_rollback_obs(rollback_obs);
-    pos.pos = 0;
-    return -EINPROGRESS;
-  }
-
-  // scan objects
-  while (!pos.done()) {
-
-    int r = m_pg->get_pgbackend()->be_scan_list(map, pos);
-    dout(30) << __func__ << " BE returned " << r << dendl;
-    if (r == -EINPROGRESS) {
-      dout(20) << __func__ << " in progress" << dendl;
-      return r;
-    }
-  }
-
-  // finish
-  dout(20) << __func__ << " finishing" << dendl;
-  ceph_assert(pos.done());
-  m_pg->_repair_oinfo_oid(map);
-
-  dout(20) << __func__ << " done, got " << map.objects.size() << " items" << dendl;
-  return 0;
-}
-
-/*
- * Process:
- * Building a map of objects suitable for snapshot validation.
- * The data in m_cleaned_meta_map is the left over partial items that need to
- * be completed before they can be processed.
- *
- * Snapshots in maps precede the head object, which is why we are scanning backwards.
- */
-ScrubMap PgScrubber::clean_meta_map()
-{
-  ScrubMap for_meta_scrub;
-
-  if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) {
-    m_cleaned_meta_map.swap(for_meta_scrub);
-  } else {
-    auto iter = m_cleaned_meta_map.objects.end();
-    --iter;  // not empty, see 'if' clause
-    auto begin = m_cleaned_meta_map.objects.begin();
-    if (iter->first.has_snapset()) {
-      ++iter;
-    } else {
-      while (iter != begin) {
-       auto next = iter--;
-       if (next->first.get_head() != iter->first.get_head()) {
-         ++iter;
-         break;
-       }
-      }
-    }
-    for_meta_scrub.objects.insert(begin, iter);
-    m_cleaned_meta_map.objects.erase(begin, iter);
-  }
-
-  return for_meta_scrub;
-}
-
-void PgScrubber::run_callbacks()
-{
-  std::list<Context*> to_run;
-  to_run.swap(m_callbacks);
-
-  for (auto& tr : to_run) {
-    tr->complete(0);
-  }
-}
-
-void PgScrubber::maps_compare_n_cleanup()
-{
-  scrub_compare_maps();
-  m_start = m_end;
-  run_callbacks();
-  requeue_waiting();
-  m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority);
-}
-
-Scrub::preemption_t& PgScrubber::get_preemptor()
-{
-  return preemption_data;
-}
-
-/*
- * Process note: called for the arriving "give me your map, replica!" request. Unlike
- * the original implementation, we do not requeue the Op waiting for
- * updates. Instead - we trigger the FSM.
- */
-void PgScrubber::replica_scrub_op(OpRequestRef op)
-{
-  op->mark_started();
-  auto msg = op->get_req<MOSDRepScrub>();
-  dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch
-          << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl;
-
-  // are we still processing a previous scrub-map request without noticing that the
-  // interval changed? won't see it here, but rather at the reservation stage.
-
-  if (msg->map_epoch < m_pg->info.history.same_interval_since) {
-    dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
-            << " < " << m_pg->info.history.same_interval_since << dendl;
-
-    // is there a general sync issue? are we holding a stale reservation?
-    // not checking now - assuming we will actively react to interval change.
-
-    return;
-  }
-
-  replica_scrubmap = ScrubMap{};
-  replica_scrubmap_pos = ScrubMapBuilder{};
-
-  m_replica_min_epoch = msg->min_epoch;
-  m_start = msg->start;
-  m_end = msg->end;
-  m_max_end = msg->end;
-  m_is_deep = msg->deep;
-  m_interval_start = m_pg->info.history.same_interval_since;
-  m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority
-                                                 : Scrub::scrub_prio_t::low_priority;
-  m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority();
-
-  preemption_data.reset();
-  preemption_data.force_preemptability(msg->allow_preemption);
-
-  replica_scrubmap_pos.reset();
-
-  // make sure the FSM is at NotActive
-  m_fsm->assert_not_active();
-
-  m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority,
-                             m_current_token);
-}
-
-void PgScrubber::set_op_parameters(requested_scrub_t& request)
-{
-  dout(10) << __func__ << " input: " << request << dendl;
-
-  // write down the epoch of starting a new scrub. Will be used
-  // to discard stale messages from previous aborted scrubs.
-  m_epoch_start = m_pg->get_osdmap_epoch();
-
-  m_flags.check_repair = request.check_repair;
-  m_flags.auto_repair = request.auto_repair || request.need_auto;
-  m_flags.required = request.req_scrub || request.must_scrub;
-
-  m_flags.priority = (request.must_scrub || request.need_auto)
-                      ? get_pg_cct()->_conf->osd_requested_scrub_priority
-                      : m_pg->get_scrub_priority();
-
-  state_set(PG_STATE_SCRUBBING);
-
-  // will we be deep-scrubbing?
-  if (request.must_deep_scrub || request.need_auto || request.time_for_deep) {
-    state_set(PG_STATE_DEEP_SCRUB);
-  }
-
-  // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e.
-  // deep-scrub with the auto_repair configuration flag set). m_is_repair value
-  // determines the scrubber behavior.
-  // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the
-  // PG status as appearing in the logs).
-  m_is_repair = request.must_repair || m_flags.auto_repair;
-  if (request.must_repair) {
-    state_set(PG_STATE_REPAIR);
-    // not calling update_op_mode_text() yet, as m_is_deep not set yet
-  }
-
-  // the publishing here seems to be required for tests synchronization
-  m_pg->publish_stats_to_osd();
-  m_flags.deep_scrub_on_error = request.deep_scrub_on_error;
-}
-
-void PgScrubber::scrub_compare_maps()
-{
-  dout(10) << __func__ << " has maps, analyzing" << dendl;
-
-  // construct authoritative scrub map for type-specific scrubbing
-  m_cleaned_meta_map.insert(m_primary_scrubmap);
-  map<hobject_t, pair<std::optional<uint32_t>, std::optional<uint32_t>>> missing_digest;
-
-  map<pg_shard_t, ScrubMap*> maps;
-  maps[m_pg_whoami] = &m_primary_scrubmap;
-
-  for (const auto& i : m_pg->get_acting_recovery_backfill()) {
-    if (i == m_pg_whoami)
-      continue;
-    dout(2) << __func__ << " replica " << i << " has "
-           << m_received_maps[i].objects.size() << " items" << dendl;
-    maps[i] = &m_received_maps[i];
-  }
-
-  set<hobject_t> master_set;
-
-  // Construct master set
-  for (const auto& map : maps) {
-    for (const auto& i : map.second->objects) {
-      master_set.insert(i.first);
-    }
-  }
-
-  stringstream ss;
-  m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss);
-
-  if (!ss.str().empty()) {
-    m_osds->clog->warn(ss);
-  }
-
-  if (m_pg->recovery_state.get_acting_recovery_backfill().size() > 1) {
-
-    dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
-
-    // Map from object with errors to good peer
-    map<hobject_t, list<pg_shard_t>> authoritative;
-
-    dout(2) << __func__ << ": primary (" << m_pg->get_primary() << ") has "
-           << m_primary_scrubmap.objects.size() << " items" << dendl;
-
-    ss.str("");
-    ss.clear();
-
-    m_pg->get_pgbackend()->be_compare_scrubmaps(
-      maps, master_set, m_is_repair, m_missing, m_inconsistent,
-      authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(),
-      m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss);
-
-    if (!ss.str().empty()) {
-      m_osds->clog->error(ss);
-    }
-
-    for (auto& i : authoritative) {
-      list<pair<ScrubMap::object, pg_shard_t>> good_peers;
-      for (list<pg_shard_t>::const_iterator j = i.second.begin(); j != i.second.end();
-          ++j) {
-       good_peers.emplace_back(maps[*j]->objects[i.first], *j);
-      }
-      m_authoritative.emplace(i.first, good_peers);
-    }
-
-    for (auto i = authoritative.begin(); i != authoritative.end(); ++i) {
-      m_cleaned_meta_map.objects.erase(i->first);
-      m_cleaned_meta_map.objects.insert(
-       *(maps[i->second.back()]->objects.find(i->first)));
-    }
-  }
-
-  auto for_meta_scrub = clean_meta_map();
-
-  // ok, do the pg-type specific scrubbing
-
-  // (Validates consistency of the object info and snap sets)
-  scrub_snapshot_metadata(for_meta_scrub, missing_digest);
-
-  // Called here on the primary can use an authoritative map if it isn't the primary
-  _scan_snaps(for_meta_scrub);
-
-  if (!m_store->empty()) {
-
-    if (m_is_repair) {
-      dout(10) << __func__ << ": discarding scrub results" << dendl;
-      m_store->flush(nullptr);
-    } else {
-      dout(10) << __func__ << ": updating scrub object" << dendl;
-      ObjectStore::Transaction t;
-      m_store->flush(&t);
-      m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
-    }
-  }
-}
-
-ScrubMachineListener::MsgAndEpoch PgScrubber::prep_replica_map_msg(
-  PreemptionNoted was_preempted)
-{
-  dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl;
-
-  auto reply =
-    make_message<MOSDRepScrubMap>(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
-                                 m_replica_min_epoch, m_pg_whoami);
-
-  reply->preempted = (was_preempted == PreemptionNoted::preempted);
-  ::encode(replica_scrubmap, reply->get_data());
-
-  return ScrubMachineListener::MsgAndEpoch{reply, m_replica_min_epoch};
-}
-
-void PgScrubber::send_replica_map(const MsgAndEpoch& preprepared)
-{
-  m_pg->send_cluster_message(m_pg->get_primary().osd, preprepared.m_msg,
-                            preprepared.m_epoch, false);
-}
-
-void PgScrubber::send_preempted_replica()
-{
-  auto reply =
-    make_message<MOSDRepScrubMap>(spg_t{m_pg->info.pgid.pgid, m_pg->get_primary().shard},
-                                 m_replica_min_epoch, m_pg_whoami);
-
-  reply->preempted = true;
-  ::encode(replica_scrubmap, reply->get_data()); // must not skip this
-  m_pg->send_cluster_message(m_pg->get_primary().osd, reply, m_replica_min_epoch, false);
-}
-
-/*
- *  - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
- *    The state-machine will react to that when all replica maps are received.
- *  - when all maps are received, we signal the FSM with the GotReplicas event (see
- *    scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
- *    FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
- *    handle.
- */
-void PgScrubber::map_from_replica(OpRequestRef op)
-{
-  auto m = op->get_req<MOSDRepScrubMap>();
-  dout(15) << __func__ << " " << *m << dendl;
-
-  if (m->map_epoch < m_pg->info.history.same_interval_since) {
-    dout(10) << __func__ << " discarding old from " << m->map_epoch << " < "
-            << m_pg->info.history.same_interval_since << dendl;
-    return;
-  }
-
-  auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
-
-  m_received_maps[m->from].decode(p, m_pg->info.pgid.pool());
-  dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl;
-
-  auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
-  if (!is_ok) {
-    // previously an unexpected map was triggering an assert. Now, as scrubs can be
-    // aborted at any time, the chances of this happening have increased, and aborting is
-    // not justified
-    dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl;
-    return;
-  }
-
-  if (m->preempted) {
-    dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
-    preemption_data.do_preempt();
-  }
-
-  if (m_maps_status.are_all_maps_available()) {
-    dout(15) << __func__ << " all repl-maps available" << dendl;
-    m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops());
-  }
-}
-
-void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
-{
-  dout(10) << __func__ << " " << *op->get_req() << dendl;
-  op->mark_started();
-  auto request_ep = op->get_req<MOSDScrubReserve>()->get_map_epoch();
-
-  /*
-   *  if we are currently holding a reservation, then:
-   *  either (1) we, the scrubber, did not yet notice an interval change. The remembered
-   *  reservation epoch is from before our interval, and we can silently discard the
-   *  reservation (no message is required).
-   *  or:
-   *  (2) the interval hasn't changed, but the same Primary that (we think) holds the
-   *  lock just sent us a new request. Note that we know it's the same Primary, as
-   *  otherwise the interval would have changed.
-   *  Ostensibly we can discard & redo the reservation. But then we
-   *  will be temporarily releasing the OSD resource - and might not be able to grab it
-   *  again. Thus, we simply treat this as a successful new request
-   *  (but mark the fact that if there is a previous request from the primary to
-   *  scrub a specific chunk - that request is now defunct).
-   */
-
-  if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
-    // we are holding a stale reservation from a past epoch
-    m_remote_osd_resource.reset();
-    dout(10) << __func__ << " stale reservation request" << dendl;
-  }
-
-  if (request_ep < m_pg->get_same_interval_since()) {
-    // will not ack stale requests
-    return;
-  }
-
-  bool granted{false};
-  if (m_remote_osd_resource.has_value()) {
-
-    dout(10) << __func__ << " already reserved." << dendl;
-
-    /*
-     * it might well be that we did not yet finish handling the latest scrub-op from
-     * our primary. This happens, for example, if 'noscrub' was set via a command, then
-     * reset. The primary in this scenario will remain in the same interval, but we do need
-     * to reset our internal state (otherwise - the first renewed 'give me your scrub map'
-     * from the primary will see us in active state, crashing the OSD).
-     */
-    advance_token();
-    granted = true;
-
-  } else if (m_pg->cct->_conf->osd_scrub_during_recovery ||
-            !m_osds->is_recovery_active()) {
-    m_remote_osd_resource.emplace(m_pg, m_osds, request_ep);
-    // OSD resources allocated?
-    granted = m_remote_osd_resource->is_reserved();
-    if (!granted) {
-      // just forget it
-      m_remote_osd_resource.reset();
-      dout(20) << __func__ << ": failed to reserve remotely" << dendl;
-    }
-  }
-
-  dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
-
-  Message* reply = new MOSDScrubReserve(
-    spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep,
-    granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami);
-
-  m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
-}
-
-void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
-{
-  dout(10) << __func__ << " " << *op->get_req() << dendl;
-  op->mark_started();
-
-  if (m_reservations.has_value()) {
-    m_reservations->handle_reserve_grant(op, from);
-  } else {
-    derr << __func__ << ": received unsolicited reservation grant from osd " << from
-        << " (" << op << ")" << dendl;
-  }
-}
-
-void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
-{
-  dout(10) << __func__ << " " << *op->get_req() << dendl;
-  op->mark_started();
-
-  if (m_reservations.has_value()) {
-    // there is an active reservation process. No action is required otherwise.
-    m_reservations->handle_reserve_reject(op, from);
-  }
-}
-
-void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
-{
-  dout(10) << __func__ << " " << *op->get_req() << dendl;
-  op->mark_started();
-
-  /*
-   * this specific scrub session has terminated. All incoming events carrying the old
-   * tag will be discarded.
-   */
-  advance_token();
-  m_remote_osd_resource.reset();
-}
-
-void PgScrubber::discard_replica_reservations()
-{
-  dout(10) << __func__ << dendl;
-  if (m_reservations.has_value()) {
-    m_reservations->discard_all();
-  }
-}
-
-void PgScrubber::clear_scrub_reservations()
-{
-  dout(10) << __func__ << dendl;
-  m_reservations.reset();        // the remote reservations
-  m_local_osd_resource.reset();          // the local reservation
-  m_remote_osd_resource.reset();  // we as replica reserved for a Primary
-}
-
-void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text)
-{
-  ceph_assert(m_pg->recovery_state.get_backfill_targets().empty());
-
-  std::vector<std::pair<int, Message*>> messages;
-  messages.reserve(m_pg->get_actingset().size());
-
-  epoch_t epch = get_osdmap_epoch();
-
-  for (auto& p : m_pg->get_actingset()) {
-
-    if (p == m_pg_whoami)
-      continue;
-
-    dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch
-            << dendl;
-    Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode,
-                                     m_pg_whoami);
-    messages.push_back(std::make_pair(p.osd, m));
-  }
-
-  if (!messages.empty()) {
-    m_osds->send_message_osd_cluster(messages, epch);
-  }
-}
-
-void PgScrubber::unreserve_replicas()
-{
-  dout(10) << __func__ << dendl;
-  m_reservations.reset();
-}
-
-[[nodiscard]] bool PgScrubber::scrub_process_inconsistent()
-{
-  dout(10) << __func__ << ": checking authoritative (mode="
-          << m_mode_desc << ", auth remaining #: " << m_authoritative.size()
-          << ")" << dendl;
-
-  // authoritative only store objects which are missing or inconsistent.
-  if (!m_authoritative.empty()) {
-
-    stringstream ss;
-    ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, "
-       << m_inconsistent.size() << " inconsistent objects";
-    dout(2) << ss.str() << dendl;
-    m_osds->clog->error(ss);
-
-    if (m_is_repair) {
-      state_clear(PG_STATE_CLEAN);
-      // we know we have a problem, so it's OK to set the user-visible flag
-      // even if we only reached here via auto-repair
-      state_set(PG_STATE_REPAIR);
-      update_op_mode_text();
-
-      for (const auto& [hobj, shrd_list] : m_authoritative) {
-
-       auto missing_entry = m_missing.find(hobj);
-
-       if (missing_entry != m_missing.end()) {
-         m_pg->repair_object(hobj, shrd_list, missing_entry->second);
-         m_fixed_count += missing_entry->second.size();
-       }
-
-       if (m_inconsistent.count(hobj)) {
-         m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]);
-         m_fixed_count += m_inconsistent[hobj].size();
-       }
-      }
-    }
-  }
-  return (!m_authoritative.empty() && m_is_repair);
-}
-
-/*
- * note: only called for the Primary.
- */
-void PgScrubber::scrub_finish()
-{
-  dout(10) << __func__ << " before flags: " << m_flags
-          << ". repair state: " << (state_test(PG_STATE_REPAIR) ? "repair" : "no-repair")
-          << ". deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl;
-
-  ceph_assert(m_pg->is_locked());
-
-  m_pg->m_planned_scrub = requested_scrub_t{};
-
-  // if the repair request comes from auto-repair and large number of errors,
-  // we would like to cancel auto-repair
-  if (m_is_repair && m_flags.auto_repair &&
-      m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
-
-    dout(10) << __func__ << " undoing the repair" << dendl;
-    state_clear(PG_STATE_REPAIR); // not expected to be set, anyway
-    m_is_repair = false;
-    update_op_mode_text();
-  }
-
-  bool do_auto_scrub = false;
-
-  // if a regular scrub had errors within the limit, do a deep scrub to auto repair
-  if (m_flags.deep_scrub_on_error && !m_authoritative.empty() &&
-      m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
-    ceph_assert(!m_is_deep);
-    do_auto_scrub = true;
-    dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl;
-  }
-
-  m_flags.deep_scrub_on_error = false;
-
-  // type-specific finish (can tally more errors)
-  _scrub_finish();
-
-  bool has_error = scrub_process_inconsistent();
-
-  {
-    stringstream oss;
-    oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " ";
-    int total_errors = m_shallow_errors + m_deep_errors;
-    if (total_errors)
-      oss << total_errors << " errors";
-    else
-      oss << "ok";
-    if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
-      oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors
-         << " remaining deep scrub error details lost)";
-    if (m_is_repair)
-      oss << ", " << m_fixed_count << " fixed";
-    if (total_errors)
-      m_osds->clog->error(oss);
-    else
-      m_osds->clog->debug(oss);
-  }
-
-  // Since we don't know which errors were fixed, we can only clear them
-  // when every one has been fixed.
-  if (m_is_repair) {
-    if (m_fixed_count == m_shallow_errors + m_deep_errors) {
-
-      ceph_assert(m_is_deep);
-      m_shallow_errors = 0;
-      m_deep_errors = 0;
-      dout(20) << __func__ << " All may be fixed" << dendl;
-
-    } else if (has_error) {
-
-      // Deep scrub in order to get corrected error counts
-      m_pg->scrub_after_recovery = true;
-      m_pg->m_planned_scrub.req_scrub =
-       m_pg->m_planned_scrub.req_scrub || m_flags.required;
-
-      dout(20) << __func__ << " Current 'required': " << m_flags.required
-              << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl;
-
-    } else if (m_shallow_errors || m_deep_errors) {
-
-      // We have errors but nothing can be fixed, so there is no repair
-      // possible.
-      state_set(PG_STATE_FAILED_REPAIR);
-      dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors)
-              << " error(s) present with no repair possible" << dendl;
-    }
-  }
-
-  {
-    // finish up
-    ObjectStore::Transaction t;
-    m_pg->recovery_state.update_stats(
-      [this](auto& history, auto& stats) {
-       dout(10) << "m_pg->recovery_state.update_stats()" << dendl;
-       utime_t now = ceph_clock_now();
-       history.last_scrub = m_pg->recovery_state.get_info().last_update;
-       history.last_scrub_stamp = now;
-       if (m_is_deep) {
-         history.last_deep_scrub = m_pg->recovery_state.get_info().last_update;
-         history.last_deep_scrub_stamp = now;
-       }
-
-       if (m_is_deep) {
-         if ((m_shallow_errors == 0) && (m_deep_errors == 0))
-           history.last_clean_scrub_stamp = now;
-         stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
-         stats.stats.sum.num_deep_scrub_errors = m_deep_errors;
-         stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects;
-         stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes;
-         stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys;
-         dout(25) << "scrub_finish shard " << m_pg_whoami
-                  << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes
-                  << " num_omap_keys = " << stats.stats.sum.num_omap_keys << dendl;
-       } else {
-         stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
-         // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
-         // because of deep-scrub errors
-         if (m_shallow_errors == 0)
-           history.last_clean_scrub_stamp = now;
-       }
-       stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors +
-                                          stats.stats.sum.num_deep_scrub_errors;
-       if (m_flags.check_repair) {
-         m_flags.check_repair = false;
-         if (m_pg->info.stats.stats.sum.num_scrub_errors) {
-           state_set(PG_STATE_FAILED_REPAIR);
-           dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors
-                    << " error(s) still present after re-scrub" << dendl;
-         }
-       }
-       return true;
-      },
-      &t);
-    int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
-    ceph_assert(tr == 0);
-
-    if (!m_pg->snap_trimq.empty()) {
-      dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
-      m_pg->snap_trimmer_scrub_complete();
-    }
-  }
-
-  if (has_error) {
-    m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared<PGPeeringEvent>(
-      get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
-  } else {
-    m_is_repair = false;
-    state_clear(PG_STATE_REPAIR);
-    update_op_mode_text();
-  }
-
-  cleanup_on_finish();
-  if (do_auto_scrub) {
-    request_rescrubbing(m_pg->m_planned_scrub);
-  }
-
-  if (m_pg->is_active() && m_pg->is_primary()) {
-    m_pg->recovery_state.share_pg_info();
-  }
-}
-
-void PgScrubber::on_digest_updates()
-{
-  dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " pending? "
-          << num_digest_updates_pending
-          << (m_end.is_max() ? " <last chunk> " : " <mid chunk> ") << dendl;
-
-  if (num_digest_updates_pending > 0) {
-    // do nothing for now. We will be called again when new updates arrive
-    return;
-  }
-
-  // got all updates, and finished with this chunk. Any more?
-  if (m_end.is_max()) {
-
-    scrub_finish();
-    m_osds->queue_scrub_is_finished(m_pg);
-
-  } else {
-    // go get a new chunk (via "requeue")
-    preemption_data.reset();
-    m_osds->queue_scrub_next_chunk(m_pg, m_pg->is_scrub_blocking_ops());
-  }
-}
-
-
-/*
- * note that the flags-set fetched from the PG (m_pg->m_planned_scrub)
- * is cleared once scrubbing starts; Some of the values dumped here are
- * thus transitory.
- */
-void PgScrubber::dump(ceph::Formatter* f) const
-{
-  f->open_object_section("scrubber");
-  f->dump_stream("epoch_start") << m_interval_start;
-  f->dump_bool("active", m_active);
-  if (m_active) {
-    f->dump_stream("start") << m_start;
-    f->dump_stream("end") << m_end;
-    f->dump_stream("m_max_end") << m_max_end;
-    f->dump_stream("subset_last_update") << m_subset_last_update;
-    f->dump_bool("deep", m_is_deep);
-    f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required));
-    f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub);
-    f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair);
-    f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto);
-    f->dump_bool("req_scrub", m_flags.required);
-    f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep);
-    f->dump_bool("auto_repair", m_flags.auto_repair);
-    f->dump_bool("check_repair", m_flags.check_repair);
-    f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error);
-    f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp;  // utime_t
-    f->dump_unsigned("priority", m_flags.priority);
-    f->dump_int("shallow_errors", m_shallow_errors);
-    f->dump_int("deep_errors", m_deep_errors);
-    f->dump_int("fixed", m_fixed_count);
-    {
-      f->open_array_section("waiting_on_whom");
-      for (const auto& p : m_maps_status.get_awaited()) {
-       f->dump_stream("shard") << p;
-      }
-      f->close_section();
-    }
-  }
-  f->close_section();
-}
-
-
-void PgScrubber::handle_query_state(ceph::Formatter* f)
-{
-  dout(10) << __func__ << dendl;
-
-  f->open_object_section("scrub");
-  f->dump_stream("scrubber.epoch_start") << m_interval_start;
-  f->dump_bool("scrubber.active", m_active);
-  f->dump_stream("scrubber.start") << m_start;
-  f->dump_stream("scrubber.end") << m_end;
-  f->dump_stream("scrubber.m_max_end") << m_max_end;
-  f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update;
-  f->dump_bool("scrubber.deep", m_is_deep);
-  {
-    f->open_array_section("scrubber.waiting_on_whom");
-    for (const auto& p : m_maps_status.get_awaited()) {
-      f->dump_stream("shard") << p;
-    }
-    f->close_section();
-  }
-
-  f->dump_string("comment", "DEPRECATED - may be removed in the next release");
-
-  f->close_section();
-}
-
-PgScrubber::~PgScrubber() = default;
-
-PgScrubber::PgScrubber(PG* pg)
-    : m_pg{pg}
-    , m_pg_id{pg->pg_id}
-    , m_osds{m_pg->osd}
-    , m_pg_whoami{pg->pg_whoami}
-    , preemption_data{pg}
-{
-  m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
-  m_fsm->initiate();
-}
-
-void PgScrubber::reserve_replicas()
-{
-  dout(10) << __func__ << dendl;
-  m_reservations.emplace(m_pg, m_pg_whoami);
-}
-
-void PgScrubber::cleanup_on_finish()
-{
-  dout(10) << __func__ << dendl;
-  ceph_assert(m_pg->is_locked());
-
-  state_clear(PG_STATE_SCRUBBING);
-  state_clear(PG_STATE_DEEP_SCRUB);
-  m_pg->publish_stats_to_osd();
-
-  clear_scrub_reservations();
-  m_pg->publish_stats_to_osd();
-
-  requeue_waiting();
-
-  reset_internal_state();
-  m_flags = scrub_flags_t{};
-
-  // type-specific state clear
-  _scrub_clear_state();
-}
-
-// uses process_event(), so must be invoked externally
-void PgScrubber::scrub_clear_state()
-{
-  dout(10) << __func__ << dendl;
-
-  clear_pgscrub_state();
-  m_fsm->process_event(FullReset{});
-}
-
-/*
- * note: does not access the state-machine
- */
-void PgScrubber::clear_pgscrub_state()
-{
-  dout(10) << __func__ << dendl;
-  ceph_assert(m_pg->is_locked());
-
-  state_clear(PG_STATE_SCRUBBING);
-  state_clear(PG_STATE_DEEP_SCRUB);
-
-  state_clear(PG_STATE_REPAIR);
-
-  clear_scrub_reservations();
-  m_pg->publish_stats_to_osd();
-
-  requeue_waiting();
-
-  reset_internal_state();
-  m_flags = scrub_flags_t{};
-
-  // type-specific state clear
-  _scrub_clear_state();
-}
-
-void PgScrubber::replica_handling_done()
-{
-  dout(10) << __func__ << dendl;
-
-  state_clear(PG_STATE_SCRUBBING);
-  state_clear(PG_STATE_DEEP_SCRUB);
-
-  reset_internal_state();
-
-  m_pg->publish_stats_to_osd();
-}
-
-/*
- * note: performs run_callbacks()
- * note: reservations-related variables are not reset here
- */
-void PgScrubber::reset_internal_state()
-{
-  dout(10) << __func__ << dendl;
-
-  preemption_data.reset();
-  m_maps_status.reset();
-  m_received_maps.clear();
-
-  m_start = hobject_t{};
-  m_end = hobject_t{};
-  m_max_end = hobject_t{};
-  m_subset_last_update = eversion_t{};
-  m_shallow_errors = 0;
-  m_deep_errors = 0;
-  m_fixed_count = 0;
-  m_omap_stats = (const struct omap_stat_t){0};
-
-  run_callbacks();
-
-  m_inconsistent.clear();
-  m_missing.clear();
-  m_authoritative.clear();
-  num_digest_updates_pending = 0;
-  m_primary_scrubmap = ScrubMap{};
-  m_primary_scrubmap_pos.reset();
-  replica_scrubmap = ScrubMap{};
-  replica_scrubmap_pos.reset();
-  m_cleaned_meta_map = ScrubMap{};
-  m_needs_sleep = true;
-  m_sleep_started_at = utime_t{};
-
-  m_active = false;
-}
-
-// note that only applicable to the Replica:
-void PgScrubber::advance_token()
-{
-  dout(10) << __func__ << " was: " << m_current_token << dendl;
-  m_current_token++;
-
-  // when advance_token() is called, it is assumed that no scrubbing takes place.
-  // We will, though, verify that. And if we are actually still handling a stale request -
-  // both our internal state and the FSM state will be cleared.
-  replica_handling_done();
-  m_fsm->process_event(FullReset{});
-}
-
-bool PgScrubber::is_token_current(Scrub::act_token_t received_token)
-{
-  if (received_token == 0 || received_token == m_current_token) {
-    return true;
-  }
-  dout(5) << __func__ << " obsolete token (" << received_token
-          << " vs current " << m_current_token << dendl;
-
-  return false;
-}
-
-const OSDMapRef& PgScrubber::get_osdmap() const
-{
-  return m_pg->get_osdmap();
-}
-
-ostream& operator<<(ostream& out, const PgScrubber& scrubber)
-{
-  return out << scrubber.m_flags;
-}
-
-ostream& PgScrubber::show(ostream& out) const
-{
-  return out << " [ " << m_pg_id << ": " << m_flags << " ] ";
-}
-
-int PgScrubber::asok_debug(std::string_view cmd,
-                          std::string param,
-                          Formatter* f,
-                          stringstream& ss)
-{
-  dout(10) << __func__ << " cmd: " << cmd << " param: " << param << dendl;
-
-  if (cmd == "block") {
-    // set a flag that will cause the next 'select_range' to report a blocked object
-    m_debug_blockrange = 1;
-  } else if (cmd == "unblock") {
-    // send an 'unblock' event, as if a blocked range was freed
-    m_debug_blockrange = 0;
-    m_fsm->process_event(Unblocked{});
-  }
-  return 0;
-}
-// ///////////////////// preemption_data_t //////////////////////////////////
-
-PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
-{
-  m_left = static_cast<int>(
-    m_pg->get_cct()->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
-}
-
-void PgScrubber::preemption_data_t::reset()
-{
-  std::lock_guard<std::mutex> lk{m_preemption_lock};
-
-  m_preemptable = false;
-  m_preempted = false;
-  m_left =
-    static_cast<int>(m_pg->cct->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
-  m_size_divisor = 1;
-}
-
-
-// ///////////////////// ReplicaReservations //////////////////////////////////
-namespace Scrub {
-
-void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch)
-{
-  auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, peer.shard), epoch,
-                               MOSDScrubReserve::RELEASE, m_pg->pg_whoami);
-  m_osds->send_message_osd_cluster(peer.osd, m, epoch);
-}
-
-ReplicaReservations::ReplicaReservations(PG* pg, pg_shard_t whoami)
-    : m_pg{pg}
-    , m_acting_set{pg->get_actingset()}
-    , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
-    , m_pending{static_cast<int>(m_acting_set.size()) - 1}
-    , m_pg_info{m_pg->get_pg_info(ScrubberPasskey())}
-{
-  epoch_t epoch = m_pg->get_osdmap_epoch();
-
-  // handle the special case of no replicas
-  if (m_pending <= 0) {
-    // just signal the scrub state-machine to continue
-    send_all_done();
-
-  } else {
-
-    for (auto p : m_acting_set) {
-      if (p == whoami)
-       continue;
-      auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, p.shard), epoch,
-                                   MOSDScrubReserve::REQUEST, m_pg->pg_whoami);
-      m_osds->send_message_osd_cluster(p.osd, m, epoch);
-      m_waited_for_peers.push_back(p);
-      dout(10) << __func__ << " <ReplicaReservations> reserve<-> " << p.osd << dendl;
-    }
-  }
-}
-
-void ReplicaReservations::send_all_done()
-{
-  m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
-}
-
-void ReplicaReservations::send_reject()
-{
-  m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
-}
-
-void ReplicaReservations::discard_all()
-{
-  dout(10) << __func__ << " " << m_reserved_peers << dendl;
-
-  m_had_rejections = true;  // preventing late-coming responses from triggering events
-  m_reserved_peers.clear();
-  m_waited_for_peers.clear();
-}
-
-ReplicaReservations::~ReplicaReservations()
-{
-  m_had_rejections = true;  // preventing late-coming responses from triggering events
-
-  // send un-reserve messages to all reserved replicas. We do not wait for answer (there
-  // wouldn't be one). Other incoming messages will be discarded on the way, by our
-  // owner.
-  epoch_t epoch = m_pg->get_osdmap_epoch();
-
-  for (auto& p : m_reserved_peers) {
-    release_replica(p, epoch);
-  }
-  m_reserved_peers.clear();
-
-  // note: the release will follow on the heels of the request. When tried otherwise,
-  // grants that followed a reject arrived after the whole scrub machine-state was
-  // reset, causing leaked reservations.
-  for (auto& p : m_waited_for_peers) {
-    release_replica(p, epoch);
-  }
-  m_waited_for_peers.clear();
-}
-
-/**
- *  @ATTN we would not reach here if the ReplicaReservation object managed by the
- * scrubber was reset.
- */
-void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
-{
-  dout(10) << __func__ << " <ReplicaReservations> granted-> " << from << dendl;
-  op->mark_started();
-
-  {
-    // reduce the amount of extra release messages. Not a must, but the log is cleaner
-    auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
-    if (w != m_waited_for_peers.end())
-      m_waited_for_peers.erase(w);
-  }
-
-  // are we forced to reject the reservation?
-  if (m_had_rejections) {
-
-    dout(10) << " rejecting late-coming reservation from " << from << dendl;
-    release_replica(from, m_pg->get_osdmap_epoch());
-
-  } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
-            m_reserved_peers.end()) {
-
-    dout(10) << " already had osd." << from << " reserved" << dendl;
-
-  } else {
-
-    dout(10) << " osd." << from << " scrub reserve = success" << dendl;
-    m_reserved_peers.push_back(from);
-    if (--m_pending == 0) {
-      send_all_done();
-    }
-  }
-}
-
-void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from)
-{
-  dout(10) << __func__ << " <ReplicaReservations> rejected-> " << from << dendl;
-  dout(10) << __func__ << " " << *op->get_req() << dendl;
-  op->mark_started();
-
-  {
-    // reduce the amount of extra release messages. Not a must, but the log is cleaner
-    auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
-    if (w != m_waited_for_peers.end())
-      m_waited_for_peers.erase(w);
-  }
-
-  if (m_had_rejections) {
-
-    // our failure was already handled when the first rejection arrived
-    dout(15) << " ignoring late-coming rejection from " << from << dendl;
-
-  } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
-            m_reserved_peers.end()) {
-
-    dout(10) << " already had osd." << from << " reserved" << dendl;
-
-  } else {
-
-    dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
-    m_had_rejections = true;  // preventing any additional notifications
-    send_reject();
-  }
-}
-
-
-// ///////////////////// LocalReservation //////////////////////////////////
-
-LocalReservation::LocalReservation(PG* pg, OSDService* osds)
-    : m_pg{pg} // holding the "whole PG" for dout() sake
-    , m_osds{osds}
-{
-  if (!m_osds->inc_scrubs_local()) {
-    dout(10) << __func__ << ": failed to reserve locally " << dendl;
-    // the failure is signalled by not having m_holding_local_reservation set
-    return;
-  }
-
-  dout(20) << __func__ << ": local OSD scrub resources reserved" << dendl;
-  m_holding_local_reservation = true;
-}
-
-LocalReservation::~LocalReservation()
-{
-  if (m_holding_local_reservation) {
-    m_holding_local_reservation = false;
-    m_osds->dec_scrubs_local();
-  }
-}
-
-
-// ///////////////////// ReservedByRemotePrimary ///////////////////////////////
-
-ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch)
-    : m_pg{pg}, m_osds{osds}, m_reserved_at{epoch}
-{
-  if (!m_osds->inc_scrubs_remote()) {
-    dout(10) << __func__ << ": failed to reserve at Primary request" << dendl;
-    // the failure is signalled by not having m_reserved_by_remote_primary set
-    return;
-  }
-
-  dout(20) << __func__ << ": scrub resources reserved at Primary request" << dendl;
-  m_reserved_by_remote_primary = true;
-}
-
-bool ReservedByRemotePrimary::is_stale() const
-{
-  return m_reserved_at < m_pg->get_same_interval_since();
-}
-
-ReservedByRemotePrimary::~ReservedByRemotePrimary()
-{
-  if (m_reserved_by_remote_primary) {
-    m_reserved_by_remote_primary = false;
-    m_osds->dec_scrubs_remote();
-  }
-}
-
-// ///////////////////// MapsCollectionStatus ////////////////////////////////
-
-auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from)
-  -> std::tuple<bool, std::string_view>
-{
-  auto fe = std::find(m_maps_awaited_for.begin(), m_maps_awaited_for.end(), from);
-  if (fe != m_maps_awaited_for.end()) {
-    // we are indeed waiting for a map from this replica
-    m_maps_awaited_for.erase(fe);
-    return std::tuple{true, ""sv};
-  } else {
-    return std::tuple{false, " unsolicited scrub-map"sv};
-  }
-}
-
-void MapsCollectionStatus::reset()
-{
-  *this = MapsCollectionStatus{};
-}
-
-std::string MapsCollectionStatus::dump() const
-{
-  std::string all;
-  for (const auto& rp : m_maps_awaited_for) {
-    all.append(rp.get_osd() + " "s);
-  }
-  return all;
-}
-
-ostream& operator<<(ostream& out, const MapsCollectionStatus& sf)
-{
-  out << " [ ";
-  for (const auto& rp : sf.m_maps_awaited_for) {
-    out << rp.get_osd() << " ";
-  }
-  if (!sf.m_local_map_ready) {
-    out << " local ";
-  }
-  return out << " ] ";
-}
-
-// ///////////////////// blocked_range_t ///////////////////////////////
-
-blocked_range_t::blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id)
-    : m_osds{osds}
-{
-  auto now_is = std::chrono::system_clock::now();
-  m_callbk = new LambdaContext([now_is, pg_id, osds]([[maybe_unused]] int r) {
-    std::time_t now_c = std::chrono::system_clock::to_time_t(now_is);
-    char buf[50];
-    strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c));
-    lgeneric_subdout(g_ceph_context, osd, 10)
-      << "PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf
-      << ")" << dendl;
-    osds->clog->warn() << "osd." << osds->whoami << " PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf << ")";
-    return;
-  });
-
-  std::lock_guard l(m_osds->sleep_lock);
-  m_osds->sleep_timer.add_event_after(waittime, m_callbk);
-}
-
-blocked_range_t::~blocked_range_t()
-{
-  std::lock_guard l(m_osds->sleep_lock);
-  m_osds->sleep_timer.cancel_event(m_callbk);
-}
-
-}  // namespace Scrub
diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h
deleted file mode 100644 (file)
index e02b173..0000000
+++ /dev/null
@@ -1,800 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-
-#include <cassert>
-#include <chrono>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "PG.h"
-#include "ScrubStore.h"
-#include "scrub_machine_lstnr.h"
-#include "scrubber_common.h"
-
-class Callback;
-
-namespace Scrub {
-class ScrubMachine;
-struct BuildMap;
-
-/**
- * Reserving/freeing scrub resources at the replicas.
- *
- *  When constructed - sends reservation requests to the acting_set.
- *  A rejection triggers a "couldn't acquire the replicas' scrub resources" event.
- *  All previous requests, whether already granted or not, are explicitly released.
- *
- *  A note re performance: I've measured a few container alternatives for
- *  m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as
- *  expected. flat_set is only slightly better. Surprisingly - std::vector (with no
- *  sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve.
- */
-class ReplicaReservations {
-  using OrigSet = decltype(std::declval<PG>().get_actingset());
-
-  PG* m_pg;
-  OrigSet m_acting_set;
-  OSDService* m_osds;
-  std::vector<pg_shard_t> m_waited_for_peers;
-  std::vector<pg_shard_t> m_reserved_peers;
-  bool m_had_rejections{false};
-  int m_pending{-1};
-  const pg_info_t& m_pg_info;
-
-  void release_replica(pg_shard_t peer, epoch_t epoch);
-
-  void send_all_done();         ///< all reservations are granted
-
-  /// notify the scrubber that we have failed to reserve replicas' resources
-  void send_reject();
-
- public:
-  /**
-   *  quietly discard all knowledge about existing reservations. No messages
-   *  are sent to peers.
-   *  To be used upon interval change, as we know the the running scrub is no longer
-   *  relevant, and that the replicas had reset the reservations on their side.
-   */
-  void discard_all();
-
-  ReplicaReservations(PG* pg, pg_shard_t whoami);
-
-  ~ReplicaReservations();
-
-  void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
-
-  void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
-};
-
-/**
- *  wraps the local OSD scrub resource reservation in an RAII wrapper
- */
-class LocalReservation {
-  PG* m_pg;
-  OSDService* m_osds;
-  bool m_holding_local_reservation{false};
-
- public:
-  LocalReservation(PG* pg, OSDService* osds);
-  ~LocalReservation();
-  bool is_reserved() const { return m_holding_local_reservation; }
-};
-
-/**
- *  wraps the OSD resource we are using when reserved as a replica by a scrubbing master.
- */
-class ReservedByRemotePrimary {
-  PG* m_pg;
-  OSDService* m_osds;
-  bool m_reserved_by_remote_primary{false};
-  const epoch_t m_reserved_at;
-
- public:
-  ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch);
-  ~ReservedByRemotePrimary();
-  [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; }
-
-  /// compare the remembered reserved-at epoch to the current interval
-  [[nodiscard]] bool is_stale() const;
-};
-
-/**
- * Once all replicas' scrub maps are received, we go on to compare the maps. That is -
- * unless we we have not yet completed building our own scrub map. MapsCollectionStatus
- * combines the status of waiting for both the local map and the replicas, without
- * resorting to adding dummy entries into a list.
- */
-class MapsCollectionStatus {
-
-  bool m_local_map_ready{false};
-  std::vector<pg_shard_t> m_maps_awaited_for;
-
- public:
-  [[nodiscard]] bool are_all_maps_available() const
-  {
-    return m_local_map_ready && m_maps_awaited_for.empty();
-  }
-
-  void mark_local_map_ready() { m_local_map_ready = true; }
-
-  void mark_replica_map_request(pg_shard_t from_whom)
-  {
-    m_maps_awaited_for.push_back(from_whom);
-  }
-
-  /// @returns true if indeed waiting for this one. Otherwise: an error string
-  auto mark_arriving_map(pg_shard_t from) -> std::tuple<bool, std::string_view>;
-
-  std::vector<pg_shard_t> get_awaited() const { return m_maps_awaited_for; }
-
-  void reset();
-
-  std::string dump() const;
-
-  friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf);
-};
-
-
-}  // namespace Scrub
-
-
-/**
- * the scrub operation flags. Primary only.
- * Set at scrub start. Checked in multiple locations - mostly
- * at finish.
- */
-struct scrub_flags_t {
-
-  unsigned int priority{0};
-
-  /**
-   * set by queue_scrub() if either planned_scrub.auto_repair or
-   * need_auto were set.
-   * Tested at scrub end.
-   */
-  bool auto_repair{false};
-
-  /// this flag indicates that we are scrubbing post repair to verify everything is fixed
-  bool check_repair{false};
-
-  /// checked at the end of the scrub, to possibly initiate a deep-scrub
-  bool deep_scrub_on_error{false};
-
-  /**
-   * scrub must not be aborted.
-   * Set for explicitly requested scrubs, and for scrubs originated by the pairing
-   * process with the 'repair' flag set (in the RequestScrub event).
-   */
-  bool required{false};
-};
-
-ostream& operator<<(ostream& out, const scrub_flags_t& sf);
-
-
-/**
- * The part of PG-scrubbing code that isn't state-machine wiring.
- *
- * Why the separation? I wish to move to a different FSM implementation. Thus I
- * am forced to strongly decouple the state-machine implementation details from
- * the actual scrubbing code.
- */
-class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
-
- public:
-  explicit PgScrubber(PG* pg);
-
-  //  ------------------  the I/F exposed to the PG (ScrubPgIF) -------------
-
-  /// are we waiting for resource reservation grants form our replicas?
-  [[nodiscard]] bool is_reserving() const final;
-
-  void initiate_regular_scrub(epoch_t epoch_queued) final;
-
-  void initiate_scrub_after_repair(epoch_t epoch_queued) final;
-
-  void send_scrub_resched(epoch_t epoch_queued) final;
-
-  void active_pushes_notification(epoch_t epoch_queued) final;
-
-  void update_applied_notification(epoch_t epoch_queued) final;
-
-  void send_scrub_unblock(epoch_t epoch_queued) final;
-
-  void digest_update_notification(epoch_t epoch_queued) final;
-
-  void send_replica_maps_ready(epoch_t epoch_queued) final;
-
-  void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
-
-  void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
-
-  void send_replica_pushes_upd(epoch_t epoch_queued) final;
-  /**
-   *  The PG has updated its 'applied version'. It might be that we are waiting for this
-   *  information: after selecting a range of objects to scrub, we've marked the latest
-   *  version of these objects in m_subset_last_update. We will not start the map building
-   *  before we know that the PG has reached this version.
-   */
-  void on_applied_when_primary(const eversion_t& applied_version) final;
-
-  void send_full_reset(epoch_t epoch_queued) final;
-
-  void send_chunk_free(epoch_t epoch_queued) final;
-
-  void send_chunk_busy(epoch_t epoch_queued) final;
-
-  void send_local_map_done(epoch_t epoch_queued) final;
-
-  void send_maps_compared(epoch_t epoch_queued) final;
-
-  void send_get_next_chunk(epoch_t epoch_queued) final;
-
-  void send_scrub_is_finished(epoch_t epoch_queued) final;
-
-  /**
-   *  we allow some number of preemptions of the scrub, which mean we do
-   *  not block.  Then we start to block.  Once we start blocking, we do
-   *  not stop until the scrub range is completed.
-   */
-  bool write_blocked_by_scrub(const hobject_t& soid) final;
-
-  /// true if the given range intersects the scrub interval in any way
-  bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
-
-  /**
-   *  we are a replica being asked by the Primary to reserve OSD resources for
-   *  scrubbing
-   */
-  void handle_scrub_reserve_request(OpRequestRef op) final;
-
-  void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
-  void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
-  void handle_scrub_reserve_release(OpRequestRef op) final;
-  void discard_replica_reservations() final;
-  void clear_scrub_reservations() final;  // PG::clear... fwds to here
-  void unreserve_replicas() final;
-
-  // managing scrub op registration
-
-  void reg_next_scrub(const requested_scrub_t& request_flags) final;
-
-  void unreg_next_scrub() final;
-
-  void scrub_requested(scrub_level_t scrub_level,
-                      scrub_type_t scrub_type,
-                      requested_scrub_t& req_flags) final;
-
-  /**
-   * Reserve local scrub resources (managed by the OSD)
-   *
-   * Fails if OSD's local-scrubs budget was exhausted
-   * \returns were local resources reserved?
-   */
-  bool reserve_local() final;
-
-  void handle_query_state(ceph::Formatter* f) final;
-
-  void dump(ceph::Formatter* f) const override;
-
-  // used if we are a replica
-
-  void replica_scrub_op(OpRequestRef op) final;
-
-  /// the op priority, taken from the primary's request message
-  Scrub::scrub_prio_t replica_op_priority() const final
-  {
-    return m_replica_request_priority;
-  };
-
-  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
-                                     unsigned int suggested_priority) const final;
-  /// the version that refers to m_flags.priority
-  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
-
-  void add_callback(Context* context) final { m_callbacks.push_back(context); }
-
-  [[nodiscard]] bool are_callbacks_pending() const final  // used for an assert in PG.cc
-  {
-    return !m_callbacks.empty();
-  }
-
-  /// handle a message carrying a replica map
-  void map_from_replica(OpRequestRef op) final;
-
-  void scrub_clear_state() final;
-
-  /**
-   *  add to scrub statistics, but only if the soid is below the scrub start
-   */
-  virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
-                                       const hobject_t& soid) override
-  {
-    ceph_assert(false);
-  }
-
-  /**
-   * finalize the parameters of the initiated scrubbing session:
-   *
-   * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
-   * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
-   */
-  void set_op_parameters(requested_scrub_t& request) final;
-
-  void cleanup_store(ObjectStore::Transaction* t) final;
-
-  bool get_store_errors(const scrub_ls_arg_t& arg,
-                       scrub_ls_result_t& res_inout) const override
-  {
-    return false;
-  }
-
-  int asok_debug(std::string_view cmd,
-                std::string param,
-                Formatter* f,
-                std::stringstream& ss) override;
-  int m_debug_blockrange{0};
-
-  // -------------------------------------------------------------------------------------------
-  // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
-
-  [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); }
-
-  void select_range_n_notify() final;
-
-  Scrub::BlockedRangeWarning acquire_blocked_alarm() final;
-
-  /// walk the log to find the latest update that affects our chunk
-  eversion_t search_log_for_updates() const final;
-
-  eversion_t get_last_update_applied() const final
-  {
-    return m_pg->recovery_state.get_last_update_applied();
-  }
-
-  int pending_active_pushes() const final { return m_pg->active_pushes; }
-
-  void on_init() final;
-  void on_replica_init() final;
-  void replica_handling_done() final;
-
-  /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
-  /// (thus can be called from FSM reactions)
-  void clear_pgscrub_state() final;
-
-  /*
-   * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
-   * is asserted - after a configuration-dependent timeout.
-   */
-  void add_delayed_scheduling() final;
-
-  void get_replicas_maps(bool replica_can_preempt) final;
-
-  void on_digest_updates() final;
-
-  ScrubMachineListener::MsgAndEpoch
-  prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final;
-
-  void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final;
-
-  void send_preempted_replica() final;
-
-  void send_remotes_reserved(epoch_t epoch_queued) final;
-  void send_reservation_failure(epoch_t epoch_queued) final;
-
-  /**
-   *  does the PG have newer updates than what we (the scrubber) know?
-   */
-  [[nodiscard]] bool has_pg_marked_new_updates() const final;
-
-  void set_subset_last_update(eversion_t e) final;
-
-  void maps_compare_n_cleanup() final;
-
-  Scrub::preemption_t& get_preemptor() final;
-
-  int build_primary_map_chunk() final;
-
-  int build_replica_map_chunk() final;
-
-  void reserve_replicas() final;
-
-  [[nodiscard]] bool was_epoch_changed() const final;
-
-  void mark_local_map_ready() final;
-
-  [[nodiscard]] bool are_all_maps_available() const final;
-
-  std::string dump_awaited_maps() const final;
-
- protected:
-  bool state_test(uint64_t m) const { return m_pg->state_test(m); }
-  void state_set(uint64_t m) { m_pg->state_set(m); }
-  void state_clear(uint64_t m) { m_pg->state_clear(m); }
-
-  [[nodiscard]] bool is_scrub_registered() const;
-
-  virtual void _scrub_clear_state() {}
-
-  utime_t m_scrub_reg_stamp;  ///< stamp we registered for
-
-  ostream& show(ostream& out) const override;
-
- public:
-  // -------------------------------------------------------------------------------------------
-
-  friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
-
-  static utime_t scrub_must_stamp() { return utime_t(1, 1); }
-
-  virtual ~PgScrubber();  // must be defined separately, in the .cc file
-
-  [[nodiscard]] bool is_scrub_active() const final { return m_active; }
-
- private:
-  void reset_internal_state();
-
-  /**
-   *  the current scrubbing operation is done. We should mark that fact, so that
-   *  all events related to the previous operation can be discarded.
-   */
-  void advance_token();
-
-  bool is_token_current(Scrub::act_token_t received_token);
-
-  void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
-
-  void _scan_snaps(ScrubMap& smap);
-
-  ScrubMap clean_meta_map();
-
-  /**
-   *  mark down some parameters of the initiated scrub:
-   *  - the epoch when started;
-   *  - the depth of the scrub requested (from the PG_STATE variable)
-   */
-  void reset_epoch(epoch_t epoch_queued);
-
-  void run_callbacks();
-
-  // -----     methods used to verify the relevance of incoming events:
-
-  /**
-   *  is the incoming event still relevant, and should be processed?
-   *
-   *  It isn't if:
-   *  - (1) we are no longer 'actively scrubbing'; or
-   *  - (2) the message is from an epoch prior to when we started the current scrub
-   * session; or
-   *  - (3) the message epoch is from a previous interval; or
-   *  - (4) the 'abort' configuration flags were set.
-   *
-   *  For (1) & (2) - teh incoming message is discarded, w/o further action.
-   *
-   *  For (3): (see check_interval() for a full description) if we have not reacted yet
-   *  to this specific new interval, we do now:
-   *  - replica reservations are silently discarded (we count on the replicas to notice
-   *        the interval change and un-reserve themselves);
-   *  - the scrubbing is halted.
-   *
-   *  For (4): the message will be discarded, but also:
-   *    if this is the first time we've noticed the 'abort' request, we perform the abort.
-   *
-   *  \returns should the incoming event be processed?
-   */
-  bool is_message_relevant(epoch_t epoch_to_verify);
-
-  /**
-   * check the 'no scrub' configuration options.
-   */
-  [[nodiscard]] bool should_abort() const;
-
-  /**
-   * Check the 'no scrub' configuration flags.
-   *
-   * Reset everything if the abort was not handled before.
-   * @returns false if the message was discarded due to abort flag.
-   */
-  [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify);
-
-  [[nodiscard]] bool check_interval(epoch_t epoch_to_verify);
-
-  epoch_t m_last_aborted{};  // last time we've noticed a request to abort
-
-  /**
-   * return true if any inconsistency/missing is repaired, false otherwise
-   */
-  [[nodiscard]] bool scrub_process_inconsistent();
-
-  void scrub_compare_maps();
-
-  bool m_needs_sleep{true};  ///< should we sleep before being rescheduled? always
-                            ///< 'true', unless we just got out of a sleep period
-
-  utime_t m_sleep_started_at;
-
-
-  // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
-  // to guarantee un-reserving when deleted.
-  std::optional<Scrub::ReplicaReservations> m_reservations;
-  std::optional<Scrub::LocalReservation> m_local_osd_resource;
-
-  /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
-  std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
-
-  void cleanup_on_finish();  // scrub_clear_state() as called for a Primary when
-                            // Active->NotActive
-
-  /// the part that actually finalizes a scrub
-  void scrub_finish();
-
- protected:
-  PG* const m_pg;
-
-  /**
-   * the derivative-specific scrub-finishing touches:
-   */
-  virtual void _scrub_finish() {}
-
-  /**
-   * Validate consistency of the object info and snap sets.
-   */
-  virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
-  {}
-
-  // common code used by build_primary_map_chunk() and build_replica_map_chunk():
-  int build_scrub_map_chunk(ScrubMap& map,  // primary or replica?
-                           ScrubMapBuilder& pos,
-                           hobject_t start,
-                           hobject_t end,
-                           bool deep);
-
-  std::unique_ptr<Scrub::ScrubMachine> m_fsm;
-  const spg_t m_pg_id; ///< a local copy of m_pg->pg_id
-  OSDService* const m_osds;
-  const pg_shard_t m_pg_whoami;         ///< a local copy of m_pg->pg_whoami;
-
-  epoch_t m_interval_start{0};  ///< interval's 'from' of when scrubbing was first scheduled
-  /*
-   * the exact epoch when the scrubbing actually started (started here - cleared checks
-   *  for no-scrub conf). Incoming events are verified against this, with stale events
-   *  discarded.
-   */
-  epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
-
-  /**
-   *  (replica) a tag identifying a specific scrub "session". Incremented whenever the
-   *  Primary releases the replica scrub resources.
-   *  When the scrub session is terminated (even if the interval remains unchanged, as
-   *  might happen following an asok no-scrub command), stale scrub-resched messages
-   *  triggered by the backend will be discarded.
-   */
-  Scrub::act_token_t m_current_token{1};
-
-  scrub_flags_t m_flags;
-
-  bool m_active{false};
-
-  eversion_t m_subset_last_update{};
-
-  std::unique_ptr<Scrub::Store> m_store;
-
-  int num_digest_updates_pending{0};
-  hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
-
-  /// Returns reference to current osdmap
-  const OSDMapRef& get_osdmap() const;
-
-  /// Returns epoch of current osdmap
-  epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
-
-  CephContext* get_pg_cct() const { return m_pg->cct; }
-
-  // collected statistics
-  int m_shallow_errors{0};
-  int m_deep_errors{0};
-  int m_fixed_count{0};
-
-  /// Maps from objects with errors to missing peers
-  HobjToShardSetMapping m_missing;
-
- protected:
-  /**
-   * 'm_is_deep' - is the running scrub a deep one?
-   *
-   * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
-   * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
-   * meaningful both for the primary and the replicas, and is used as a parameter when
-   * building the scrub maps.
-   */
-  bool m_is_deep{false};
-
-  /**
-   * If set: affects the backend & scrubber-backend functions called after all
-   * scrub maps are available.
-   *
-   * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be
-   * a "user facing" status display only).
-   */
-  bool m_is_repair{false};
-
-  /**
-   * User-readable summary of the scrubber's current mode of operation. Used for
-   * both osd.*.log and the cluster log.
-   * One of:
-   *    "repair"
-   *    "deep-scrub",
-   *    "scrub
-   *
-   * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for
-   * auto_repair will show as "deep-scrub" and not as "repair" (until the first error
-   * is detected).
-   */
-  std::string_view m_mode_desc;
-
-  void update_op_mode_text();
-
-private:
-
-  /**
-   * initiate a deep-scrub after the current scrub ended with errors.
-   */
-  void request_rescrubbing(requested_scrub_t& req_flags);
-
-  /*
-   * Select a range of objects to scrub.
-   *
-   * By:
-   * - setting tentative range based on conf and divisor
-   * - requesting a partial list of elements from the backend;
-   * - handling some head/clones issues
-   *
-   * The selected range is set directly into 'm_start' and 'm_end'
-   */
-  bool select_range();
-
-  std::list<Context*> m_callbacks;
-
-  /**
-   * send a replica (un)reservation request to the acting set
-   *
-   * @param opcode - one of MOSDScrubReserve::REQUEST
-   *                  or MOSDScrubReserve::RELEASE
-   */
-  void message_all_replicas(int32_t opcode, std::string_view op_text);
-
-  hobject_t m_max_end; ///< Largest end that may have been sent to replicas
-  ScrubMap m_primary_scrubmap;
-  ScrubMapBuilder m_primary_scrubmap_pos;
-
-  std::map<pg_shard_t, ScrubMap> m_received_maps;
-
-  /// Cleaned std::map pending snap metadata scrub
-  ScrubMap m_cleaned_meta_map;
-
-  void _request_scrub_map(pg_shard_t replica,
-                         eversion_t version,
-                         hobject_t start,
-                         hobject_t end,
-                         bool deep,
-                         bool allow_preemption);
-
-
-  Scrub::MapsCollectionStatus m_maps_status;
-
-  omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
-
-  /// Maps from objects with errors to inconsistent peers
-  HobjToShardSetMapping m_inconsistent;
-
-  /// Maps from object with errors to good peers
-  std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
-
-  // ------------ members used if we are a replica
-
-  epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message
-
-  ScrubMapBuilder replica_scrubmap_pos;
-  ScrubMap replica_scrubmap;
-
-  /**
-   * we mark the request priority as it arrived. It influences the queuing priority
-   * when we wait for local updates
-   */
-  Scrub::scrub_prio_t m_replica_request_priority;
-
-  /**
-   * the 'preemption' "state-machine".
-   * Note: I was considering an orthogonal sub-machine implementation, but as
-   * the state diagram is extremely simple, the added complexity wasn't justified.
-   */
-  class preemption_data_t : public Scrub::preemption_t {
-   public:
-    preemption_data_t(PG* pg); // the PG access is used for conf access (and logs)
-
-    [[nodiscard]] bool is_preemptable() const final { return m_preemptable; }
-
-    bool do_preempt() final
-    {
-      if (m_preempted || !m_preemptable)
-       return false;
-
-      std::lock_guard<std::mutex> lk{m_preemption_lock};
-      if (!m_preemptable)
-       return false;
-
-      m_preempted = true;
-      return true;
-    }
-
-    /// same as 'do_preempt()' but w/o checks (as once a replica
-    /// was preempted, we cannot continue)
-    void replica_preempted() { m_preempted = true; }
-
-    void enable_preemption()
-    {
-      std::lock_guard<std::mutex> lk{m_preemption_lock};
-      if (are_preemptions_left() && !m_preempted) {
-       m_preemptable = true;
-      }
-    }
-
-    /// used by a replica to set preemptability state according to the Primary's request
-    void force_preemptability(bool is_allowed)
-    {
-      // note: no need to lock for a replica
-      m_preempted = false;
-      m_preemptable = is_allowed;
-    }
-
-    bool disable_and_test() final
-    {
-      std::lock_guard<std::mutex> lk{m_preemption_lock};
-      m_preemptable = false;
-      return m_preempted;
-    }
-
-    [[nodiscard]] bool was_preempted() const { return m_preempted; }
-
-    [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; }
-
-    void reset();
-
-    void adjust_parameters() final
-    {
-      std::lock_guard<std::mutex> lk{m_preemption_lock};
-
-      if (m_preempted) {
-       m_preempted = false;
-       m_preemptable = adjust_left();
-      } else {
-       m_preemptable = are_preemptions_left();
-      }
-    }
-
-   private:
-    PG* m_pg;
-    mutable std::mutex m_preemption_lock;
-    bool m_preemptable{false};
-    bool m_preempted{false};
-    int m_left;
-    size_t m_size_divisor{1};
-    bool are_preemptions_left() const { return m_left > 0; }
-
-    bool adjust_left()
-    {
-      if (m_left > 0) {
-       --m_left;
-       m_size_divisor *= 2;
-      }
-      return m_left > 0;
-    }
-  };
-
-  preemption_data_t preemption_data;
-};
diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc
deleted file mode 100644 (file)
index edee613..0000000
+++ /dev/null
@@ -1,522 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "scrub_machine.h"
-
-#include <chrono>
-#include <typeinfo>
-
-#include <boost/core/demangle.hpp>
-
-#include "OSD.h"
-#include "OpRequest.h"
-#include "ScrubStore.h"
-#include "scrub_machine_lstnr.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_osd
-#undef dout_prefix
-#define dout_prefix *_dout << " scrubberFSM "
-
-using namespace std::chrono;
-using namespace std::chrono_literals;
-namespace sc = boost::statechart;
-
-#define DECLARE_LOCALS                                           \
-  ScrubMachineListener* scrbr = context<ScrubMachine>().m_scrbr; \
-  std::ignore = scrbr;                                           \
-  auto pg_id = context<ScrubMachine>().m_pg_id;                  \
-  std::ignore = pg_id;
-
-namespace Scrub {
-
-// --------- trace/debug auxiliaries -------------------------------
-
-void on_event_creation(std::string_view nm)
-{
-  dout(20) << " event: --vvvv---- " << nm << dendl;
-}
-
-void on_event_discard(std::string_view nm)
-{
-  dout(20) << " event: --^^^^---- " << nm << dendl;
-}
-
-void ScrubMachine::my_states() const
-{
-  for (auto si = state_begin(); si != state_end(); ++si) {
-    const auto& siw{*si};  // prevents a warning re side-effects
-    dout(20) << " state: " << boost::core::demangle(typeid(siw).name()) << dendl;
-  }
-}
-
-void ScrubMachine::assert_not_active() const
-{
-  ceph_assert(state_cast<const NotActive*>());
-}
-
-bool ScrubMachine::is_reserving() const
-{
-  return state_cast<const ReservingReplicas*>();
-}
-
-bool ScrubMachine::is_accepting_updates() const
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  ceph_assert(scrbr->is_primary());
-
-  return state_cast<const WaitLastUpdate*>();
-}
-
-// for the rest of the code in this file - we know what PG we are dealing with:
-#undef dout_prefix
-#define dout_prefix _prefix(_dout, this->context<ScrubMachine>().m_pg)
-template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
-{
-  return t->gen_prefix(*_dout) << " scrubberFSM pg(" << t->pg_id << ") ";
-}
-
-// ////////////// the actual actions
-
-// ----------------------- NotActive -----------------------------------------
-
-NotActive::NotActive(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> NotActive" << dendl;
-}
-
-// ----------------------- ReservingReplicas ---------------------------------
-
-ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> ReservingReplicas" << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  scrbr->reserve_replicas();
-}
-
-sc::result ReservingReplicas::react(const ReservationFailure&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
-
-  // the Scrubber must release all resources and abort the scrubbing
-  scrbr->clear_pgscrub_state();
-  return transit<NotActive>();
-}
-
-/**
- * note: the event poster is handling the scrubber reset
- */
-sc::result ReservingReplicas::react(const FullReset&)
-{
-  dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl;
-  return transit<NotActive>();
-}
-
-// ----------------------- ActiveScrubbing -----------------------------------
-
-ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> ActiveScrubbing" << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  scrbr->on_init();
-}
-
-/**
- *  upon exiting the Active state
- */
-ActiveScrubbing::~ActiveScrubbing()
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(15) << __func__ << dendl;
-  scrbr->unreserve_replicas();
-}
-
-/*
- * The only source of an InternalError event as of now is the BuildMap state,
- * when encountering a backend error.
- * We kill the scrub and reset the FSM.
- */
-sc::result ActiveScrubbing::react(const InternalError&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << __func__ << dendl;
-  scrbr->clear_pgscrub_state();
-  return transit<NotActive>();
-}
-
-sc::result ActiveScrubbing::react(const FullReset&)
-{
-  dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl;
-  // caller takes care of clearing the scrubber & FSM states
-  return transit<NotActive>();
-}
-
-// ----------------------- RangeBlocked -----------------------------------
-
-/*
- * Blocked. Will be released by kick_object_context_blocked() (or upon
- * an abort)
- *
- * Note: we are never expected to be waiting for long for a blocked object.
- * Unfortunately we know from experience that a bug elsewhere might result
- * in an indefinite wait in this state, for an object that is never released.
- * If that happens, all we can do is to issue a warning message to help
- * with the debugging.
- */
-RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> Act/RangeBlocked" << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-
-  // arrange to have a warning message issued if we are stuck in this
-  // state for longer than some reasonable number of minutes.
-  m_timeout = scrbr->acquire_blocked_alarm();
-}
-
-// ----------------------- PendingTimer -----------------------------------
-
-/**
- *  Sleeping till timer reactivation - or just requeuing
- */
-PendingTimer::PendingTimer(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> Act/PendingTimer" << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-
-  scrbr->add_delayed_scheduling();
-}
-
-// ----------------------- NewChunk -----------------------------------
-
-/**
- *  Preconditions:
- *  - preemption data was set
- *  - epoch start was updated
- */
-NewChunk::NewChunk(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> Act/NewChunk" << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-
-  scrbr->get_preemptor().adjust_parameters();
-
-  //  choose range to work on
-  //  select_range_n_notify() will signal either SelectedChunkFree or
-  //  ChunkIsBusy. If 'busy', we transition to Blocked, and wait for the
-  //  range to become available.
-  scrbr->select_range_n_notify();
-}
-
-sc::result NewChunk::react(const SelectedChunkFree&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl;
-
-  scrbr->set_subset_last_update(scrbr->search_log_for_updates());
-  return transit<WaitPushes>();
-}
-
-// ----------------------- WaitPushes -----------------------------------
-
-WaitPushes::WaitPushes(my_context ctx) : my_base(ctx)
-{
-  dout(10) << " -- state -->> Act/WaitPushes" << dendl;
-  post_event(ActivePushesUpd{});
-}
-
-/*
- * Triggered externally, by the entity that had an update re pushes
- */
-sc::result WaitPushes::react(const ActivePushesUpd&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: "
-          << scrbr->pending_active_pushes() << dendl;
-
-  if (!scrbr->pending_active_pushes()) {
-    // done waiting
-    return transit<WaitLastUpdate>();
-  }
-
-  return discard_event();
-}
-
-// ----------------------- WaitLastUpdate -----------------------------------
-
-WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx)
-{
-  dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl;
-  post_event(UpdatesApplied{});
-}
-
-/**
- *  Note:
- *  Updates are locally readable immediately. Thus, on the replicas we do need
- *  to wait for the update notifications before scrubbing. For the Primary it's
- *  a bit different: on EC (and only there) rmw operations have an additional
- *  read roundtrip. That means that on the Primary we need to wait for
- *  last_update_applied (the replica side, even on EC, is still safe
- *  since the actual transaction will already be readable by commit time.
- */
-void WaitLastUpdate::on_new_updates(const UpdatesApplied&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl;
-
-  if (scrbr->has_pg_marked_new_updates()) {
-    post_event(InternalAllUpdates{});
-  } else {
-    // will be requeued by op_applied
-    dout(10) << "wait for EC read/modify/writes to queue" << dendl;
-  }
-}
-
-/*
- *  request maps from the replicas in the acting set
- */
-sc::result WaitLastUpdate::react(const InternalAllUpdates&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl;
-
-  scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable());
-  return transit<BuildMap>();
-}
-
-// ----------------------- BuildMap -----------------------------------
-
-BuildMap::BuildMap(my_context ctx) : my_base(ctx)
-{
-  dout(10) << " -- state -->> Act/BuildMap" << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-
-  // no need to check for an epoch change, as all possible flows that brought us here have
-  // a check_interval() verification of their final event.
-
-  if (scrbr->get_preemptor().was_preempted()) {
-
-    // we were preempted, either directly or by a replica
-    dout(10) << __func__ << " preempted!!!" << dendl;
-    scrbr->mark_local_map_ready();
-    post_event(IntBmPreempted{});
-
-  } else {
-
-    auto ret = scrbr->build_primary_map_chunk();
-
-    if (ret == -EINPROGRESS) {
-      // must wait for the backend to finish. No specific event provided.
-      // build_primary_map_chunk() has already requeued us.
-      dout(20) << "waiting for the backend..." << dendl;
-
-    } else if (ret < 0) {
-
-      dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
-      post_event(InternalError{});
-
-    } else {
-
-      // the local map was created
-      post_event(IntLocalMapDone{});
-    }
-  }
-}
-
-sc::result BuildMap::react(const IntLocalMapDone&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl;
-
-  scrbr->mark_local_map_ready();
-  return transit<WaitReplicas>();
-}
-
-// ----------------------- DrainReplMaps -----------------------------------
-
-DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> Act/DrainReplMaps" << dendl;
-  // we may have received all maps already. Send the event that will make us check.
-  post_event(GotReplicas{});
-}
-
-sc::result DrainReplMaps::react(const GotReplicas&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl;
-
-  if (scrbr->are_all_maps_available()) {
-    // NewChunk will handle the preemption that brought us to this state
-    return transit<PendingTimer>();
-  }
-
-  dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: "
-          << scrbr->dump_awaited_maps() << dendl;
-  return discard_event();
-}
-
-// ----------------------- WaitReplicas -----------------------------------
-
-WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> Act/WaitReplicas" << dendl;
-  post_event(GotReplicas{});
-}
-
-/**
- * note: now that maps_compare_n_cleanup() is "futurized"(*), and we remain in this state
- *  for a while even after we got all our maps, we must prevent are_all_maps_available()
- *  (actually - the code after the if()) from being called more than once.
- * This is basically a separate state, but it's too transitory and artificial to justify
- *  the cost of a separate state.
-
- * (*) "futurized" - in Crimson, the call to maps_compare_n_cleanup() returns immediately
- *  after initiating the process. The actual termination of the maps comparing etc' is
- *  signalled via an event. As we share the code with "classic" OSD, here too
- *  maps_compare_n_cleanup() is responsible for signalling the completion of the
- *  processing.
- */
-sc::result WaitReplicas::react(const GotReplicas&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl;
-
-  if (!all_maps_already_called && scrbr->are_all_maps_available()) {
-    dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl;
-
-    all_maps_already_called = true;
-
-    // were we preempted?
-    if (scrbr->get_preemptor().disable_and_test()) {  // a test&set
-
-
-      dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl;
-      return transit<PendingTimer>();
-
-    } else {
-
-      // maps_compare_n_cleanup() will arrange for MapsCompared event to be sent:
-      scrbr->maps_compare_n_cleanup();
-      return discard_event();
-    }
-  } else {
-    return discard_event();
-  }
-}
-
-// ----------------------- WaitDigestUpdate -----------------------------------
-
-WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl;
-  // perform an initial check: maybe we already
-  // have all the updates we need:
-  // (note that DigestUpdate is usually an external event)
-  post_event(DigestUpdate{});
-}
-
-sc::result WaitDigestUpdate::react(const DigestUpdate&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl;
-
-  // on_digest_updates() will either:
-  // - do nothing - if we are still waiting for updates, or
-  // - finish the scrubbing of the current chunk, and:
-  //  - send NextChunk, or
-  //  - send ScrubFinished
-
-  scrbr->on_digest_updates();
-  return discard_event();
-}
-
-ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
-    : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub}
-{
-  dout(15) << "ScrubMachine created " << m_pg_id << dendl;
-}
-
-ScrubMachine::~ScrubMachine() = default;
-
-// -------- for replicas -----------------------------------------------------
-
-// ----------------------- ReplicaWaitUpdates --------------------------------
-
-ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx)
-{
-  dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl;
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  scrbr->on_replica_init();
-}
-
-/*
- * Triggered externally, by the entity that had an update re pushes
- */
-sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): "
-          << scrbr->pending_active_pushes() << dendl;
-
-  if (scrbr->pending_active_pushes() == 0) {
-
-    // done waiting
-    return transit<ActiveReplica>();
-  }
-
-  return discard_event();
-}
-
-/**
- * the event poster is handling the scrubber reset
- */
-sc::result ReplicaWaitUpdates::react(const FullReset&)
-{
-  dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl;
-  return transit<NotActive>();
-}
-
-// ----------------------- ActiveReplica -----------------------------------
-
-ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "-- state -->> ActiveReplica" << dendl;
-  scrbr->on_replica_init();  // as we might have skipped ReplicaWaitUpdates
-  post_event(SchedReplica{});
-}
-
-sc::result ActiveReplica::react(const SchedReplica&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? "
-          << scrbr->get_preemptor().is_preemptable() << dendl;
-
-  if (scrbr->get_preemptor().was_preempted()) {
-    dout(10) << "replica scrub job preempted" << dendl;
-
-    scrbr->send_preempted_replica();
-    scrbr->replica_handling_done();
-    return transit<NotActive>();
-  }
-
-  // start or check progress of build_replica_map_chunk()
-  auto ret_init = scrbr->build_replica_map_chunk();
-  if (ret_init != -EINPROGRESS) {
-    return transit<NotActive>();
-  }
-
-  return discard_event();
-}
-
-/**
- * the event poster is handling the scrubber reset
- */
-sc::result ActiveReplica::react(const FullReset&)
-{
-  dout(10) << "ActiveReplica::react(const FullReset&)" << dendl;
-  return transit<NotActive>();
-}
-
-}  // namespace Scrub
diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h
deleted file mode 100644 (file)
index 998bc5f..0000000
+++ /dev/null
@@ -1,346 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#pragma once
-
-#include <string>
-
-#include <boost/statechart/custom_reaction.hpp>
-#include <boost/statechart/deferral.hpp>
-#include <boost/statechart/event.hpp>
-#include <boost/statechart/event_base.hpp>
-#include <boost/statechart/in_state_reaction.hpp>
-#include <boost/statechart/simple_state.hpp>
-#include <boost/statechart/state.hpp>
-#include <boost/statechart/state_machine.hpp>
-#include <boost/statechart/transition.hpp>
-
-#include "common/version.h"
-#include "include/Context.h"
-
-#include "scrub_machine_lstnr.h"
-#include "scrubber_common.h"
-
-using namespace std::string_literals;
-
-class PG;  // holding a pointer to that one - just for testing
-class PgScrubber;
-namespace Scrub {
-
-namespace sc = ::boost::statechart;
-namespace mpl = ::boost::mpl;
-
-//
-//  EVENTS
-//
-
-void on_event_creation(std::string_view nm);
-void on_event_discard(std::string_view nm);
-
-#define MEV(E)                                          \
-  struct E : sc::event<E> {                             \
-    inline static int actv{0};                          \
-    E()                                                 \
-    {                                                   \
-      if (!actv++)                                      \
-       on_event_creation(#E);                          \
-    }                                                   \
-    ~E()                                                \
-    {                                                   \
-      if (!--actv)                                      \
-       on_event_discard(#E);                           \
-    }                                                   \
-    void print(std::ostream* out) const { *out << #E; } \
-    std::string_view print() const { return #E; }       \
-  };
-
-MEV(RemotesReserved)  ///< all replicas have granted our reserve request
-
-MEV(ReservationFailure)         ///< a reservation request has failed
-
-MEV(StartScrub)         ///< initiate a new scrubbing session (relevant if we are a Primary)
-
-MEV(AfterRepairScrub)  ///< initiate a new scrubbing session. Only triggered at Recovery
-                      ///< completion.
-
-MEV(Unblocked) ///< triggered when the PG unblocked an object that was marked for
-               ///< scrubbing. Via the PGScrubUnblocked op
-
-MEV(InternalSchedScrub)
-
-MEV(SelectedChunkFree)
-
-MEV(ChunkIsBusy)
-
-MEV(ActivePushesUpd)  ///< Update to active_pushes. 'active_pushes' represents recovery
-                     ///< that is in-flight to the local ObjectStore
-
-MEV(UpdatesApplied)  ///< (Primary only) all updates are committed
-
-MEV(InternalAllUpdates)         ///< the internal counterpart of UpdatesApplied
-
-MEV(GotReplicas)  ///< got a map from a replica
-
-MEV(IntBmPreempted)  ///< internal - BuildMap preempted. Required, as detected within the
-                    ///< ctor
-
-MEV(InternalError)
-
-MEV(IntLocalMapDone)
-
-MEV(DigestUpdate)  ///< external. called upon success of a MODIFY op. See
-                  ///< scrub_snapshot_metadata()
-
-MEV(MapsCompared)  ///< (Crimson) maps_compare_n_cleanup() transactions are done
-
-MEV(StartReplica)  ///< initiating replica scrub.
-
-MEV(StartReplicaNoWait)         ///< 'start replica' when there are no pending updates
-
-MEV(SchedReplica)
-
-MEV(ReplicaPushesUpd)  ///< Update to active_pushes. 'active_pushes' represents recovery
-                      ///< that is in-flight to the local ObjectStore
-
-MEV(FullReset) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive)
-
-MEV(NextChunk) ///< finished handling this chunk. Go get the next one
-
-MEV(ScrubFinished)  ///< all chunks handled
-
-
-struct NotActive;          ///< the quiescent state. No active scrubbing.
-struct ReservingReplicas;   ///< securing scrub resources from replicas' OSDs
-struct ActiveScrubbing;            ///< the active state for a Primary. A sub-machine.
-struct ReplicaWaitUpdates;  ///< an active state for a replica. Waiting for all active
-                           ///< operations to finish.
-struct ActiveReplica;      ///< an active state for a replica.
-
-
-class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
- public:
-  friend class PgScrubber;
-
- public:
-  explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub);
-  ~ScrubMachine();
-
-  PG* m_pg;  // only used for dout messages
-  spg_t m_pg_id;
-  ScrubMachineListener* m_scrbr;
-
-  void my_states() const;
-  void assert_not_active() const;
-  [[nodiscard]] bool is_reserving() const;
-  [[nodiscard]] bool is_accepting_updates() const;
-};
-
-/**
- *  The Scrubber's base (quiescent) state.
- *  Scrubbing is triggered by one of the following events:
- *  - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources
- *    reservation process. Will be issued by PG::scrub(), following a
- *    queued "PGScrub" op.
- *  - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is
- *    not required to reserve resources.
- *  - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming
- *    MOSDRepScrub message.
- *
- *  note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting
- *   for replica resources to be acquired. But once replicas started using the
- *   resource-request to identify and tag the scrub session, this bypass cannot be
- *   supported anymore.
- */
-struct NotActive : sc::state<NotActive, ScrubMachine> {
-  explicit NotActive(my_context ctx);
-
-  using reactions = mpl::list<sc::transition<StartScrub, ReservingReplicas>,
-                             // a scrubbing that was initiated at recovery completion,
-                             // and requires no resource reservations:
-                             sc::transition<AfterRepairScrub, ReservingReplicas>,
-                             sc::transition<StartReplica, ReplicaWaitUpdates>,
-                             sc::transition<StartReplicaNoWait, ActiveReplica>>;
-};
-
-struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine> {
-
-  explicit ReservingReplicas(my_context ctx);
-  using reactions = mpl::list<sc::custom_reaction<FullReset>,
-                             // all replicas granted our resources request
-                             sc::transition<RemotesReserved, ActiveScrubbing>,
-                             sc::custom_reaction<ReservationFailure>>;
-
-  sc::result react(const FullReset&);
-
-  /// at least one replica denied us the scrub resources we've requested
-  sc::result react(const ReservationFailure&);
-};
-
-
-// the "active" sub-states
-
-struct RangeBlocked;  ///< the objects range is blocked
-struct PendingTimer;  ///< either delaying the scrub by some time and requeuing, or just
-                     ///< requeue
-struct NewChunk;      ///< select a chunk to scrub, and verify its availability
-struct WaitPushes;
-struct WaitLastUpdate;
-struct BuildMap;
-struct DrainReplMaps;  ///< a problem during BuildMap. Wait for all replicas to report,
-                      ///< then restart.
-struct WaitReplicas;   ///< wait for all replicas to report
-struct WaitDigestUpdate;
-
-struct ActiveScrubbing : sc::state<ActiveScrubbing, ScrubMachine, PendingTimer> {
-
-  explicit ActiveScrubbing(my_context ctx);
-  ~ActiveScrubbing();
-
-  using reactions = mpl::list<
-    sc::custom_reaction<InternalError>,
-    sc::custom_reaction<FullReset>>;
-
-  sc::result react(const FullReset&);
-  sc::result react(const InternalError&);
-};
-
-struct RangeBlocked : sc::state<RangeBlocked, ActiveScrubbing> {
-  explicit RangeBlocked(my_context ctx);
-  using reactions = mpl::list<sc::transition<Unblocked, PendingTimer>>;
-
-  Scrub::BlockedRangeWarning m_timeout;
-};
-
-struct PendingTimer : sc::state<PendingTimer, ActiveScrubbing> {
-
-  explicit PendingTimer(my_context ctx);
-
-  using reactions = mpl::list<sc::transition<InternalSchedScrub, NewChunk>>;
-};
-
-struct NewChunk : sc::state<NewChunk, ActiveScrubbing> {
-
-  explicit NewChunk(my_context ctx);
-
-  using reactions = mpl::list<sc::transition<ChunkIsBusy, RangeBlocked>,
-                             sc::custom_reaction<SelectedChunkFree>>;
-
-  sc::result react(const SelectedChunkFree&);
-};
-
-/**
- * initiate the update process for this chunk
- *
- * Wait fo 'active_pushes' to clear.
- * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence
- * scrub waits until the correct data is readable (in-flight data to the Objectstore is
- * not readable until written to disk, termed 'applied' here)
- */
-struct WaitPushes : sc::state<WaitPushes, ActiveScrubbing> {
-
-  explicit WaitPushes(my_context ctx);
-
-  using reactions = mpl::list<sc::custom_reaction<ActivePushesUpd>>;
-
-  sc::result react(const ActivePushesUpd&);
-};
-
-struct WaitLastUpdate : sc::state<WaitLastUpdate, ActiveScrubbing> {
-
-  explicit WaitLastUpdate(my_context ctx);
-
-  void on_new_updates(const UpdatesApplied&);
-
-  using reactions = mpl::list<sc::custom_reaction<InternalAllUpdates>,
-                             sc::in_state_reaction<UpdatesApplied,
-                                                   WaitLastUpdate,
-                                                   &WaitLastUpdate::on_new_updates>>;
-
-  sc::result react(const InternalAllUpdates&);
-};
-
-struct BuildMap : sc::state<BuildMap, ActiveScrubbing> {
-  explicit BuildMap(my_context ctx);
-
-  // possible error scenarios:
-  // - an error reported by the backend will trigger an 'InternalError' event,
-  //   handled by our parent state;
-  // - if preempted, we switch to DrainReplMaps, where we will wait for all
-  //   replicas to send their maps before acknowledging the preemption;
-  // - an interval change will be handled by the relevant 'send-event' functions,
-  //   and will translated into a 'FullReset' event.
-  using reactions =
-    mpl::list<sc::transition<IntBmPreempted, DrainReplMaps>,
-             sc::transition<InternalSchedScrub, BuildMap>,  // looping, waiting
-                                                            // for the backend to
-                                                            // finish
-             sc::custom_reaction<IntLocalMapDone>>;
-
-  sc::result react(const IntLocalMapDone&);
-};
-
-/*
- *  "drain" scrub-maps responses from replicas
- */
-struct DrainReplMaps : sc::state<DrainReplMaps, ActiveScrubbing> {
-  explicit DrainReplMaps(my_context ctx);
-
-  using reactions =
-    mpl::list<sc::custom_reaction<GotReplicas> // all replicas are accounted for
-             >;
-
-  sc::result react(const GotReplicas&);
-};
-
-struct WaitReplicas : sc::state<WaitReplicas, ActiveScrubbing> {
-  explicit WaitReplicas(my_context ctx);
-
-  using reactions =
-    mpl::list<sc::custom_reaction<GotReplicas>,         // all replicas are accounted for
-             sc::transition<MapsCompared, WaitDigestUpdate>,
-             sc::deferral<DigestUpdate>  // might arrive before we've reached WDU
-             >;
-
-  sc::result react(const GotReplicas&);
-
-  bool all_maps_already_called{false}; // see comment in react code
-};
-
-struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing> {
-  explicit WaitDigestUpdate(my_context ctx);
-
-  using reactions = mpl::list<sc::custom_reaction<DigestUpdate>,
-                             sc::transition<NextChunk, PendingTimer>,
-                             sc::transition<ScrubFinished, NotActive>>;
-  sc::result react(const DigestUpdate&);
-};
-
-// ----------------------------- the "replica active" states -----------------------
-
-/*
- * Waiting for 'active_pushes' to complete
- *
- * When in this state:
- * - the details of the Primary's request were internalized by PgScrubber;
- * - 'active' scrubbing is set
- */
-struct ReplicaWaitUpdates : sc::state<ReplicaWaitUpdates, ScrubMachine> {
-  explicit ReplicaWaitUpdates(my_context ctx);
-  using reactions =
-    mpl::list<sc::custom_reaction<ReplicaPushesUpd>, sc::custom_reaction<FullReset>>;
-
-  sc::result react(const ReplicaPushesUpd&);
-  sc::result react(const FullReset&);
-};
-
-
-struct ActiveReplica : sc::state<ActiveReplica, ScrubMachine> {
-  explicit ActiveReplica(my_context ctx);
-  using reactions = mpl::list<sc::custom_reaction<SchedReplica>,
-                             sc::custom_reaction<FullReset>,
-                             sc::transition<ScrubFinished, NotActive>>;
-
-  sc::result react(const SchedReplica&);
-  sc::result react(const FullReset&);
-};
-
-}  // namespace Scrub
diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h
deleted file mode 100644 (file)
index 91dee91..0000000
+++ /dev/null
@@ -1,164 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-/**
- * \file the PgScrubber interface used by the scrub FSM
- */
-#include "common/version.h"
-#include "include/Context.h"
-
-#include "osd_types.h"
-
-namespace Scrub {
-
-enum class PreemptionNoted { no_preemption, preempted };
-
-/// the interface exposed by the PgScrubber into its internal
-/// preemption_data object
-struct preemption_t {
-
-  virtual ~preemption_t() = default;
-
-  [[nodiscard]] virtual bool is_preemptable() const = 0;
-
-  [[nodiscard]] virtual bool was_preempted() const = 0;
-
-  virtual void adjust_parameters() = 0;
-
-  /**
-   *  Try to preempt the scrub.
-   *  'true' (i.e. - preempted) if:
-   *   preemptable && not already preempted
-   */
-  virtual bool do_preempt() = 0;
-
-  /**
-   *  disables preemptions.
-   *  Returns 'true' if we were already preempted
-   */
-  virtual bool disable_and_test() = 0;
-};
-
-/// an aux used when blocking on a busy object.
-/// Issues a log warning if still blocked after 'waittime'.
-struct blocked_range_t {
-  blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id);
-  ~blocked_range_t();
-
-  OSDService* m_osds;
-  Context* m_callbk;
-};
-
-using BlockedRangeWarning = std::unique_ptr<blocked_range_t>;
-
-}  // namespace Scrub
-
-struct ScrubMachineListener {
-
-  struct MsgAndEpoch {
-    MessageRef m_msg;
-    epoch_t m_epoch;
-  };
-
-  virtual ~ScrubMachineListener() = default;
-
-  [[nodiscard]] virtual bool is_primary() const = 0;
-
-  virtual void select_range_n_notify() = 0;
-
-  virtual Scrub::BlockedRangeWarning acquire_blocked_alarm() = 0;
-
-  /// walk the log to find the latest update that affects our chunk
-  virtual eversion_t search_log_for_updates() const = 0;
-
-  virtual eversion_t get_last_update_applied() const = 0;
-
-  virtual int pending_active_pushes() const = 0;
-
-  virtual int build_primary_map_chunk() = 0;
-
-  virtual int build_replica_map_chunk() = 0;
-
-  virtual void on_init() = 0;
-
-  virtual void on_replica_init() = 0;
-
-  virtual void replica_handling_done() = 0;
-
-  /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
-  /// (thus can be called from FSM reactions)
-  virtual void clear_pgscrub_state() = 0;
-
-  /*
-   * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
-   * is asserted - after a configuration-dependent timeout.
-   */
-  virtual void add_delayed_scheduling() = 0;
-
-  /**
-   * Ask all replicas for their scrub maps for the current chunk.
-   */
-  virtual void get_replicas_maps(bool replica_can_preempt) = 0;
-
-  virtual void on_digest_updates() = 0;
-
-  /**
-   * Prepare a MOSDRepScrubMap message carrying the requested scrub map
-   * @param was_preempted - were we preempted?
-   * @return the message, and the current value of 'm_replica_min_epoch' (which is
-   *     used when sending the message, but will be overwritten before that).
-   */
-  [[nodiscard]] virtual MsgAndEpoch prep_replica_map_msg(
-    Scrub::PreemptionNoted was_preempted) = 0;
-
-  /**
-   * Send to the primary the pre-prepared message containing the requested map
-   */
-  virtual void send_replica_map(const MsgAndEpoch& preprepared) = 0;
-
-  /**
-   * Let the primary know that we were preempted while trying to build the
-   * requested map.
-   */
-  virtual void send_preempted_replica() = 0;
-
-  [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0;
-
-  virtual void set_subset_last_update(eversion_t e) = 0;
-
-  [[nodiscard]] virtual bool was_epoch_changed() const = 0;
-
-  virtual Scrub::preemption_t& get_preemptor() = 0;
-
-  /**
-   *  a "technical" collection of the steps performed once all
-   *  rep maps are available:
-   *  - the maps are compared
-   *  - the scrub region markers (start_ & end_) are advanced
-   *  - callbacks and ops that were pending are allowed to run
-   */
-  virtual void maps_compare_n_cleanup() = 0;
-
-  /**
-   * order the PgScrubber to initiate the process of reserving replicas' scrub
-   * resources.
-   */
-  virtual void reserve_replicas() = 0;
-
-  virtual void unreserve_replicas() = 0;
-
-  /**
-   * the FSM interface into the "are we waiting for maps, either our own or from
-   * replicas" state.
-   * The FSM can only:
-   * - mark the local map as available, and
-   * - query status
-   */
-  virtual void mark_local_map_ready() = 0;
-
-  [[nodiscard]] virtual bool are_all_maps_available() const = 0;
-
-  /// a log/debug interface
-  virtual std::string dump_awaited_maps() const = 0;
-};
diff --git a/src/osd/scrubber/PrimaryLogScrub.cc b/src/osd/scrubber/PrimaryLogScrub.cc
new file mode 100644 (file)
index 0000000..2be7b56
--- /dev/null
@@ -0,0 +1,589 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PrimaryLogScrub.h"
+
+#include "common/scrub_types.h"
+#include "osd/osd_types_fmt.h"
+
+#include "osd/PeeringState.h"
+#include "osd/PrimaryLogPG.h"
+#include "scrub_machine.h"
+
+#define dout_context (m_pg->get_cct())
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this->m_pg)
+
+using std::vector;
+
+template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
+{
+  return t->gen_prefix(*_dout) << " PrimaryLog scrubber pg(" << t->pg_id << ") ";
+}
+
+using namespace Scrub;
+using Scrub::ScrubMachine;
+
+bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg,
+                                      scrub_ls_result_t& res_inout) const
+{
+  if (!m_store) {
+    return false;
+  }
+
+  if (arg.get_snapsets) {
+    res_inout.vals =
+      m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return);
+  } else {
+    res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after,
+                                               arg.max_return);
+  }
+  return true;
+}
+
+void PrimaryLogScrub::_scrub_finish()
+{
+  auto& info = m_pg->get_pg_info(ScrubberPasskey{});  ///< a temporary alias
+
+  dout(10) << __func__
+          << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid")
+          << dendl;
+
+  if (info.stats.stats_invalid) {
+    m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) {
+      stats.stats = m_scrub_cstat;
+      stats.stats_invalid = false;
+      return false;
+    });
+
+    if (m_pl_pg->agent_state)
+      m_pl_pg->agent_choose_mode();
+  }
+
+  dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/"
+          << info.stats.stats.sum.num_objects << " objects, "
+          << m_scrub_cstat.sum.num_object_clones << "/"
+          << info.stats.stats.sum.num_object_clones << " clones, "
+          << m_scrub_cstat.sum.num_objects_dirty << "/"
+          << info.stats.stats.sum.num_objects_dirty << " dirty, "
+          << m_scrub_cstat.sum.num_objects_omap << "/"
+          << info.stats.stats.sum.num_objects_omap << " omap, "
+          << m_scrub_cstat.sum.num_objects_pinned << "/"
+          << info.stats.stats.sum.num_objects_pinned << " pinned, "
+          << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+          << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
+          << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes
+          << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/"
+          << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
+          << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+          << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
+          << dendl;
+
+  if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
+      m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
+      (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
+       !info.stats.dirty_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
+       !info.stats.omap_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
+       !info.stats.pin_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_hit_set_archive !=
+        info.stats.stats.sum.num_objects_hit_set_archive &&
+       !info.stats.hitset_stats_invalid) ||
+      (m_scrub_cstat.sum.num_bytes_hit_set_archive !=
+        info.stats.stats.sum.num_bytes_hit_set_archive &&
+       !info.stats.hitset_bytes_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_manifest !=
+        info.stats.stats.sum.num_objects_manifest &&
+       !info.stats.manifest_stats_invalid) ||
+      m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
+      m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
+    m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got "
+                         << m_scrub_cstat.sum.num_objects << "/"
+                         << info.stats.stats.sum.num_objects << " objects, "
+                         << m_scrub_cstat.sum.num_object_clones << "/"
+                         << info.stats.stats.sum.num_object_clones << " clones, "
+                         << m_scrub_cstat.sum.num_objects_dirty << "/"
+                         << info.stats.stats.sum.num_objects_dirty << " dirty, "
+                         << m_scrub_cstat.sum.num_objects_omap << "/"
+                         << info.stats.stats.sum.num_objects_omap << " omap, "
+                         << m_scrub_cstat.sum.num_objects_pinned << "/"
+                         << info.stats.stats.sum.num_objects_pinned << " pinned, "
+                         << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+                         << info.stats.stats.sum.num_objects_hit_set_archive
+                         << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts
+                         << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
+                         << m_scrub_cstat.sum.num_bytes << "/"
+                         << info.stats.stats.sum.num_bytes << " bytes, "
+                         << m_scrub_cstat.sum.num_objects_manifest << "/"
+                         << info.stats.stats.sum.num_objects_manifest
+                         << " manifest objects, "
+                         << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+                         << info.stats.stats.sum.num_bytes_hit_set_archive
+                         << " hit_set_archive bytes.";
+    ++m_shallow_errors;
+
+    if (m_is_repair) {
+      ++m_fixed_count;
+      m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) {
+       stats.stats = m_scrub_cstat;
+       stats.dirty_stats_invalid = false;
+       stats.omap_stats_invalid = false;
+       stats.hitset_stats_invalid = false;
+       stats.hitset_bytes_stats_invalid = false;
+       stats.pin_stats_invalid = false;
+       stats.manifest_stats_invalid = false;
+       return false;
+      });
+      m_pl_pg->publish_stats_to_osd();
+      m_pl_pg->recovery_state.share_pg_info();
+    }
+  }
+  // Clear object context cache to get repair information
+  if (m_is_repair)
+    m_pl_pg->object_contexts.clear();
+}
+
+static bool doing_clones(const std::optional<SnapSet>& snapset,
+                        const vector<snapid_t>::reverse_iterator& curclone)
+{
+  return snapset && curclone != snapset->clones.rend();
+}
+
+void PrimaryLogScrub::log_missing(int missing,
+                                 const std::optional<hobject_t>& head,
+                                 LogChannelRef clog,
+                                 const spg_t& pgid,
+                                 const char* func,
+                                 bool allow_incomplete_clones)
+{
+  ceph_assert(head);
+  if (allow_incomplete_clones) {
+    dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped "
+            << missing << " clone(s) in cache tier" << dendl;
+  } else {
+    clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing
+                << " missing clone(s)";
+  }
+}
+
+int PrimaryLogScrub::process_clones_to(const std::optional<hobject_t>& head,
+                                      const std::optional<SnapSet>& snapset,
+                                      LogChannelRef clog,
+                                      const spg_t& pgid,
+                                      bool allow_incomplete_clones,
+                                      std::optional<snapid_t> target,
+                                      vector<snapid_t>::reverse_iterator* curclone,
+                                      inconsistent_snapset_wrapper& e)
+{
+  ceph_assert(head);
+  ceph_assert(snapset);
+  int missing_count = 0;
+
+  // NOTE: clones are in descending order, thus **curclone > target test here
+  hobject_t next_clone(*head);
+  while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
+
+    ++missing_count;
+    // it is okay to be missing one or more clones in a cache tier.
+    // skip higher-numbered clones in the list.
+    if (!allow_incomplete_clones) {
+      next_clone.snap = **curclone;
+      clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone "
+                   << next_clone << " " << m_missing << " missing";
+      ++m_shallow_errors;
+      e.set_clone_missing(next_clone.snap);
+    }
+    // Clones are descending
+    ++(*curclone);
+  }
+  return missing_count;
+}
+
+/*
+ * Validate consistency of the object info and snap sets.
+ *
+ * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
+ * the comparison of the objects is against multiple snapset.clones. There are
+ * multiple clone lists and in between lists we expect head.
+ *
+ * Example
+ *
+ * objects              expected
+ * =======              =======
+ * obj1 snap 1          head, unexpected obj1 snap 1
+ * obj2 head            head, match
+ *              [SnapSet clones 6 4 2 1]
+ * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
+ * obj2 snap 6          obj2 snap 6, match
+ * obj2 snap 4          obj2 snap 4, match
+ * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), match
+ *              [Snapset clones 3 1]
+ * obj3 snap 3          obj3 snap 3 match
+ * obj3 snap 1          obj3 snap 1 match
+ * obj4 head            head, match
+ *              [Snapset clones 4]
+ * EOL                  obj4 snap 4, (expected)
+ */
+void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap,
+                                             const missing_map_t& missing_digest)
+{
+  dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects
+          << dendl;
+
+  auto& info = m_pl_pg->info;
+  const PGPool& pool = m_pl_pg->pool;
+  bool allow_incomplete_clones = pool.info.allow_incomplete_clones();
+
+  std::optional<snapid_t> all_clones;  // Unspecified snapid_t or std::nullopt
+
+  // traverse in reverse order.
+  std::optional<hobject_t> head;
+  std::optional<SnapSet> snapset;              // If initialized so will head (above)
+  vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
+  int missing = 0;
+  inconsistent_snapset_wrapper soid_error, head_error;
+  int soid_error_count = 0;
+
+  for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
+
+    const hobject_t& soid = p->first;
+    ceph_assert(!soid.is_snapdir());
+    soid_error = inconsistent_snapset_wrapper{soid};
+    object_stat_sum_t stat;
+    std::optional<object_info_t> oi;
+
+    stat.num_objects++;
+
+    if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+      stat.num_objects_hit_set_archive++;
+
+    if (soid.is_snap()) {
+      // it's a clone
+      stat.num_object_clones++;
+    }
+
+    // basic checks.
+    if (p->second.attrs.count(OI_ATTR) == 0) {
+      oi = std::nullopt;
+      m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
+                           << OI_ATTR << "' attr";
+      ++m_shallow_errors;
+      soid_error.set_info_missing();
+    } else {
+      bufferlist bv;
+      bv.push_back(p->second.attrs[OI_ATTR]);
+      try {
+       oi = object_info_t(bv);
+      } catch (ceph::buffer::error& e) {
+       oi = std::nullopt;
+       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+                             << " : can't decode '" << OI_ATTR << "' attr " << e.what();
+       ++m_shallow_errors;
+       soid_error.set_info_corrupted();
+       soid_error.set_info_missing();  // Not available too
+      }
+    }
+
+    if (oi) {
+      if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
+       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+                             << " : on disk size (" << p->second.size
+                             << ") does not match object info size (" << oi->size
+                             << ") adjusted for ondisk to ("
+                             << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")";
+       soid_error.set_size_mismatch();
+       ++m_shallow_errors;
+      }
+
+      dout(20) << m_mode_desc << "  " << soid << " " << *oi << dendl;
+
+      // A clone num_bytes will be added later when we have snapset
+      if (!soid.is_snap()) {
+       stat.num_bytes += oi->size;
+      }
+      if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+       stat.num_bytes_hit_set_archive += oi->size;
+
+      if (oi->is_dirty())
+       ++stat.num_objects_dirty;
+      if (oi->is_whiteout())
+       ++stat.num_whiteouts;
+      if (oi->is_omap())
+       ++stat.num_objects_omap;
+      if (oi->is_cache_pinned())
+       ++stat.num_objects_pinned;
+      if (oi->has_manifest())
+       ++stat.num_objects_manifest;
+    }
+
+    // Check for any problems while processing clones
+    if (doing_clones(snapset, curclone)) {
+      std::optional<snapid_t> target;
+      // Expecting an object with snap for current head
+      if (soid.has_snapset() || soid.get_head() != head->get_head()) {
+
+       dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid
+                << " while processing " << *head << dendl;
+
+       target = all_clones;
+      } else {
+       ceph_assert(soid.is_snap());
+       target = soid.snap;
+      }
+
+      // Log any clones we were expecting to be there up to target
+      // This will set missing, but will be a no-op if snap.soid == *curclone.
+      missing +=
+       process_clones_to(head, snapset, m_osds->clog, info.pgid,
+                         allow_incomplete_clones, target, &curclone, head_error);
+    }
+
+    bool expected;
+    // Check doing_clones() again in case we ran process_clones_to()
+    if (doing_clones(snapset, curclone)) {
+      // A head would have processed all clones above
+      // or all greater than *curclone.
+      ceph_assert(soid.is_snap() && *curclone <= soid.snap);
+
+      // After processing above clone snap should match the expected curclone
+      expected = (*curclone == soid.snap);
+    } else {
+      // If we aren't doing clones any longer, then expecting head
+      expected = soid.has_snapset();
+    }
+    if (!expected) {
+      // If we couldn't read the head's snapset, just ignore clones
+      if (head && !snapset) {
+       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+                             << " : clone ignored due to missing snapset";
+      } else {
+       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+                             << " : is an unexpected clone";
+      }
+      ++m_shallow_errors;
+      soid_error.set_headless();
+      m_store->add_snap_error(pool.id, soid_error);
+      ++soid_error_count;
+      if (head && soid.get_head() == head->get_head())
+       head_error.set_clone(soid.snap);
+      continue;
+    }
+
+    // new snapset?
+    if (soid.has_snapset()) {
+
+      if (missing) {
+       log_missing(missing, head, m_osds->clog, info.pgid, __func__,
+                   pool.info.allow_incomplete_clones());
+      }
+
+      // Save previous head error information
+      if (head && (head_error.errors || soid_error_count))
+       m_store->add_snap_error(pool.id, head_error);
+      // Set this as a new head object
+      head = soid;
+      missing = 0;
+      head_error = soid_error;
+      soid_error_count = 0;
+
+      dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl;
+
+      if (p->second.attrs.count(SS_ATTR) == 0) {
+       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
+                             << SS_ATTR << "' attr";
+       ++m_shallow_errors;
+       snapset = std::nullopt;
+       head_error.set_snapset_missing();
+      } else {
+       bufferlist bl;
+       bl.push_back(p->second.attrs[SS_ATTR]);
+       auto blp = bl.cbegin();
+       try {
+         snapset = SnapSet();  // Initialize optional<> before decoding into it
+         decode(*snapset, blp);
+         head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
+       } catch (ceph::buffer::error& e) {
+         snapset = std::nullopt;
+         m_osds->clog->error()
+           << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR
+           << "' attr " << e.what();
+         ++m_shallow_errors;
+         head_error.set_snapset_corrupted();
+       }
+      }
+
+      if (snapset) {
+       // what will be next?
+       curclone = snapset->clones.rbegin();
+
+       if (!snapset->clones.empty()) {
+         dout(20) << "  snapset " << *snapset << dendl;
+         if (snapset->seq == 0) {
+           m_osds->clog->error()
+             << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set";
+           ++m_shallow_errors;
+           head_error.set_snapset_error();
+         }
+       }
+      }
+    } else {
+      ceph_assert(soid.is_snap());
+      ceph_assert(head);
+      ceph_assert(snapset);
+      ceph_assert(soid.snap == *curclone);
+
+      dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl;
+
+      if (snapset->clone_size.count(soid.snap) == 0) {
+       m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+                             << " : is missing in clone_size";
+       ++m_shallow_errors;
+       soid_error.set_size_mismatch();
+      } else {
+       if (oi && oi->size != snapset->clone_size[soid.snap]) {
+         m_osds->clog->error()
+           << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size
+           << " != clone_size " << snapset->clone_size[*curclone];
+         ++m_shallow_errors;
+         soid_error.set_size_mismatch();
+       }
+
+       if (snapset->clone_overlap.count(soid.snap) == 0) {
+         m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+                               << " : is missing in clone_overlap";
+         ++m_shallow_errors;
+         soid_error.set_size_mismatch();
+       } else {
+         // This checking is based on get_clone_bytes().  The first 2 asserts
+         // can't happen because we know we have a clone_size and
+         // a clone_overlap.  Now we check that the interval_set won't
+         // cause the last assert.
+         uint64_t size = snapset->clone_size.find(soid.snap)->second;
+         const interval_set<uint64_t>& overlap =
+           snapset->clone_overlap.find(soid.snap)->second;
+         bool bad_interval_set = false;
+         for (interval_set<uint64_t>::const_iterator i = overlap.begin();
+              i != overlap.end(); ++i) {
+           if (size < i.get_len()) {
+             bad_interval_set = true;
+             break;
+           }
+           size -= i.get_len();
+         }
+
+         if (bad_interval_set) {
+           m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+                                 << " : bad interval_set in clone_overlap";
+           ++m_shallow_errors;
+           soid_error.set_size_mismatch();
+         } else {
+           stat.num_bytes += snapset->get_clone_bytes(soid.snap);
+         }
+       }
+      }
+
+      // what's next?
+      ++curclone;
+      if (soid_error.errors) {
+       m_store->add_snap_error(pool.id, soid_error);
+       ++soid_error_count;
+      }
+    }
+    m_scrub_cstat.add(stat);
+  }
+
+  if (doing_clones(snapset, curclone)) {
+    dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid
+            << " No more objects while processing " << *head << dendl;
+
+    missing +=
+      process_clones_to(head, snapset, m_osds->clog, info.pgid,
+                       allow_incomplete_clones, all_clones, &curclone, head_error);
+  }
+
+  // There could be missing found by the test above or even
+  // before dropping out of the loop for the last head.
+  if (missing) {
+    log_missing(missing, head, m_osds->clog, info.pgid, __func__,
+               allow_incomplete_clones);
+  }
+  if (head && (head_error.errors || soid_error_count))
+    m_store->add_snap_error(pool.id, head_error);
+
+  dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing"
+          << dendl;
+  for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
+
+    ceph_assert(!p->first.is_snapdir());
+    dout(10) << __func__ << " recording digests for " << p->first << dendl;
+
+    ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false);
+    if (!obc) {
+      m_osds->clog->error() << info.pgid << " " << m_mode_desc
+                           << " cannot get object context for object " << p->first;
+      continue;
+    }
+    if (obc->obs.oi.soid != p->first) {
+      m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first
+                           << " : object has a valid oi attr with a mismatched name, "
+                           << " obc->obs.oi.soid: " << obc->obs.oi.soid;
+      continue;
+    }
+    PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc);
+    ctx->at_version = m_pl_pg->get_next_version();
+    ctx->mtime = utime_t();  // do not update mtime
+    if (p->second.first) {
+      ctx->new_obs.oi.set_data_digest(*p->second.first);
+    } else {
+      ctx->new_obs.oi.clear_data_digest();
+    }
+    if (p->second.second) {
+      ctx->new_obs.oi.set_omap_digest(*p->second.second);
+    } else {
+      ctx->new_obs.oi.clear_omap_digest();
+    }
+    m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
+
+    ++num_digest_updates_pending;
+    ctx->register_on_success([this]() {
+      dout(20) << "updating scrub digest " << num_digest_updates_pending << dendl;
+      if (--num_digest_updates_pending <= 0) {
+       m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops());
+      }
+    });
+
+    m_pl_pg->simple_opc_submit(std::move(ctx));
+  }
+
+  dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl;
+}
+
+PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {}
+
+void PrimaryLogScrub::_scrub_clear_state()
+{
+  dout(15) << __func__ << dendl;
+  m_scrub_cstat = object_stat_collection_t();
+}
+
+void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+                                              const hobject_t& soid)
+{
+  // We scrub objects in hobject_t order, so objects before m_start have already been
+  // scrubbed and their stats have already been added to the scrubber. Objects after that
+  // point haven't been included in the scrubber's stats accounting yet, so they will be
+  // included when the scrubber gets to that object.
+  if (is_primary() && is_scrub_active()) {
+    if (soid < m_start) {
+
+      dout(20) << fmt::format("{} {} < [{},{})", __func__, soid, m_start, m_end) << dendl;
+      m_scrub_cstat.add(delta_stats);
+
+    } else {
+
+      dout(25) << fmt::format("{} {} >= [{},{})", __func__, soid, m_start, m_end) << dendl;
+    }
+  }
+}
diff --git a/src/osd/scrubber/PrimaryLogScrub.h b/src/osd/scrubber/PrimaryLogScrub.h
new file mode 100644 (file)
index 0000000..9ea889b
--- /dev/null
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+// the './' includes are marked this way to affect clang-format
+#include "./pg_scrubber.h"
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "debug.h"
+
+#include "common/errno.h"
+#include "common/scrub_types.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrubReserve.h"
+
+#include "osd/OSD.h"
+#include "scrub_machine.h"
+
+class PrimaryLogPG;
+
+/**
+ * The derivative of PgScrubber that is used by PrimaryLogPG.
+ */
+class PrimaryLogScrub : public PgScrubber {
+ public:
+  explicit PrimaryLogScrub(PrimaryLogPG* pg);
+
+  void _scrub_finish() final;
+
+  bool get_store_errors(const scrub_ls_arg_t& arg,
+                       scrub_ls_result_t& res_inout) const final;
+
+  void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+                               const hobject_t& soid) final;
+
+ private:
+  // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object:
+  PrimaryLogPG* const m_pl_pg;
+
+  /**
+   * Validate consistency of the object info and snap sets.
+   */
+  void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final;
+
+  void log_missing(int missing,
+                  const std::optional<hobject_t>& head,
+                  LogChannelRef clog,
+                  const spg_t& pgid,
+                  const char* func,
+                  bool allow_incomplete_clones);
+
+  int process_clones_to(const std::optional<hobject_t>& head,
+                       const std::optional<SnapSet>& snapset,
+                       LogChannelRef clog,
+                       const spg_t& pgid,
+                       bool allow_incomplete_clones,
+                       std::optional<snapid_t> target,
+                       std::vector<snapid_t>::reverse_iterator* curclone,
+                       inconsistent_snapset_wrapper& snap_error);
+
+
+  // handle our part in stats collection
+  object_stat_collection_t m_scrub_cstat;
+  void _scrub_clear_state() final;  // which just clears the stats
+};
diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
new file mode 100644 (file)
index 0000000..1787b3d
--- /dev/null
@@ -0,0 +1,198 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "ScrubStore.h"
+#include "osd/osd_types.h"
+#include "common/scrub_types.h"
+#include "include/rados/rados_types.hpp"
+
+using std::ostringstream;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+
+namespace {
+ghobject_t make_scrub_object(const spg_t& pgid)
+{
+  ostringstream ss;
+  ss << "scrub_" << pgid;
+  return pgid.make_temp_ghobject(ss.str());
+}
+
+string first_object_key(int64_t pool)
+{
+  auto hoid = hobject_t(object_t(),
+                       "",
+                       0,
+                       0x00000000,
+                       pool,
+                       "");
+  hoid.build_hash_cache();
+  return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+// the object_key should be unique across pools
+string to_object_key(int64_t pool, const librados::object_id_t& oid)
+{
+  auto hoid = hobject_t(object_t(oid.name),
+                       oid.locator, // key
+                       oid.snap,
+                       0,              // hash
+                       pool,
+                       oid.nspace);
+  hoid.build_hash_cache();
+  return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+string last_object_key(int64_t pool)
+{
+  auto hoid = hobject_t(object_t(),
+                       "",
+                       0,
+                       0xffffffff,
+                       pool,
+                       "");
+  hoid.build_hash_cache();
+  return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+string first_snap_key(int64_t pool)
+{
+  // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
+  // the representing the minimal and maximum keys. and this relies on how
+  // hobject_t::to_str() works: hex(pool).hex(revhash).
+  auto hoid = hobject_t(object_t(),
+                       "",
+                       0,
+                       0x00000000,
+                       pool,
+                       "");
+  hoid.build_hash_cache();
+  return "SCRUB_SS_" + hoid.to_str();
+}
+
+string to_snap_key(int64_t pool, const librados::object_id_t& oid)
+{
+  auto hoid = hobject_t(object_t(oid.name),
+                       oid.locator, // key
+                       oid.snap,
+                       0x77777777, // hash
+                       pool,
+                       oid.nspace);
+  hoid.build_hash_cache();
+  return "SCRUB_SS_" + hoid.to_str();
+}
+
+string last_snap_key(int64_t pool)
+{
+  auto hoid = hobject_t(object_t(),
+                       "",
+                       0,
+                       0xffffffff,
+                       pool,
+                       "");
+  hoid.build_hash_cache();
+  return "SCRUB_SS_" + hoid.to_str();
+}
+}
+
+namespace Scrub {
+
+Store*
+Store::create(ObjectStore* store,
+             ObjectStore::Transaction* t,
+             const spg_t& pgid,
+             const coll_t& coll)
+{
+  ceph_assert(store);
+  ceph_assert(t);
+  ghobject_t oid = make_scrub_object(pgid);
+  t->touch(coll, oid);
+  return new Store{coll, oid, store};
+}
+
+Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
+  : coll(coll),
+    hoid(oid),
+    driver(store, coll, hoid),
+    backend(&driver)
+{}
+
+Store::~Store()
+{
+  ceph_assert(results.empty());
+}
+
+void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
+{
+  bufferlist bl;
+  e.encode(bl);
+  results[to_object_key(pool, e.object)] = bl;
+}
+
+void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
+{
+  bufferlist bl;
+  e.encode(bl);
+  results[to_snap_key(pool, e.object)] = bl;
+}
+
+bool Store::empty() const
+{
+  return results.empty();
+}
+
+void Store::flush(ObjectStore::Transaction* t)
+{
+  if (t) {
+    OSDriver::OSTransaction txn = driver.get_transaction(t);
+    backend.set_keys(results, &txn);
+  }
+  results.clear();
+}
+
+void Store::cleanup(ObjectStore::Transaction* t)
+{
+  t->remove(coll, hoid);
+}
+
+std::vector<bufferlist>
+Store::get_snap_errors(int64_t pool,
+                      const librados::object_id_t& start,
+                      uint64_t max_return) const
+{
+  const string begin = (start.name.empty() ?
+                       first_snap_key(pool) : to_snap_key(pool, start));
+  const string end = last_snap_key(pool);
+  return get_errors(begin, end, max_return);
+}
+
+std::vector<bufferlist>
+Store::get_object_errors(int64_t pool,
+                        const librados::object_id_t& start,
+                        uint64_t max_return) const
+{
+  const string begin = (start.name.empty() ?
+                       first_object_key(pool) : to_object_key(pool, start));
+  const string end = last_object_key(pool);
+  return get_errors(begin, end, max_return);
+}
+
+std::vector<bufferlist>
+Store::get_errors(const string& begin,
+                 const string& end,
+                 uint64_t max_return) const
+{
+  vector<bufferlist> errors;
+  auto next = std::make_pair(begin, bufferlist{});
+  while (max_return && !backend.get_next(next.first, &next)) {
+    if (next.first >= end)
+      break;
+    errors.push_back(next.second);
+    max_return--;
+  }
+  return errors;
+}
+
+} // namespace Scrub
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
new file mode 100644 (file)
index 0000000..57cd0e8
--- /dev/null
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_SCRUB_RESULT_H
+#define CEPH_SCRUB_RESULT_H
+
+#include "osd/SnapMapper.h"            // for OSDriver
+#include "common/map_cacher.hpp"
+
+namespace librados {
+  struct object_id_t;
+}
+
+struct inconsistent_obj_wrapper;
+struct inconsistent_snapset_wrapper;
+
+namespace Scrub {
+
+class Store {
+public:
+  ~Store();
+  static Store* create(ObjectStore* store,
+                      ObjectStore::Transaction* t,
+                      const spg_t& pgid,
+                      const coll_t& coll);
+  void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
+  void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
+  bool empty() const;
+  void flush(ObjectStore::Transaction *);
+  void cleanup(ObjectStore::Transaction *);
+  std::vector<ceph::buffer::list> get_snap_errors(int64_t pool,
+                                         const librados::object_id_t& start,
+                                         uint64_t max_return) const;
+  std::vector<ceph::buffer::list> get_object_errors(int64_t pool,
+                                           const librados::object_id_t& start,
+                                           uint64_t max_return) const;
+private:
+  Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
+  std::vector<ceph::buffer::list> get_errors(const std::string& start, const std::string& end,
+                                    uint64_t max_return) const;
+private:
+  const coll_t coll;
+  const ghobject_t hoid;
+  // a temp object holding mappings from seq-id to inconsistencies found in
+  // scrubbing
+  OSDriver driver;
+  mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+  std::map<std::string, ceph::buffer::list> results;
+};
+}
+
+#endif // CEPH_SCRUB_RESULT_H
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
new file mode 100644 (file)
index 0000000..a9405ad
--- /dev/null
@@ -0,0 +1,2392 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=2 sw=2 smarttab
+
+#include "./pg_scrubber.h"  // the '.' notation used to affect clang-format order
+
+#include <iostream>
+#include <vector>
+
+#include "debug.h"
+
+#include "common/errno.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrubReserve.h"
+
+#include "osd/OSD.h"
+#include "ScrubStore.h"
+#include "scrub_machine.h"
+
+using std::list;
+using std::map;
+using std::pair;
+using std::set;
+using std::stringstream;
+using std::vector;
+using namespace Scrub;
+using namespace std::chrono;
+using namespace std::chrono_literals;
+using namespace std::literals;
+
+#define dout_context (m_pg->get_cct())
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this->m_pg)
+
+template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
+{
+  return t->gen_prefix(*_dout) << " scrubber pg(" << t->pg_id << ") ";
+}
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf)
+{
+  if (sf.auto_repair)
+    out << " AUTO_REPAIR";
+  if (sf.check_repair)
+    out << " CHECK_REPAIR";
+  if (sf.deep_scrub_on_error)
+    out << " DEEP_SCRUB_ON_ERROR";
+  if (sf.required)
+    out << " REQ_SCRUB";
+
+  return out;
+}
+
+ostream& operator<<(ostream& out, const requested_scrub_t& sf)
+{
+  if (sf.must_repair)
+    out << " MUST_REPAIR";
+  if (sf.auto_repair)
+    out << " planned AUTO_REPAIR";
+  if (sf.check_repair)
+    out << " planned CHECK_REPAIR";
+  if (sf.deep_scrub_on_error)
+    out << " planned DEEP_SCRUB_ON_ERROR";
+  if (sf.must_deep_scrub)
+    out << " MUST_DEEP_SCRUB";
+  if (sf.must_scrub)
+    out << " MUST_SCRUB";
+  if (sf.time_for_deep)
+    out << " TIME_FOR_DEEP";
+  if (sf.need_auto)
+    out << " NEED_AUTO";
+  if (sf.req_scrub)
+    out << " planned REQ_SCRUB";
+
+  return out;
+}
+
+/*
+ * if the incoming message is from a previous interval, it must mean
+ * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard
+ * the stale message.
+ */
+bool PgScrubber::check_interval(epoch_t epoch_to_verify)
+{
+  return epoch_to_verify >= m_pg->get_same_interval_since();
+}
+
+bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify)
+{
+  if (!m_active) {
+    // not scrubbing. We can assume that the scrub was already terminated, and we
+    // can silently discard the incoming event.
+    return false;
+  }
+
+  // is this a message from before we started this scrub?
+  if (epoch_to_verify < m_epoch_start) {
+    return false;
+  }
+
+  // has a new interval started?
+  if (!check_interval(epoch_to_verify)) {
+    // if this is a new interval, on_change() has already terminated that
+    // old scrub.
+    return false;
+  }
+
+  ceph_assert(is_primary());
+
+  // were we instructed to abort?
+  return verify_against_abort(epoch_to_verify);
+}
+
+bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify)
+{
+  if (!should_abort()) {
+    return true;
+  }
+
+  dout(10) << __func__ << " aborting. incoming epoch: " << epoch_to_verify
+          << " vs last-aborted: " << m_last_aborted << dendl;
+
+  // if we were not aware of the abort before - kill the scrub.
+  if (epoch_to_verify > m_last_aborted) {
+    scrub_clear_state();
+    m_last_aborted = std::max(epoch_to_verify, m_epoch_start);
+  }
+  return false;
+}
+
+bool PgScrubber::should_abort() const
+{
+  if (m_flags.required) {
+    return false;  // not stopping 'required' scrubs for configuration changes
+  }
+
+  if (m_is_deep) {
+    if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+       m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
+      dout(10) << "nodeep_scrub set, aborting" << dendl;
+      return true;
+    }
+  }
+
+  if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+      m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
+    dout(10) << "noscrub set, aborting" << dendl;
+    return true;
+  }
+
+  return false;
+}
+
+//   initiating state-machine events --------------------------------
+
+/*
+ * a note re the checks performed before sending scrub-initiating messages:
+ *
+ * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that
+ * possibly were in the queue while the PG changed state and became unavailable for
+ * scrubbing:
+ *
+ * The check_interval() catches all major changes to the PG. As for the other conditions
+ * we may check (and see is_message_relevant() above):
+ *
+ * - we are not 'active' yet, so must not check against is_active(), and:
+ *
+ * - the 'abort' flags were just verified (when the triggering message was queued). As
+ *   those are only modified in human speeds - they need not be queried again.
+ *
+ * Some of the considerations above are also relevant to the replica-side initiation
+ * ('StartReplica' & 'StartReplicaNoWait').
+ */
+
+void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
+{
+  dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
+  // we may have lost our Primary status while the message languished in the queue
+  if (check_interval(epoch_queued)) {
+    dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl;
+    reset_epoch(epoch_queued);
+    m_fsm->my_states();
+    m_fsm->process_event(StartScrub{});
+    dout(10) << "scrubber event --<< StartScrub" << dendl;
+  }
+}
+
+void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued)
+{
+  dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
+  // we may have lost our Primary status while the message languished in the queue
+  if (check_interval(epoch_queued)) {
+    dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl;
+    reset_epoch(epoch_queued);
+    m_fsm->my_states();
+    m_fsm->process_event(AfterRepairScrub{});
+    dout(10) << "scrubber event --<< AfterRepairScrub" << dendl;
+  }
+}
+
+void PgScrubber::send_scrub_unblock(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(Unblocked{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_resched(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(InternalSchedScrub{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+          << " token: " << token << dendl;
+  if (is_primary()) {
+    // shouldn't happen. Ignore
+    dout(1) << "got a replica scrub request while Primary!" << dendl;
+    return;
+  }
+
+  if (check_interval(epoch_queued) && is_token_current(token)) {
+    m_fsm->my_states();
+    // save us some time by not waiting for updates if there are none
+    // to wait for. Affects the transition from NotActive into either
+    // ReplicaWaitUpdates or ActiveReplica.
+    if (pending_active_pushes())
+      m_fsm->process_event(StartReplica{});
+    else
+      m_fsm->process_event(StartReplicaNoWait{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+          << " token: " << token << dendl;
+  if (check_interval(epoch_queued) && is_token_current(token)) {
+    m_fsm->my_states();
+    m_fsm->process_event(SchedReplica{});  // retest for map availability
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::active_pushes_notification(epoch_t epoch_queued)
+{
+  // note: Primary only
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(ActivePushesUpd{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::update_applied_notification(epoch_t epoch_queued)
+{
+  // note: Primary only
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(UpdatesApplied{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::digest_update_notification(epoch_t epoch_queued)
+{
+  // note: Primary only
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(DigestUpdate{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_local_map_done(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(Scrub::IntLocalMapDone{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(GotReplicas{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(ReplicaPushesUpd{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_remotes_reserved(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  // note: scrub is not active yet
+  if (check_interval(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(RemotesReserved{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_reservation_failure(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {  // do not check for 'active'!
+    m_fsm->my_states();
+    m_fsm->process_event(ReservationFailure{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_full_reset(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+  m_fsm->my_states();
+  m_fsm->process_event(Scrub::FullReset{});
+
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_chunk_free(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(Scrub::SelectedChunkFree{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_chunk_busy(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(Scrub::ChunkIsBusy{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_get_next_chunk(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->my_states();
+    m_fsm->process_event(Scrub::NextChunk{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+  // can't check for "active"
+
+  m_fsm->my_states();
+  m_fsm->process_event(Scrub::ScrubFinished{});
+
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_maps_compared(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+  m_fsm->my_states();
+  m_fsm->process_event(Scrub::MapsCompared{});
+
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+// -----------------
+
+bool PgScrubber::is_reserving() const
+{
+  return m_fsm->is_reserving();
+}
+
+void PgScrubber::reset_epoch(epoch_t epoch_queued)
+{
+  dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
+  m_fsm->assert_not_active();
+
+  m_epoch_start = epoch_queued;
+  m_needs_sleep = true;
+  m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
+  update_op_mode_text();
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
+{
+  unsigned int qu_priority = m_flags.priority;
+
+  if (with_priority == Scrub::scrub_prio_t::high_priority) {
+    qu_priority =
+      std::max(qu_priority, (unsigned int)m_pg->get_cct()->_conf->osd_client_op_priority);
+  }
+  return qu_priority;
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+                                               unsigned int suggested_priority) const
+{
+  if (with_priority == Scrub::scrub_prio_t::high_priority) {
+    suggested_priority = std::max(suggested_priority,
+                                 (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
+  }
+  return suggested_priority;
+}
+
+// ///////////////////////////////////////////////////////////////////// //
+// scrub-op registration handling
+
+bool PgScrubber::is_scrub_registered() const
+{
+  return !m_scrub_reg_stamp.is_zero();
+}
+
+void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
+{
+  if (!is_primary()) {
+    // normal. No warning is required.
+    return;
+  }
+
+  dout(10) << __func__ << " planned: must? " << request_flags.must_scrub << " need-auto? "
+          << request_flags.need_auto << " stamp: " << m_pg->info.history.last_scrub_stamp
+          << dendl;
+
+  ceph_assert(!is_scrub_registered());
+
+  utime_t reg_stamp;
+  bool must = false;
+
+  if (request_flags.must_scrub || request_flags.need_auto) {
+    // Set the smallest time that isn't utime_t()
+    reg_stamp = PgScrubber::scrub_must_stamp();
+    must = true;
+  } else if (m_pg->info.stats.stats_invalid &&
+            m_pg->cct->_conf->osd_scrub_invalid_stats) {
+    reg_stamp = ceph_clock_now();
+    must = true;
+  } else {
+    reg_stamp = m_pg->info.history.last_scrub_stamp;
+  }
+
+  dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must
+          << " required:" << m_flags.required << " flags: " << request_flags
+          << " stamp: " << reg_stamp << dendl;
+
+  const double scrub_min_interval =
+    m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0);
+  const double scrub_max_interval =
+    m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0);
+
+  // note the sched_time, so we can locate this scrub, and remove it later
+  m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval,
+                                          scrub_max_interval, must);
+  dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time "
+          << m_scrub_reg_stamp << ", must = " << (int)must << dendl;
+}
+
+void PgScrubber::unreg_next_scrub()
+{
+  if (is_scrub_registered()) {
+    dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl;
+    m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
+    m_scrub_reg_stamp = utime_t{};
+  }
+}
+
+void PgScrubber::scrub_requested(scrub_level_t scrub_level,
+                                scrub_type_t scrub_type,
+                                requested_scrub_t& req_flags)
+{
+  dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ")
+          << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ")
+          << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered()
+          << dendl;
+
+  unreg_next_scrub();
+
+  req_flags.must_scrub = true;
+  req_flags.must_deep_scrub =
+    (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair);
+  req_flags.must_repair = (scrub_type == scrub_type_t::do_repair);
+  // User might intervene, so clear this
+  req_flags.need_auto = false;
+  req_flags.req_scrub = true;
+
+  dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl;
+
+  reg_next_scrub(req_flags);
+}
+
+void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
+{
+  dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? "
+          << is_scrub_registered() << dendl;
+
+  unreg_next_scrub();
+  req_flags.need_auto = true;
+  reg_next_scrub(req_flags);
+}
+
+bool PgScrubber::reserve_local()
+{
+  // try to create the reservation object (which translates into asking the
+  // OSD for the local scrub resource). If failing - undo it immediately
+
+  m_local_osd_resource.emplace(m_pg, m_osds);
+  if (!m_local_osd_resource->is_reserved()) {
+    m_local_osd_resource.reset();
+    return false;
+  }
+
+  return true;
+}
+
+// ----------------------------------------------------------------------------
+
+bool PgScrubber::has_pg_marked_new_updates() const
+{
+  auto last_applied = m_pg->recovery_state.get_last_update_applied();
+  dout(10) << __func__ << " recovery last: " << last_applied
+          << " vs. scrub's: " << m_subset_last_update << dendl;
+
+  return last_applied >= m_subset_last_update;
+}
+
+void PgScrubber::set_subset_last_update(eversion_t e)
+{
+  m_subset_last_update = e;
+  dout(15) << __func__ << " last-update: " << e << dendl;
+}
+
+void PgScrubber::on_applied_when_primary(const eversion_t& applied_version)
+{
+  // we are only interested in updates if we are the Primary, and in state
+  // WaitLastUpdate
+  if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) {
+    m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops());
+    dout(15) << __func__ << " update: " << applied_version
+            << " vs. required: " << m_subset_last_update << dendl;
+  }
+}
+
+/*
+ * The selected range is set directly into 'm_start' and 'm_end'
+ * setting:
+ * - m_subset_last_update
+ * - m_max_end
+ * - end
+ * - start
+ */
+bool PgScrubber::select_range()
+{
+  m_primary_scrubmap = ScrubMap{};
+  m_received_maps.clear();
+
+  /* get the start and end of our scrub chunk
+   *
+   * Our scrub chunk has an important restriction we're going to need to
+   * respect. We can't let head be start or end.
+   * Using a half-open interval means that if end == head,
+   * we'd scrub/lock head and the clone right next to head in different
+   * chunks which would allow us to miss clones created between
+   * scrubbing that chunk and scrubbing the chunk including head.
+   * This isn't true for any of the other clones since clones can
+   * only be created "just to the left of" head.  There is one exception
+   * to this: promotion of clones which always happens to the left of the
+   * left-most clone, but promote_object checks the scrubber in that
+   * case, so it should be ok.  Also, it's ok to "miss" clones at the
+   * left end of the range if we are a tier because they may legitimately
+   * not exist (see _scrub).
+   */
+  int min_idx = std::max<int64_t>(
+    3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor());
+
+  int max_idx = std::max<int64_t>(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max /
+                                            preemption_data.chunk_divisor());
+
+  dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx
+          << " Div: " << preemption_data.chunk_divisor() << dendl;
+
+  hobject_t start = m_start;
+  hobject_t candidate_end;
+  std::vector<hobject_t> objects;
+  int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects,
+                                                       &candidate_end);
+  ceph_assert(ret >= 0);
+
+  if (!objects.empty()) {
+
+    hobject_t back = objects.back();
+    while (candidate_end.is_head() && candidate_end == back.get_head()) {
+      candidate_end = back;
+      objects.pop_back();
+      if (objects.empty()) {
+       ceph_assert(0 ==
+                   "Somehow we got more than 2 objects which"
+                   "have the same head but are not clones");
+      }
+      back = objects.back();
+    }
+
+    if (candidate_end.is_head()) {
+      ceph_assert(candidate_end != back.get_head());
+      candidate_end = candidate_end.get_object_boundary();
+    }
+
+  } else {
+    ceph_assert(candidate_end.is_max());
+  }
+
+  // is that range free for us? if not - we will be rescheduled later by whoever
+  // triggered us this time
+
+  if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) {
+    // we'll be requeued by whatever made us unavailable for scrub
+    dout(10) << __func__ << ": scrub blocked somewhere in range "
+            << "[" << m_start << ", " << candidate_end << ")" << dendl;
+    return false;
+  }
+
+  m_end = candidate_end;
+  if (m_end > m_max_end)
+    m_max_end = m_end;
+
+  dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// "
+          << m_max_end << dendl;
+
+  // debug: be 'blocked' if told so by the 'pg scrub_debug block' asok command
+  if (m_debug_blockrange > 0) {
+    m_debug_blockrange--;
+    return false;
+  }
+  return true;
+}
+
+void PgScrubber::select_range_n_notify()
+{
+  if (select_range()) {
+    // the next chunk to handle is not blocked
+    dout(20) << __func__ << ": selection OK" << dendl;
+    m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority);
+
+  } else {
+    // we will wait for the objects range to become available for scrubbing
+    dout(10) << __func__ << ": selected chunk is busy" << dendl;
+    m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority);
+  }
+}
+
+bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
+{
+  if (soid < m_start || soid >= m_end) {
+    return false;
+  }
+
+  dout(20) << __func__ << " " << soid << " can preempt? "
+          << preemption_data.is_preemptable() << " already preempted? "
+          << preemption_data.was_preempted() << dendl;
+
+  if (preemption_data.was_preempted()) {
+    // otherwise - write requests arriving while 'already preempted' is set
+    // but 'preemptable' is not - will not be allowed to continue, and will
+    // not be requeued on time.
+    return false;
+  }
+
+  if (preemption_data.is_preemptable()) {
+
+    dout(10) << __func__ << " " << soid << " preempted" << dendl;
+
+    // signal the preemption
+    preemption_data.do_preempt();
+    m_end = m_start;  // free the range we were scrubbing
+
+    return false;
+  }
+  return true;
+}
+
+bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end)
+{
+  // does [start, end] intersect [scrubber.start, scrubber.m_max_end)
+  return (start < m_max_end && end >= m_start);
+}
+
+Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm()
+{
+  return std::make_unique<blocked_range_t>(m_osds, ceph::timespan{300s}, m_pg_id);
+}
+
+/**
+ *  if we are required to sleep:
+ *     arrange a callback sometimes later.
+ *     be sure to be able to identify a stale callback.
+ *  Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue)
+ *    anyway.
+ */
+void PgScrubber::add_delayed_scheduling()
+{
+  m_end = m_start;  // not blocking any range now
+
+  milliseconds sleep_time{0ms};
+  if (m_needs_sleep) {
+    double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required);
+    sleep_time = milliseconds{long(scrub_sleep)};
+  }
+  dout(15) << __func__ << " sleep: " << sleep_time.count() << "ms. needed? "
+          << m_needs_sleep << dendl;
+
+  if (sleep_time.count()) {
+    // schedule a transition for some 'sleep_time' ms in the future
+
+    m_needs_sleep = false;
+    m_sleep_started_at = ceph_clock_now();
+
+    // the following log line is used by osd-scrub-test.sh
+    dout(20) << __func__ << " scrub state is PendingTimer, sleeping" << dendl;
+
+    // the 'delayer' for crimson is different. Will be factored out.
+
+    spg_t pgid = m_pg->get_pgid();
+    auto callbk = new LambdaContext([osds = m_osds, pgid,
+                                    scrbr = this]([[maybe_unused]] int r) mutable {
+      PGRef pg = osds->osd->lookup_lock_pg(pgid);
+      if (!pg) {
+       lgeneric_subdout(g_ceph_context, osd, 10)
+         << "scrub_requeue_callback: Could not find "
+         << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl;
+       return;
+      }
+      scrbr->m_needs_sleep = true;
+      lgeneric_dout(scrbr->get_pg_cct(), 7)
+       << "scrub_requeue_callback: slept for "
+       << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl;
+
+      scrbr->m_sleep_started_at = utime_t{};
+      osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority);
+      pg->unlock();
+    });
+
+    std::lock_guard l(m_osds->sleep_lock);
+    m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk);
+
+  } else {
+    // just a requeue
+    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
+  }
+}
+
+eversion_t PgScrubber::search_log_for_updates() const
+{
+  auto& projected = m_pg->projected_log.log;
+  auto pi = find_if(
+    projected.crbegin(), projected.crend(),
+    [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; });
+
+  if (pi != projected.crend())
+    return pi->version;
+
+  // there was no relevant update entry in the log
+
+  auto& log = m_pg->recovery_state.get_pg_log().get_log().log;
+  auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool {
+    return e.soid >= m_start && e.soid < m_end;
+  });
+
+  if (p == log.crend())
+    return eversion_t{};
+  else
+    return p->version;
+}
+
+void PgScrubber::get_replicas_maps(bool replica_can_preempt)
+{
+  dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/"
+          << m_interval_start
+          << " pg same_interval_since: " << m_pg->info.history.same_interval_since
+          << dendl;
+
+  m_primary_scrubmap_pos.reset();
+
+  // ask replicas to scan and send maps
+  for (const auto& i : m_pg->get_acting_recovery_backfill()) {
+
+    if (i == m_pg_whoami)
+      continue;
+
+    m_maps_status.mark_replica_map_request(i);
+    _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep,
+                      replica_can_preempt);
+  }
+
+  dout(10) << __func__ << " awaiting" << m_maps_status << dendl;
+}
+
+bool PgScrubber::was_epoch_changed() const
+{
+  // for crimson we have m_pg->get_info().history.same_interval_since
+  dout(10) << __func__ << " epoch_start: " << m_interval_start
+          << " from pg: " << m_pg->get_history().same_interval_since << dendl;
+
+  return m_interval_start < m_pg->get_history().same_interval_since;
+}
+
+void PgScrubber::mark_local_map_ready()
+{
+  m_maps_status.mark_local_map_ready();
+}
+
+bool PgScrubber::are_all_maps_available() const
+{
+  return m_maps_status.are_all_maps_available();
+}
+
+std::string PgScrubber::dump_awaited_maps() const
+{
+  return m_maps_status.dump();
+}
+
+void PgScrubber::update_op_mode_text()
+{
+  auto visible_repair = state_test(PG_STATE_REPAIR);
+  m_mode_desc = (visible_repair ? "repair" : (m_is_deep ? "deep-scrub" : "scrub"));
+
+  dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false")
+          << ", internal: " << (m_is_repair ? "true" : "false")
+          << ". Displayed: " << m_mode_desc << dendl;
+}
+
+void PgScrubber::_request_scrub_map(pg_shard_t replica,
+                                   eversion_t version,
+                                   hobject_t start,
+                                   hobject_t end,
+                                   bool deep,
+                                   bool allow_preemption)
+{
+  ceph_assert(replica != m_pg_whoami);
+  dout(10) << __func__ << " scrubmap from osd." << replica
+          << (deep ? " deep" : " shallow") << dendl;
+
+  auto repscrubop =
+    new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version,
+                    get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep,
+                    allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub());
+
+  // default priority. We want the replica-scrub processed prior to any recovery
+  // or client io messages (we are holding a lock!)
+  m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
+}
+
+void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
+{
+  if (!m_store)
+    return;
+
+  struct OnComplete : Context {
+    std::unique_ptr<Scrub::Store> store;
+    explicit OnComplete(std::unique_ptr<Scrub::Store>&& store) : store(std::move(store))
+    {}
+    void finish(int) override {}
+  };
+  m_store->cleanup(t);
+  t->register_on_complete(new OnComplete(std::move(m_store)));
+  ceph_assert(!m_store);
+}
+
+void PgScrubber::on_init()
+{
+  // going upwards from 'inactive'
+  ceph_assert(!is_scrub_active());
+
+  preemption_data.reset();
+  m_pg->publish_stats_to_osd();
+  m_interval_start = m_pg->get_history().same_interval_since;
+
+  dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl;
+
+  //  create a new store
+  {
+    ObjectStore::Transaction t;
+    cleanup_store(&t);
+    m_store.reset(
+      Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
+    m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+  }
+
+  m_start = m_pg->info.pgid.pgid.get_hobj_start();
+  m_active = true;
+}
+
+void PgScrubber::on_replica_init()
+{
+  m_active = true;
+}
+
+void PgScrubber::_scan_snaps(ScrubMap& smap)
+{
+  hobject_t head;
+  SnapSet snapset;
+
+  // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings
+  // in this function
+  dout(15) << "_scan_snaps starts" << dendl;
+
+  for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
+
+    const hobject_t& hoid = i->first;
+    ScrubMap::object& o = i->second;
+
+    dout(20) << __func__ << " " << hoid << dendl;
+
+    ceph_assert(!hoid.is_snapdir());
+    if (hoid.is_head()) {
+      // parse the SnapSet
+      bufferlist bl;
+      if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
+       continue;
+      }
+      bl.push_back(o.attrs[SS_ATTR]);
+      auto p = bl.cbegin();
+      try {
+       decode(snapset, p);
+      } catch (...) {
+       continue;
+      }
+      head = hoid.get_head();
+      continue;
+    }
+
+    if (hoid.snap < CEPH_MAXSNAP) {
+      // check and if necessary fix snap_mapper
+      if (hoid.get_head() != head) {
+       derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl;
+       continue;
+      }
+      set<snapid_t> obj_snaps;
+      auto p = snapset.clone_snaps.find(hoid.snap);
+      if (p == snapset.clone_snaps.end()) {
+       derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl;
+       continue;
+      }
+      obj_snaps.insert(p->second.begin(), p->second.end());
+      set<snapid_t> cur_snaps;
+      int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps);
+      if (r != 0 && r != -ENOENT) {
+       derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
+       ceph_abort();
+      }
+      if (r == -ENOENT || cur_snaps != obj_snaps) {
+       ObjectStore::Transaction t;
+       OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t));
+       if (r == 0) {
+         r = m_pg->snap_mapper.remove_oid(hoid, &_t);
+         if (r != 0) {
+           derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+           ceph_abort();
+         }
+         m_pg->osd->clog->error()
+           << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+           << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps
+           << ", oi: " << obj_snaps << "...repaired";
+       } else {
+         m_pg->osd->clog->error()
+           << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+           << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper"
+           << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r
+           << "...repaired";
+       }
+       m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t);
+
+       // wait for repair to apply to avoid confusing other bits of the system.
+       {
+         dout(15) << __func__ << " wait on repair!" << dendl;
+
+         ceph::condition_variable my_cond;
+         ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
+         int e = 0;
+         bool done;
+
+         t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e));
+
+         e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t));
+         if (e != 0) {
+           derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl;
+         } else {
+           std::unique_lock l{my_lock};
+           my_cond.wait(l, [&done] { return done; });
+         }
+       }
+      }
+    }
+  }
+}
+
+int PgScrubber::build_primary_map_chunk()
+{
+  epoch_t map_building_since = m_pg->get_osdmap_epoch();
+  dout(20) << __func__ << ": initiated at epoch " << map_building_since << dendl;
+
+  auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start,
+                                  m_end, m_is_deep);
+
+  if (ret == -EINPROGRESS) {
+    // reschedule another round of asking the backend to collect the scrub data
+    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority);
+  }
+  return ret;
+}
+
+int PgScrubber::build_replica_map_chunk()
+{
+  dout(10) << __func__ << " interval start: " << m_interval_start
+          << " current token: " << m_current_token << " epoch: " << m_epoch_start
+          << " deep: " << m_is_deep << dendl;
+
+  auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
+                                  m_is_deep);
+
+  switch (ret) {
+
+    case -EINPROGRESS:
+      // must wait for the backend to finish. No external event source.
+      // (note: previous version used low priority here. Now switched to using the
+      // priority of the original message)
+      m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority,
+                                         m_flags.priority, m_current_token);
+      break;
+
+    case 0: {
+      // finished!
+      m_cleaned_meta_map.clear_from(m_start);
+      m_cleaned_meta_map.insert(replica_scrubmap);
+      auto for_meta_scrub = clean_meta_map();
+      _scan_snaps(for_meta_scrub);
+
+      // the local map has been created. Send it to the primary.
+      // Note: once the message reaches the Primary, it may ask us for another
+      // chunk - and we better be done with the current scrub. Thus - the preparation of
+      // the reply message is separate, and we clear the scrub state before actually
+      // sending it.
+
+      auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
+      replica_handling_done();
+      dout(15) << __func__ << " chunk map sent " << dendl;
+      send_replica_map(reply);
+    } break;
+
+    default:
+      // negative retval: build_scrub_map_chunk() signalled an error
+      // Pre-Pacific code ignored this option, treating it as a success.
+      // \todo Add an error flag in the returning message.
+      dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret
+             << dendl;
+      replica_handling_done();
+      // only in debug mode for now:
+      assert(false && "backend error");
+      break;
+  };
+
+  return ret;
+}
+
+int PgScrubber::build_scrub_map_chunk(
+  ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep)
+{
+  dout(10) << __func__ << " [" << start << "," << end << ") "
+          << " pos " << pos << " Deep: " << deep << dendl;
+
+  // start
+  while (pos.empty()) {
+
+    pos.deep = deep;
+    map.valid_through = m_pg->info.last_update;
+
+    // objects
+    vector<ghobject_t> rollback_obs;
+    pos.ret =
+      m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs);
+    dout(10) << __func__ << " while pos empty " << pos.ret << dendl;
+    if (pos.ret < 0) {
+      dout(5) << "objects_list_range error: " << pos.ret << dendl;
+      return pos.ret;
+    }
+    dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl;
+    if (pos.ls.empty()) {
+      break;
+    }
+    m_pg->_scan_rollback_obs(rollback_obs);
+    pos.pos = 0;
+    return -EINPROGRESS;
+  }
+
+  // scan objects
+  while (!pos.done()) {
+
+    int r = m_pg->get_pgbackend()->be_scan_list(map, pos);
+    dout(30) << __func__ << " BE returned " << r << dendl;
+    if (r == -EINPROGRESS) {
+      dout(20) << __func__ << " in progress" << dendl;
+      return r;
+    }
+  }
+
+  // finish
+  dout(20) << __func__ << " finishing" << dendl;
+  ceph_assert(pos.done());
+  m_pg->_repair_oinfo_oid(map);
+
+  dout(20) << __func__ << " done, got " << map.objects.size() << " items" << dendl;
+  return 0;
+}
+
+/*
+ * Process:
+ * Building a map of objects suitable for snapshot validation.
+ * The data in m_cleaned_meta_map is the left over partial items that need to
+ * be completed before they can be processed.
+ *
+ * Snapshots in maps precede the head object, which is why we are scanning backwards.
+ */
+ScrubMap PgScrubber::clean_meta_map()
+{
+  ScrubMap for_meta_scrub;
+
+  if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) {
+    m_cleaned_meta_map.swap(for_meta_scrub);
+  } else {
+    auto iter = m_cleaned_meta_map.objects.end();
+    --iter;  // not empty, see 'if' clause
+    auto begin = m_cleaned_meta_map.objects.begin();
+    if (iter->first.has_snapset()) {
+      ++iter;
+    } else {
+      while (iter != begin) {
+       auto next = iter--;
+       if (next->first.get_head() != iter->first.get_head()) {
+         ++iter;
+         break;
+       }
+      }
+    }
+    for_meta_scrub.objects.insert(begin, iter);
+    m_cleaned_meta_map.objects.erase(begin, iter);
+  }
+
+  return for_meta_scrub;
+}
+
+void PgScrubber::run_callbacks()
+{
+  std::list<Context*> to_run;
+  to_run.swap(m_callbacks);
+
+  for (auto& tr : to_run) {
+    tr->complete(0);
+  }
+}
+
+void PgScrubber::maps_compare_n_cleanup()
+{
+  scrub_compare_maps();
+  m_start = m_end;
+  run_callbacks();
+  requeue_waiting();
+  m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority);
+}
+
+Scrub::preemption_t& PgScrubber::get_preemptor()
+{
+  return preemption_data;
+}
+
+/*
+ * Process note: called for the arriving "give me your map, replica!" request. Unlike
+ * the original implementation, we do not requeue the Op waiting for
+ * updates. Instead - we trigger the FSM.
+ */
+void PgScrubber::replica_scrub_op(OpRequestRef op)
+{
+  op->mark_started();
+  auto msg = op->get_req<MOSDRepScrub>();
+  dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch
+          << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl;
+
+  // are we still processing a previous scrub-map request without noticing that the
+  // interval changed? won't see it here, but rather at the reservation stage.
+
+  if (msg->map_epoch < m_pg->info.history.same_interval_since) {
+    dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
+            << " < " << m_pg->info.history.same_interval_since << dendl;
+
+    // is there a general sync issue? are we holding a stale reservation?
+    // not checking now - assuming we will actively react to interval change.
+
+    return;
+  }
+
+  replica_scrubmap = ScrubMap{};
+  replica_scrubmap_pos = ScrubMapBuilder{};
+
+  m_replica_min_epoch = msg->min_epoch;
+  m_start = msg->start;
+  m_end = msg->end;
+  m_max_end = msg->end;
+  m_is_deep = msg->deep;
+  m_interval_start = m_pg->info.history.same_interval_since;
+  m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority
+                                                 : Scrub::scrub_prio_t::low_priority;
+  m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority();
+
+  preemption_data.reset();
+  preemption_data.force_preemptability(msg->allow_preemption);
+
+  replica_scrubmap_pos.reset();
+
+  // make sure the FSM is at NotActive
+  m_fsm->assert_not_active();
+
+  m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority,
+                             m_current_token);
+}
+
+void PgScrubber::set_op_parameters(requested_scrub_t& request)
+{
+  dout(10) << __func__ << " input: " << request << dendl;
+
+  // write down the epoch of starting a new scrub. Will be used
+  // to discard stale messages from previous aborted scrubs.
+  m_epoch_start = m_pg->get_osdmap_epoch();
+
+  m_flags.check_repair = request.check_repair;
+  m_flags.auto_repair = request.auto_repair || request.need_auto;
+  m_flags.required = request.req_scrub || request.must_scrub;
+
+  m_flags.priority = (request.must_scrub || request.need_auto)
+                      ? get_pg_cct()->_conf->osd_requested_scrub_priority
+                      : m_pg->get_scrub_priority();
+
+  state_set(PG_STATE_SCRUBBING);
+
+  // will we be deep-scrubbing?
+  if (request.must_deep_scrub || request.need_auto || request.time_for_deep) {
+    state_set(PG_STATE_DEEP_SCRUB);
+  }
+
+  // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e.
+  // deep-scrub with the auto_repair configuration flag set). m_is_repair value
+  // determines the scrubber behavior.
+  // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the
+  // PG status as appearing in the logs).
+  m_is_repair = request.must_repair || m_flags.auto_repair;
+  if (request.must_repair) {
+    state_set(PG_STATE_REPAIR);
+    // not calling update_op_mode_text() yet, as m_is_deep not set yet
+  }
+
+  // the publishing here seems to be required for tests synchronization
+  m_pg->publish_stats_to_osd();
+  m_flags.deep_scrub_on_error = request.deep_scrub_on_error;
+}
+
+void PgScrubber::scrub_compare_maps()
+{
+  dout(10) << __func__ << " has maps, analyzing" << dendl;
+
+  // construct authoritative scrub map for type-specific scrubbing
+  m_cleaned_meta_map.insert(m_primary_scrubmap);
+  map<hobject_t, pair<std::optional<uint32_t>, std::optional<uint32_t>>> missing_digest;
+
+  map<pg_shard_t, ScrubMap*> maps;
+  maps[m_pg_whoami] = &m_primary_scrubmap;
+
+  for (const auto& i : m_pg->get_acting_recovery_backfill()) {
+    if (i == m_pg_whoami)
+      continue;
+    dout(2) << __func__ << " replica " << i << " has "
+           << m_received_maps[i].objects.size() << " items" << dendl;
+    maps[i] = &m_received_maps[i];
+  }
+
+  set<hobject_t> master_set;
+
+  // Construct master set
+  for (const auto& map : maps) {
+    for (const auto& i : map.second->objects) {
+      master_set.insert(i.first);
+    }
+  }
+
+  stringstream ss;
+  m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss);
+
+  if (!ss.str().empty()) {
+    m_osds->clog->warn(ss);
+  }
+
+  if (m_pg->recovery_state.get_acting_recovery_backfill().size() > 1) {
+
+    dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
+
+    // Map from object with errors to good peer
+    map<hobject_t, list<pg_shard_t>> authoritative;
+
+    dout(2) << __func__ << ": primary (" << m_pg->get_primary() << ") has "
+           << m_primary_scrubmap.objects.size() << " items" << dendl;
+
+    ss.str("");
+    ss.clear();
+
+    m_pg->get_pgbackend()->be_compare_scrubmaps(
+      maps, master_set, m_is_repair, m_missing, m_inconsistent,
+      authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(),
+      m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss);
+
+    if (!ss.str().empty()) {
+      m_osds->clog->error(ss);
+    }
+
+    for (auto& i : authoritative) {
+      list<pair<ScrubMap::object, pg_shard_t>> good_peers;
+      for (list<pg_shard_t>::const_iterator j = i.second.begin(); j != i.second.end();
+          ++j) {
+       good_peers.emplace_back(maps[*j]->objects[i.first], *j);
+      }
+      m_authoritative.emplace(i.first, good_peers);
+    }
+
+    for (auto i = authoritative.begin(); i != authoritative.end(); ++i) {
+      m_cleaned_meta_map.objects.erase(i->first);
+      m_cleaned_meta_map.objects.insert(
+       *(maps[i->second.back()]->objects.find(i->first)));
+    }
+  }
+
+  auto for_meta_scrub = clean_meta_map();
+
+  // ok, do the pg-type specific scrubbing
+
+  // (Validates consistency of the object info and snap sets)
+  scrub_snapshot_metadata(for_meta_scrub, missing_digest);
+
+  // Called here on the primary can use an authoritative map if it isn't the primary
+  _scan_snaps(for_meta_scrub);
+
+  if (!m_store->empty()) {
+
+    if (m_is_repair) {
+      dout(10) << __func__ << ": discarding scrub results" << dendl;
+      m_store->flush(nullptr);
+    } else {
+      dout(10) << __func__ << ": updating scrub object" << dendl;
+      ObjectStore::Transaction t;
+      m_store->flush(&t);
+      m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+    }
+  }
+}
+
+ScrubMachineListener::MsgAndEpoch PgScrubber::prep_replica_map_msg(
+  PreemptionNoted was_preempted)
+{
+  dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl;
+
+  auto reply =
+    make_message<MOSDRepScrubMap>(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
+                                 m_replica_min_epoch, m_pg_whoami);
+
+  reply->preempted = (was_preempted == PreemptionNoted::preempted);
+  ::encode(replica_scrubmap, reply->get_data());
+
+  return ScrubMachineListener::MsgAndEpoch{reply, m_replica_min_epoch};
+}
+
+void PgScrubber::send_replica_map(const MsgAndEpoch& preprepared)
+{
+  m_pg->send_cluster_message(m_pg->get_primary().osd, preprepared.m_msg,
+                            preprepared.m_epoch, false);
+}
+
+void PgScrubber::send_preempted_replica()
+{
+  auto reply =
+    make_message<MOSDRepScrubMap>(spg_t{m_pg->info.pgid.pgid, m_pg->get_primary().shard},
+                                 m_replica_min_epoch, m_pg_whoami);
+
+  reply->preempted = true;
+  ::encode(replica_scrubmap, reply->get_data()); // must not skip this
+  m_pg->send_cluster_message(m_pg->get_primary().osd, reply, m_replica_min_epoch, false);
+}
+
+/*
+ *  - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
+ *    The state-machine will react to that when all replica maps are received.
+ *  - when all maps are received, we signal the FSM with the GotReplicas event (see
+ *    scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
+ *    FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
+ *    handle.
+ */
+void PgScrubber::map_from_replica(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDRepScrubMap>();
+  dout(15) << __func__ << " " << *m << dendl;
+
+  if (m->map_epoch < m_pg->info.history.same_interval_since) {
+    dout(10) << __func__ << " discarding old from " << m->map_epoch << " < "
+            << m_pg->info.history.same_interval_since << dendl;
+    return;
+  }
+
+  auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
+
+  m_received_maps[m->from].decode(p, m_pg->info.pgid.pool());
+  dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl;
+
+  auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
+  if (!is_ok) {
+    // previously an unexpected map was triggering an assert. Now, as scrubs can be
+    // aborted at any time, the chances of this happening have increased, and aborting is
+    // not justified
+    dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl;
+    return;
+  }
+
+  if (m->preempted) {
+    dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+    preemption_data.do_preempt();
+  }
+
+  if (m_maps_status.are_all_maps_available()) {
+    dout(15) << __func__ << " all repl-maps available" << dendl;
+    m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops());
+  }
+}
+
+void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+  auto request_ep = op->get_req<MOSDScrubReserve>()->get_map_epoch();
+
+  /*
+   *  if we are currently holding a reservation, then:
+   *  either (1) we, the scrubber, did not yet notice an interval change. The remembered
+   *  reservation epoch is from before our interval, and we can silently discard the
+   *  reservation (no message is required).
+   *  or:
+   *  (2) the interval hasn't changed, but the same Primary that (we think) holds the
+   *  lock just sent us a new request. Note that we know it's the same Primary, as
+   *  otherwise the interval would have changed.
+   *  Ostensibly we can discard & redo the reservation. But then we
+   *  will be temporarily releasing the OSD resource - and might not be able to grab it
+   *  again. Thus, we simply treat this as a successful new request
+   *  (but mark the fact that if there is a previous request from the primary to
+   *  scrub a specific chunk - that request is now defunct).
+   */
+
+  if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
+    // we are holding a stale reservation from a past epoch
+    m_remote_osd_resource.reset();
+    dout(10) << __func__ << " stale reservation request" << dendl;
+  }
+
+  if (request_ep < m_pg->get_same_interval_since()) {
+    // will not ack stale requests
+    return;
+  }
+
+  bool granted{false};
+  if (m_remote_osd_resource.has_value()) {
+
+    dout(10) << __func__ << " already reserved." << dendl;
+
+    /*
+     * it might well be that we did not yet finish handling the latest scrub-op from
+     * our primary. This happens, for example, if 'noscrub' was set via a command, then
+     * reset. The primary in this scenario will remain in the same interval, but we do need
+     * to reset our internal state (otherwise - the first renewed 'give me your scrub map'
+     * from the primary will see us in active state, crashing the OSD).
+     */
+    advance_token();
+    granted = true;
+
+  } else if (m_pg->cct->_conf->osd_scrub_during_recovery ||
+            !m_osds->is_recovery_active()) {
+    m_remote_osd_resource.emplace(m_pg, m_osds, request_ep);
+    // OSD resources allocated?
+    granted = m_remote_osd_resource->is_reserved();
+    if (!granted) {
+      // just forget it
+      m_remote_osd_resource.reset();
+      dout(20) << __func__ << ": failed to reserve remotely" << dendl;
+    }
+  }
+
+  dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
+
+  Message* reply = new MOSDScrubReserve(
+    spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep,
+    granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami);
+
+  m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
+}
+
+void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  if (m_reservations.has_value()) {
+    m_reservations->handle_reserve_grant(op, from);
+  } else {
+    derr << __func__ << ": received unsolicited reservation grant from osd " << from
+        << " (" << op << ")" << dendl;
+  }
+}
+
+void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  if (m_reservations.has_value()) {
+    // there is an active reservation process. No action is required otherwise.
+    m_reservations->handle_reserve_reject(op, from);
+  }
+}
+
+void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  /*
+   * this specific scrub session has terminated. All incoming events carrying the old
+   * tag will be discarded.
+   */
+  advance_token();
+  m_remote_osd_resource.reset();
+}
+
+void PgScrubber::discard_replica_reservations()
+{
+  dout(10) << __func__ << dendl;
+  if (m_reservations.has_value()) {
+    m_reservations->discard_all();
+  }
+}
+
+void PgScrubber::clear_scrub_reservations()
+{
+  dout(10) << __func__ << dendl;
+  m_reservations.reset();        // the remote reservations
+  m_local_osd_resource.reset();          // the local reservation
+  m_remote_osd_resource.reset();  // we as replica reserved for a Primary
+}
+
+void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text)
+{
+  ceph_assert(m_pg->recovery_state.get_backfill_targets().empty());
+
+  std::vector<std::pair<int, Message*>> messages;
+  messages.reserve(m_pg->get_actingset().size());
+
+  epoch_t epch = get_osdmap_epoch();
+
+  for (auto& p : m_pg->get_actingset()) {
+
+    if (p == m_pg_whoami)
+      continue;
+
+    dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch
+            << dendl;
+    Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode,
+                                     m_pg_whoami);
+    messages.push_back(std::make_pair(p.osd, m));
+  }
+
+  if (!messages.empty()) {
+    m_osds->send_message_osd_cluster(messages, epch);
+  }
+}
+
+void PgScrubber::unreserve_replicas()
+{
+  dout(10) << __func__ << dendl;
+  m_reservations.reset();
+}
+
+[[nodiscard]] bool PgScrubber::scrub_process_inconsistent()
+{
+  dout(10) << __func__ << ": checking authoritative (mode="
+          << m_mode_desc << ", auth remaining #: " << m_authoritative.size()
+          << ")" << dendl;
+
+  // authoritative only store objects which are missing or inconsistent.
+  if (!m_authoritative.empty()) {
+
+    stringstream ss;
+    ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, "
+       << m_inconsistent.size() << " inconsistent objects";
+    dout(2) << ss.str() << dendl;
+    m_osds->clog->error(ss);
+
+    if (m_is_repair) {
+      state_clear(PG_STATE_CLEAN);
+      // we know we have a problem, so it's OK to set the user-visible flag
+      // even if we only reached here via auto-repair
+      state_set(PG_STATE_REPAIR);
+      update_op_mode_text();
+
+      for (const auto& [hobj, shrd_list] : m_authoritative) {
+
+       auto missing_entry = m_missing.find(hobj);
+
+       if (missing_entry != m_missing.end()) {
+         m_pg->repair_object(hobj, shrd_list, missing_entry->second);
+         m_fixed_count += missing_entry->second.size();
+       }
+
+       if (m_inconsistent.count(hobj)) {
+         m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]);
+         m_fixed_count += m_inconsistent[hobj].size();
+       }
+      }
+    }
+  }
+  return (!m_authoritative.empty() && m_is_repair);
+}
+
+/*
+ * note: only called for the Primary.
+ */
+void PgScrubber::scrub_finish()
+{
+  dout(10) << __func__ << " before flags: " << m_flags
+          << ". repair state: " << (state_test(PG_STATE_REPAIR) ? "repair" : "no-repair")
+          << ". deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl;
+
+  ceph_assert(m_pg->is_locked());
+
+  m_pg->m_planned_scrub = requested_scrub_t{};
+
+  // if the repair request comes from auto-repair and large number of errors,
+  // we would like to cancel auto-repair
+  if (m_is_repair && m_flags.auto_repair &&
+      m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+
+    dout(10) << __func__ << " undoing the repair" << dendl;
+    state_clear(PG_STATE_REPAIR); // not expected to be set, anyway
+    m_is_repair = false;
+    update_op_mode_text();
+  }
+
+  bool do_auto_scrub = false;
+
+  // if a regular scrub had errors within the limit, do a deep scrub to auto repair
+  if (m_flags.deep_scrub_on_error && !m_authoritative.empty() &&
+      m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+    ceph_assert(!m_is_deep);
+    do_auto_scrub = true;
+    dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl;
+  }
+
+  m_flags.deep_scrub_on_error = false;
+
+  // type-specific finish (can tally more errors)
+  _scrub_finish();
+
+  bool has_error = scrub_process_inconsistent();
+
+  {
+    stringstream oss;
+    oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " ";
+    int total_errors = m_shallow_errors + m_deep_errors;
+    if (total_errors)
+      oss << total_errors << " errors";
+    else
+      oss << "ok";
+    if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
+      oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors
+         << " remaining deep scrub error details lost)";
+    if (m_is_repair)
+      oss << ", " << m_fixed_count << " fixed";
+    if (total_errors)
+      m_osds->clog->error(oss);
+    else
+      m_osds->clog->debug(oss);
+  }
+
+  // Since we don't know which errors were fixed, we can only clear them
+  // when every one has been fixed.
+  if (m_is_repair) {
+    if (m_fixed_count == m_shallow_errors + m_deep_errors) {
+
+      ceph_assert(m_is_deep);
+      m_shallow_errors = 0;
+      m_deep_errors = 0;
+      dout(20) << __func__ << " All may be fixed" << dendl;
+
+    } else if (has_error) {
+
+      // Deep scrub in order to get corrected error counts
+      m_pg->scrub_after_recovery = true;
+      m_pg->m_planned_scrub.req_scrub =
+       m_pg->m_planned_scrub.req_scrub || m_flags.required;
+
+      dout(20) << __func__ << " Current 'required': " << m_flags.required
+              << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl;
+
+    } else if (m_shallow_errors || m_deep_errors) {
+
+      // We have errors but nothing can be fixed, so there is no repair
+      // possible.
+      state_set(PG_STATE_FAILED_REPAIR);
+      dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors)
+              << " error(s) present with no repair possible" << dendl;
+    }
+  }
+
+  {
+    // finish up
+    ObjectStore::Transaction t;
+    m_pg->recovery_state.update_stats(
+      [this](auto& history, auto& stats) {
+       dout(10) << "m_pg->recovery_state.update_stats()" << dendl;
+       utime_t now = ceph_clock_now();
+       history.last_scrub = m_pg->recovery_state.get_info().last_update;
+       history.last_scrub_stamp = now;
+       if (m_is_deep) {
+         history.last_deep_scrub = m_pg->recovery_state.get_info().last_update;
+         history.last_deep_scrub_stamp = now;
+       }
+
+       if (m_is_deep) {
+         if ((m_shallow_errors == 0) && (m_deep_errors == 0))
+           history.last_clean_scrub_stamp = now;
+         stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+         stats.stats.sum.num_deep_scrub_errors = m_deep_errors;
+         stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects;
+         stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes;
+         stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys;
+         dout(25) << "scrub_finish shard " << m_pg_whoami
+                  << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes
+                  << " num_omap_keys = " << stats.stats.sum.num_omap_keys << dendl;
+       } else {
+         stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+         // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
+         // because of deep-scrub errors
+         if (m_shallow_errors == 0)
+           history.last_clean_scrub_stamp = now;
+       }
+       stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors +
+                                          stats.stats.sum.num_deep_scrub_errors;
+       if (m_flags.check_repair) {
+         m_flags.check_repair = false;
+         if (m_pg->info.stats.stats.sum.num_scrub_errors) {
+           state_set(PG_STATE_FAILED_REPAIR);
+           dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors
+                    << " error(s) still present after re-scrub" << dendl;
+         }
+       }
+       return true;
+      },
+      &t);
+    int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+    ceph_assert(tr == 0);
+
+    if (!m_pg->snap_trimq.empty()) {
+      dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
+      m_pg->snap_trimmer_scrub_complete();
+    }
+  }
+
+  if (has_error) {
+    m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared<PGPeeringEvent>(
+      get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
+  } else {
+    m_is_repair = false;
+    state_clear(PG_STATE_REPAIR);
+    update_op_mode_text();
+  }
+
+  cleanup_on_finish();
+  if (do_auto_scrub) {
+    request_rescrubbing(m_pg->m_planned_scrub);
+  }
+
+  if (m_pg->is_active() && m_pg->is_primary()) {
+    m_pg->recovery_state.share_pg_info();
+  }
+}
+
+void PgScrubber::on_digest_updates()
+{
+  dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " pending? "
+          << num_digest_updates_pending
+          << (m_end.is_max() ? " <last chunk> " : " <mid chunk> ") << dendl;
+
+  if (num_digest_updates_pending > 0) {
+    // do nothing for now. We will be called again when new updates arrive
+    return;
+  }
+
+  // got all updates, and finished with this chunk. Any more?
+  if (m_end.is_max()) {
+
+    scrub_finish();
+    m_osds->queue_scrub_is_finished(m_pg);
+
+  } else {
+    // go get a new chunk (via "requeue")
+    preemption_data.reset();
+    m_osds->queue_scrub_next_chunk(m_pg, m_pg->is_scrub_blocking_ops());
+  }
+}
+
+
+/*
+ * note that the flags-set fetched from the PG (m_pg->m_planned_scrub)
+ * is cleared once scrubbing starts; Some of the values dumped here are
+ * thus transitory.
+ */
+void PgScrubber::dump(ceph::Formatter* f) const
+{
+  f->open_object_section("scrubber");
+  f->dump_stream("epoch_start") << m_interval_start;
+  f->dump_bool("active", m_active);
+  if (m_active) {
+    f->dump_stream("start") << m_start;
+    f->dump_stream("end") << m_end;
+    f->dump_stream("m_max_end") << m_max_end;
+    f->dump_stream("subset_last_update") << m_subset_last_update;
+    f->dump_bool("deep", m_is_deep);
+    f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required));
+    f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub);
+    f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair);
+    f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto);
+    f->dump_bool("req_scrub", m_flags.required);
+    f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep);
+    f->dump_bool("auto_repair", m_flags.auto_repair);
+    f->dump_bool("check_repair", m_flags.check_repair);
+    f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error);
+    f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp;  // utime_t
+    f->dump_unsigned("priority", m_flags.priority);
+    f->dump_int("shallow_errors", m_shallow_errors);
+    f->dump_int("deep_errors", m_deep_errors);
+    f->dump_int("fixed", m_fixed_count);
+    {
+      f->open_array_section("waiting_on_whom");
+      for (const auto& p : m_maps_status.get_awaited()) {
+       f->dump_stream("shard") << p;
+      }
+      f->close_section();
+    }
+  }
+  f->close_section();
+}
+
+
+void PgScrubber::handle_query_state(ceph::Formatter* f)
+{
+  dout(10) << __func__ << dendl;
+
+  f->open_object_section("scrub");
+  f->dump_stream("scrubber.epoch_start") << m_interval_start;
+  f->dump_bool("scrubber.active", m_active);
+  f->dump_stream("scrubber.start") << m_start;
+  f->dump_stream("scrubber.end") << m_end;
+  f->dump_stream("scrubber.m_max_end") << m_max_end;
+  f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update;
+  f->dump_bool("scrubber.deep", m_is_deep);
+  {
+    f->open_array_section("scrubber.waiting_on_whom");
+    for (const auto& p : m_maps_status.get_awaited()) {
+      f->dump_stream("shard") << p;
+    }
+    f->close_section();
+  }
+
+  f->dump_string("comment", "DEPRECATED - may be removed in the next release");
+
+  f->close_section();
+}
+
+PgScrubber::~PgScrubber() = default;
+
+PgScrubber::PgScrubber(PG* pg)
+    : m_pg{pg}
+    , m_pg_id{pg->pg_id}
+    , m_osds{m_pg->osd}
+    , m_pg_whoami{pg->pg_whoami}
+    , preemption_data{pg}
+{
+  m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
+  m_fsm->initiate();
+}
+
+void PgScrubber::reserve_replicas()
+{
+  dout(10) << __func__ << dendl;
+  m_reservations.emplace(m_pg, m_pg_whoami);
+}
+
+void PgScrubber::cleanup_on_finish()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(m_pg->is_locked());
+
+  state_clear(PG_STATE_SCRUBBING);
+  state_clear(PG_STATE_DEEP_SCRUB);
+  m_pg->publish_stats_to_osd();
+
+  clear_scrub_reservations();
+  m_pg->publish_stats_to_osd();
+
+  requeue_waiting();
+
+  reset_internal_state();
+  m_flags = scrub_flags_t{};
+
+  // type-specific state clear
+  _scrub_clear_state();
+}
+
+// uses process_event(), so must be invoked externally
+void PgScrubber::scrub_clear_state()
+{
+  dout(10) << __func__ << dendl;
+
+  clear_pgscrub_state();
+  m_fsm->process_event(FullReset{});
+}
+
+/*
+ * note: does not access the state-machine
+ */
+void PgScrubber::clear_pgscrub_state()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(m_pg->is_locked());
+
+  state_clear(PG_STATE_SCRUBBING);
+  state_clear(PG_STATE_DEEP_SCRUB);
+
+  state_clear(PG_STATE_REPAIR);
+
+  clear_scrub_reservations();
+  m_pg->publish_stats_to_osd();
+
+  requeue_waiting();
+
+  reset_internal_state();
+  m_flags = scrub_flags_t{};
+
+  // type-specific state clear
+  _scrub_clear_state();
+}
+
+void PgScrubber::replica_handling_done()
+{
+  dout(10) << __func__ << dendl;
+
+  state_clear(PG_STATE_SCRUBBING);
+  state_clear(PG_STATE_DEEP_SCRUB);
+
+  reset_internal_state();
+
+  m_pg->publish_stats_to_osd();
+}
+
+/*
+ * note: performs run_callbacks()
+ * note: reservations-related variables are not reset here
+ */
+void PgScrubber::reset_internal_state()
+{
+  dout(10) << __func__ << dendl;
+
+  preemption_data.reset();
+  m_maps_status.reset();
+  m_received_maps.clear();
+
+  m_start = hobject_t{};
+  m_end = hobject_t{};
+  m_max_end = hobject_t{};
+  m_subset_last_update = eversion_t{};
+  m_shallow_errors = 0;
+  m_deep_errors = 0;
+  m_fixed_count = 0;
+  m_omap_stats = (const struct omap_stat_t){0};
+
+  run_callbacks();
+
+  m_inconsistent.clear();
+  m_missing.clear();
+  m_authoritative.clear();
+  num_digest_updates_pending = 0;
+  m_primary_scrubmap = ScrubMap{};
+  m_primary_scrubmap_pos.reset();
+  replica_scrubmap = ScrubMap{};
+  replica_scrubmap_pos.reset();
+  m_cleaned_meta_map = ScrubMap{};
+  m_needs_sleep = true;
+  m_sleep_started_at = utime_t{};
+
+  m_active = false;
+}
+
+// note that only applicable to the Replica:
+void PgScrubber::advance_token()
+{
+  dout(10) << __func__ << " was: " << m_current_token << dendl;
+  m_current_token++;
+
+  // when advance_token() is called, it is assumed that no scrubbing takes place.
+  // We will, though, verify that. And if we are actually still handling a stale request -
+  // both our internal state and the FSM state will be cleared.
+  replica_handling_done();
+  m_fsm->process_event(FullReset{});
+}
+
+bool PgScrubber::is_token_current(Scrub::act_token_t received_token)
+{
+  if (received_token == 0 || received_token == m_current_token) {
+    return true;
+  }
+  dout(5) << __func__ << " obsolete token (" << received_token
+          << " vs current " << m_current_token << dendl;
+
+  return false;
+}
+
+const OSDMapRef& PgScrubber::get_osdmap() const
+{
+  return m_pg->get_osdmap();
+}
+
+ostream& operator<<(ostream& out, const PgScrubber& scrubber)
+{
+  return out << scrubber.m_flags;
+}
+
+ostream& PgScrubber::show(ostream& out) const
+{
+  return out << " [ " << m_pg_id << ": " << m_flags << " ] ";
+}
+
+int PgScrubber::asok_debug(std::string_view cmd,
+                          std::string param,
+                          Formatter* f,
+                          stringstream& ss)
+{
+  dout(10) << __func__ << " cmd: " << cmd << " param: " << param << dendl;
+
+  if (cmd == "block") {
+    // set a flag that will cause the next 'select_range' to report a blocked object
+    m_debug_blockrange = 1;
+  } else if (cmd == "unblock") {
+    // send an 'unblock' event, as if a blocked range was freed
+    m_debug_blockrange = 0;
+    m_fsm->process_event(Unblocked{});
+  }
+  return 0;
+}
+// ///////////////////// preemption_data_t //////////////////////////////////
+
+PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
+{
+  m_left = static_cast<int>(
+    m_pg->get_cct()->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+}
+
+void PgScrubber::preemption_data_t::reset()
+{
+  std::lock_guard<std::mutex> lk{m_preemption_lock};
+
+  m_preemptable = false;
+  m_preempted = false;
+  m_left =
+    static_cast<int>(m_pg->cct->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+  m_size_divisor = 1;
+}
+
+
+// ///////////////////// ReplicaReservations //////////////////////////////////
+namespace Scrub {
+
+void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch)
+{
+  auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, peer.shard), epoch,
+                               MOSDScrubReserve::RELEASE, m_pg->pg_whoami);
+  m_osds->send_message_osd_cluster(peer.osd, m, epoch);
+}
+
+ReplicaReservations::ReplicaReservations(PG* pg, pg_shard_t whoami)
+    : m_pg{pg}
+    , m_acting_set{pg->get_actingset()}
+    , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
+    , m_pending{static_cast<int>(m_acting_set.size()) - 1}
+    , m_pg_info{m_pg->get_pg_info(ScrubberPasskey())}
+{
+  epoch_t epoch = m_pg->get_osdmap_epoch();
+
+  // handle the special case of no replicas
+  if (m_pending <= 0) {
+    // just signal the scrub state-machine to continue
+    send_all_done();
+
+  } else {
+
+    for (auto p : m_acting_set) {
+      if (p == whoami)
+       continue;
+      auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, p.shard), epoch,
+                                   MOSDScrubReserve::REQUEST, m_pg->pg_whoami);
+      m_osds->send_message_osd_cluster(p.osd, m, epoch);
+      m_waited_for_peers.push_back(p);
+      dout(10) << __func__ << " <ReplicaReservations> reserve<-> " << p.osd << dendl;
+    }
+  }
+}
+
+void ReplicaReservations::send_all_done()
+{
+  m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
+}
+
+void ReplicaReservations::send_reject()
+{
+  m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
+}
+
+void ReplicaReservations::discard_all()
+{
+  dout(10) << __func__ << " " << m_reserved_peers << dendl;
+
+  m_had_rejections = true;  // preventing late-coming responses from triggering events
+  m_reserved_peers.clear();
+  m_waited_for_peers.clear();
+}
+
+ReplicaReservations::~ReplicaReservations()
+{
+  m_had_rejections = true;  // preventing late-coming responses from triggering events
+
+  // send un-reserve messages to all reserved replicas. We do not wait for answer (there
+  // wouldn't be one). Other incoming messages will be discarded on the way, by our
+  // owner.
+  epoch_t epoch = m_pg->get_osdmap_epoch();
+
+  for (auto& p : m_reserved_peers) {
+    release_replica(p, epoch);
+  }
+  m_reserved_peers.clear();
+
+  // note: the release will follow on the heels of the request. When tried otherwise,
+  // grants that followed a reject arrived after the whole scrub machine-state was
+  // reset, causing leaked reservations.
+  for (auto& p : m_waited_for_peers) {
+    release_replica(p, epoch);
+  }
+  m_waited_for_peers.clear();
+}
+
+/**
+ *  @ATTN we would not reach here if the ReplicaReservation object managed by the
+ * scrubber was reset.
+ */
+void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << " <ReplicaReservations> granted-> " << from << dendl;
+  op->mark_started();
+
+  {
+    // reduce the amount of extra release messages. Not a must, but the log is cleaner
+    auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
+    if (w != m_waited_for_peers.end())
+      m_waited_for_peers.erase(w);
+  }
+
+  // are we forced to reject the reservation?
+  if (m_had_rejections) {
+
+    dout(10) << " rejecting late-coming reservation from " << from << dendl;
+    release_replica(from, m_pg->get_osdmap_epoch());
+
+  } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
+            m_reserved_peers.end()) {
+
+    dout(10) << " already had osd." << from << " reserved" << dendl;
+
+  } else {
+
+    dout(10) << " osd." << from << " scrub reserve = success" << dendl;
+    m_reserved_peers.push_back(from);
+    if (--m_pending == 0) {
+      send_all_done();
+    }
+  }
+}
+
+void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << " <ReplicaReservations> rejected-> " << from << dendl;
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  {
+    // reduce the amount of extra release messages. Not a must, but the log is cleaner
+    auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
+    if (w != m_waited_for_peers.end())
+      m_waited_for_peers.erase(w);
+  }
+
+  if (m_had_rejections) {
+
+    // our failure was already handled when the first rejection arrived
+    dout(15) << " ignoring late-coming rejection from " << from << dendl;
+
+  } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
+            m_reserved_peers.end()) {
+
+    dout(10) << " already had osd." << from << " reserved" << dendl;
+
+  } else {
+
+    dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
+    m_had_rejections = true;  // preventing any additional notifications
+    send_reject();
+  }
+}
+
+
+// ///////////////////// LocalReservation //////////////////////////////////
+
+LocalReservation::LocalReservation(PG* pg, OSDService* osds)
+    : m_pg{pg} // holding the "whole PG" for dout() sake
+    , m_osds{osds}
+{
+  if (!m_osds->inc_scrubs_local()) {
+    dout(10) << __func__ << ": failed to reserve locally " << dendl;
+    // the failure is signalled by not having m_holding_local_reservation set
+    return;
+  }
+
+  dout(20) << __func__ << ": local OSD scrub resources reserved" << dendl;
+  m_holding_local_reservation = true;
+}
+
+LocalReservation::~LocalReservation()
+{
+  if (m_holding_local_reservation) {
+    m_holding_local_reservation = false;
+    m_osds->dec_scrubs_local();
+  }
+}
+
+
+// ///////////////////// ReservedByRemotePrimary ///////////////////////////////
+
+ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch)
+    : m_pg{pg}, m_osds{osds}, m_reserved_at{epoch}
+{
+  if (!m_osds->inc_scrubs_remote()) {
+    dout(10) << __func__ << ": failed to reserve at Primary request" << dendl;
+    // the failure is signalled by not having m_reserved_by_remote_primary set
+    return;
+  }
+
+  dout(20) << __func__ << ": scrub resources reserved at Primary request" << dendl;
+  m_reserved_by_remote_primary = true;
+}
+
+bool ReservedByRemotePrimary::is_stale() const
+{
+  return m_reserved_at < m_pg->get_same_interval_since();
+}
+
+ReservedByRemotePrimary::~ReservedByRemotePrimary()
+{
+  if (m_reserved_by_remote_primary) {
+    m_reserved_by_remote_primary = false;
+    m_osds->dec_scrubs_remote();
+  }
+}
+
+// ///////////////////// MapsCollectionStatus ////////////////////////////////
+
+auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from)
+  -> std::tuple<bool, std::string_view>
+{
+  auto fe = std::find(m_maps_awaited_for.begin(), m_maps_awaited_for.end(), from);
+  if (fe != m_maps_awaited_for.end()) {
+    // we are indeed waiting for a map from this replica
+    m_maps_awaited_for.erase(fe);
+    return std::tuple{true, ""sv};
+  } else {
+    return std::tuple{false, " unsolicited scrub-map"sv};
+  }
+}
+
+void MapsCollectionStatus::reset()
+{
+  *this = MapsCollectionStatus{};
+}
+
+std::string MapsCollectionStatus::dump() const
+{
+  std::string all;
+  for (const auto& rp : m_maps_awaited_for) {
+    all.append(rp.get_osd() + " "s);
+  }
+  return all;
+}
+
+ostream& operator<<(ostream& out, const MapsCollectionStatus& sf)
+{
+  out << " [ ";
+  for (const auto& rp : sf.m_maps_awaited_for) {
+    out << rp.get_osd() << " ";
+  }
+  if (!sf.m_local_map_ready) {
+    out << " local ";
+  }
+  return out << " ] ";
+}
+
+// ///////////////////// blocked_range_t ///////////////////////////////
+
+blocked_range_t::blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id)
+    : m_osds{osds}
+{
+  auto now_is = std::chrono::system_clock::now();
+  m_callbk = new LambdaContext([now_is, pg_id, osds]([[maybe_unused]] int r) {
+    std::time_t now_c = std::chrono::system_clock::to_time_t(now_is);
+    char buf[50];
+    strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c));
+    lgeneric_subdout(g_ceph_context, osd, 10)
+      << "PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf
+      << ")" << dendl;
+    osds->clog->warn() << "osd." << osds->whoami << " PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf << ")";
+    return;
+  });
+
+  std::lock_guard l(m_osds->sleep_lock);
+  m_osds->sleep_timer.add_event_after(waittime, m_callbk);
+}
+
+blocked_range_t::~blocked_range_t()
+{
+  std::lock_guard l(m_osds->sleep_lock);
+  m_osds->sleep_timer.cancel_event(m_callbk);
+}
+
+}  // namespace Scrub
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
new file mode 100644 (file)
index 0000000..c08279e
--- /dev/null
@@ -0,0 +1,800 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "osd/PG.h"
+#include "ScrubStore.h"
+#include "scrub_machine_lstnr.h"
+#include "osd/scrubber_common.h"
+
+class Callback;
+
+namespace Scrub {
+class ScrubMachine;
+struct BuildMap;
+
+/**
+ * Reserving/freeing scrub resources at the replicas.
+ *
+ *  When constructed - sends reservation requests to the acting_set.
+ *  A rejection triggers a "couldn't acquire the replicas' scrub resources" event.
+ *  All previous requests, whether already granted or not, are explicitly released.
+ *
+ *  A note re performance: I've measured a few container alternatives for
+ *  m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as
+ *  expected. flat_set is only slightly better. Surprisingly - std::vector (with no
+ *  sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve.
+ */
+class ReplicaReservations {
+  using OrigSet = decltype(std::declval<PG>().get_actingset());
+
+  PG* m_pg;
+  OrigSet m_acting_set;
+  OSDService* m_osds;
+  std::vector<pg_shard_t> m_waited_for_peers;
+  std::vector<pg_shard_t> m_reserved_peers;
+  bool m_had_rejections{false};
+  int m_pending{-1};
+  const pg_info_t& m_pg_info;
+
+  void release_replica(pg_shard_t peer, epoch_t epoch);
+
+  void send_all_done();         ///< all reservations are granted
+
+  /// notify the scrubber that we have failed to reserve replicas' resources
+  void send_reject();
+
+ public:
+  /**
+   *  quietly discard all knowledge about existing reservations. No messages
+   *  are sent to peers.
+   *  To be used upon interval change, as we know the the running scrub is no longer
+   *  relevant, and that the replicas had reset the reservations on their side.
+   */
+  void discard_all();
+
+  ReplicaReservations(PG* pg, pg_shard_t whoami);
+
+  ~ReplicaReservations();
+
+  void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+
+  void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
+};
+
+/**
+ *  wraps the local OSD scrub resource reservation in an RAII wrapper
+ */
+class LocalReservation {
+  PG* m_pg;
+  OSDService* m_osds;
+  bool m_holding_local_reservation{false};
+
+ public:
+  LocalReservation(PG* pg, OSDService* osds);
+  ~LocalReservation();
+  bool is_reserved() const { return m_holding_local_reservation; }
+};
+
+/**
+ *  wraps the OSD resource we are using when reserved as a replica by a scrubbing master.
+ */
+class ReservedByRemotePrimary {
+  PG* m_pg;
+  OSDService* m_osds;
+  bool m_reserved_by_remote_primary{false};
+  const epoch_t m_reserved_at;
+
+ public:
+  ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch);
+  ~ReservedByRemotePrimary();
+  [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; }
+
+  /// compare the remembered reserved-at epoch to the current interval
+  [[nodiscard]] bool is_stale() const;
+};
+
+/**
+ * Once all replicas' scrub maps are received, we go on to compare the maps. That is -
+ * unless we we have not yet completed building our own scrub map. MapsCollectionStatus
+ * combines the status of waiting for both the local map and the replicas, without
+ * resorting to adding dummy entries into a list.
+ */
+class MapsCollectionStatus {
+
+  bool m_local_map_ready{false};
+  std::vector<pg_shard_t> m_maps_awaited_for;
+
+ public:
+  [[nodiscard]] bool are_all_maps_available() const
+  {
+    return m_local_map_ready && m_maps_awaited_for.empty();
+  }
+
+  void mark_local_map_ready() { m_local_map_ready = true; }
+
+  void mark_replica_map_request(pg_shard_t from_whom)
+  {
+    m_maps_awaited_for.push_back(from_whom);
+  }
+
+  /// @returns true if indeed waiting for this one. Otherwise: an error string
+  auto mark_arriving_map(pg_shard_t from) -> std::tuple<bool, std::string_view>;
+
+  std::vector<pg_shard_t> get_awaited() const { return m_maps_awaited_for; }
+
+  void reset();
+
+  std::string dump() const;
+
+  friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf);
+};
+
+
+}  // namespace Scrub
+
+
+/**
+ * the scrub operation flags. Primary only.
+ * Set at scrub start. Checked in multiple locations - mostly
+ * at finish.
+ */
+struct scrub_flags_t {
+
+  unsigned int priority{0};
+
+  /**
+   * set by queue_scrub() if either planned_scrub.auto_repair or
+   * need_auto were set.
+   * Tested at scrub end.
+   */
+  bool auto_repair{false};
+
+  /// this flag indicates that we are scrubbing post repair to verify everything is fixed
+  bool check_repair{false};
+
+  /// checked at the end of the scrub, to possibly initiate a deep-scrub
+  bool deep_scrub_on_error{false};
+
+  /**
+   * scrub must not be aborted.
+   * Set for explicitly requested scrubs, and for scrubs originated by the pairing
+   * process with the 'repair' flag set (in the RequestScrub event).
+   */
+  bool required{false};
+};
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf);
+
+
+/**
+ * The part of PG-scrubbing code that isn't state-machine wiring.
+ *
+ * Why the separation? I wish to move to a different FSM implementation. Thus I
+ * am forced to strongly decouple the state-machine implementation details from
+ * the actual scrubbing code.
+ */
+class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
+
+ public:
+  explicit PgScrubber(PG* pg);
+
+  //  ------------------  the I/F exposed to the PG (ScrubPgIF) -------------
+
+  /// are we waiting for resource reservation grants form our replicas?
+  [[nodiscard]] bool is_reserving() const final;
+
+  void initiate_regular_scrub(epoch_t epoch_queued) final;
+
+  void initiate_scrub_after_repair(epoch_t epoch_queued) final;
+
+  void send_scrub_resched(epoch_t epoch_queued) final;
+
+  void active_pushes_notification(epoch_t epoch_queued) final;
+
+  void update_applied_notification(epoch_t epoch_queued) final;
+
+  void send_scrub_unblock(epoch_t epoch_queued) final;
+
+  void digest_update_notification(epoch_t epoch_queued) final;
+
+  void send_replica_maps_ready(epoch_t epoch_queued) final;
+
+  void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+  void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+  void send_replica_pushes_upd(epoch_t epoch_queued) final;
+  /**
+   *  The PG has updated its 'applied version'. It might be that we are waiting for this
+   *  information: after selecting a range of objects to scrub, we've marked the latest
+   *  version of these objects in m_subset_last_update. We will not start the map building
+   *  before we know that the PG has reached this version.
+   */
+  void on_applied_when_primary(const eversion_t& applied_version) final;
+
+  void send_full_reset(epoch_t epoch_queued) final;
+
+  void send_chunk_free(epoch_t epoch_queued) final;
+
+  void send_chunk_busy(epoch_t epoch_queued) final;
+
+  void send_local_map_done(epoch_t epoch_queued) final;
+
+  void send_maps_compared(epoch_t epoch_queued) final;
+
+  void send_get_next_chunk(epoch_t epoch_queued) final;
+
+  void send_scrub_is_finished(epoch_t epoch_queued) final;
+
+  /**
+   *  we allow some number of preemptions of the scrub, which mean we do
+   *  not block.  Then we start to block.  Once we start blocking, we do
+   *  not stop until the scrub range is completed.
+   */
+  bool write_blocked_by_scrub(const hobject_t& soid) final;
+
+  /// true if the given range intersects the scrub interval in any way
+  bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
+
+  /**
+   *  we are a replica being asked by the Primary to reserve OSD resources for
+   *  scrubbing
+   */
+  void handle_scrub_reserve_request(OpRequestRef op) final;
+
+  void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
+  void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
+  void handle_scrub_reserve_release(OpRequestRef op) final;
+  void discard_replica_reservations() final;
+  void clear_scrub_reservations() final;  // PG::clear... fwds to here
+  void unreserve_replicas() final;
+
+  // managing scrub op registration
+
+  void reg_next_scrub(const requested_scrub_t& request_flags) final;
+
+  void unreg_next_scrub() final;
+
+  void scrub_requested(scrub_level_t scrub_level,
+                      scrub_type_t scrub_type,
+                      requested_scrub_t& req_flags) final;
+
+  /**
+   * Reserve local scrub resources (managed by the OSD)
+   *
+   * Fails if OSD's local-scrubs budget was exhausted
+   * \returns were local resources reserved?
+   */
+  bool reserve_local() final;
+
+  void handle_query_state(ceph::Formatter* f) final;
+
+  void dump(ceph::Formatter* f) const override;
+
+  // used if we are a replica
+
+  void replica_scrub_op(OpRequestRef op) final;
+
+  /// the op priority, taken from the primary's request message
+  Scrub::scrub_prio_t replica_op_priority() const final
+  {
+    return m_replica_request_priority;
+  };
+
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+                                     unsigned int suggested_priority) const final;
+  /// the version that refers to m_flags.priority
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
+
+  void add_callback(Context* context) final { m_callbacks.push_back(context); }
+
+  [[nodiscard]] bool are_callbacks_pending() const final  // used for an assert in PG.cc
+  {
+    return !m_callbacks.empty();
+  }
+
+  /// handle a message carrying a replica map
+  void map_from_replica(OpRequestRef op) final;
+
+  void scrub_clear_state() final;
+
+  /**
+   *  add to scrub statistics, but only if the soid is below the scrub start
+   */
+  virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+                                       const hobject_t& soid) override
+  {
+    ceph_assert(false);
+  }
+
+  /**
+   * finalize the parameters of the initiated scrubbing session:
+   *
+   * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
+   * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
+   */
+  void set_op_parameters(requested_scrub_t& request) final;
+
+  void cleanup_store(ObjectStore::Transaction* t) final;
+
+  bool get_store_errors(const scrub_ls_arg_t& arg,
+                       scrub_ls_result_t& res_inout) const override
+  {
+    return false;
+  }
+
+  int asok_debug(std::string_view cmd,
+                std::string param,
+                Formatter* f,
+                std::stringstream& ss) override;
+  int m_debug_blockrange{0};
+
+  // -------------------------------------------------------------------------------------------
+  // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
+
+  [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); }
+
+  void select_range_n_notify() final;
+
+  Scrub::BlockedRangeWarning acquire_blocked_alarm() final;
+
+  /// walk the log to find the latest update that affects our chunk
+  eversion_t search_log_for_updates() const final;
+
+  eversion_t get_last_update_applied() const final
+  {
+    return m_pg->recovery_state.get_last_update_applied();
+  }
+
+  int pending_active_pushes() const final { return m_pg->active_pushes; }
+
+  void on_init() final;
+  void on_replica_init() final;
+  void replica_handling_done() final;
+
+  /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+  /// (thus can be called from FSM reactions)
+  void clear_pgscrub_state() final;
+
+  /*
+   * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
+   * is asserted - after a configuration-dependent timeout.
+   */
+  void add_delayed_scheduling() final;
+
+  void get_replicas_maps(bool replica_can_preempt) final;
+
+  void on_digest_updates() final;
+
+  ScrubMachineListener::MsgAndEpoch
+  prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final;
+
+  void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final;
+
+  void send_preempted_replica() final;
+
+  void send_remotes_reserved(epoch_t epoch_queued) final;
+  void send_reservation_failure(epoch_t epoch_queued) final;
+
+  /**
+   *  does the PG have newer updates than what we (the scrubber) know?
+   */
+  [[nodiscard]] bool has_pg_marked_new_updates() const final;
+
+  void set_subset_last_update(eversion_t e) final;
+
+  void maps_compare_n_cleanup() final;
+
+  Scrub::preemption_t& get_preemptor() final;
+
+  int build_primary_map_chunk() final;
+
+  int build_replica_map_chunk() final;
+
+  void reserve_replicas() final;
+
+  [[nodiscard]] bool was_epoch_changed() const final;
+
+  void mark_local_map_ready() final;
+
+  [[nodiscard]] bool are_all_maps_available() const final;
+
+  std::string dump_awaited_maps() const final;
+
+ protected:
+  bool state_test(uint64_t m) const { return m_pg->state_test(m); }
+  void state_set(uint64_t m) { m_pg->state_set(m); }
+  void state_clear(uint64_t m) { m_pg->state_clear(m); }
+
+  [[nodiscard]] bool is_scrub_registered() const;
+
+  virtual void _scrub_clear_state() {}
+
+  utime_t m_scrub_reg_stamp;  ///< stamp we registered for
+
+  ostream& show(ostream& out) const override;
+
+ public:
+  // -------------------------------------------------------------------------------------------
+
+  friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
+
+  static utime_t scrub_must_stamp() { return utime_t(1, 1); }
+
+  virtual ~PgScrubber();  // must be defined separately, in the .cc file
+
+  [[nodiscard]] bool is_scrub_active() const final { return m_active; }
+
+ private:
+  void reset_internal_state();
+
+  /**
+   *  the current scrubbing operation is done. We should mark that fact, so that
+   *  all events related to the previous operation can be discarded.
+   */
+  void advance_token();
+
+  bool is_token_current(Scrub::act_token_t received_token);
+
+  void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
+
+  void _scan_snaps(ScrubMap& smap);
+
+  ScrubMap clean_meta_map();
+
+  /**
+   *  mark down some parameters of the initiated scrub:
+   *  - the epoch when started;
+   *  - the depth of the scrub requested (from the PG_STATE variable)
+   */
+  void reset_epoch(epoch_t epoch_queued);
+
+  void run_callbacks();
+
+  // -----     methods used to verify the relevance of incoming events:
+
+  /**
+   *  is the incoming event still relevant, and should be processed?
+   *
+   *  It isn't if:
+   *  - (1) we are no longer 'actively scrubbing'; or
+   *  - (2) the message is from an epoch prior to when we started the current scrub
+   * session; or
+   *  - (3) the message epoch is from a previous interval; or
+   *  - (4) the 'abort' configuration flags were set.
+   *
+   *  For (1) & (2) - teh incoming message is discarded, w/o further action.
+   *
+   *  For (3): (see check_interval() for a full description) if we have not reacted yet
+   *  to this specific new interval, we do now:
+   *  - replica reservations are silently discarded (we count on the replicas to notice
+   *        the interval change and un-reserve themselves);
+   *  - the scrubbing is halted.
+   *
+   *  For (4): the message will be discarded, but also:
+   *    if this is the first time we've noticed the 'abort' request, we perform the abort.
+   *
+   *  \returns should the incoming event be processed?
+   */
+  bool is_message_relevant(epoch_t epoch_to_verify);
+
+  /**
+   * check the 'no scrub' configuration options.
+   */
+  [[nodiscard]] bool should_abort() const;
+
+  /**
+   * Check the 'no scrub' configuration flags.
+   *
+   * Reset everything if the abort was not handled before.
+   * @returns false if the message was discarded due to abort flag.
+   */
+  [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify);
+
+  [[nodiscard]] bool check_interval(epoch_t epoch_to_verify);
+
+  epoch_t m_last_aborted{};  // last time we've noticed a request to abort
+
+  /**
+   * return true if any inconsistency/missing is repaired, false otherwise
+   */
+  [[nodiscard]] bool scrub_process_inconsistent();
+
+  void scrub_compare_maps();
+
+  bool m_needs_sleep{true};  ///< should we sleep before being rescheduled? always
+                            ///< 'true', unless we just got out of a sleep period
+
+  utime_t m_sleep_started_at;
+
+
+  // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
+  // to guarantee un-reserving when deleted.
+  std::optional<Scrub::ReplicaReservations> m_reservations;
+  std::optional<Scrub::LocalReservation> m_local_osd_resource;
+
+  /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
+  std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
+
+  void cleanup_on_finish();  // scrub_clear_state() as called for a Primary when
+                            // Active->NotActive
+
+  /// the part that actually finalizes a scrub
+  void scrub_finish();
+
+ protected:
+  PG* const m_pg;
+
+  /**
+   * the derivative-specific scrub-finishing touches:
+   */
+  virtual void _scrub_finish() {}
+
+  /**
+   * Validate consistency of the object info and snap sets.
+   */
+  virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
+  {}
+
+  // common code used by build_primary_map_chunk() and build_replica_map_chunk():
+  int build_scrub_map_chunk(ScrubMap& map,  // primary or replica?
+                           ScrubMapBuilder& pos,
+                           hobject_t start,
+                           hobject_t end,
+                           bool deep);
+
+  std::unique_ptr<Scrub::ScrubMachine> m_fsm;
+  const spg_t m_pg_id; ///< a local copy of m_pg->pg_id
+  OSDService* const m_osds;
+  const pg_shard_t m_pg_whoami;         ///< a local copy of m_pg->pg_whoami;
+
+  epoch_t m_interval_start{0};  ///< interval's 'from' of when scrubbing was first scheduled
+  /*
+   * the exact epoch when the scrubbing actually started (started here - cleared checks
+   *  for no-scrub conf). Incoming events are verified against this, with stale events
+   *  discarded.
+   */
+  epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
+
+  /**
+   *  (replica) a tag identifying a specific scrub "session". Incremented whenever the
+   *  Primary releases the replica scrub resources.
+   *  When the scrub session is terminated (even if the interval remains unchanged, as
+   *  might happen following an asok no-scrub command), stale scrub-resched messages
+   *  triggered by the backend will be discarded.
+   */
+  Scrub::act_token_t m_current_token{1};
+
+  scrub_flags_t m_flags;
+
+  bool m_active{false};
+
+  eversion_t m_subset_last_update{};
+
+  std::unique_ptr<Scrub::Store> m_store;
+
+  int num_digest_updates_pending{0};
+  hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
+
+  /// Returns reference to current osdmap
+  const OSDMapRef& get_osdmap() const;
+
+  /// Returns epoch of current osdmap
+  epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
+
+  CephContext* get_pg_cct() const { return m_pg->cct; }
+
+  // collected statistics
+  int m_shallow_errors{0};
+  int m_deep_errors{0};
+  int m_fixed_count{0};
+
+  /// Maps from objects with errors to missing peers
+  HobjToShardSetMapping m_missing;
+
+ protected:
+  /**
+   * 'm_is_deep' - is the running scrub a deep one?
+   *
+   * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
+   * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
+   * meaningful both for the primary and the replicas, and is used as a parameter when
+   * building the scrub maps.
+   */
+  bool m_is_deep{false};
+
+  /**
+   * If set: affects the backend & scrubber-backend functions called after all
+   * scrub maps are available.
+   *
+   * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be
+   * a "user facing" status display only).
+   */
+  bool m_is_repair{false};
+
+  /**
+   * User-readable summary of the scrubber's current mode of operation. Used for
+   * both osd.*.log and the cluster log.
+   * One of:
+   *    "repair"
+   *    "deep-scrub",
+   *    "scrub
+   *
+   * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for
+   * auto_repair will show as "deep-scrub" and not as "repair" (until the first error
+   * is detected).
+   */
+  std::string_view m_mode_desc;
+
+  void update_op_mode_text();
+
+private:
+
+  /**
+   * initiate a deep-scrub after the current scrub ended with errors.
+   */
+  void request_rescrubbing(requested_scrub_t& req_flags);
+
+  /*
+   * Select a range of objects to scrub.
+   *
+   * By:
+   * - setting tentative range based on conf and divisor
+   * - requesting a partial list of elements from the backend;
+   * - handling some head/clones issues
+   *
+   * The selected range is set directly into 'm_start' and 'm_end'
+   */
+  bool select_range();
+
+  std::list<Context*> m_callbacks;
+
+  /**
+   * send a replica (un)reservation request to the acting set
+   *
+   * @param opcode - one of MOSDScrubReserve::REQUEST
+   *                  or MOSDScrubReserve::RELEASE
+   */
+  void message_all_replicas(int32_t opcode, std::string_view op_text);
+
+  hobject_t m_max_end; ///< Largest end that may have been sent to replicas
+  ScrubMap m_primary_scrubmap;
+  ScrubMapBuilder m_primary_scrubmap_pos;
+
+  std::map<pg_shard_t, ScrubMap> m_received_maps;
+
+  /// Cleaned std::map pending snap metadata scrub
+  ScrubMap m_cleaned_meta_map;
+
+  void _request_scrub_map(pg_shard_t replica,
+                         eversion_t version,
+                         hobject_t start,
+                         hobject_t end,
+                         bool deep,
+                         bool allow_preemption);
+
+
+  Scrub::MapsCollectionStatus m_maps_status;
+
+  omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
+
+  /// Maps from objects with errors to inconsistent peers
+  HobjToShardSetMapping m_inconsistent;
+
+  /// Maps from object with errors to good peers
+  std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
+
+  // ------------ members used if we are a replica
+
+  epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message
+
+  ScrubMapBuilder replica_scrubmap_pos;
+  ScrubMap replica_scrubmap;
+
+  /**
+   * we mark the request priority as it arrived. It influences the queuing priority
+   * when we wait for local updates
+   */
+  Scrub::scrub_prio_t m_replica_request_priority;
+
+  /**
+   * the 'preemption' "state-machine".
+   * Note: I was considering an orthogonal sub-machine implementation, but as
+   * the state diagram is extremely simple, the added complexity wasn't justified.
+   */
+  class preemption_data_t : public Scrub::preemption_t {
+   public:
+    preemption_data_t(PG* pg); // the PG access is used for conf access (and logs)
+
+    [[nodiscard]] bool is_preemptable() const final { return m_preemptable; }
+
+    bool do_preempt() final
+    {
+      if (m_preempted || !m_preemptable)
+       return false;
+
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+      if (!m_preemptable)
+       return false;
+
+      m_preempted = true;
+      return true;
+    }
+
+    /// same as 'do_preempt()' but w/o checks (as once a replica
+    /// was preempted, we cannot continue)
+    void replica_preempted() { m_preempted = true; }
+
+    void enable_preemption()
+    {
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+      if (are_preemptions_left() && !m_preempted) {
+       m_preemptable = true;
+      }
+    }
+
+    /// used by a replica to set preemptability state according to the Primary's request
+    void force_preemptability(bool is_allowed)
+    {
+      // note: no need to lock for a replica
+      m_preempted = false;
+      m_preemptable = is_allowed;
+    }
+
+    bool disable_and_test() final
+    {
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+      m_preemptable = false;
+      return m_preempted;
+    }
+
+    [[nodiscard]] bool was_preempted() const { return m_preempted; }
+
+    [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; }
+
+    void reset();
+
+    void adjust_parameters() final
+    {
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+
+      if (m_preempted) {
+       m_preempted = false;
+       m_preemptable = adjust_left();
+      } else {
+       m_preemptable = are_preemptions_left();
+      }
+    }
+
+   private:
+    PG* m_pg;
+    mutable std::mutex m_preemption_lock;
+    bool m_preemptable{false};
+    bool m_preempted{false};
+    int m_left;
+    size_t m_size_divisor{1};
+    bool are_preemptions_left() const { return m_left > 0; }
+
+    bool adjust_left()
+    {
+      if (m_left > 0) {
+       --m_left;
+       m_size_divisor *= 2;
+      }
+      return m_left > 0;
+    }
+  };
+
+  preemption_data_t preemption_data;
+};
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
new file mode 100644 (file)
index 0000000..41e3cd1
--- /dev/null
@@ -0,0 +1,521 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "scrub_machine.h"
+
+#include <chrono>
+#include <typeinfo>
+
+#include <boost/core/demangle.hpp>
+
+#include "osd/OSD.h"
+#include "osd/OpRequest.h"
+#include "ScrubStore.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout << " scrubberFSM "
+
+using namespace std::chrono;
+using namespace std::chrono_literals;
+namespace sc = boost::statechart;
+
+#define DECLARE_LOCALS                                           \
+  ScrubMachineListener* scrbr = context<ScrubMachine>().m_scrbr; \
+  std::ignore = scrbr;                                           \
+  auto pg_id = context<ScrubMachine>().m_pg_id;                  \
+  std::ignore = pg_id;
+
+namespace Scrub {
+
+// --------- trace/debug auxiliaries -------------------------------
+
+void on_event_creation(std::string_view nm)
+{
+  dout(20) << " event: --vvvv---- " << nm << dendl;
+}
+
+void on_event_discard(std::string_view nm)
+{
+  dout(20) << " event: --^^^^---- " << nm << dendl;
+}
+
+void ScrubMachine::my_states() const
+{
+  for (auto si = state_begin(); si != state_end(); ++si) {
+    const auto& siw{*si};  // prevents a warning re side-effects
+    dout(20) << " state: " << boost::core::demangle(typeid(siw).name()) << dendl;
+  }
+}
+
+void ScrubMachine::assert_not_active() const
+{
+  ceph_assert(state_cast<const NotActive*>());
+}
+
+bool ScrubMachine::is_reserving() const
+{
+  return state_cast<const ReservingReplicas*>();
+}
+
+bool ScrubMachine::is_accepting_updates() const
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  ceph_assert(scrbr->is_primary());
+
+  return state_cast<const WaitLastUpdate*>();
+}
+
+// for the rest of the code in this file - we know what PG we are dealing with:
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this->context<ScrubMachine>().m_pg)
+template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
+{
+  return t->gen_prefix(*_dout) << " scrubberFSM pg(" << t->pg_id << ") ";
+}
+
+// ////////////// the actual actions
+
+// ----------------------- NotActive -----------------------------------------
+
+NotActive::NotActive(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> NotActive" << dendl;
+}
+
+// ----------------------- ReservingReplicas ---------------------------------
+
+ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> ReservingReplicas" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  scrbr->reserve_replicas();
+}
+
+sc::result ReservingReplicas::react(const ReservationFailure&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
+
+  // the Scrubber must release all resources and abort the scrubbing
+  scrbr->clear_pgscrub_state();
+  return transit<NotActive>();
+}
+
+/**
+ * note: the event poster is handling the scrubber reset
+ */
+sc::result ReservingReplicas::react(const FullReset&)
+{
+  dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl;
+  return transit<NotActive>();
+}
+
+// ----------------------- ActiveScrubbing -----------------------------------
+
+ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> ActiveScrubbing" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  scrbr->on_init();
+}
+
+/**
+ *  upon exiting the Active state
+ */
+ActiveScrubbing::~ActiveScrubbing()
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(15) << __func__ << dendl;
+  scrbr->unreserve_replicas();
+}
+
+/*
+ * The only source of an InternalError event as of now is the BuildMap state,
+ * when encountering a backend error.
+ * We kill the scrub and reset the FSM.
+ */
+sc::result ActiveScrubbing::react(const InternalError&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << __func__ << dendl;
+  scrbr->clear_pgscrub_state();
+  return transit<NotActive>();
+}
+
+sc::result ActiveScrubbing::react(const FullReset&)
+{
+  dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl;
+  // caller takes care of clearing the scrubber & FSM states
+  return transit<NotActive>();
+}
+
+// ----------------------- RangeBlocked -----------------------------------
+
+/*
+ * Blocked. Will be released by kick_object_context_blocked() (or upon
+ * an abort)
+ *
+ * Note: we are never expected to be waiting for long for a blocked object.
+ * Unfortunately we know from experience that a bug elsewhere might result
+ * in an indefinite wait in this state, for an object that is never released.
+ * If that happens, all we can do is to issue a warning message to help
+ * with the debugging.
+ */
+RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/RangeBlocked" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+
+  // arrange to have a warning message issued if we are stuck in this
+  // state for longer than some reasonable number of minutes.
+  m_timeout = scrbr->acquire_blocked_alarm();
+}
+
+// ----------------------- PendingTimer -----------------------------------
+
+/**
+ *  Sleeping till timer reactivation - or just requeuing
+ */
+PendingTimer::PendingTimer(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/PendingTimer" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+
+  scrbr->add_delayed_scheduling();
+}
+
+// ----------------------- NewChunk -----------------------------------
+
+/**
+ *  Preconditions:
+ *  - preemption data was set
+ *  - epoch start was updated
+ */
+NewChunk::NewChunk(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/NewChunk" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+
+  scrbr->get_preemptor().adjust_parameters();
+
+  //  choose range to work on
+  //  select_range_n_notify() will signal either SelectedChunkFree or
+  //  ChunkIsBusy. If 'busy', we transition to Blocked, and wait for the
+  //  range to become available.
+  scrbr->select_range_n_notify();
+}
+
+sc::result NewChunk::react(const SelectedChunkFree&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl;
+
+  scrbr->set_subset_last_update(scrbr->search_log_for_updates());
+  return transit<WaitPushes>();
+}
+
+// ----------------------- WaitPushes -----------------------------------
+
+WaitPushes::WaitPushes(my_context ctx) : my_base(ctx)
+{
+  dout(10) << " -- state -->> Act/WaitPushes" << dendl;
+  post_event(ActivePushesUpd{});
+}
+
+/*
+ * Triggered externally, by the entity that had an update re pushes
+ */
+sc::result WaitPushes::react(const ActivePushesUpd&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: "
+          << scrbr->pending_active_pushes() << dendl;
+
+  if (!scrbr->pending_active_pushes()) {
+    // done waiting
+    return transit<WaitLastUpdate>();
+  }
+
+  return discard_event();
+}
+
+// ----------------------- WaitLastUpdate -----------------------------------
+
+WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx)
+{
+  dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl;
+  post_event(UpdatesApplied{});
+}
+
+/**
+ *  Note:
+ *  Updates are locally readable immediately. Thus, on the replicas we do need
+ *  to wait for the update notifications before scrubbing. For the Primary it's
+ *  a bit different: on EC (and only there) rmw operations have an additional
+ *  read roundtrip. That means that on the Primary we need to wait for
+ *  last_update_applied (the replica side, even on EC, is still safe
+ *  since the actual transaction will already be readable by commit time.
+ */
+void WaitLastUpdate::on_new_updates(const UpdatesApplied&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl;
+
+  if (scrbr->has_pg_marked_new_updates()) {
+    post_event(InternalAllUpdates{});
+  } else {
+    // will be requeued by op_applied
+    dout(10) << "wait for EC read/modify/writes to queue" << dendl;
+  }
+}
+
+/*
+ *  request maps from the replicas in the acting set
+ */
+sc::result WaitLastUpdate::react(const InternalAllUpdates&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl;
+
+  scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable());
+  return transit<BuildMap>();
+}
+
+// ----------------------- BuildMap -----------------------------------
+
+BuildMap::BuildMap(my_context ctx) : my_base(ctx)
+{
+  dout(10) << " -- state -->> Act/BuildMap" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+
+  // no need to check for an epoch change, as all possible flows that brought us here have
+  // a check_interval() verification of their final event.
+
+  if (scrbr->get_preemptor().was_preempted()) {
+
+    // we were preempted, either directly or by a replica
+    dout(10) << __func__ << " preempted!!!" << dendl;
+    scrbr->mark_local_map_ready();
+    post_event(IntBmPreempted{});
+
+  } else {
+
+    auto ret = scrbr->build_primary_map_chunk();
+
+    if (ret == -EINPROGRESS) {
+      // must wait for the backend to finish. No specific event provided.
+      // build_primary_map_chunk() has already requeued us.
+      dout(20) << "waiting for the backend..." << dendl;
+
+    } else if (ret < 0) {
+
+      dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
+      post_event(InternalError{});
+
+    } else {
+
+      // the local map was created
+      post_event(IntLocalMapDone{});
+    }
+  }
+}
+
+sc::result BuildMap::react(const IntLocalMapDone&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl;
+
+  scrbr->mark_local_map_ready();
+  return transit<WaitReplicas>();
+}
+
+// ----------------------- DrainReplMaps -----------------------------------
+
+DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/DrainReplMaps" << dendl;
+  // we may have received all maps already. Send the event that will make us check.
+  post_event(GotReplicas{});
+}
+
+sc::result DrainReplMaps::react(const GotReplicas&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl;
+
+  if (scrbr->are_all_maps_available()) {
+    // NewChunk will handle the preemption that brought us to this state
+    return transit<PendingTimer>();
+  }
+
+  dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: "
+          << scrbr->dump_awaited_maps() << dendl;
+  return discard_event();
+}
+
+// ----------------------- WaitReplicas -----------------------------------
+
+WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/WaitReplicas" << dendl;
+  post_event(GotReplicas{});
+}
+
+/**
+ * note: now that maps_compare_n_cleanup() is "futurized"(*), and we remain in this state
+ *  for a while even after we got all our maps, we must prevent are_all_maps_available()
+ *  (actually - the code after the if()) from being called more than once.
+ * This is basically a separate state, but it's too transitory and artificial to justify
+ *  the cost of a separate state.
+
+ * (*) "futurized" - in Crimson, the call to maps_compare_n_cleanup() returns immediately
+ *  after initiating the process. The actual termination of the maps comparing etc' is
+ *  signalled via an event. As we share the code with "classic" OSD, here too
+ *  maps_compare_n_cleanup() is responsible for signalling the completion of the
+ *  processing.
+ */
+sc::result WaitReplicas::react(const GotReplicas&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl;
+
+  if (!all_maps_already_called && scrbr->are_all_maps_available()) {
+    dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl;
+
+    all_maps_already_called = true;
+
+    // were we preempted?
+    if (scrbr->get_preemptor().disable_and_test()) {  // a test&set
+
+
+      dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl;
+      return transit<PendingTimer>();
+
+    } else {
+
+      // maps_compare_n_cleanup() will arrange for MapsCompared event to be sent:
+      scrbr->maps_compare_n_cleanup();
+      return discard_event();
+    }
+  } else {
+    return discard_event();
+  }
+}
+
+// ----------------------- WaitDigestUpdate -----------------------------------
+
+WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl;
+  // perform an initial check: maybe we already
+  // have all the updates we need:
+  // (note that DigestUpdate is usually an external event)
+  post_event(DigestUpdate{});
+}
+
+sc::result WaitDigestUpdate::react(const DigestUpdate&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl;
+
+  // on_digest_updates() will either:
+  // - do nothing - if we are still waiting for updates, or
+  // - finish the scrubbing of the current chunk, and:
+  //  - send NextChunk, or
+  //  - send ScrubFinished
+
+  scrbr->on_digest_updates();
+  return discard_event();
+}
+
+ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
+    : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub}
+{
+  dout(15) << "ScrubMachine created " << m_pg_id << dendl;
+}
+
+ScrubMachine::~ScrubMachine() = default;
+
+// -------- for replicas -----------------------------------------------------
+
+// ----------------------- ReplicaWaitUpdates --------------------------------
+
+ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  scrbr->on_replica_init();
+}
+
+/*
+ * Triggered externally, by the entity that had an update re pushes
+ */
+sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): "
+          << scrbr->pending_active_pushes() << dendl;
+
+  if (scrbr->pending_active_pushes() == 0) {
+
+    // done waiting
+    return transit<ActiveReplica>();
+  }
+
+  return discard_event();
+}
+
+/**
+ * the event poster is handling the scrubber reset
+ */
+sc::result ReplicaWaitUpdates::react(const FullReset&)
+{
+  dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl;
+  return transit<NotActive>();
+}
+
+// ----------------------- ActiveReplica -----------------------------------
+
+ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "-- state -->> ActiveReplica" << dendl;
+  scrbr->on_replica_init();  // as we might have skipped ReplicaWaitUpdates
+  post_event(SchedReplica{});
+}
+
+sc::result ActiveReplica::react(const SchedReplica&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? "
+          << scrbr->get_preemptor().is_preemptable() << dendl;
+
+  if (scrbr->get_preemptor().was_preempted()) {
+    dout(10) << "replica scrub job preempted" << dendl;
+
+    scrbr->send_preempted_replica();
+    scrbr->replica_handling_done();
+    return transit<NotActive>();
+  }
+
+  // start or check progress of build_replica_map_chunk()
+  auto ret_init = scrbr->build_replica_map_chunk();
+  if (ret_init != -EINPROGRESS) {
+    return transit<NotActive>();
+  }
+
+  return discard_event();
+}
+
+/**
+ * the event poster is handling the scrubber reset
+ */
+sc::result ActiveReplica::react(const FullReset&)
+{
+  dout(10) << "ActiveReplica::react(const FullReset&)" << dendl;
+  return transit<NotActive>();
+}
+
+}  // namespace Scrub
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h
new file mode 100644 (file)
index 0000000..7f18700
--- /dev/null
@@ -0,0 +1,346 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <string>
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/deferral.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <boost/statechart/in_state_reaction.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+
+#include "common/version.h"
+#include "include/Context.h"
+
+#include "scrub_machine_lstnr.h"
+#include "osd/scrubber_common.h"
+
+using namespace std::string_literals;
+
+class PG;  // holding a pointer to that one - just for testing
+class PgScrubber;
+namespace Scrub {
+
+namespace sc = ::boost::statechart;
+namespace mpl = ::boost::mpl;
+
+//
+//  EVENTS
+//
+
+void on_event_creation(std::string_view nm);
+void on_event_discard(std::string_view nm);
+
+#define MEV(E)                                          \
+  struct E : sc::event<E> {                             \
+    inline static int actv{0};                          \
+    E()                                                 \
+    {                                                   \
+      if (!actv++)                                      \
+       on_event_creation(#E);                          \
+    }                                                   \
+    ~E()                                                \
+    {                                                   \
+      if (!--actv)                                      \
+       on_event_discard(#E);                           \
+    }                                                   \
+    void print(std::ostream* out) const { *out << #E; } \
+    std::string_view print() const { return #E; }       \
+  };
+
+MEV(RemotesReserved)  ///< all replicas have granted our reserve request
+
+MEV(ReservationFailure)         ///< a reservation request has failed
+
+MEV(StartScrub)         ///< initiate a new scrubbing session (relevant if we are a Primary)
+
+MEV(AfterRepairScrub)  ///< initiate a new scrubbing session. Only triggered at Recovery
+                      ///< completion.
+
+MEV(Unblocked) ///< triggered when the PG unblocked an object that was marked for
+               ///< scrubbing. Via the PGScrubUnblocked op
+
+MEV(InternalSchedScrub)
+
+MEV(SelectedChunkFree)
+
+MEV(ChunkIsBusy)
+
+MEV(ActivePushesUpd)  ///< Update to active_pushes. 'active_pushes' represents recovery
+                     ///< that is in-flight to the local ObjectStore
+
+MEV(UpdatesApplied)  ///< (Primary only) all updates are committed
+
+MEV(InternalAllUpdates)         ///< the internal counterpart of UpdatesApplied
+
+MEV(GotReplicas)  ///< got a map from a replica
+
+MEV(IntBmPreempted)  ///< internal - BuildMap preempted. Required, as detected within the
+                    ///< ctor
+
+MEV(InternalError)
+
+MEV(IntLocalMapDone)
+
+MEV(DigestUpdate)  ///< external. called upon success of a MODIFY op. See
+                  ///< scrub_snapshot_metadata()
+
+MEV(MapsCompared)  ///< (Crimson) maps_compare_n_cleanup() transactions are done
+
+MEV(StartReplica)  ///< initiating replica scrub.
+
+MEV(StartReplicaNoWait)         ///< 'start replica' when there are no pending updates
+
+MEV(SchedReplica)
+
+MEV(ReplicaPushesUpd)  ///< Update to active_pushes. 'active_pushes' represents recovery
+                      ///< that is in-flight to the local ObjectStore
+
+MEV(FullReset) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive)
+
+MEV(NextChunk) ///< finished handling this chunk. Go get the next one
+
+MEV(ScrubFinished)  ///< all chunks handled
+
+
+struct NotActive;          ///< the quiescent state. No active scrubbing.
+struct ReservingReplicas;   ///< securing scrub resources from replicas' OSDs
+struct ActiveScrubbing;            ///< the active state for a Primary. A sub-machine.
+struct ReplicaWaitUpdates;  ///< an active state for a replica. Waiting for all active
+                           ///< operations to finish.
+struct ActiveReplica;      ///< an active state for a replica.
+
+
+class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
+ public:
+  friend class PgScrubber;
+
+ public:
+  explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub);
+  ~ScrubMachine();
+
+  PG* m_pg;  // only used for dout messages
+  spg_t m_pg_id;
+  ScrubMachineListener* m_scrbr;
+
+  void my_states() const;
+  void assert_not_active() const;
+  [[nodiscard]] bool is_reserving() const;
+  [[nodiscard]] bool is_accepting_updates() const;
+};
+
+/**
+ *  The Scrubber's base (quiescent) state.
+ *  Scrubbing is triggered by one of the following events:
+ *  - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources
+ *    reservation process. Will be issued by PG::scrub(), following a
+ *    queued "PGScrub" op.
+ *  - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is
+ *    not required to reserve resources.
+ *  - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming
+ *    MOSDRepScrub message.
+ *
+ *  note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting
+ *   for replica resources to be acquired. But once replicas started using the
+ *   resource-request to identify and tag the scrub session, this bypass cannot be
+ *   supported anymore.
+ */
+struct NotActive : sc::state<NotActive, ScrubMachine> {
+  explicit NotActive(my_context ctx);
+
+  using reactions = mpl::list<sc::transition<StartScrub, ReservingReplicas>,
+                             // a scrubbing that was initiated at recovery completion,
+                             // and requires no resource reservations:
+                             sc::transition<AfterRepairScrub, ReservingReplicas>,
+                             sc::transition<StartReplica, ReplicaWaitUpdates>,
+                             sc::transition<StartReplicaNoWait, ActiveReplica>>;
+};
+
+struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine> {
+
+  explicit ReservingReplicas(my_context ctx);
+  using reactions = mpl::list<sc::custom_reaction<FullReset>,
+                             // all replicas granted our resources request
+                             sc::transition<RemotesReserved, ActiveScrubbing>,
+                             sc::custom_reaction<ReservationFailure>>;
+
+  sc::result react(const FullReset&);
+
+  /// at least one replica denied us the scrub resources we've requested
+  sc::result react(const ReservationFailure&);
+};
+
+
+// the "active" sub-states
+
+struct RangeBlocked;  ///< the objects range is blocked
+struct PendingTimer;  ///< either delaying the scrub by some time and requeuing, or just
+                     ///< requeue
+struct NewChunk;      ///< select a chunk to scrub, and verify its availability
+struct WaitPushes;
+struct WaitLastUpdate;
+struct BuildMap;
+struct DrainReplMaps;  ///< a problem during BuildMap. Wait for all replicas to report,
+                      ///< then restart.
+struct WaitReplicas;   ///< wait for all replicas to report
+struct WaitDigestUpdate;
+
+struct ActiveScrubbing : sc::state<ActiveScrubbing, ScrubMachine, PendingTimer> {
+
+  explicit ActiveScrubbing(my_context ctx);
+  ~ActiveScrubbing();
+
+  using reactions = mpl::list<
+    sc::custom_reaction<InternalError>,
+    sc::custom_reaction<FullReset>>;
+
+  sc::result react(const FullReset&);
+  sc::result react(const InternalError&);
+};
+
+struct RangeBlocked : sc::state<RangeBlocked, ActiveScrubbing> {
+  explicit RangeBlocked(my_context ctx);
+  using reactions = mpl::list<sc::transition<Unblocked, PendingTimer>>;
+
+  Scrub::BlockedRangeWarning m_timeout;
+};
+
+struct PendingTimer : sc::state<PendingTimer, ActiveScrubbing> {
+
+  explicit PendingTimer(my_context ctx);
+
+  using reactions = mpl::list<sc::transition<InternalSchedScrub, NewChunk>>;
+};
+
+struct NewChunk : sc::state<NewChunk, ActiveScrubbing> {
+
+  explicit NewChunk(my_context ctx);
+
+  using reactions = mpl::list<sc::transition<ChunkIsBusy, RangeBlocked>,
+                             sc::custom_reaction<SelectedChunkFree>>;
+
+  sc::result react(const SelectedChunkFree&);
+};
+
+/**
+ * initiate the update process for this chunk
+ *
+ * Wait fo 'active_pushes' to clear.
+ * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence
+ * scrub waits until the correct data is readable (in-flight data to the Objectstore is
+ * not readable until written to disk, termed 'applied' here)
+ */
+struct WaitPushes : sc::state<WaitPushes, ActiveScrubbing> {
+
+  explicit WaitPushes(my_context ctx);
+
+  using reactions = mpl::list<sc::custom_reaction<ActivePushesUpd>>;
+
+  sc::result react(const ActivePushesUpd&);
+};
+
+struct WaitLastUpdate : sc::state<WaitLastUpdate, ActiveScrubbing> {
+
+  explicit WaitLastUpdate(my_context ctx);
+
+  void on_new_updates(const UpdatesApplied&);
+
+  using reactions = mpl::list<sc::custom_reaction<InternalAllUpdates>,
+                             sc::in_state_reaction<UpdatesApplied,
+                                                   WaitLastUpdate,
+                                                   &WaitLastUpdate::on_new_updates>>;
+
+  sc::result react(const InternalAllUpdates&);
+};
+
+struct BuildMap : sc::state<BuildMap, ActiveScrubbing> {
+  explicit BuildMap(my_context ctx);
+
+  // possible error scenarios:
+  // - an error reported by the backend will trigger an 'InternalError' event,
+  //   handled by our parent state;
+  // - if preempted, we switch to DrainReplMaps, where we will wait for all
+  //   replicas to send their maps before acknowledging the preemption;
+  // - an interval change will be handled by the relevant 'send-event' functions,
+  //   and will translated into a 'FullReset' event.
+  using reactions =
+    mpl::list<sc::transition<IntBmPreempted, DrainReplMaps>,
+             sc::transition<InternalSchedScrub, BuildMap>,  // looping, waiting
+                                                            // for the backend to
+                                                            // finish
+             sc::custom_reaction<IntLocalMapDone>>;
+
+  sc::result react(const IntLocalMapDone&);
+};
+
+/*
+ *  "drain" scrub-maps responses from replicas
+ */
+struct DrainReplMaps : sc::state<DrainReplMaps, ActiveScrubbing> {
+  explicit DrainReplMaps(my_context ctx);
+
+  using reactions =
+    mpl::list<sc::custom_reaction<GotReplicas> // all replicas are accounted for
+             >;
+
+  sc::result react(const GotReplicas&);
+};
+
+struct WaitReplicas : sc::state<WaitReplicas, ActiveScrubbing> {
+  explicit WaitReplicas(my_context ctx);
+
+  using reactions =
+    mpl::list<sc::custom_reaction<GotReplicas>,         // all replicas are accounted for
+             sc::transition<MapsCompared, WaitDigestUpdate>,
+             sc::deferral<DigestUpdate>  // might arrive before we've reached WDU
+             >;
+
+  sc::result react(const GotReplicas&);
+
+  bool all_maps_already_called{false}; // see comment in react code
+};
+
+struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing> {
+  explicit WaitDigestUpdate(my_context ctx);
+
+  using reactions = mpl::list<sc::custom_reaction<DigestUpdate>,
+                             sc::transition<NextChunk, PendingTimer>,
+                             sc::transition<ScrubFinished, NotActive>>;
+  sc::result react(const DigestUpdate&);
+};
+
+// ----------------------------- the "replica active" states -----------------------
+
+/*
+ * Waiting for 'active_pushes' to complete
+ *
+ * When in this state:
+ * - the details of the Primary's request were internalized by PgScrubber;
+ * - 'active' scrubbing is set
+ */
+struct ReplicaWaitUpdates : sc::state<ReplicaWaitUpdates, ScrubMachine> {
+  explicit ReplicaWaitUpdates(my_context ctx);
+  using reactions =
+    mpl::list<sc::custom_reaction<ReplicaPushesUpd>, sc::custom_reaction<FullReset>>;
+
+  sc::result react(const ReplicaPushesUpd&);
+  sc::result react(const FullReset&);
+};
+
+
+struct ActiveReplica : sc::state<ActiveReplica, ScrubMachine> {
+  explicit ActiveReplica(my_context ctx);
+  using reactions = mpl::list<sc::custom_reaction<SchedReplica>,
+                             sc::custom_reaction<FullReset>,
+                             sc::transition<ScrubFinished, NotActive>>;
+
+  sc::result react(const SchedReplica&);
+  sc::result react(const FullReset&);
+};
+
+}  // namespace Scrub
diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h
new file mode 100644 (file)
index 0000000..25bd080
--- /dev/null
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+/**
+ * \file the PgScrubber interface used by the scrub FSM
+ */
+#include "common/version.h"
+#include "include/Context.h"
+
+#include "osd/osd_types.h"
+
+namespace Scrub {
+
+enum class PreemptionNoted { no_preemption, preempted };
+
+/// the interface exposed by the PgScrubber into its internal
+/// preemption_data object
+struct preemption_t {
+
+  virtual ~preemption_t() = default;
+
+  [[nodiscard]] virtual bool is_preemptable() const = 0;
+
+  [[nodiscard]] virtual bool was_preempted() const = 0;
+
+  virtual void adjust_parameters() = 0;
+
+  /**
+   *  Try to preempt the scrub.
+   *  'true' (i.e. - preempted) if:
+   *   preemptable && not already preempted
+   */
+  virtual bool do_preempt() = 0;
+
+  /**
+   *  disables preemptions.
+   *  Returns 'true' if we were already preempted
+   */
+  virtual bool disable_and_test() = 0;
+};
+
+/// an aux used when blocking on a busy object.
+/// Issues a log warning if still blocked after 'waittime'.
+struct blocked_range_t {
+  blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id);
+  ~blocked_range_t();
+
+  OSDService* m_osds;
+  Context* m_callbk;
+};
+
+using BlockedRangeWarning = std::unique_ptr<blocked_range_t>;
+
+}  // namespace Scrub
+
+struct ScrubMachineListener {
+
+  struct MsgAndEpoch {
+    MessageRef m_msg;
+    epoch_t m_epoch;
+  };
+
+  virtual ~ScrubMachineListener() = default;
+
+  [[nodiscard]] virtual bool is_primary() const = 0;
+
+  virtual void select_range_n_notify() = 0;
+
+  virtual Scrub::BlockedRangeWarning acquire_blocked_alarm() = 0;
+
+  /// walk the log to find the latest update that affects our chunk
+  virtual eversion_t search_log_for_updates() const = 0;
+
+  virtual eversion_t get_last_update_applied() const = 0;
+
+  virtual int pending_active_pushes() const = 0;
+
+  virtual int build_primary_map_chunk() = 0;
+
+  virtual int build_replica_map_chunk() = 0;
+
+  virtual void on_init() = 0;
+
+  virtual void on_replica_init() = 0;
+
+  virtual void replica_handling_done() = 0;
+
+  /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+  /// (thus can be called from FSM reactions)
+  virtual void clear_pgscrub_state() = 0;
+
+  /*
+   * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
+   * is asserted - after a configuration-dependent timeout.
+   */
+  virtual void add_delayed_scheduling() = 0;
+
+  /**
+   * Ask all replicas for their scrub maps for the current chunk.
+   */
+  virtual void get_replicas_maps(bool replica_can_preempt) = 0;
+
+  virtual void on_digest_updates() = 0;
+
+  /**
+   * Prepare a MOSDRepScrubMap message carrying the requested scrub map
+   * @param was_preempted - were we preempted?
+   * @return the message, and the current value of 'm_replica_min_epoch' (which is
+   *     used when sending the message, but will be overwritten before that).
+   */
+  [[nodiscard]] virtual MsgAndEpoch prep_replica_map_msg(
+    Scrub::PreemptionNoted was_preempted) = 0;
+
+  /**
+   * Send to the primary the pre-prepared message containing the requested map
+   */
+  virtual void send_replica_map(const MsgAndEpoch& preprepared) = 0;
+
+  /**
+   * Let the primary know that we were preempted while trying to build the
+   * requested map.
+   */
+  virtual void send_preempted_replica() = 0;
+
+  [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0;
+
+  virtual void set_subset_last_update(eversion_t e) = 0;
+
+  [[nodiscard]] virtual bool was_epoch_changed() const = 0;
+
+  virtual Scrub::preemption_t& get_preemptor() = 0;
+
+  /**
+   *  a "technical" collection of the steps performed once all
+   *  rep maps are available:
+   *  - the maps are compared
+   *  - the scrub region markers (start_ & end_) are advanced
+   *  - callbacks and ops that were pending are allowed to run
+   */
+  virtual void maps_compare_n_cleanup() = 0;
+
+  /**
+   * order the PgScrubber to initiate the process of reserving replicas' scrub
+   * resources.
+   */
+  virtual void reserve_replicas() = 0;
+
+  virtual void unreserve_replicas() = 0;
+
+  /**
+   * the FSM interface into the "are we waiting for maps, either our own or from
+   * replicas" state.
+   * The FSM can only:
+   * - mark the local map as available, and
+   * - query status
+   */
+  virtual void mark_local_map_ready() = 0;
+
+  [[nodiscard]] virtual bool are_all_maps_available() const = 0;
+
+  /// a log/debug interface
+  virtual std::string dump_awaited_maps() const = 0;
+};