osd: extracting scrubbing functionality from 'PG'

author Ronen Friedman <rfriedma@redhat.com>

Sun, 15 Nov 2020 16:39:33 +0000 (18:39 +0200)

committer Ronen Friedman <rfriedma@redhat.com>

Thu, 10 Dec 2020 13:21:53 +0000 (15:21 +0200)
author Ronen Friedman <rfriedma@redhat.com>
Sun, 15 Nov 2020 16:39:33 +0000 (18:39 +0200)
committer Ronen Friedman <rfriedma@redhat.com>
Thu, 10 Dec 2020 13:21:53 +0000 (15:21 +0200)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index bcc2a9de03b551c84f13ae98778c020df18436f1..db0d9eb23f8bc3bd2ecaf67f37b13b965570cdf6 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1767,21 +1767,29 @@ void OSDService::queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority
      pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
  }
  
-void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
+void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
  {
-  unsigned scrub_queue_priority = pg->scrubber.priority;
-  if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
-    scrub_queue_priority = cct->_conf->osd_client_op_priority;
-  }
-  const auto epoch = pg->get_osdmap_epoch();
-  enqueue_back(
-    OpSchedulerItem(
-      unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
-      cct->_conf->osd_scrub_cost,
-      scrub_queue_priority,
-      ceph_clock_now(),
-      0,
-      epoch));
+  queue_scrub_event_msg<PGScrub>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
+}
+
+void OSDService::queue_for_rep_scrub(PG* pg,
+                                    Scrub::scrub_prio_t with_priority,
+                                    unsigned int qu_priority)
+{
+  queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority);
+}
+
+void OSDService::queue_for_rep_scrub_resched(PG* pg,
+                                            Scrub::scrub_prio_t with_priority,
+                                            unsigned int qu_priority)
+{
+  // Resulting scrub event: 'SchedReplica'
+  queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority);
  }
  
  void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
@@ -1796,6 +1804,46 @@ void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priorit
    queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
  }
  
+void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'ActivePushesUpd'
+  queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'Unblocked'
+  queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'DigestUpdate'
+  queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'GotReplicas'
+  queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'ReplicaPushesUpd'
+  queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
+}
+
  void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
  {
    dout(10) << __func__ << " on " << pgid << " e " << e  << dendl;
@@ -7386,6 +7434,45 @@ bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs)
    return pgid < rhs.pgid;
  }
  
+// this one is only moved here (from the header) temporarily, for debugging:
+void OSDService::unreg_pg_scrub(spg_t pgid, utime_t t)
+{
+  std::lock_guard l{OSDService::sched_scrub_lock};
+  size_t removed = sched_scrub_pg.erase(ScrubJob{cct, pgid, t});
+  ceph_assert(removed);
+  dout(10) << __func__ << " scrub-set removed: " << pgid << " T(" << t << ")" << dendl;
+}
+
+// this one is only moved here (from the header) temporarily, for debugging:
+utime_t OSDService::reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval,
+                     double pool_scrub_max_interval, bool must)
+{
+  ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval,
+                     must);
+  std::lock_guard l(OSDService::sched_scrub_lock);
+  auto [x, inserted] = sched_scrub_pg.insert(scrub_job);
+  dout(10) << __func__ << " scrub-set inserted: " << pgid << " T(" << t << ")" << " must: " << must << " inserted "
+    << inserted << dendl;
+  return scrub_job.sched_time;
+}
+
+void OSDService::dumps_scrub(ceph::Formatter *f)
+{
+  ceph_assert(f != nullptr);
+  std::lock_guard l(sched_scrub_lock);
+
+  f->open_array_section("scrubs");
+  for (const auto &i: sched_scrub_pg) {
+    f->open_object_section("scrub");
+    f->dump_stream("pgid") << i.pgid;
+    f->dump_stream("sched_time") << i.sched_time;
+    f->dump_stream("deadline") << i.deadline;
+    f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp());
+    f->close_section();
+  }
+  f->close_section();
+}
+
  double OSD::scrub_sleep_time(bool must_scrub)
  {
    if (must_scrub) {
@@ -7483,14 +7570,17 @@ bool OSD::scrub_load_below_threshold()
  
  void OSD::sched_scrub()
  {
+  dout(20) << __func__ << " sched_scrub starts" << dendl;
+
    // if not permitted, fail fast
    if (!service.can_inc_scrubs()) {
+    dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
      return;
    }
    bool allow_requested_repair_only = false;
    if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
      if (!cct->_conf->osd_repair_during_recovery) {
-      dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
+      dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl;
        return;
      }
      dout(10) << __func__
@@ -7504,57 +7594,62 @@ void OSD::sched_scrub()
    bool load_is_low = scrub_load_below_threshold();
    dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
  
-  OSDService::ScrubJob scrub;
-  if (service.first_scrub_stamp(&scrub)) {
+  OSDService::ScrubJob scrub_job;
+  if (service.first_scrub_stamp(&scrub_job)) {
      do {
        dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
  
-      if (scrub.sched_time > now) {
+      if (scrub_job.sched_time > now) {
         // save ourselves some effort
-       dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
+       dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time
                  << " > " << now << dendl;
         break;
        }
  
-      if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
-        dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
+      if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) {
+        dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to "
                   << (!time_permit ? "time not permit" : "high load") << dendl;
          continue;
        }
  
-      PGRef pg = _lookup_lock_pg(scrub.pgid);
-      if (!pg)
+      PGRef pg = _lookup_lock_pg(scrub_job.pgid);
+      if (!pg) {
+       dout(20) << __func__ << " pg  " << scrub_job.pgid << " not found" << dendl;
         continue;
+      }
+
        // This has already started, so go on to the next scrub job
-      if (pg->scrubber.active) {
+      if (pg->is_scrub_active()) {
         pg->unlock();
-       dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
+       dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl;
         continue;
        }
-      // Skip other kinds of scrubing if only explicitly requested repairing is allowed
-      if (allow_requested_repair_only && !pg->scrubber.must_repair) {
+      // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
+      if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
          pg->unlock();
-        dout(10) << __func__ << " skip " << scrub.pgid
+        dout(10) << __func__ << " skip " << scrub_job.pgid
                   << " because repairing is not explicitly requested on it"
                   << dendl;
          continue;
        }
+
        // If it is reserving, let it resolve before going to the next scrub job
-      if (pg->scrubber.local_reserved && !pg->scrubber.active) {
+      if (pg->m_scrubber->is_reserving()) {
         pg->unlock();
-       dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
+       dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
         break;
        }
-      dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
+      dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
                << (pg->get_must_scrub() ? ", explicitly requested" :
                    (load_is_low ? ", load_is_low" : " deadline < now"))
                << dendl;
        if (pg->sched_scrub()) {
         pg->unlock();
+        dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
         break;
        }
        pg->unlock();
-    } while (service.next_scrub_stamp(scrub, &scrub));
+    } while (service.next_scrub_stamp(scrub_job, &scrub_job));
    }
    dout(20) << "sched_scrub done" << dendl;
  }
@@ -7562,20 +7657,20 @@ void OSD::sched_scrub()
  void OSD::resched_all_scrubs()
  {
    dout(10) << __func__ << ": start" << dendl;
-  OSDService::ScrubJob scrub;
-  if (service.first_scrub_stamp(&scrub)) {
+  OSDService::ScrubJob scrub_job;
+  if (service.first_scrub_stamp(&scrub_job)) {
      do {
-      dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
+      dout(20) << __func__ << ": examine " << scrub_job.pgid << dendl;
  
-      PGRef pg = _lookup_lock_pg(scrub.pgid);
+      PGRef pg = _lookup_lock_pg(scrub_job.pgid);
        if (!pg)
         continue;
-      if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
-        dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
+      if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
+        dout(15) << __func__ << ": reschedule " << scrub_job.pgid << dendl;
          pg->on_info_history_change();
        }
        pg->unlock();
-    } while (service.next_scrub_stamp(scrub, &scrub));
+    } while (service.next_scrub_stamp(scrub_job, &scrub_job));
    }
    dout(10) << __func__ << ": done" << dendl;
  }
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 92bb331365b0b17600255541598e5e81b12e9c3b..aab81286fdb94e7030812c78d318eff4e4a9f4c1 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -291,20 +291,10 @@ public:
    };
    std::set<ScrubJob> sched_scrub_pg;
  
-  /// @returns the scrub_reg_stamp used for unregister the scrub job
+  /// @returns the scrub_reg_stamp used for unregister'ing the scrub job
    utime_t reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval,
-                      double pool_scrub_max_interval, bool must) {
-    ScrubJob scrub(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval,
-                  must);
-    std::lock_guard l(sched_scrub_lock);
-    sched_scrub_pg.insert(scrub);
-    return scrub.sched_time;
-  }
-  void unreg_pg_scrub(spg_t pgid, utime_t t) {
-    std::lock_guard l(sched_scrub_lock);
-    size_t removed = sched_scrub_pg.erase(ScrubJob(cct, pgid, t));
-    ceph_assert(removed);
-  }
+                      double pool_scrub_max_interval, bool must);
+  void unreg_pg_scrub(spg_t pgid, utime_t t);
    bool first_scrub_stamp(ScrubJob *out) {
      std::lock_guard l(sched_scrub_lock);
      if (sched_scrub_pg.empty())
@@ -328,21 +318,7 @@ public:
      return true;
    }
  
-  void dumps_scrub(ceph::Formatter *f) {
-    ceph_assert(f != nullptr);
-    std::lock_guard l(sched_scrub_lock);
-
-    f->open_array_section("scrubs");
-    for (const auto &i: sched_scrub_pg) {
-      f->open_object_section("scrub");
-      f->dump_stream("pgid") << i.pgid;
-      f->dump_stream("sched_time") << i.sched_time;
-      f->dump_stream("deadline") << i.deadline;
-      f->dump_bool("forced", i.sched_time == PG::Scrubber::scrub_must_stamp());
-      f->close_section();
-    }
-    f->close_section();
-  }
+  void dumps_scrub(ceph::Formatter* f);
  
    bool can_inc_scrubs();
    bool inc_scrubs_local();
@@ -602,7 +578,8 @@ public:
    AsyncReserver<spg_t, Finisher> snap_reserver;
    void queue_recovery_context(PG *pg, GenContext<ThreadPool::TPHandle&> *c);
    void queue_for_snap_trim(PG *pg);
-  void queue_for_scrub(PG *pg, bool with_high_priority);
+  void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
+  void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);
  
    /// queue the message (-> event) that all replicas reserved scrub resources for us
    void queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority);
@@ -610,6 +587,36 @@ public:
    /// queue the message (-> event) that some replicas denied our scrub resources request
    void queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority);
  
+  /// Signals either (a) the end of a sleep period, or (b) a recheck of the availability
+  /// of the primary map being created by the backend.
+  void queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals a change in the number of in-flight recovery writes
+  void queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that all pending updates were applied
+  void queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// The block-range that was locked and prevented the scrubbing - is freed
+  void queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that all write OPs are done
+  void queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that we (the Primary) got all waited-for scrub-maps from our replicas
+  void queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  void queue_for_rep_scrub(PG* pg,
+                          Scrub::scrub_prio_t with_high_priority,
+                          unsigned int qu_priority);
+
+  /// Signals a change in the number of in-flight recovery writes
+  void queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority);
+
+  void queue_for_rep_scrub_resched(PG* pg,
+                                  Scrub::scrub_prio_t with_high_priority,
+                                  unsigned int qu_priority);
+
    void queue_for_pg_delete(spg_t pgid, epoch_t e);
    bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
  
@@ -619,12 +626,14 @@ private:
    std::list<std::pair<epoch_t, PGRef> > awaiting_throttle;
  
    /// queue a scrub-related message for a PG
-  template<class MSG_TYPE>
-  void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority, unsigned int qu_priority);
+  template <class MSG_TYPE>
+  void queue_scrub_event_msg(PG* pg,
+                            Scrub::scrub_prio_t with_priority,
+                            unsigned int qu_priority);
  
    /// An alternative version of queue_scrub_event_msg(), in which the queuing priority is
    /// provided by the executing scrub (i.e. taken from PgScrubber::m_flags)
-  template<class MSG_TYPE>
+  template <class MSG_TYPE>
    void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority);
  
    utime_t defer_recovery_until;
@@ -1682,6 +1691,7 @@ protected:
    friend class PG;
    friend struct OSDShard;
    friend class PrimaryLogPG;
+  friend class PgScrubber;
  
  
   protected:
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index 7090d14bc20caf65eba510179454a1a2e2a13cbd..724d896b727b979f43ef98812428c8c256bef42c 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -214,7 +214,6 @@ PG::PG(OSDService *o, OSDMapRef curmap,
    pg_stats_publish_valid(false),
    finish_sync_event(NULL),
    scrub_after_recovery(false),
-  save_req_scrub(false),
    active_pushes(0),
    recovery_state(
      o->cct,
@@ -310,7 +309,7 @@ void PG::log_state_exit(
    osd->pg_recovery_stats.log_exit(
      state_name, ceph_clock_now() - enter_time, events, event_dur);
  }
-  
+
  /********* PG **********/
  
  void PG::remove_snap_mapped_object(
@@ -365,29 +364,12 @@ void PG::clear_primary_state()
    finish_sync_event = 0;  // so that _finish_recovery doesn't go off in another thread
    release_pg_backoffs();
  
-  scrubber.reserved_peers.clear();
+  m_scrubber->unreserve_replicas();
    scrub_after_recovery = false;
-  save_req_scrub = false;
  
    agent_clear();
  }
  
-PG::Scrubber::Scrubber()
- : local_reserved(false), remote_reserved(false), reserve_failed(false),
-   epoch_start(0),
-   active(false),
-   shallow_errors(0), deep_errors(0), fixed(0),
-   must_scrub(false), must_deep_scrub(false), must_repair(false),
-   need_auto(false), req_scrub(false), time_for_deep(false),
-   auto_repair(false),
-   check_repair(false),
-   deep_scrub_on_error(false),
-   num_digest_updates_pending(0),
-   state(INACTIVE),
-   deep(false)
-{}
-
-PG::Scrubber::~Scrubber() {}
  
  bool PG::op_has_sufficient_caps(OpRequestRef& op)
  {
@@ -431,20 +413,6 @@ bool PG::op_has_sufficient_caps(OpRequestRef& op)
    return cap;
  }
  
-bool PG::requeue_scrub(bool high_priority)
-{
-  ceph_assert(ceph_mutex_is_locked(_lock));
-  if (scrub_queued) {
-    dout(10) << __func__ << ": already queued" << dendl;
-    return false;
-  } else {
-    dout(10) << __func__ << ": queueing" << dendl;
-    scrub_queued = true;
-    osd->queue_for_scrub(this, high_priority);
-    return true;
-  }
-}
-
  void PG::queue_recovery()
  {
    if (!is_primary() || !is_peered()) {
@@ -459,55 +427,36 @@ void PG::queue_recovery()
    }
  }
  
-bool PG::queue_scrub()
+void PG::queue_scrub_after_repair()
  {
+  dout(10) << __func__ << dendl;
    ceph_assert(ceph_mutex_is_locked(_lock));
+
+  m_planned_scrub.must_deep_scrub = true;
+  m_planned_scrub.check_repair = true;
+  m_planned_scrub.must_scrub = true;
+
    if (is_scrubbing()) {
-    return false;
-  }
-  // An interrupted recovery repair could leave this set.
-  state_clear(PG_STATE_REPAIR);
-  if (scrubber.need_auto) {
-    scrubber.must_scrub = true;
-    scrubber.must_deep_scrub = true;
-    scrubber.auto_repair = true;
-    scrubber.need_auto = false;
-  }
-  scrubber.priority = scrubber.must_scrub ?
-         cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
-  scrubber.must_scrub = false;
-  state_set(PG_STATE_SCRUBBING);
-  if (scrubber.must_deep_scrub) {
-    state_set(PG_STATE_DEEP_SCRUB);
-    scrubber.must_deep_scrub = false;
+    dout(10) << __func__ << ": scrubbing already" << dendl;
+    return;
    }
-  if (scrubber.must_repair || scrubber.auto_repair) {
-    state_set(PG_STATE_REPAIR);
-    scrubber.must_repair = false;
+  if (scrub_queued) {
+    dout(10) << __func__ << ": already queued" << dendl;
+    return;
    }
-  requeue_scrub();
-  return true;
-}
  
-void PG::scrub_send_resources_granted(epoch_t epoch_queued,
-                                     [[maybe_unused]] ThreadPool::TPHandle& handle)
-{
-  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  //m_scrubber->send_remotes_reserved();
-}
+  m_scrubber->set_op_parameters(m_planned_scrub);
+  dout(15) << __func__ << ": queueing" << dendl;
  
-void PG::scrub_send_resources_denied(epoch_t epoch_queued,
-                                    [[maybe_unused]] ThreadPool::TPHandle& handle)
-{
-  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
-  //m_scrubber->send_reservation_failure();
+  scrub_queued = true;
+  osd->queue_scrub_after_repair(this, Scrub::scrub_prio_t::high_priority);
  }
  
  unsigned PG::get_scrub_priority()
  {
    // a higher value -> a higher priority
-  int64_t pool_scrub_priority = 0;
-  pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
+  int64_t pool_scrub_priority =
+    pool.info.opts.value_or(pool_opts_t::SCRUB_PRIORITY, (int64_t)0);
    return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
  }
  
@@ -525,8 +474,11 @@ Context *PG::finish_recovery()
    return finish_sync_event;
  }
  
-void PG::_finish_recovery(Context *c)
+void PG::_finish_recovery(Context* c)
  {
+  dout(15) << __func__ << " finish_sync_event? " << finish_sync_event << " clean? "
+                << is_clean() << dendl;
+
    std::scoped_lock locker{*this};
    if (recovery_state.is_deleting() || !is_clean()) {
      dout(10) << __func__ << " raced with delete or repair" << dendl;
@@ -535,7 +487,7 @@ void PG::_finish_recovery(Context *c)
    // When recovery is initiated by a repair, that flag is left on
    state_clear(PG_STATE_REPAIR);
    if (c == finish_sync_event) {
-    dout(10) << "_finish_recovery" << dendl;
+    dout(15) << __func__ << " scrub_after_recovery? " << scrub_after_recovery << dendl;
      finish_sync_event = 0;
      recovery_state.purge_strays();
  
@@ -544,11 +496,7 @@ void PG::_finish_recovery(Context *c)
      if (scrub_after_recovery) {
        dout(10) << "_finish_recovery requeueing for scrub" << dendl;
        scrub_after_recovery = false;
-      scrubber.must_deep_scrub = true;
-      scrubber.check_repair = true;
-      // We remember whether req_scrub was set when scrub_after_recovery set to true
-      scrubber.req_scrub = save_req_scrub;
-      queue_scrub();
+      queue_scrub_after_repair();
      }
    } else {
      dout(10) << "_finish_recovery -- stale" << dendl;
@@ -1359,243 +1307,247 @@ void PG::requeue_map_waiters()
    }
  }
  
+bool PG::get_must_scrub() const
+{
+  dout(20) << __func__ << " must_scrub? " << (m_planned_scrub.must_scrub ? "true" : "false") << dendl;
+  return m_planned_scrub.must_scrub;
+}
  
  unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
  {
-  return 0; // next commit: m_scrubber->scrub_requeue_priority(with_priority);
+  return m_scrubber->scrub_requeue_priority(with_priority);
  }
  
  unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const
  {
-  return 0; // next commit: m_scrubber->scrub_requeue_priority(with_priority, suggested_priority);
+  return m_scrubber->scrub_requeue_priority(with_priority, suggested_priority);
  }
  
  // ==========================================================================================
  // SCRUB
  
  /*
- * when holding pg and sched_scrub_lock, then the states are:
- *   scheduling:
- *     scrubber.local_reserved = true
- *     scrubber.active = false
- *     scrubber.reserved_peers includes whoami
- *     osd->scrubs_local++
- *   scheduling, replica declined:
- *     scrubber.local_reserved = true
- *     scrubber.reserved_peers includes -1
- *     osd->scrub_local++
- *   pending:
- *     scrubber.local_reserved = true
- *     scrubber.active = false
- *     scrubber.reserved_peers.size() == acting.size();
- *     pg on scrub_wq
- *     osd->scrub_local++
- *   scrubbing:
- *     scrubber.local_reserved = true;
- *     scrubber.active = true
- *     scrubber.reserved_peers empty
+ *  implementation note:
+ *  PG::sched_scrub() is called only once per a specific scrub session.
+ *  That call commits us to the whatever choices are made (deep/shallow, etc').
+ *  Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into
+ *  PgScrubber's m_flags, then cleared.
   */
-
-// returns true if a scrub has been newly kicked off
  bool PG::sched_scrub()
  {
+  dout(15) << __func__ << " pg(" << info.pgid
+         << (is_active() ? ") <active>" : ") <not-active>")
+         << (is_clean() ? " <clean>" : " <not-clean>") << dendl;
    ceph_assert(ceph_mutex_is_locked(_lock));
    ceph_assert(!is_scrubbing());
-  if (!(is_primary() && is_active() && is_clean())) {
+
+  if (!is_primary() || !is_active() || !is_clean()) {
      return false;
    }
  
-  // All processing the first time through commits us to whatever
-  // choices are made.
-  if (!scrubber.local_reserved) {
-    dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl;
-
-    bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
-                      pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
-    bool allow_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
-                 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
-    bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
-    bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair
-                               && get_pgbackend()->auto_repair_supported());
-
-    scrubber.time_for_deep = false;
-    // Clear these in case user issues the scrub/repair command during
-    // the scheduling of the scrub/repair (e.g. request reservation)
-    scrubber.deep_scrub_on_error = false;
-    scrubber.auto_repair = false;
+  if (scrub_queued) {
+    // only applicable to the very first time a scrub event is queued
+    // (until handled and posted to the scrub FSM)
+    dout(10) << __func__ << ": already queued" << dendl;
+    return false;
+  }
  
-    // All periodic scrub handling goes here because must_scrub is
-    // always set for must_deep_scrub and must_repair.
-    if (!scrubber.must_scrub) {
-      ceph_assert(!scrubber.must_deep_scrub && !scrubber.must_repair);
-      // Handle deep scrub determination only if allowed
-      if (allow_deep_scrub) {
-        // Initial entry and scheduled scrubs without nodeep_scrub set get here
-        if (scrubber.need_auto) {
-         dout(20) << __func__ << ": need repair after scrub errors" << dendl;
-          scrubber.time_for_deep = true;
-        } else {
-          double deep_scrub_interval = 0;
-          pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
-          if (deep_scrub_interval <= 0) {
-           deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
-          }
-          scrubber.time_for_deep = ceph_clock_now() >=
-                 info.history.last_deep_scrub_stamp + deep_scrub_interval;
-
-          bool deep_coin_flip = false;
-         // If we randomize when !allow_scrub && allow_deep_scrub, then it guarantees
-         // we will deep scrub because this function is called often.
-         if (!scrubber.time_for_deep && allow_scrub)
-           deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
-          dout(20) << __func__ << ": time_for_deep=" << scrubber.time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
-
-          scrubber.time_for_deep = (scrubber.time_for_deep || deep_coin_flip);
-        }
+  // analyse the combination of the requested scrub flags, the osd/pool configuration
+  // and the PG status to determine whether we should scrub now, and what type of scrub
+  // should that be.
+  auto updated_flags = verify_scrub_mode();
+  if (!updated_flags) {
+    // the stars do not align for starting a scrub for this PG at this time
+    // (due to configuration or priority issues)
+    // The reason was already reported by the callee.
+    dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
+    return false;
+  }
  
-        if (!scrubber.time_for_deep && has_deep_errors) {
-         osd->clog->info() << "osd." << osd->whoami
-                           << " pg " << info.pgid
-                           << " Deep scrub errors, upgrading scrub to deep-scrub";
-         scrubber.time_for_deep = true;
-        }
+  // try to reserve the local OSD resources. If failing: no harm. We will
+  // be retried by the OSD later on.
+  if (!m_scrubber->reserve_local()) {
+    dout(10) << __func__ << ": failed to reserve locally" << dendl;
+    return false;
+  }
  
-        if (try_to_auto_repair) {
-          if (scrubber.time_for_deep) {
-            dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
-            scrubber.auto_repair = true;
-          } else if (allow_scrub) {
-            dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl;
-            scrubber.deep_scrub_on_error = true;
-          }
-        }
-      } else { // !allow_deep_scrub
-        dout(20) << __func__ << ": nodeep_scrub set" << dendl;
-        if (has_deep_errors) {
-          osd->clog->error() << "osd." << osd->whoami
-                            << " pg " << info.pgid
-                            << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
-          return false;
-        }
-      }
+  // can commit to the updated flags now, as nothing will stop the scrub
+  m_planned_scrub = *updated_flags;
  
-      //NOSCRUB so skip regular scrubs
-      if (!allow_scrub && !scrubber.time_for_deep) {
-        return false;
-      }
-    // scrubber.must_scrub
-    } else if (!scrubber.must_deep_scrub && has_deep_errors) {
-       osd->clog->error() << "osd." << osd->whoami
-                          << " pg " << info.pgid
-                          << " Regular scrub request, deep-scrub details will be lost";
-    }
-    // Unless precluded this was handle above
-    scrubber.need_auto = false;
-
-    ceph_assert(scrubber.reserved_peers.empty());
-    bool allow_scrubing = cct->_conf->osd_scrub_during_recovery ||
-                          (cct->_conf->osd_repair_during_recovery && scrubber.must_repair) ||
-                          !osd->is_recovery_active();
-    if (allow_scrubing &&
-         osd->inc_scrubs_local()) {
-      dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
-      scrubber.local_reserved = true;
-      scrubber.reserved_peers.insert(pg_whoami);
-      scrub_reserve_replicas();
-    } else {
-      dout(20) << __func__ << ": failed to reserve locally" << dendl;
-      return false;
-    }
+  // An interrupted recovery repair could leave this set.
+  state_clear(PG_STATE_REPAIR);
+
+  // Pass control to the scrubber. It is the scrubber that handles the replicas'
+  // resources reservations.
+  m_scrubber->set_op_parameters(m_planned_scrub);
+
+  dout(10) << __func__ << ": queueing" << dendl;
+
+  scrub_queued = true;
+  osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority);
+  return true;
+}
+
+double PG::next_deepscrub_interval() const
+{
+  double deep_scrub_interval =
+    pool.info.opts.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0);
+  if (deep_scrub_interval <= 0.0)
+    deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+  return info.history.last_deep_scrub_stamp + deep_scrub_interval;
+}
+
+bool PG::is_time_for_deep(bool allow_deep_scrub,
+                         bool allow_scrub,
+                         bool has_deep_errors,
+                         const requested_scrub_t& planned) const
+{
+  dout(10) << __func__ << ": need_auto?" << planned.need_auto << " allow_deep_scrub? " << allow_deep_scrub << dendl;
+
+  if (!allow_deep_scrub)
+    return false;
+
+  if (planned.need_auto) {
+    dout(10) << __func__ << ": need repair after scrub errors" << dendl;
+    return true;
    }
  
-  if (scrubber.local_reserved) {
-    if (scrubber.reserve_failed) {
-      dout(20) << __func__ << ": failed, a peer declined" << dendl;
-      clear_scrub_reserved();
-      scrub_unreserve_replicas();
+  if (ceph_clock_now() >= next_deepscrub_interval())
+    return true;
+
+  if (has_deep_errors) {
+    osd->clog->info() << "osd." << osd->whoami << " pg " << info.pgid
+                     << " Deep scrub errors, upgrading scrub to deep-scrub";
+    return true;
+  }
+
+  // we only flip coins if 'allow_scrub' is asserted. Otherwise - as this function is
+  // called often, we will probably be deep-scrubbing most of the time.
+  if (allow_scrub) {
+    bool deep_coin_flip =
+      (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
+
+    dout(15) << __func__ << ": time_for_deep=" << planned.time_for_deep
+            << " deep_coin_flip=" << deep_coin_flip << dendl;
+
+    if (deep_coin_flip)
+      return true;
+  }
+
+  return false;
+}
+
+bool PG::verify_periodic_scrub_mode(bool allow_deep_scrub,
+                             bool try_to_auto_repair,
+                             bool allow_regular_scrub,
+                             bool has_deep_errors,
+                             requested_scrub_t& planned) const
+
+{
+  ceph_assert(!planned.must_deep_scrub && !planned.must_repair);
+
+  if (!allow_deep_scrub && has_deep_errors) {
+      osd->clog->error()
+       << "osd." << osd->whoami << " pg " << info.pgid
+       << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
        return false;
-    } else if (scrubber.reserved_peers.size() == get_actingset().size()) {
-      dout(20) << __func__ << ": success, reserved self and replicas" << dendl;
-      if (scrubber.time_for_deep) {
-       dout(10) << __func__ << ": scrub will be deep" << dendl;
-       state_set(PG_STATE_DEEP_SCRUB);
-       scrubber.time_for_deep = false;
+  }
+
+  if (allow_deep_scrub) {
+    // Initial entry and scheduled scrubs without nodeep_scrub set get here
+
+    planned.time_for_deep =
+      is_time_for_deep(allow_deep_scrub, allow_regular_scrub, has_deep_errors, planned);
+
+    if (try_to_auto_repair) {
+      if (planned.time_for_deep) {
+       dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
+       planned.auto_repair = true;
+      } else if (allow_regular_scrub) {
+       dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found"
+                << dendl;
+       planned.deep_scrub_on_error = true;
        }
-      queue_scrub();
-    } else {
-      // none declined, since scrubber.reserved is set
-      dout(20) << __func__ << ": reserved " << scrubber.reserved_peers
-              << ", waiting for replicas" << dendl;
      }
    }
+
+  dout(20) << __func__ << " updated flags: " << planned
+          << " allow_regular_scrub: " << allow_regular_scrub << dendl;
+
+  // NOSCRUB so skip regular scrubs
+  if (!allow_regular_scrub && !planned.time_for_deep) {
+    return false;
+  }
+
    return true;
  }
  
-bool PG::is_scrub_registered()
+std::optional<requested_scrub_t> PG::verify_scrub_mode() const
  {
-  return !scrubber.scrub_reg_stamp.is_zero();
-}
+  dout(10) << __func__ << " processing pg " << info.pgid << dendl;
  
-void PG::reg_next_scrub()
-{
-  if (!is_primary())
-    return;
+  bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+                           pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
+  bool allow_regular_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+                              pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
+  bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
+  bool try_to_auto_repair =
+    (cct->_conf->osd_scrub_auto_repair && get_pgbackend()->auto_repair_supported());
  
-  utime_t reg_stamp;
-  bool must = false;
-  if (scrubber.must_scrub || scrubber.need_auto) {
-    // Set the smallest time that isn't utime_t()
-    reg_stamp = Scrubber::scrub_must_stamp();
-    must = true;
-  } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
-    reg_stamp = ceph_clock_now();
-    must = true;
-  } else {
-    reg_stamp = info.history.last_scrub_stamp;
+  auto upd_flags = m_planned_scrub;
+
+  upd_flags.time_for_deep = false;
+  // Clear these in case user issues the scrub/repair command during
+  // the scheduling of the scrub/repair (e.g. request reservation)
+  upd_flags.deep_scrub_on_error = false;
+  upd_flags.auto_repair = false;
+
+  if (upd_flags.must_scrub && !upd_flags.must_deep_scrub && has_deep_errors) {
+    osd->clog->error() << "osd." << osd->whoami << " pg " << info.pgid
+                      << " Regular scrub request, deep-scrub details will be lost";
+  }
+
+  if (!upd_flags.must_scrub) {
+    // All periodic scrub handling goes here because must_scrub is
+    // always set for must_deep_scrub and must_repair.
+
+    bool can_start_periodic =
+      verify_periodic_scrub_mode(allow_deep_scrub, try_to_auto_repair,
+                                allow_regular_scrub, has_deep_errors, upd_flags);
+    if (!can_start_periodic) {
+      return std::nullopt;
+    }
    }
-  // note down the sched_time, so we can locate this scrub, and remove it
-  // later on.
-  double scrub_min_interval = 0, scrub_max_interval = 0;
-  pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
-  pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
-  ceph_assert(!is_scrub_registered());
-  scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
-                                              reg_stamp,
-                                              scrub_min_interval,
-                                              scrub_max_interval,
-                                              must);
-  dout(10) << __func__ << " pg " << pg_id << " register next scrub, scrub time "
-      << scrubber.scrub_reg_stamp << ", must = " << (int)must << dendl;
-}
-
-void PG::unreg_next_scrub()
-{
-  if (is_scrub_registered()) {
-    osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
-    scrubber.scrub_reg_stamp = utime_t();
+
+  //  scrubbing while recovering?
+
+  bool prevented_by_recovery =
+    osd->is_recovery_active() && !cct->_conf->osd_scrub_during_recovery &&
+    (!cct->_conf->osd_repair_during_recovery || !upd_flags.must_repair);
+
+  if (prevented_by_recovery) {
+    dout(20) << __func__ << ": scrubbing prevented during recovery" << dendl;
+    return std::nullopt;
    }
+
+  upd_flags.need_auto = false;
+  return upd_flags;
+}
+
+void PG::reg_next_scrub()
+{
+  m_scrubber->reg_next_scrub(m_planned_scrub);
  }
  
  void PG::on_info_history_change()
  {
-  unreg_next_scrub();
-  reg_next_scrub();
+  m_scrubber->unreg_next_scrub();
+  m_scrubber->reg_next_scrub(m_planned_scrub);
  }
  
-void PG::scrub_requested(bool deep, bool repair, bool need_auto)
+void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type)
  {
-  unreg_next_scrub();
-  if (need_auto) {
-    scrubber.need_auto = true;
-  } else {
-    scrubber.must_scrub = true;
-    scrubber.must_deep_scrub = deep || repair;
-    scrubber.must_repair = repair;
-    // User might intervene, so clear this
-    scrubber.need_auto = false;
-    scrubber.req_scrub = true;
-  }
-  reg_next_scrub();
+  m_scrubber->scrub_requested(scrub_level, scrub_type, m_planned_scrub);
  }
  
  void PG::clear_ready_to_merge() {
@@ -1616,6 +1568,7 @@ void PG::on_role_change() {
  }
  
  void PG::on_new_interval() {
+  dout(20) << __func__ << " scrub_queued was " << scrub_queued << " flags: " << m_planned_scrub << dendl;
    scrub_queued = false;
    projected_last_update = eversion_t();
    cancel_recovery();
@@ -1698,6 +1651,15 @@ void PG::schedule_event_on_commit(
    t.register_on_commit(new QueuePeeringEvt(this, on_commit));
  }
  
+void PG::on_activate(interval_set<snapid_t> snaps)
+{
+  ceph_assert(!m_scrubber->are_callbacks_pending());
+  ceph_assert(callbacks_for_degraded_object.empty());
+  snap_trimq = snaps;
+  release_pg_backoffs();
+  projected_last_update = info.last_update;
+}
+
  void PG::on_active_exit()
  {
    backfill_reserving = false;
@@ -1903,133 +1865,6 @@ void PG::on_activate_committed()
    }
  }
  
-void PG::do_replica_scrub_map(OpRequestRef op)
-{
-  auto m = op->get_req<MOSDRepScrubMap>();
-  dout(7) << __func__ << " " << *m << dendl;
-  if (m->map_epoch < info.history.same_interval_since) {
-    dout(10) << __func__ << " discarding old from "
-            << m->map_epoch << " < " << info.history.same_interval_since
-            << dendl;
-    return;
-  }
-  if (!scrubber.is_chunky_scrub_active()) {
-    dout(10) << __func__ << " scrub isn't active" << dendl;
-    return;
-  }
-
-  op->mark_started();
-
-  auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
-  scrubber.received_maps[m->from].decode(p, info.pgid.pool());
-  dout(10) << "map version is "
-          << scrubber.received_maps[m->from].valid_through
-          << dendl;
-
-  dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
-          << dendl;
-  ceph_assert(scrubber.waiting_on_whom.count(m->from));
-  scrubber.waiting_on_whom.erase(m->from);
-  if (m->preempted) {
-    dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
-    scrub_preempted = true;
-  }
-  if (scrubber.waiting_on_whom.empty()) {
-    requeue_scrub(ops_blocked_by_scrub());
-  }
-}
-
-// send scrub v3 messages (chunky scrub)
-void PG::_request_scrub_map(
-  pg_shard_t replica, eversion_t version,
-  hobject_t start, hobject_t end,
-  bool deep,
-  bool allow_preemption)
-{
-  ceph_assert(replica != pg_whoami);
-  dout(10) << "scrub  requesting scrubmap from osd." << replica
-          << " deep " << (int)deep << dendl;
-  MOSDRepScrub *repscrubop = new MOSDRepScrub(
-    spg_t(info.pgid.pgid, replica.shard), version,
-    get_osdmap_epoch(),
-    get_last_peering_reset(),
-    start, end, deep,
-    allow_preemption,
-    scrubber.priority,
-    ops_blocked_by_scrub());
-  // default priority, we want the rep scrub processed prior to any recovery
-  // or client io messages (we are holding a lock!)
-  osd->send_message_osd_cluster(
-    replica.osd, repscrubop, get_osdmap_epoch());
-}
-
-void PG::handle_scrub_reserve_request(OpRequestRef op)
-{
-  dout(7) << __func__ << " " << *op->get_req() << dendl;
-  op->mark_started();
-  if (scrubber.remote_reserved) {
-    dout(10) << __func__ << " ignoring reserve request: Already reserved"
-            << dendl;
-    return;
-  }
-  if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
-      osd->inc_scrubs_remote()) {
-    scrubber.remote_reserved = true;
-  } else {
-    dout(20) << __func__ << ": failed to reserve remotely" << dendl;
-    scrubber.remote_reserved = false;
-  }
-  auto m = op->get_req<MOSDScrubReserve>();
-  Message *reply = new MOSDScrubReserve(
-    spg_t(info.pgid.pgid, get_primary().shard),
-    m->map_epoch,
-    scrubber.remote_reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
-    pg_whoami);
-  osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
-}
-
-void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
-{
-  dout(7) << __func__ << " " << *op->get_req() << dendl;
-  op->mark_started();
-  if (!scrubber.local_reserved) {
-    dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
-    return;
-  }
-  if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
-    dout(10) << " already had osd." << from << " reserved" << dendl;
-  } else {
-    dout(10) << " osd." << from << " scrub reserve = success" << dendl;
-    scrubber.reserved_peers.insert(from);
-    sched_scrub();
-  }
-}
-
-void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
-{
-  dout(7) << __func__ << " " << *op->get_req() << dendl;
-  op->mark_started();
-  if (!scrubber.local_reserved) {
-    dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
-    return;
-  }
-  if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
-    dout(10) << " already had osd." << from << " reserved" << dendl;
-  } else {
-    /* One decline stops this pg from being scheduled for scrubbing. */
-    dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
-    scrubber.reserve_failed = true;
-    sched_scrub();
-  }
-}
-
-void PG::handle_scrub_reserve_release(OpRequestRef op)
-{
-  dout(7) << __func__ << " " << *op->get_req() << dendl;
-  op->mark_started();
-  clear_scrub_reserved();
-}
-
  // Compute pending backfill data
  static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
  {
@@ -2117,62 +1952,6 @@ bool PG::try_reserve_recovery_space(
  void PG::unreserve_recovery_space() {
    primary_num_bytes.store(0);
    local_num_bytes.store(0);
-  return;
-}
-
-void PG::clear_scrub_reserved()
-{
-  scrubber.reserved_peers.clear();
-  scrubber.reserve_failed = false;
-
-  if (scrubber.local_reserved) {
-    scrubber.local_reserved = false;
-    osd->dec_scrubs_local();
-  }
-  if (scrubber.remote_reserved) {
-    scrubber.remote_reserved = false;
-    osd->dec_scrubs_remote();
-  }
-}
-
-void PG::scrub_reserve_replicas()
-{
-  ceph_assert(recovery_state.get_backfill_targets().empty());
-  std::vector<std::pair<int, Message*>> messages;
-  messages.reserve(get_actingset().size());
-  epoch_t  e = get_osdmap_epoch();
-  for (set<pg_shard_t>::iterator i = get_actingset().begin();
-      i != get_actingset().end();
-      ++i) {
-    if (*i == pg_whoami) continue;
-    dout(10) << "scrub requesting reserve from osd." << *i << dendl;
-    Message* m =  new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), e,
-                                       MOSDScrubReserve::REQUEST, pg_whoami);
-    messages.push_back(std::make_pair(i->osd, m));
-  }
-  if (!messages.empty()) {
-    osd->send_message_osd_cluster(messages, e);
-  }
-}
-
-void PG::scrub_unreserve_replicas()
-{
-  ceph_assert(recovery_state.get_backfill_targets().empty());
-  std::vector<std::pair<int, Message*>> messages;
-  messages.reserve(get_actingset().size());
-  epoch_t e = get_osdmap_epoch();
-  for (set<pg_shard_t>::iterator i = get_actingset().begin();
-       i != get_actingset().end();
-       ++i) {
-    if (*i == pg_whoami) continue;
-    dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
-    Message* m =  new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), e,
-                                       MOSDScrubReserve::RELEASE, pg_whoami);
-    messages.push_back(std::make_pair(i->osd, m));
-  }
-  if (!messages.empty()) {
-    osd->send_message_osd_cluster(messages, e);
-  }
  }
  
  void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
@@ -2199,111 +1978,6 @@ void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
    }
  }
  
-void PG::_scan_snaps(ScrubMap &smap) 
-{
-  hobject_t head;
-  SnapSet snapset;
-
-  // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify 
-  // caller using clean_meta_map(), and it works properly.
-  dout(20) << __func__ << " start" << dendl;
-
-  for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
-       i != smap.objects.rend();
-       ++i) {
-    const hobject_t &hoid = i->first;
-    ScrubMap::object &o = i->second;
-
-    dout(20) << __func__ << " " << hoid << dendl;
-
-    ceph_assert(!hoid.is_snapdir());
-    if (hoid.is_head()) {
-      // parse the SnapSet
-      bufferlist bl;
-      if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
-       continue;
-      }
-      bl.push_back(o.attrs[SS_ATTR]);
-      auto p = bl.cbegin();
-      try {
-       decode(snapset, p);
-      } catch(...) {
-       continue;
-      }
-      head = hoid.get_head();
-      continue;
-    }
-    if (hoid.snap < CEPH_MAXSNAP) {
-      // check and if necessary fix snap_mapper
-      if (hoid.get_head() != head) {
-       derr << __func__ << " no head for " << hoid << " (have " << head << ")"
-            << dendl;
-       continue;
-      }
-      set<snapid_t> obj_snaps;
-      auto p = snapset.clone_snaps.find(hoid.snap);
-      if (p == snapset.clone_snaps.end()) {
-       derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
-            << dendl;
-       continue;
-      }
-      obj_snaps.insert(p->second.begin(), p->second.end());
-      set<snapid_t> cur_snaps;
-      int r = snap_mapper.get_snaps(hoid, &cur_snaps);
-      if (r != 0 && r != -ENOENT) {
-       derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
-       ceph_abort();
-      }
-      if (r == -ENOENT || cur_snaps != obj_snaps) {
-       ObjectStore::Transaction t;
-       OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
-       if (r == 0) {
-         r = snap_mapper.remove_oid(hoid, &_t);
-         if (r != 0) {
-           derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
-                << dendl;
-           ceph_abort();
-         }
-         osd->clog->error() << "osd." << osd->whoami
-                           << " found snap mapper error on pg "
-                           << info.pgid
-                           << " oid " << hoid << " snaps in mapper: "
-                           << cur_snaps << ", oi: "
-                           << obj_snaps
-                           << "...repaired";
-       } else {
-         osd->clog->error() << "osd." << osd->whoami
-                           << " found snap mapper error on pg "
-                           << info.pgid
-                           << " oid " << hoid << " snaps missing in mapper"
-                           << ", should be: "
-                           << obj_snaps
-                            << " was " << cur_snaps << " r " << r
-                           << "...repaired";
-       }
-       snap_mapper.add_oid(hoid, obj_snaps, &_t);
-
-       // wait for repair to apply to avoid confusing other bits of the system.
-       {
-         ceph::condition_variable my_cond;
-         ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
-         int r = 0;
-         bool done;
-         t.register_on_applied_sync(
-           new C_SafeCond(my_lock, my_cond, &done, &r));
-         r = osd->store->queue_transaction(ch, std::move(t));
-         if (r != 0) {
-           derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
-                << dendl;
-         } else {
-           std::unique_lock l{my_lock};
-           my_cond.wait(l, [&done] { return done;});
-         }
-       }
-      }
-    }
-  }
-}
  
  void PG::_repair_oinfo_oid(ScrubMap &smap)
  {
@@ -2350,82 +2024,6 @@ void PG::_repair_oinfo_oid(ScrubMap &smap)
      }
    }
  }
-int PG::build_scrub_map_chunk(
-  ScrubMap &map,
-  ScrubMapBuilder &pos,
-  hobject_t start,
-  hobject_t end,
-  bool deep,
-  ThreadPool::TPHandle &handle)
-{
-  dout(10) << __func__ << " [" << start << "," << end << ") "
-          << " pos " << pos
-          << dendl;
-
-  // start
-  while (pos.empty()) {
-    pos.deep = deep;
-    map.valid_through = info.last_update;
-
-    // objects
-    vector<ghobject_t> rollback_obs;
-    pos.ret = get_pgbackend()->objects_list_range(
-      start,
-      end,
-      &pos.ls,
-      &rollback_obs);
-    if (pos.ret < 0) {
-      dout(5) << "objects_list_range error: " << pos.ret << dendl;
-      return pos.ret;
-    }
-    if (pos.ls.empty()) {
-      break;
-    }
-    _scan_rollback_obs(rollback_obs);
-    pos.pos = 0;
-    return -EINPROGRESS;
-  }
-
-  // scan objects
-  while (!pos.done()) {
-    int r = get_pgbackend()->be_scan_list(map, pos);
-    if (r == -EINPROGRESS) {
-      return r;
-    }
-  }
-
-  // finish
-  dout(20) << __func__ << " finishing" << dendl;
-  ceph_assert(pos.done());
-  _repair_oinfo_oid(map);
-  if (!is_primary()) {
-    ScrubMap for_meta_scrub;
-    // In case we restarted smaller chunk, clear old data
-    scrubber.cleaned_meta_map.clear_from(scrubber.start);
-    scrubber.cleaned_meta_map.insert(map);
-    scrubber.clean_meta_map(for_meta_scrub);
-    _scan_snaps(for_meta_scrub);
-  }
-
-  dout(20) << __func__ << " done, got " << map.objects.size() << " items"
-          << dendl;
-  return 0;
-}
-
-void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
-  if (!store)
-    return;
-  struct OnComplete : Context {
-    std::unique_ptr<Scrub::Store> store;
-    explicit OnComplete(
-      std::unique_ptr<Scrub::Store> &&store)
-      : store(std::move(store)) {}
-    void finish(int) override {}
-  };
-  store->cleanup(t);
-  t->register_on_complete(new OnComplete(std::move(store)));
-  ceph_assert(!store);
-}
  
  void PG::repair_object(
    const hobject_t &soid,
@@ -2466,950 +2064,170 @@ void PG::repair_object(
    recovery_state.force_object_missing(bad_peers, soid, oi.version);
  }
  
-/* replica_scrub
- *
- * Wait for last_update_applied to match msg->scrub_to as above. Wait
- * for pushes to complete in case of recent recovery. Build a single
- * scrubmap of objects that are in the range [msg->start, msg->end).
- */
-void PG::replica_scrub(
-  OpRequestRef op,
-  ThreadPool::TPHandle &handle)
+void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
  {
-  auto msg = op->get_req<MOSDRepScrub>();
-  ceph_assert(!scrubber.active_rep_scrub);
-  dout(7) << "replica_scrub" << dendl;
+  dout(10) << __func__ << " (op)" << dendl;
+  m_scrubber->replica_scrub_op(op);
+}
  
-  if (msg->map_epoch < info.history.same_interval_since) {
-    dout(10) << "replica_scrub discarding old replica_scrub from "
-            << msg->map_epoch << " < " << info.history.same_interval_since 
-            << dendl;
-    return;
-  }
+void PG::scrub(epoch_t queued, ThreadPool::TPHandle& handle)
+{
+  dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl;
  
-  ceph_assert(msg->chunky);
-  if (active_pushes > 0) {
-    dout(10) << "waiting for active pushes to finish" << dendl;
-    scrubber.active_rep_scrub = op;
-    return;
-  }
+  scrub_queued = false;
  
-  scrubber.state = Scrubber::BUILD_MAP_REPLICA;
-  scrubber.replica_scrub_start = msg->min_epoch;
-  scrubber.start = msg->start;
-  scrubber.end = msg->end;
-  scrubber.max_end = msg->end;
-  scrubber.deep = msg->deep;
-  scrubber.epoch_start = info.history.same_interval_since;
-  if (msg->priority) {
-    scrubber.priority = msg->priority;
-  } else {
-    scrubber.priority = get_scrub_priority();
+  if (pg_has_reset_since(queued)) {
+    dout(10) << " pg::scrub reset_since " << __func__ << " " << queued << dendl;
+    dout(10) << " pg::scrub reset_since " << __func__ << " "
+           << recovery_state.get_last_peering_reset() << dendl;
+    m_scrubber->scrub_clear_state(false);
+    return;
    }
  
-  scrub_can_preempt = msg->allow_preemption;
-  scrub_preempted = false;
-  scrubber.replica_scrubmap_pos.reset();
+  ceph_assert(
+    is_primary());  // as the replica request should have reached PG::replica_scrub()
  
-  requeue_scrub(msg->high_priority);
+  ceph_assert(!m_scrubber->is_scrub_active());
+  // a new scrub
+  m_scrubber->reset_epoch(queued);
+  m_scrubber->send_start_scrub();
  }
  
-/* Scrub:
- * PG_STATE_SCRUBBING is set when the scrub is queued
- * 
- * scrub will be chunky if all OSDs in PG support chunky scrub
- * scrub will fail if OSDs are too old.
- */
-void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
-{
-  OSDService *osds = osd;
-  double scrub_sleep = osds->osd->scrub_sleep_time(scrubber.must_scrub);
-  if (scrub_sleep > 0 &&
-      (scrubber.state == PG::Scrubber::NEW_CHUNK ||
-       scrubber.state == PG::Scrubber::INACTIVE) &&
-       scrubber.needs_sleep) {
-    ceph_assert(!scrubber.sleeping);
-    dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
-
-    // Do an async sleep so we don't block the op queue
-    spg_t pgid = get_pgid();
-    int state = scrubber.state;
-    auto scrub_requeue_callback =
-        new LambdaContext([osds, pgid, state](int r) {
-          PGRef pg = osds->osd->lookup_lock_pg(pgid);
-          if (pg == nullptr) {
-            lgeneric_dout(osds->osd->cct, 20)
-                << "scrub_requeue_callback: Could not find "
-                << "PG " << pgid << " can't complete scrub requeue after sleep"
-                << dendl;
-            return;
-          }
-          pg->scrubber.sleeping = false;
-          pg->scrubber.needs_sleep = false;
-          lgeneric_dout(pg->cct, 20)
-              << "scrub_requeue_callback: slept for "
-              << ceph_clock_now() - pg->scrubber.sleep_start
-              << ", re-queuing scrub with state " << state << dendl;
-          pg->scrub_queued = false;
-          pg->requeue_scrub();
-          pg->scrubber.sleep_start = utime_t();
-          pg->unlock();
-        });
-    std::lock_guard l(osd->sleep_lock);
-    osd->sleep_timer.add_event_after(scrub_sleep,
-                                           scrub_requeue_callback);
-    scrubber.sleeping = true;
-    scrubber.sleep_start = ceph_clock_now();
-    return;
-  }
-  if (pg_has_reset_since(queued)) {
-    return;
-  }
-  ceph_assert(scrub_queued);
-  scrub_queued = false;
-  scrubber.needs_sleep = true;
+// note: no need to secure OSD resources for a recovery scrub
+void PG::recovery_scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle)
+{
+  dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued << dendl;
  
-  // for the replica
-  if (!is_primary() &&
-      scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
-    chunky_scrub(handle);
-    return;
-  }
+  scrub_queued = false;
  
-  if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
-    dout(10) << "scrub -- not primary or active or not clean" << dendl;
-    state_clear(PG_STATE_SCRUBBING);
-    state_clear(PG_STATE_REPAIR);
-    state_clear(PG_STATE_DEEP_SCRUB);
-    publish_stats_to_osd();
+  if (pg_has_reset_since(epoch_queued)) {
+    dout(10) << " reset_since " << __func__ << " " << epoch_queued << dendl;
+    dout(10) << " reset_since " << __func__ << " "
+           << recovery_state.get_last_peering_reset() << dendl;
      return;
    }
  
-  if (!scrubber.active) {
-    ceph_assert(recovery_state.get_backfill_targets().empty());
-
-    scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
+  ceph_assert(is_primary());
+  ceph_assert(!m_scrubber->is_scrub_active());
  
-    dout(10) << "starting a new chunky scrub" << dendl;
-  }
-
-  chunky_scrub(handle);
+  // a new scrub
+  m_scrubber->reset_epoch(epoch_queued);
+  m_scrubber->send_start_after_repair();
  }
  
-void PG::abort_scrub()
+void PG::replica_scrub(epoch_t epoch_queued,
+                      [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  scrub_clear_state();
-  scrub_unreserve_replicas();
+  dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued
+          << (is_primary() ? " (primary)" : " (replica)") << dendl;
+  scrub_queued = false;
+  m_scrubber->replica_scrub(epoch_queued);
  }
  
-/*
- * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
- * chunk.
- *
- * The object store is partitioned into chunks which end on hash boundaries. For
- * each chunk, the following logic is performed:
- *
- *  (1) Block writes on the chunk
- *  (2) Request maps from replicas
- *  (3) Wait for pushes to be applied (after recovery)
- *  (4) Wait for writes to flush on the chunk
- *  (5) Wait for maps from replicas
- *  (6) Compare / repair all scrub maps
- *  (7) Wait for digest updates to apply
- *
- * This logic is encoded in the mostly linear state machine:
- *
- *           +------------------+
- *  _________v__________        |
- * |                    |       |
- * |      INACTIVE      |       |
- * |____________________|       |
- *           |                  |
- *           |   +----------+   |
- *  _________v___v______    |   |
- * |                    |   |   |
- * |      NEW_CHUNK     |   |   |
- * |____________________|   |   |
- *           |              |   |
- *  _________v__________    |   |
- * |                    |   |   |
- * |     WAIT_PUSHES    |   |   |
- * |____________________|   |   |
- *           |              |   |
- *  _________v__________    |   |
- * |                    |   |   |
- * |  WAIT_LAST_UPDATE  |   |   |
- * |____________________|   |   |
- *           |              |   |
- *  _________v__________    |   |
- * |                    |   |   |
- * |      BUILD_MAP     |   |   |
- * |____________________|   |   |
- *           |              |   |
- *  _________v__________    |   |
- * |                    |   |   |
- * |    WAIT_REPLICAS   |   |   |
- * |____________________|   |   |
- *           |              |   |
- *  _________v__________    |   |
- * |                    |   |   |
- * |    COMPARE_MAPS    |   |   |
- * |____________________|   |   |
- *           |              |   |
- *           |              |   |
- *  _________v__________    |   |
- * |                    |   |   |
- * |WAIT_DIGEST_UPDATES |   |   |
- * |____________________|   |   |
- *           |   |          |   |
- *           |   +----------+   |
- *  _________v__________        |
- * |                    |       |
- * |       FINISH       |       |
- * |____________________|       |
- *           |                  |
- *           +------------------+
- *
- * The primary determines the last update from the subset by walking the log. If
- * it sees a log entry pertaining to a file in the chunk, it tells the replicas
- * to wait until that update is applied before building a scrub map. Both the
- * primary and replicas will wait for any active pushes to be applied.
- *
- * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
- *
- * scrubber.state encodes the current state of the scrub (refer to state diagram
- * for details).
- */
-void PG::chunky_scrub(ThreadPool::TPHandle &handle)
-{
-  // Since repair is only by request and we need to scrub afterward
-  // treat the same as req_scrub.
-  if (!scrubber.req_scrub) {
-    if (state_test(PG_STATE_DEEP_SCRUB)) {
-      if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
-         pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
-           dout(10) << "nodeep_scrub set, aborting" << dendl;
-       abort_scrub();
-        return;
-      }
-    } else if (state_test(PG_STATE_SCRUBBING)) {
-      if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
-         dout(10) << "noscrub set, aborting" << dendl;
-        abort_scrub();
-         return;
-      }
-    }
-  }
-  // check for map changes
-  if (scrubber.is_chunky_scrub_active()) {
-    if (scrubber.epoch_start != info.history.same_interval_since) {
-      dout(10) << "scrub pg changed, aborting" << dendl;
-      abort_scrub();
-      return;
-    }
-  }
-
-  bool done = false;
-  int ret;
-
-  while (!done) {
-    dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
-            << " [" << scrubber.start << "," << scrubber.end << ")"
-            << " max_end " << scrubber.max_end << dendl;
-
-    switch (scrubber.state) {
-      case PG::Scrubber::INACTIVE:
-        dout(10) << "scrub start" << dendl;
-       ceph_assert(is_primary());
-
-        publish_stats_to_osd();
-        scrubber.epoch_start = info.history.same_interval_since;
-        scrubber.active = true;
-
-       {
-         ObjectStore::Transaction t;
-         scrubber.cleanup_store(&t);
-         scrubber.store.reset(Scrub::Store::create(osd->store, &t,
-                                                   info.pgid, coll));
-         osd->store->queue_transaction(ch, std::move(t), nullptr);
-       }
-
-        // Don't include temporary objects when scrubbing
-        scrubber.start = info.pgid.pgid.get_hobj_start();
-        scrubber.state = PG::Scrubber::NEW_CHUNK;
-
-       {
-         bool repair = state_test(PG_STATE_REPAIR);
-         bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
-         const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
-         stringstream oss;
-         oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
-         osd->clog->debug(oss);
-       }
-
-       scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
-         "osd_scrub_max_preemptions");
-       scrubber.preempt_divisor = 1;
-        break;
-
-      case PG::Scrubber::NEW_CHUNK:
-        scrubber.primary_scrubmap = ScrubMap();
-        scrubber.received_maps.clear();
-
-       // begin (possible) preemption window
-       if (scrub_preempted) {
-         scrubber.preempt_left--;
-         scrubber.preempt_divisor *= 2;
-         dout(10) << __func__ << " preempted, " << scrubber.preempt_left
-                  << " left" << dendl;
-         scrub_preempted = false;
-       }
-       scrub_can_preempt = scrubber.preempt_left > 0;
-
-        {
-          /* get the start and end of our scrub chunk
-          *
-          * Our scrub chunk has an important restriction we're going to need to
-          * respect. We can't let head be start or end.
-          * Using a half-open interval means that if end == head,
-          * we'd scrub/lock head and the clone right next to head in different
-          * chunks which would allow us to miss clones created between
-          * scrubbing that chunk and scrubbing the chunk including head.
-          * This isn't true for any of the other clones since clones can
-          * only be created "just to the left of" head.  There is one exception
-          * to this: promotion of clones which always happens to the left of the
-          * left-most clone, but promote_object checks the scrubber in that
-          * case, so it should be ok.  Also, it's ok to "miss" clones at the
-          * left end of the range if we are a tier because they may legitimately
-          * not exist (see _scrub).
-          */
-          ceph_assert(scrubber.preempt_divisor > 0);
-         int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
-                                     scrubber.preempt_divisor);
-         int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
-                                      scrubber.preempt_divisor);
-          hobject_t start = scrubber.start;
-         hobject_t candidate_end;
-         vector<hobject_t> objects;
-         ret = get_pgbackend()->objects_list_partial(
-           start,
-           min,
-           max,
-           &objects,
-           &candidate_end);
-         ceph_assert(ret >= 0);
-
-         if (!objects.empty()) {
-           hobject_t back = objects.back();
-           while (candidate_end.is_head() &&
-                  candidate_end == back.get_head()) {
-             candidate_end = back;
-             objects.pop_back();
-             if (objects.empty()) {
-               ceph_assert(0 ==
-                      "Somehow we got more than 2 objects which"
-                      "have the same head but are not clones");
-             }
-             back = objects.back();
-           }
-           if (candidate_end.is_head()) {
-             ceph_assert(candidate_end != back.get_head());
-             candidate_end = candidate_end.get_object_boundary();
-           }
-         } else {
-           ceph_assert(candidate_end.is_max());
-         }
-
-         if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
-           // we'll be requeued by whatever made us unavailable for scrub
-           dout(10) << __func__ << ": scrub blocked somewhere in range "
-                    << "[" << scrubber.start << ", " << candidate_end << ")"
-                    << dendl;
-           done = true;
-           break;
-         }
-         scrubber.end = candidate_end;
-         if (scrubber.end > scrubber.max_end)
-           scrubber.max_end = scrubber.end;
-        }
-
-        // walk the log to find the latest update that affects our chunk
-        scrubber.subset_last_update = eversion_t();
-       for (auto p = projected_log.log.rbegin();
-            p != projected_log.log.rend();
-            ++p) {
-          if (p->soid >= scrubber.start &&
-             p->soid < scrubber.end) {
-            scrubber.subset_last_update = p->version;
-            break;
-         }
-       }
-       if (scrubber.subset_last_update == eversion_t()) {
-         for (list<pg_log_entry_t>::const_reverse_iterator p =
-                recovery_state.get_pg_log().get_log().log.rbegin();
-              p != recovery_state.get_pg_log().get_log().log.rend();
-              ++p) {
-           if (p->soid >= scrubber.start &&
-               p->soid < scrubber.end) {
-             scrubber.subset_last_update = p->version;
-             break;
-           }
-         }
-       }
-
-        scrubber.state = PG::Scrubber::WAIT_PUSHES;
-        break;
-
-      case PG::Scrubber::WAIT_PUSHES:
-        if (active_pushes == 0) {
-          scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
-        } else {
-          dout(15) << "wait for pushes to apply" << dendl;
-          done = true;
-        }
-        break;
-
-      case PG::Scrubber::WAIT_LAST_UPDATE:
-        if (recovery_state.get_last_update_applied() <
-         scrubber.subset_last_update) {
-          // will be requeued by op_applied
-          dout(15) << "wait for EC read/modify/writes to queue" << dendl;
-          done = true;
-         break;
-       }
-
-        // ask replicas to scan
-        scrubber.waiting_on_whom.insert(pg_whoami);
-
-        // request maps from replicas
-       for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
-            i != get_acting_recovery_backfill().end();
-            ++i) {
-         if (*i == pg_whoami) continue;
-          _request_scrub_map(*i, scrubber.subset_last_update,
-                             scrubber.start, scrubber.end, scrubber.deep,
-                            scrubber.preempt_left > 0);
-          scrubber.waiting_on_whom.insert(*i);
-        }
-       dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
-                << dendl;
-
-       scrubber.state = PG::Scrubber::BUILD_MAP;
-       scrubber.primary_scrubmap_pos.reset();
-        break;
-
-      case PG::Scrubber::BUILD_MAP:
-        ceph_assert(recovery_state.get_last_update_applied() >=
-         scrubber.subset_last_update);
-
-        // build my own scrub map
-       if (scrub_preempted) {
-         dout(10) << __func__ << " preempted" << dendl;
-         scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
-         break;
-       }
-       ret = build_scrub_map_chunk(
-         scrubber.primary_scrubmap,
-         scrubber.primary_scrubmap_pos,
-         scrubber.start, scrubber.end,
-         scrubber.deep,
-         handle);
-       if (ret == -EINPROGRESS) {
-         requeue_scrub();
-         done = true;
-         break;
-       }
-       scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
-       break;
-
-      case PG::Scrubber::BUILD_MAP_DONE:
-       if (scrubber.primary_scrubmap_pos.ret < 0) {
-         dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
-                 << ", aborting" << dendl;
-          scrub_clear_state();
-          scrub_unreserve_replicas();
-          return;
-        }
-       dout(10) << __func__ << " waiting_on_whom was "
-                << scrubber.waiting_on_whom << dendl;
-       ceph_assert(scrubber.waiting_on_whom.count(pg_whoami));
-        scrubber.waiting_on_whom.erase(pg_whoami);
-
-        scrubber.state = PG::Scrubber::WAIT_REPLICAS;
-        break;
-
-      case PG::Scrubber::WAIT_REPLICAS:
-        if (!scrubber.waiting_on_whom.empty()) {
-          // will be requeued by do_replica_scrub_map
-          dout(10) << "wait for replicas to build scrub map" << dendl;
-          done = true;
-         break;
-       }
-       // end (possible) preemption window
-       scrub_can_preempt = false;
-       if (scrub_preempted) {
-         dout(10) << __func__ << " preempted, restarting chunk" << dendl;
-         scrubber.state = PG::Scrubber::NEW_CHUNK;
-       } else {
-          scrubber.state = PG::Scrubber::COMPARE_MAPS;
-        }
-        break;
-
-      case PG::Scrubber::COMPARE_MAPS:
-        ceph_assert(recovery_state.get_last_update_applied() >=
-         scrubber.subset_last_update);
-        ceph_assert(scrubber.waiting_on_whom.empty());
-
-        scrub_compare_maps();
-       scrubber.start = scrubber.end;
-       scrubber.run_callbacks();
-
-        // requeue the writes from the chunk that just finished
-        requeue_ops(waiting_for_scrub);
-
-       scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
-
-       // fall-thru
-
-      case PG::Scrubber::WAIT_DIGEST_UPDATES:
-       if (scrubber.num_digest_updates_pending) {
-         dout(10) << __func__ << " waiting on "
-                  << scrubber.num_digest_updates_pending
-                  << " digest updates" << dendl;
-         done = true;
-         break;
-       }
-
-       scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
-         "osd_scrub_max_preemptions");
-       scrubber.preempt_divisor = 1;
-
-       if (!(scrubber.end.is_max())) {
-         scrubber.state = PG::Scrubber::NEW_CHUNK;
-         requeue_scrub();
-          done = true;
-        } else {
-          scrubber.state = PG::Scrubber::FINISH;
-        }
-
-       break;
-
-      case PG::Scrubber::FINISH:
-        scrub_finish();
-        scrubber.state = PG::Scrubber::INACTIVE;
-        done = true;
-
-       if (!snap_trimq.empty()) {
-         dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
-         snap_trimmer_scrub_complete();
-       }
-
-        break;
-
-      case PG::Scrubber::BUILD_MAP_REPLICA:
-        // build my own scrub map
-       if (scrub_preempted) {
-         dout(10) << __func__ << " preempted" << dendl;
-         ret = 0;
-       } else {
-         ret = build_scrub_map_chunk(
-           scrubber.replica_scrubmap,
-           scrubber.replica_scrubmap_pos,
-           scrubber.start, scrubber.end,
-           scrubber.deep,
-           handle);
-       }
-       if (ret == -EINPROGRESS) {
-         requeue_scrub();
-         done = true;
-         break;
-       }
-       // reply
-       {
-         MOSDRepScrubMap *reply = new MOSDRepScrubMap(
-           spg_t(info.pgid.pgid, get_primary().shard),
-           scrubber.replica_scrub_start,
-           pg_whoami);
-         reply->preempted = scrub_preempted;
-         ::encode(scrubber.replica_scrubmap, reply->get_data());
-         osd->send_message_osd_cluster(
-           get_primary().osd, reply,
-           scrubber.replica_scrub_start);
-       }
-       scrub_preempted = false;
-       scrub_can_preempt = false;
-       scrubber.state = PG::Scrubber::INACTIVE;
-       scrubber.replica_scrubmap = ScrubMap();
-       scrubber.replica_scrubmap_pos = ScrubMapBuilder();
-       scrubber.start = hobject_t();
-       scrubber.end = hobject_t();
-       scrubber.max_end = hobject_t();
-       done = true;
-       break;
-
-      default:
-        ceph_abort();
-    }
-  }
-  dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
-          << " [" << scrubber.start << "," << scrubber.end << ")"
-          << " max_end " << scrubber.max_end << dendl;
+void PG::scrub_send_scrub_resched(epoch_t epoch_queued,
+                                 [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+  dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl;
+  scrub_queued = false;
+  m_scrubber->send_scrub_resched();
  }
  
-bool PG::write_blocked_by_scrub(const hobject_t& soid)
+void PG::scrub_send_resources_granted(epoch_t epoch_queued,
+                                     [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  if (soid < scrubber.start || soid >= scrubber.end) {
-    return false;
-  }
-  if (scrub_can_preempt) {
-    if (!scrub_preempted) {
-      dout(10) << __func__ << " " << soid << " preempted" << dendl;
-      scrub_preempted = true;
-    } else {
-      dout(10) << __func__ << " " << soid << " already preempted" << dendl;
-    }
-    return false;
-  }
-  return true;
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+  m_scrubber->send_remotes_reserved();
  }
  
-bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
+void PG::scrub_send_resources_denied(epoch_t epoch_queued,
+                                    [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  // does [start, end] intersect [scrubber.start, scrubber.max_end)
-  return (start < scrubber.max_end &&
-         end >= scrubber.start);
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+  m_scrubber->send_reservation_failure();
  }
  
-void PG::scrub_clear_state(bool has_error)
+void PG::replica_scrub_resched(epoch_t epoch_queued,
+                              [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  ceph_assert(is_locked());
-  state_clear(PG_STATE_SCRUBBING);
-  if (!has_error)
-    state_clear(PG_STATE_REPAIR);
-  state_clear(PG_STATE_DEEP_SCRUB);
-  publish_stats_to_osd();
-
-  scrubber.req_scrub = false;
-  // local -> nothing.
-  if (scrubber.local_reserved) {
-    osd->dec_scrubs_local();
-    scrubber.local_reserved = false;
-    scrubber.reserved_peers.clear();
-  }
-
-  requeue_ops(waiting_for_scrub);
-
-  scrubber.reset();
-
-  // type-specific state clear
-  _scrub_clear_state();
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+  scrub_queued = false;
+  m_scrubber->replica_scrub_resched(epoch_queued);
  }
  
-void PG::scrub_compare_maps() 
+void PG::scrub_send_pushes_update(epoch_t epoch_queued,
+                                 [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  dout(10) << __func__ << " has maps, analyzing" << dendl;
-
-  // construct authoritative scrub map for type specific scrubbing
-  scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
-  map<hobject_t,
-      pair<std::optional<uint32_t>,
-           std::optional<uint32_t>>> missing_digest;
-
-  map<pg_shard_t, ScrubMap *> maps;
-  maps[pg_whoami] = &scrubber.primary_scrubmap;
-
-  for (const auto& i : get_acting_recovery_backfill()) {
-    if (i == pg_whoami) continue;
-    dout(2) << __func__ << " replica " << i << " has "
-            << scrubber.received_maps[i].objects.size()
-            << " items" << dendl;
-    maps[i] = &scrubber.received_maps[i];
-  }
-
-  set<hobject_t> master_set;
-
-  // Construct master set
-  for (const auto& map : maps) {
-    for (const auto& i : map.second->objects) {
-      master_set.insert(i.first);
-    }
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+  if (pg_has_reset_since(epoch_queued)) {
+    dout(10) << __func__ << " been reset at "
+           << recovery_state.get_last_peering_reset() << dendl;
+    return;
    }
+  m_scrubber->active_pushes_notification();
+}
  
-  stringstream ss;
-  get_pgbackend()->be_omap_checks(maps, master_set,
-                                  scrubber.omap_stats, ss);
+void PG::scrub_send_replica_pushes(epoch_t epoch_queued,
+                                  [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+  m_scrubber->send_replica_pushes_upd();
+}
  
-  if (!ss.str().empty()) {
-    osd->clog->warn(ss);
+void PG::scrub_send_applied_update(epoch_t epoch_queued,
+                                  [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+  if (pg_has_reset_since(epoch_queued)) {
+    dout(10) << __func__ << " been reset at "
+           << recovery_state.get_last_peering_reset() << dendl;
+    return;
    }
+  m_scrubber->update_applied_notification(epoch_queued);
+}
  
-  if (recovery_state.get_acting().size() > 1) {
-    dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
-
-    // Map from object with errors to good peer
-    map<hobject_t, list<pg_shard_t>> authoritative;
-
-    dout(2) << __func__ << get_primary() << " has "
-           << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
-
-    ss.str("");
-    ss.clear();
-
-    get_pgbackend()->be_compare_scrubmaps(
-      maps,
-      master_set,
-      state_test(PG_STATE_REPAIR),
-      scrubber.missing,
-      scrubber.inconsistent,
-      authoritative,
-      missing_digest,
-      scrubber.shallow_errors,
-      scrubber.deep_errors,
-      scrubber.store.get(),
-      info.pgid, recovery_state.get_acting(),
-      ss);
-    dout(2) << ss.str() << dendl;
-
-    if (!ss.str().empty()) {
-      osd->clog->error(ss);
-    }
-
-    for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
-        i != authoritative.end();
-        ++i) {
-      list<pair<ScrubMap::object, pg_shard_t> > good_peers;
-      for (list<pg_shard_t>::const_iterator j = i->second.begin();
-          j != i->second.end();
-          ++j) {
-       good_peers.emplace_back(maps[*j]->objects[i->first], *j);
-      }
-      scrubber.authoritative.emplace(i->first, good_peers);
-    }
-
-    for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
-        i != authoritative.end();
-        ++i) {
-      scrubber.cleaned_meta_map.objects.erase(i->first);
-      scrubber.cleaned_meta_map.objects.insert(
-       *(maps[i->second.back()]->objects.find(i->first))
-       );
-    }
+void PG::scrub_send_unblocking(epoch_t epoch_queued,
+                              [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+  if (pg_has_reset_since(epoch_queued)) {
+    dout(10) << __func__ << " been reset at "
+           << recovery_state.get_last_peering_reset() << dendl;
+    return;
    }
+  m_scrubber->send_scrub_unblock();
+}
  
-  ScrubMap for_meta_scrub;
-  scrubber.clean_meta_map(for_meta_scrub);
-
-  // ok, do the pg-type specific scrubbing
-  scrub_snapshot_metadata(for_meta_scrub, missing_digest);
-  // Called here on the primary can use an authoritative map if it isn't the primary
-  _scan_snaps(for_meta_scrub);
-  if (!scrubber.store->empty()) {
-    if (state_test(PG_STATE_REPAIR)) {
-      dout(10) << __func__ << ": discarding scrub results" << dendl;
-      scrubber.store->flush(nullptr);
-    } else {
-      dout(10) << __func__ << ": updating scrub object" << dendl;
-      ObjectStore::Transaction t;
-      scrubber.store->flush(&t);
-      osd->store->queue_transaction(ch, std::move(t), nullptr);
-    }
-  }
+void PG::scrub_send_digest_update(epoch_t epoch_queued,
+                                 [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+  m_scrubber->digest_update_notification();
  }
  
-bool PG::scrub_process_inconsistent()
+void PG::scrub_send_replmaps_ready(epoch_t epoch_queued,
+                                  [[maybe_unused]] ThreadPool::TPHandle& handle)
  {
-  dout(10) << __func__ << ": checking authoritative" << dendl;
-  bool repair = state_test(PG_STATE_REPAIR);
-  bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
-  const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
-  
-  // authoriative only store objects which missing or inconsistent.
-  if (!scrubber.authoritative.empty()) {
-    stringstream ss;
-    ss << info.pgid << " " << mode << " "
-       << scrubber.missing.size() << " missing, "
-       << scrubber.inconsistent.size() << " inconsistent objects";
-    dout(2) << ss.str() << dendl;
-    osd->clog->error(ss);
-    if (repair) {
-      state_clear(PG_STATE_CLEAN);
-      for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
-            scrubber.authoritative.begin();
-          i != scrubber.authoritative.end();
-          ++i) {
-       auto missing_entry = scrubber.missing.find(i->first);
-       if (missing_entry != scrubber.missing.end()) {
-          repair_object(
-            i->first,
-            i->second,
-           missing_entry->second);
-         scrubber.fixed += missing_entry->second.size();
-       }
-       if (scrubber.inconsistent.count(i->first)) {
-          repair_object(
-            i->first,
-            i->second,
-           scrubber.inconsistent[i->first]);
-         scrubber.fixed += missing_entry->second.size();
-       }
-      }
-    }
-  }
-  return (!scrubber.authoritative.empty() && repair);
+  dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
+  m_scrubber->send_replica_maps_ready();
  }
  
-bool PG::ops_blocked_by_scrub() const {
+bool PG::ops_blocked_by_scrub() const
+{
    return (waiting_for_scrub.size() != 0);
  }
  
-// the part that actually finalizes a scrub
-void PG::scrub_finish() 
+Scrub::scrub_prio_t PG::is_scrub_blocking_ops() const
  {
-  dout(20) << __func__ << dendl;
-  bool repair = state_test(PG_STATE_REPAIR);
-  bool do_auto_scrub = false;
-  // if the repair request comes from auto-repair and large number of errors,
-  // we would like to cancel auto-repair
-  if (repair && scrubber.auto_repair
-      && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
-    state_clear(PG_STATE_REPAIR);
-    repair = false;
-  }
-  bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
-  const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
-
-  // if a regular scrub had errors within the limit, do a deep scrub to auto repair.
-  if (scrubber.deep_scrub_on_error
-      && scrubber.authoritative.size()
-      && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) {
-    ceph_assert(!deep_scrub);
-    do_auto_scrub = true;
-    dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl;
-  }
-  scrubber.deep_scrub_on_error = false;
-
-  // type-specific finish (can tally more errors)
-  _scrub_finish();
-
-  bool has_error = scrub_process_inconsistent();
-
-  {
-    stringstream oss;
-    oss << info.pgid.pgid << " " << mode << " ";
-    int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
-    if (total_errors)
-      oss << total_errors << " errors";
-    else
-      oss << "ok";
-    if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
-      oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
-          << " remaining deep scrub error details lost)";
-    if (repair)
-      oss << ", " << scrubber.fixed << " fixed";
-    if (total_errors)
-      osd->clog->error(oss);
-    else
-      osd->clog->debug(oss);
-  }
-
-  // Since we don't know which errors were fixed, we can only clear them
-  // when every one has been fixed.
-  if (repair) {
-    if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
-      ceph_assert(deep_scrub);
-      scrubber.shallow_errors = scrubber.deep_errors = 0;
-      dout(20) << __func__ << " All may be fixed" << dendl;
-    } else if (has_error) {
-      // Deep scrub in order to get corrected error counts
-      scrub_after_recovery = true;
-      save_req_scrub = scrubber.req_scrub;
-      dout(20) << __func__ << " Set scrub_after_recovery, req_scrub=" << save_req_scrub << dendl;
-    } else if (scrubber.shallow_errors || scrubber.deep_errors) {
-      // We have errors but nothing can be fixed, so there is no repair
-      // possible.
-      state_set(PG_STATE_FAILED_REPAIR);
-      dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors)
-              << " error(s) present with no repair possible" << dendl;
-    }
-  }
-
-  {
-    // finish up
-    ObjectStore::Transaction t;
-    recovery_state.update_stats(
-      [this, deep_scrub](auto &history, auto &stats) {
-       utime_t now = ceph_clock_now();
-       history.last_scrub = recovery_state.get_info().last_update;
-       history.last_scrub_stamp = now;
-       if (scrubber.deep) {
-         history.last_deep_scrub = recovery_state.get_info().last_update;
-         history.last_deep_scrub_stamp = now;
-       }
-
-       if (deep_scrub) {
-         if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
-           history.last_clean_scrub_stamp = now;
-         stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
-         stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
-         stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects;
-         stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes;
-         stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys;
-         dout(25) << "scrub_finish shard " << pg_whoami << " num_omap_bytes = "
-                  << stats.stats.sum.num_omap_bytes << " num_omap_keys = "
-                  << stats.stats.sum.num_omap_keys << dendl;
-       } else {
-         stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
-         // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
-         // because of deep-scrub errors
-         if (scrubber.shallow_errors == 0)
-           history.last_clean_scrub_stamp = now;
-       }
-       stats.stats.sum.num_scrub_errors =
-         stats.stats.sum.num_shallow_scrub_errors +
-         stats.stats.sum.num_deep_scrub_errors;
-       if (scrubber.check_repair) {
-         scrubber.check_repair = false;
-         if (info.stats.stats.sum.num_scrub_errors) {
-           state_set(PG_STATE_FAILED_REPAIR);
-           dout(10) << "scrub_finish " << info.stats.stats.sum.num_scrub_errors
-                    << " error(s) still present after re-scrub" << dendl;
-         }
-       }
-       return true;
-      },
-      &t);
-    int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
-    ceph_assert(tr == 0);
-  }
-
-  if (has_error) {
-    queue_peering_event(
-      PGPeeringEventRef(
-       std::make_shared<PGPeeringEvent>(
-         get_osdmap_epoch(),
-         get_osdmap_epoch(),
-         PeeringState::DoRecovery())));
-  }
-
-  scrub_clear_state(has_error);
-  scrub_unreserve_replicas();
-
-  if (do_auto_scrub) {
-    scrub_requested(false, false, true);
-  }
-
-  if (is_active() && is_primary()) {
-    recovery_state.share_pg_info();
-  }
+  return waiting_for_scrub.size() ? Scrub::scrub_prio_t::high_priority
+                                 : Scrub::scrub_prio_t::low_priority;
  }
  
  bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
  {
-  if (get_last_peering_reset() > reply_epoch ||
-      get_last_peering_reset() > query_epoch) {
-    dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
-            << " last_peering_reset " << get_last_peering_reset()
-            << dendl;
+  if (auto last_reset = get_last_peering_reset();
+      last_reset > reply_epoch || last_reset > query_epoch) {
+    dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch "
+            << query_epoch << " last_peering_reset " << last_reset << dendl;
      return true;
    }
    return false;
@@ -3453,24 +2271,12 @@ bool PG::try_flush_or_schedule_async()
  ostream& operator<<(ostream& out, const PG& pg)
  {
    out << pg.recovery_state;
-  if (pg.scrubber.must_repair)
-    out << " MUST_REPAIR";
-  if (pg.scrubber.auto_repair)
-    out << " AUTO_REPAIR";
-  if (pg.scrubber.check_repair)
-    out << " CHECK_REPAIR";
-  if (pg.scrubber.deep_scrub_on_error)
-    out << " DEEP_SCRUB_ON_ERROR";
-  if (pg.scrubber.must_deep_scrub)
-    out << " MUST_DEEP_SCRUB";
-  if (pg.scrubber.must_scrub)
-    out << " MUST_SCRUB";
-  if (pg.scrubber.time_for_deep)
-    out << " TIME_FOR_DEEP";
-  if (pg.scrubber.need_auto)
-    out << " NEED_AUTO";
-  if (pg.scrubber.req_scrub)
-    out << " REQ_SCRUB";
+
+  // listing all scrub-related flags - both current and "planned next scrub"
+  if (pg.is_scrubbing()) {
+    out << *pg.m_scrubber;
+  }
+  out << pg.m_planned_scrub;
  
    if (pg.recovery_ops_active)
      out << " rops=" << pg.recovery_ops_active;
@@ -3596,15 +2402,19 @@ bool PG::can_discard_replica_op(OpRequestRef& op)
    // resets the messenger sesssion when the replica reconnects. to avoid the
    // out-of-order replies, the messages from that replica should be discarded.
    OSDMapRef next_map = osd->get_next_osdmap();
-  if (next_map->is_down(from))
+  if (next_map->is_down(from)) {
+    dout(20) << " " << __func__ << " dead for nextmap is down " << from << dendl;
      return true;
+  }
    /* Mostly, this overlaps with the old_peering_msg
     * condition.  An important exception is pushes
     * sent by replicas not in the acting set, since
     * if such a replica goes down it does not cause
     * a new interval. */
-  if (next_map->get_down_at(from) >= m->map_epoch)
+  if (next_map->get_down_at(from) >= m->map_epoch) {
+    dout(20) << " " << __func__ << " dead for 'get_down_at' " << from << dendl;
      return true;
+  }
  
    // same pg?
    //  if pg changes _at all_, we reset and repeer!
@@ -3798,45 +2608,6 @@ void PG::handle_initialize(PeeringCtx &rctx)
    recovery_state.handle_event(evt, &rctx);
  }
  
-void PG::Scrubber::dump(Formatter *f)
-{
-  f->open_object_section("scrubber");
-  f->dump_stream("epoch_start") << epoch_start;
-  f->dump_bool("active", active);
-  if (active) {
-    f->dump_string("state", state_string(state));
-    f->dump_stream("start") << start;
-    f->dump_stream("end") << end;
-    f->dump_stream("max_end") << max_end;
-    f->dump_stream("subset_last_update") << subset_last_update;
-    f->dump_bool("deep", deep);
-    f->dump_bool("must_scrub", must_scrub);
-    f->dump_bool("must_deep_scrub", must_deep_scrub);
-    f->dump_bool("must_repair", must_repair);
-    f->dump_bool("need_auto", need_auto);
-    f->dump_bool("req_scrub", req_scrub);
-    f->dump_bool("time_for_deep", time_for_deep);
-    f->dump_bool("auto_repair", auto_repair);
-    f->dump_bool("check_repair", check_repair);
-    f->dump_bool("deep_scrub_on_error", deep_scrub_on_error);
-    f->dump_stream("scrub_reg_stamp") << scrub_reg_stamp; //utime_t
-    f->dump_stream("waiting_on_whom") << waiting_on_whom; //set<pg_shard_t>
-    f->dump_unsigned("priority", priority);
-    f->dump_int("shallow_errors", shallow_errors);
-    f->dump_int("deep_errors", deep_errors);
-    f->dump_int("fixed", fixed);
-    {
-      f->open_array_section("waiting_on_whom");
-      for (set<pg_shard_t>::iterator p = waiting_on_whom.begin();
-          p != waiting_on_whom.end();
-          ++p) {
-       f->dump_stream("shard") << *p;
-      }
-      f->close_section();
-    }
-  }
-  f->close_section();
-}
  
  void PG::handle_query_state(Formatter *f)
  {
@@ -3846,27 +2617,8 @@ void PG::handle_query_state(Formatter *f)
  
    // This code has moved to after the close of recovery_state array.
    // I don't think that scrub is a recovery state
-  if (is_primary() && is_active()) {
-    f->open_object_section("scrub");
-    f->dump_stream("scrubber.epoch_start") << scrubber.epoch_start;
-    f->dump_bool("scrubber.active", scrubber.active);
-    f->dump_string("scrubber.state", PG::Scrubber::state_string(scrubber.state));
-    f->dump_stream("scrubber.start") << scrubber.start;
-    f->dump_stream("scrubber.end") << scrubber.end;
-    f->dump_stream("scrubber.max_end") << scrubber.max_end;
-    f->dump_stream("scrubber.subset_last_update") << scrubber.subset_last_update;
-    f->dump_bool("scrubber.deep", scrubber.deep);
-    {
-      f->open_array_section("scrubber.waiting_on_whom");
-      for (set<pg_shard_t>::iterator p = scrubber.waiting_on_whom.begin();
-          p != scrubber.waiting_on_whom.end();
-          ++p) {
-       f->dump_stream("shard") << *p;
-      }
-      f->close_section();
-    }
-    f->dump_string("comment", "DEPRECATED - may be removed in the next release");
-    f->close_section();
+  if (is_primary() && is_active() && m_scrubber->is_scrub_active()) {
+    m_scrubber->handle_query_state(f);
    }
  }
  
diff --git a/src/osd/PG.h b/src/osd/PG.h

index 69f631394ef98895a2cb152752991925349e9fc6..9119b2979380e8403e3c61056c93abb58a03cffd 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -177,8 +177,13 @@ class PG : public DoutPrefixProvider, public PeeringState::PeeringListener {
  public:
    const pg_shard_t pg_whoami;
    const spg_t pg_id;
+
+  std::unique_ptr<ScrubPgIF> m_scrubber;
+
    /// flags detailing scheduling/operation characteristics of the next scrub 
    requested_scrub_t m_planned_scrub;
+  /// scrubbing state for both Primary & replicas
+  bool is_scrub_active() const { return m_scrubber->is_scrub_active(); }
  
  public:
    // -- members --
@@ -375,14 +380,27 @@ public:
                           ObjectStore::Transaction &t);
  
    void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+  /**
+   *  a special version of PG::scrub(), which:
+   *  - is initiated after repair, and
+   *  - is not required to allocate local/remote OSD scrub resources
+   */
+  void recovery_scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+  void replica_scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+  void replica_scrub_resched(epoch_t queued, ThreadPool::TPHandle &handle);
  
    /// Queues a PGScrubResourcesOK message. Will translate into 'RemotesReserved' FSM event
    void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle &handle);
    void scrub_send_resources_denied(epoch_t queued, ThreadPool::TPHandle &handle);
+  void scrub_send_scrub_resched(epoch_t queued, ThreadPool::TPHandle &handle);
+  void scrub_send_pushes_update(epoch_t queued, ThreadPool::TPHandle &handle);
+  void scrub_send_applied_update(epoch_t queued, ThreadPool::TPHandle &handle);
+  void scrub_send_unblocking(epoch_t epoch_queued, ThreadPool::TPHandle &handle);
+  void scrub_send_digest_update(epoch_t epoch_queued, ThreadPool::TPHandle &handle);
+  void scrub_send_replmaps_ready(epoch_t epoch_queued, ThreadPool::TPHandle &handle);
+  void scrub_send_replica_pushes(epoch_t queued, ThreadPool::TPHandle &handle);
  
-  bool is_scrub_registered();
    void reg_next_scrub();
-  void unreg_next_scrub();
  
    void queue_want_pg_temp(const std::vector<int> &wanted) override;
    void clear_want_pg_temp() override;
@@ -398,7 +416,7 @@ public:
  
    void on_info_history_change() override;
  
-  void scrub_requested(bool deep, bool repair, bool need_auto = false) override;
+  void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) override;
  
    uint64_t get_snap_trimq_size() const override {
      return snap_trimq.size();
@@ -444,13 +462,7 @@ public:
      return finish_recovery();
    }
  
-  void on_activate(interval_set<snapid_t> snaps) override {
-    ceph_assert(scrubber.callbacks.empty());
-    ceph_assert(callbacks_for_degraded_object.empty());
-    snap_trimq = snaps;
-    release_pg_backoffs();
-    projected_last_update = info.last_update;
-  }
+  void on_activate(interval_set<snapid_t> snaps) override;
  
    void on_activate_committed() override;
  
@@ -526,14 +538,37 @@ public:
    void shutdown();
    virtual void on_shutdown() = 0;
  
-  bool get_must_scrub() const {
-    return scrubber.must_scrub;
-  }
+  bool get_must_scrub() const;
    bool sched_scrub();
  
    unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const;
    /// the version that refers to flags_.priority
    unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const;
+private:
+  // auxiliaries used by sched_scrub():
+  double next_deepscrub_interval() const;
+
+  /// should we perform deep scrub?
+  bool is_time_for_deep(bool allow_deep_scrub,
+                       bool allow_scrub,
+                       bool has_deep_errors,
+                       const requested_scrub_t& planned) const;
+
+  /**
+   * Verify the various 'next scrub' flags in m_planned_scrub against configuration
+   * and scrub-related timestamps.
+   *
+   * @returns an updated copy of the m_planned_flags (or nothing if no scrubbing)
+   */
+  std::optional<requested_scrub_t> verify_scrub_mode() const;
+
+  bool verify_periodic_scrub_mode(bool allow_deep_scrub,
+                                 bool try_to_auto_repair,
+                                 bool allow_regular_scrub,
+                                 bool has_deep_errors,
+                                 requested_scrub_t& planned) const;
+
+public:
    virtual void do_request(
      OpRequestRef& op,
      ThreadPool::TPHandle &handle
@@ -946,7 +981,7 @@ protected:
        pg->get_pgbackend()->trim(entry, t);
      }
    };
-  
+
    void update_object_snap_mapping(
      ObjectStore::Transaction *t, const hobject_t &soid,
      const std::set<snapid_t> &snaps);
@@ -1013,248 +1048,23 @@ public:
      hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
      release_backoffs(begin, end);
    }
-protected:
  
    // -- scrub --
-public:
-  struct Scrubber {
-    Scrubber();
-    ~Scrubber();
-
-    // metadata
-    std::set<pg_shard_t> reserved_peers;
-    bool local_reserved, remote_reserved, reserve_failed;
-    epoch_t epoch_start;
-
-    // common to both scrubs
-    bool active;
-    std::set<pg_shard_t> waiting_on_whom;
-    int shallow_errors;
-    int deep_errors;
-    int fixed;
-    ScrubMap primary_scrubmap;
-    ScrubMapBuilder primary_scrubmap_pos;
-    epoch_t replica_scrub_start = 0;
-    ScrubMap replica_scrubmap;
-    ScrubMapBuilder replica_scrubmap_pos;
-    std::map<pg_shard_t, ScrubMap> received_maps;
-    OpRequestRef active_rep_scrub;
-    utime_t scrub_reg_stamp;  // stamp we registered for
-
-    static utime_t scrub_must_stamp() { return utime_t(0,1); }
-
-    omap_stat_t omap_stats  = (const struct omap_stat_t){ 0 };
-
-    // For async sleep
-    bool sleeping = false;
-    bool needs_sleep = true;
-    utime_t sleep_start;
-
-    // flags to indicate explicitly requested scrubs (by admin)
-    bool must_scrub, must_deep_scrub, must_repair, need_auto, req_scrub;
-
-    // Priority to use for scrub scheduling
-    unsigned priority = 0;
-
-    bool time_for_deep;
-    // this flag indicates whether we would like to do auto-repair of the PG or not
-    bool auto_repair;
-    // this flag indicates that we are scrubbing post repair to verify everything is fixed
-    bool check_repair;
-    // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
-    // we should deep scrub in order to auto repair
-    bool deep_scrub_on_error;
-
-    // Maps from objects with errors to missing/inconsistent peers
-    std::map<hobject_t, std::set<pg_shard_t>> missing;
-    std::map<hobject_t, std::set<pg_shard_t>> inconsistent;
-
-    // Std::map from object with errors to good peers
-    std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t> >> authoritative;
-
-    // Cleaned std::map pending snap metadata scrub
-    ScrubMap cleaned_meta_map;
-
-    void clean_meta_map(ScrubMap &for_meta_scrub) {
-      if (end.is_max() ||
-          cleaned_meta_map.objects.empty()) {
-         cleaned_meta_map.swap(for_meta_scrub);
-      } else {
-        auto iter = cleaned_meta_map.objects.end();
-        --iter; // not empty, see if clause
-        auto begin = cleaned_meta_map.objects.begin();
-        if (iter->first.has_snapset()) {
-          ++iter;
-        } else {
-          while (iter != begin) {
-            auto next = iter--;
-            if (next->first.get_head() != iter->first.get_head()) {
-             ++iter;
-             break;
-            }
-          }
-        }
-        for_meta_scrub.objects.insert(begin, iter);
-        cleaned_meta_map.objects.erase(begin, iter);
-      }
-    }
-
-    // digest updates which we are waiting on
-    int num_digest_updates_pending;
-
-    // chunky scrub
-    hobject_t start, end;    // [start,end)
-    hobject_t max_end;       // Largest end that may have been sent to replicas
-    eversion_t subset_last_update;
-
-    // chunky scrub state
-    enum State {
-      INACTIVE,
-      NEW_CHUNK,
-      WAIT_PUSHES,
-      WAIT_LAST_UPDATE,
-      BUILD_MAP,
-      BUILD_MAP_DONE,
-      WAIT_REPLICAS,
-      COMPARE_MAPS,
-      WAIT_DIGEST_UPDATES,
-      FINISH,
-      BUILD_MAP_REPLICA,
-    } state;
-
-    std::unique_ptr<Scrub::Store> store;
-    // deep scrub
-    bool deep;
-    int preempt_left;
-    int preempt_divisor;
-
-    std::list<Context*> callbacks;
-    void add_callback(Context *context) {
-      callbacks.push_back(context);
-    }
-    void run_callbacks() {
-      std::list<Context*> to_run;
-      to_run.swap(callbacks);
-      for (std::list<Context*>::iterator i = to_run.begin();
-          i != to_run.end();
-          ++i) {
-       (*i)->complete(0);
-      }
-    }
-
-    static const char *state_string(const PG::Scrubber::State& state) {
-      const char *ret = NULL;
-      switch( state )
-      {
-        case INACTIVE: ret = "INACTIVE"; break;
-        case NEW_CHUNK: ret = "NEW_CHUNK"; break;
-        case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
-        case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
-        case BUILD_MAP: ret = "BUILD_MAP"; break;
-        case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
-        case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
-        case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
-        case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
-        case FINISH: ret = "FINISH"; break;
-        case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
-      }
-      return ret;
-    }
-
-    bool is_chunky_scrub_active() const { return state != INACTIVE; }
-
-    // clear all state
-    void reset() {
-      active = false;
-      waiting_on_whom.clear();
-      if (active_rep_scrub) {
-        active_rep_scrub = OpRequestRef();
-      }
-      received_maps.clear();
-
-      must_scrub = false;
-      must_deep_scrub = false;
-      must_repair = false;
-      need_auto = false;
-      req_scrub = false;
-      time_for_deep = false;
-      auto_repair = false;
-      check_repair = false;
-      deep_scrub_on_error = false;
-
-      state = PG::Scrubber::INACTIVE;
-      start = hobject_t();
-      end = hobject_t();
-      max_end = hobject_t();
-      subset_last_update = eversion_t();
-      shallow_errors = 0;
-      deep_errors = 0;
-      fixed = 0;
-      omap_stats = (const struct omap_stat_t){ 0 };
-      deep = false;
-      run_callbacks();
-      inconsistent.clear();
-      missing.clear();
-      authoritative.clear();
-      num_digest_updates_pending = 0;
-      primary_scrubmap = ScrubMap();
-      primary_scrubmap_pos.reset();
-      replica_scrubmap = ScrubMap();
-      replica_scrubmap_pos.reset();
-      cleaned_meta_map = ScrubMap();
-      sleeping = false;
-      needs_sleep = true;
-      sleep_start = utime_t();
-    }
-
-    void create_results(const hobject_t& obj);
-    void cleanup_store(ObjectStore::Transaction *t);
-    void dump(ceph::Formatter *f);
-  } scrubber;
-
  protected:
    bool scrub_after_recovery;
-  bool save_req_scrub; // Saved for scrub_after_recovery
  
    int active_pushes;
  
-  bool scrub_can_preempt = false;
-  bool scrub_preempted = false;
-
-  // we allow some number of preemptions of the scrub, which mean we do
-  // not block.  then we start to block.  once we start blocking, we do
-  // not stop until the scrub range is completed.
-  bool write_blocked_by_scrub(const hobject_t &soid);
-
-  /// true if the given range intersects the scrub interval in any way
-  bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
-
    void repair_object(
      const hobject_t &soid,
      const std::list<std::pair<ScrubMap::object, pg_shard_t> > &ok_peers,
      const std::set<pg_shard_t> &bad_peers);
  
-  void abort_scrub();
-  void chunky_scrub(ThreadPool::TPHandle &handle);
-  void scrub_compare_maps();
-  /**
-   * return true if any inconsistency/missing is repaired, false otherwise
-   */
-  bool scrub_process_inconsistent();
-  bool ops_blocked_by_scrub() const;
-  void scrub_finish();
-  void scrub_clear_state(bool keep_repair = false);
-  void _scan_snaps(ScrubMap &map);
+  [[nodiscard]] bool ops_blocked_by_scrub() const;
+  [[nodiscard]] Scrub::scrub_prio_t is_scrub_blocking_ops() const;
+
    void _repair_oinfo_oid(ScrubMap &map);
    void _scan_rollback_obs(const std::vector<ghobject_t> &rollback_obs);
-  void _request_scrub_map(pg_shard_t replica, eversion_t version,
-                          hobject_t start, hobject_t end, bool deep,
-                         bool allow_preemption);
-  int build_scrub_map_chunk(
-    ScrubMap &map,
-    ScrubMapBuilder &pos,
-    hobject_t start, hobject_t end, bool deep,
-    ThreadPool::TPHandle &handle);
    /**
     * returns true if [begin, end) is good to scrub at this time
     * a false return value obliges the implementer to requeue scrub when the
@@ -1262,27 +1072,12 @@ protected:
     */
    virtual bool _range_available_for_scrub(
      const hobject_t &begin, const hobject_t &end) = 0;
-  virtual void scrub_snapshot_metadata(
-    ScrubMap &map,
-    const std::map<hobject_t,
-                   std::pair<std::optional<uint32_t>,
-                        std::optional<uint32_t>>> &missing_digest) { }
-  virtual void _scrub_clear_state() { }
-  virtual void _scrub_finish() { }
-  void clear_scrub_reserved();
-  void scrub_reserve_replicas();
-  void scrub_unreserve_replicas();
-  bool scrub_all_replicas_reserved() const;
-
-  void replica_scrub(
-    OpRequestRef op,
-    ThreadPool::TPHandle &handle);
-  void do_replica_scrub_map(OpRequestRef op);
-
-  void handle_scrub_reserve_request(OpRequestRef op);
-  void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
-  void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
-  void handle_scrub_reserve_release(OpRequestRef op);
+
+  /**
+   * Initiate the process that will create our scrub map for the Primary.
+   * (triggered by MSG_OSD_REP_SCRUB)
+   */
+  void replica_scrub(OpRequestRef op, ThreadPool::TPHandle &handle);
  
    // -- recovery state --
  
@@ -1332,7 +1127,7 @@ protected:
    bool is_clean() const { return recovery_state.is_clean(); }
    bool is_degraded() const { return recovery_state.is_degraded(); }
    bool is_undersized() const { return recovery_state.is_undersized(); }
-  bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
+  bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } // Primary only
    bool is_remapped() const { return recovery_state.is_remapped(); }
    bool is_peered() const { return recovery_state.is_peered(); }
    bool is_recovering() const { return recovery_state.is_recovering(); }
@@ -1395,10 +1190,10 @@ protected:
  
    virtual void kick_snap_trim() = 0;
    virtual void snap_trimmer_scrub_complete() = 0;
-  bool requeue_scrub(bool high_priority = false);
+
    void queue_recovery();
-  bool queue_scrub();
-  unsigned get_scrub_priority();
+  void queue_scrub_after_repair();
+  unsigned int get_scrub_priority();
  
    bool try_flush_or_schedule_async() override;
    void start_flush_on_transaction(
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h

index d036c1d00a6ea5b7041d4a302eb68198d9a00175..98b8fc561c9ab342d18182d8749cb8cfff2e8157 100644 (file)
--- a/src/osd/PeeringState.h
+++ b/src/osd/PeeringState.h
@@ -263,7 +263,7 @@ public:
      /// Notify that info/history changed (generally to update scrub registration)
      virtual void on_info_history_change() = 0;
      /// Notify that a scrub has been requested
-    virtual void scrub_requested(bool deep, bool repair, bool need_auto = false) = 0;
+    virtual void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) = 0;
  
      /// Return current snap_trimq size
      virtual uint64_t get_snap_trimq_size() const = 0;
@@ -502,12 +502,12 @@ public:
    };
  
    struct RequestScrub : boost::statechart::event<RequestScrub> {
-    bool deep;
-    bool repair;
-    explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
+    scrub_level_t deep;
+    scrub_type_t repair;
+    explicit RequestScrub(bool d, bool r) : deep(scrub_level_t(d)), repair(scrub_type_t(r)) {}
      void print(std::ostream *out) const {
-      *out << "RequestScrub(" << (deep ? "deep" : "shallow")
-          << (repair ? " repair" : "");
+      *out << "RequestScrub(" << ((deep==scrub_level_t::deep) ? "deep" : "shallow")
+          << ((repair==scrub_type_t::do_repair) ? " repair)" : ")");
      }
    };
  
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc

index e06001401b76e9c1715f25c46d64450b952af602..edba0886707d5dc1d6326f850b09558fd4d2af48 100644 (file)
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -18,6 +18,7 @@
  #include "boost/tuple/tuple.hpp"
  #include "boost/intrusive_ptr.hpp"
  #include "PG.h"
+#include "pg_scrubber.h"
  #include "PrimaryLogPG.h"
  #include "OSD.h"
  #include "OpRequest.h"
@@ -937,7 +938,7 @@ PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
      if (r != 0) {
        derr << "Error opening class '" << class_name << "': "
             << cpp_strerror(r) << dendl;
-      if (r != -EPERM) // propogate permission error
+      if (r != -EPERM) // propagate permission error
          r = -EINVAL;
        return { r, nullptr };
      } else {
@@ -1010,7 +1011,7 @@ void PrimaryLogPG::do_command(
      f->close_section();
  
      if (is_primary() && is_active()) {
-      scrubber.dump(f.get());
+      m_scrubber->dump(f.get());
      }
  
      f->open_object_section("agent_state");
@@ -1591,24 +1592,24 @@ int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
      dout(10) << " corrupted scrub_ls_arg_t" << dendl;
      return -EINVAL;
    }
+
    int r = 0;
    scrub_ls_result_t result = {.interval = info.history.same_interval_since};
+
    if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
      r = -EAGAIN;
-  } else if (!scrubber.store) {
-    r = -ENOENT;
-  } else if (arg.get_snapsets) {
-    result.vals = scrubber.store->get_snap_errors(get_pgid().pool(),
-                                                 arg.start_after,
-                                                 arg.max_return);
    } else {
-    result.vals = scrubber.store->get_object_errors(get_pgid().pool(),
-                                                   arg.start_after,
-                                                   arg.max_return);
+    bool store_queried = m_scrubber->get_store_errors(arg, result);
+    if (!store_queried) {
+      // the scrubber's store is not initialized
+      r = -ENOENT;
+    }
    }
-  encode(result, osd_op->outdata);
+  encode(result, osd_op->outdata);  // RRR really? even if no store?
+
    return r;
  }
+}
  
  PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
                            const PGPool &_pool,
@@ -1621,11 +1622,14 @@ PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
    new_backfill(false),
    temp_seq(0),
    snap_trimmer_machine(this)
-{ 
+{
    recovery_state.set_backend_predicates(
      pgbackend->get_is_readable_predicate(),
      pgbackend->get_is_recoverable_predicate());
    snap_trimmer_machine.initiate();
+
+  m_scrubber = make_unique<PgScrubber>(this); // *not* the final code
+  // next commit: m_scrubber = make_unique<PrimaryLogScrub>(this);
  }
  
  void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
@@ -1790,16 +1794,16 @@ void PrimaryLogPG::do_request(
        auto m = op->get_req<MOSDScrubReserve>();
        switch (m->type) {
        case MOSDScrubReserve::REQUEST:
-       handle_scrub_reserve_request(op);
+       m_scrubber->handle_scrub_reserve_request(op);
         break;
        case MOSDScrubReserve::GRANT:
-       handle_scrub_reserve_grant(op, m->from);
+       m_scrubber->handle_scrub_reserve_grant(op, m->from);
         break;
        case MOSDScrubReserve::REJECT:
-       handle_scrub_reserve_reject(op, m->from);
+       m_scrubber->handle_scrub_reserve_reject(op, m->from);
         break;
        case MOSDScrubReserve::RELEASE:
-       handle_scrub_reserve_release(op);
+       m_scrubber->handle_scrub_reserve_release(op);
         break;
        }
      }
@@ -2051,7 +2055,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
        return;
      }
  
-    if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) {
+    if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
        dout(20) << __func__ << ": waiting for scrub" << dendl;
        waiting_for_scrub.push_back(op);
        op->mark_delayed("waiting for scrub");
@@ -2416,7 +2420,7 @@ PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
         return cache_result_t::BLOCKED_RECOVERY;
        }
  
-      if (write_blocked_by_scrub(head)) {
+      if (m_scrubber->write_blocked_by_scrub(head)) {
         dout(20) << __func__ << ": waiting for scrub" << dendl;
         waiting_for_scrub.push_back(op);
         op->mark_delayed("waiting for scrub");
@@ -3750,7 +3754,7 @@ void PrimaryLogPG::promote_object(ObjectContextRef obc,
  {
    hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
    ceph_assert(hoid != hobject_t());
-  if (write_blocked_by_scrub(hoid)) {
+  if (m_scrubber->write_blocked_by_scrub(hoid)) {
      dout(10) << __func__ << " " << hoid
              << " blocked by scrub" << dendl;
      if (op) {
@@ -8668,16 +8672,7 @@ void PrimaryLogPG::apply_stats(
      }
    }
  
-  if (is_primary() && scrubber.active) {
-    if (soid < scrubber.start) {
-      dout(20) << __func__ << " " << soid << " < [" << scrubber.start
-              << "," << scrubber.end << ")" << dendl;
-      scrub_cstat.add(delta_stats);
-    } else {
-      dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
-              << "," << scrubber.end << ")" << dendl;
-    }
-  }
+  m_scrubber->stats_of_handled_objects(delta_stats, soid);
  }
  
  void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
@@ -10579,7 +10574,7 @@ int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
    }
  
    if (!fop->blocking &&
-      write_blocked_by_scrub(oid)) {
+      m_scrubber->write_blocked_by_scrub(oid)) {
      if (fop->op) {
        dout(10) << __func__ << " blocked by scrub" << dendl;
        requeue_op(fop->op);
@@ -10796,15 +10791,9 @@ void PrimaryLogPG::op_applied(const eversion_t &applied_version)
    ceph_assert(applied_version != eversion_t());
    ceph_assert(applied_version <= info.last_update);
    recovery_state.local_write_applied(applied_version);
-  if (is_primary()) {
-    if (scrubber.active) {
-      if (recovery_state.get_last_update_applied() >=
-       scrubber.subset_last_update) {
-       requeue_scrub(ops_blocked_by_scrub());
-      }
-    } else {
-      ceph_assert(scrubber.start == scrubber.end);
-    }
+
+  if (is_primary() && m_scrubber->should_requeue_blocked_ops(recovery_state.get_last_update_applied())) {
+    osd->queue_scrub_applied_update(this, is_scrub_blocking_ops());
    }
  }
  
@@ -11231,11 +11220,11 @@ void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
      return;
    }
  
-  if (write_blocked_by_scrub(obc->obs.oi.soid)) {
+  if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
      dout(10) << "handle_watch_timeout waiting for scrub on obj "
              << obc->obs.oi.soid
              << dendl;
-    scrubber.add_callback(
+    m_scrubber->add_callback(
        watch->get_delayed_cb() // This callback!
        );
      return;
@@ -11693,11 +11682,15 @@ void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
    }
  
    if (obc->requeue_scrub_on_unblock) {
+
      obc->requeue_scrub_on_unblock = false;
+
+    dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
+
      // only requeue if we are still active: we may be unblocking
      // because we are resetting for a new peering interval
      if (is_active()) {
-      requeue_scrub();
+      osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
      }
    }
  }
@@ -11932,9 +11925,10 @@ void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
    --active_pushes;
  
    // requeue an active chunky scrub waiting on recovery ops
-  if (!recovery_state.is_deleting() && active_pushes == 0
-      && scrubber.is_chunky_scrub_active()) {
-    requeue_scrub(ops_blocked_by_scrub());
+  if (!recovery_state.is_deleting() && active_pushes == 0 &&
+      m_scrubber->is_scrub_active()) {
+
+    osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
    }
  }
  
@@ -11944,20 +11938,11 @@ void PrimaryLogPG::_applied_recovered_object_replica()
    ceph_assert(active_pushes >= 1);
    --active_pushes;
  
-  // requeue an active chunky scrub waiting on recovery ops
+  // requeue an active scrub waiting on recovery ops
    if (!recovery_state.is_deleting() && active_pushes == 0 &&
-      scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
-       scrubber.active_rep_scrub->get_req())->chunky) {
-    auto& op = scrubber.active_rep_scrub;
-    osd->enqueue_back(
-      OpSchedulerItem(
-        unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)),
-       op->get_req()->get_cost(),
-       op->get_req()->get_priority(),
-       op->get_req()->get_recv_stamp(),
-       op->get_req()->get_source().num(),
-       get_osdmap_epoch()));
-    scrubber.active_rep_scrub.reset();
+      m_scrubber->is_scrub_active()) {
+
+    osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
    }
  }
  
@@ -12366,10 +12351,9 @@ void PrimaryLogPG::on_shutdown()
      osd->clear_queued_recovery(this);
    }
  
-  clear_scrub_reserved();
-  scrub_clear_state();
+  m_scrubber->scrub_clear_state();
  
-  unreg_next_scrub();
+  m_scrubber->unreg_next_scrub();
  
    vector<ceph_tid_t> tids;
    cancel_copy_ops(false, &tids);
@@ -12488,7 +12472,7 @@ void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
    requeue_ops(waiting_for_active);
    requeue_ops(waiting_for_readable);
  
-  clear_scrub_reserved();
+  m_scrubber->clear_scrub_reservations();
  
    vector<ceph_tid_t> tids;
    cancel_copy_ops(is_primary(), &tids);
@@ -12518,7 +12502,7 @@ void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
    }
  
    // requeues waiting_for_scrub
-  scrub_clear_state();
+  m_scrubber->scrub_clear_state();
  
    for (auto p = waiting_for_blocked_object.begin();
         p != waiting_for_blocked_object.end();
@@ -12561,7 +12545,7 @@ void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
    context_registry_on_change();
  
    pgbackend->on_change_cleanup(&t);
-  scrubber.cleanup_store(&t);
+  m_scrubber->cleanup_store(&t);
    pgbackend->on_change();
  
    // clear snap_trimmer state
@@ -12613,6 +12597,8 @@ void PrimaryLogPG::_clear_recovery_state()
  #ifdef DEBUG_RECOVERY_OIDS
    recovering_oids.clear();
  #endif
+  dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
+
    last_backfill_started = hobject_t();
    set<hobject_t>::iterator i = backfills_in_flight.begin();
    while (i != backfills_in_flight.end()) {
@@ -13884,7 +13870,7 @@ void PrimaryLogPG::hit_set_remove_all()
      // Once we hit a degraded object just skip
      if (is_degraded_or_backfilling_object(aoid))
        return;
-    if (write_blocked_by_scrub(aoid))
+    if (m_scrubber->write_blocked_by_scrub(aoid))
        return;
    }
  
@@ -14004,7 +13990,7 @@ void PrimaryLogPG::hit_set_persist()
      // Once we hit a degraded object just skip further trim
      if (is_degraded_or_backfilling_object(aoid))
        return;
-    if (write_blocked_by_scrub(aoid))
+    if (m_scrubber->write_blocked_by_scrub(aoid))
        return;
    }
  
@@ -14037,7 +14023,7 @@ void PrimaryLogPG::hit_set_persist()
      new_hset.using_gmt);
  
    // If the current object is degraded we skip this persist request
-  if (write_blocked_by_scrub(oid))
+  if (m_scrubber->write_blocked_by_scrub(oid))
      return;
  
    hit_set->seal();
@@ -14284,7 +14270,7 @@ bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
        osd->logger->inc(l_osd_agent_skip);
        continue;
      }
-    if (range_intersects_scrub(obc->obs.oi.soid,
+    if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
                                obc->obs.oi.soid.get_head())) {
        dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
        osd->logger->inc(l_osd_agent_skip);
@@ -14487,7 +14473,7 @@ bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
      return false;
    }
    // This is already checked by agent_work() which passes after_flush = false
-  if (after_flush && range_intersects_scrub(soid, soid.get_head())) {
+  if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
        dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
        return false;
    }
@@ -14911,9 +14897,20 @@ bool PrimaryLogPG::already_complete(eversion_t v)
  // ==========================================================================================
  // SCRUB
  
+void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
+{
+  dout(15) << __func__ << " is scrub active? " << m_scrubber->is_scrub_active() << dendl;
+  op->mark_started();
+
+  if (!m_scrubber->is_scrub_active()) {
+    dout(10) << __func__ << " scrub isn't active" << dendl;
+    return;
+  }
+  m_scrubber->map_from_replica(op);
+}
  
-bool PrimaryLogPG::_range_available_for_scrub(
-  const hobject_t &begin, const hobject_t &end)
+bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
+                                             const hobject_t& end)
  {
    pair<hobject_t, ObjectContextRef> next;
    next.second = object_contexts.lookup(begin);
@@ -15526,7 +15523,7 @@ boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
      ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
      return discard_event();
    }
-  if (pg->scrubber.active) {
+  if (pg->m_scrubber->is_scrub_active()) {
      ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
      return transit< WaitScrub >();
    } else {
@@ -15741,6 +15738,10 @@ bool PrimaryLogPG::check_failsafe_full() {
      return osd->check_failsafe_full(get_dpp());
  }
  
+bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
+{
+  return m_scrubber->write_blocked_by_scrub(oid);
+}
  void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
  void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
  
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h

index bc682332dba058a1f72932f720851194c81e21a6..a85c4f85ed475d685c51307971564ad9a7ef970a 100644 (file)
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -574,6 +574,11 @@ public:
    OstreamTemp clog_error() override { return osd->clog->error(); }
    OstreamTemp clog_warn() override { return osd->clog->warn(); }
  
+  /**
+   * a scrub-map arrived from a replica
+   */
+  void do_replica_scrub_map(OpRequestRef op);
+
    struct watch_disconnect_t {
      uint64_t cookie;
      entity_name_t name;
@@ -912,49 +917,10 @@ protected:
     * Releases locks
     *
     * @param manager [in] manager with locks to release
+   * 
+   * (moved to .cc due to scrubber access)
     */
-  void release_object_locks(
-    ObcLockManager &lock_manager) {
-    std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
-    bool requeue_recovery = false;
-    bool requeue_snaptrim = false;
-    lock_manager.put_locks(
-      &to_req,
-      &requeue_recovery,
-      &requeue_snaptrim);
-    if (requeue_recovery)
-      queue_recovery();
-    if (requeue_snaptrim)
-      snap_trimmer_machine.process_event(TrimWriteUnblocked());
-
-    if (!to_req.empty()) {
-      // requeue at front of scrub blocking queue if we are blocked by scrub
-      for (auto &&p: to_req) {
-       if (write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
-          for (auto& op : p.second) {
-            op->mark_delayed("waiting for scrub");
-          }
-
-         waiting_for_scrub.splice(
-           waiting_for_scrub.begin(),
-           p.second,
-           p.second.begin(),
-           p.second.end());
-       } else if (is_laggy()) {
-          for (auto& op : p.second) {
-            op->mark_delayed("waiting for readable");
-          }
-         waiting_for_readable.splice(
-           waiting_for_readable.begin(),
-           p.second,
-           p.second.begin(),
-           p.second.end());
-       } else {
-         requeue_ops(p.second);
-       }
-      }
-    }
-  }
+  void release_object_locks(ObcLockManager &lock_manager);
  
    // replica ops
    // [primary|tail]
@@ -1964,9 +1930,7 @@ public:
    void on_removal(ObjectStore::Transaction &t) override;
    void on_shutdown() override;
    bool check_failsafe_full() override;
-  bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
-    return write_blocked_by_scrub(oid);
-  }
+  bool maybe_preempt_replica_scrub(const hobject_t& oid) override;
    int rep_repair_primary_object(const hobject_t& soid, OpContext *ctx);
  
    // attr cache handling
diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc

index b6af7e07fc71c2d9eae2710478a695f742eac0f5..bcabad41f73e34a0c1870ffcdf410cb7ac1287a8 100644 (file)
--- a/src/osd/pg_scrubber.cc
+++ b/src/osd/pg_scrubber.cc
@@ -34,6 +34,20 @@ template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
    return t->gen_prefix(*_dout) << " scrubber pg(" << t->pg_id << ") ";
  }
  
+ostream& operator<<(ostream& out, const scrub_flags_t& sf)
+{
+  if (sf.auto_repair)
+    out << " AUTO_REPAIR";
+  if (sf.check_repair)
+    out << " CHECK_REPAIR";
+  if (sf.deep_scrub_on_error)
+    out << " DEEP_SCRUB_ON_ERROR";
+  if (sf.required)
+    out << " REQ_SCRUB";
+
+  return out;
+}
+
  ostream& operator<<(ostream& out, const requested_scrub_t& sf)
  {
    if (sf.must_repair)
@@ -58,6 +72,1822 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf)
    return out;
  }
  
+bool PgScrubber::is_event_relevant(epoch_t queued) const
+{
+  return is_primary() && m_pg->is_active() && m_pg->is_clean() && is_scrub_active() &&
+        !was_epoch_changed() && (!queued || !m_pg->pg_has_reset_since(queued));
+}
+
+bool PgScrubber::should_abort_scrub(epoch_t queued) const
+{
+  dout(10) << __func__ << "(): queued:" << queued << " required: " << m_flags.required
+          << " noscrub: " << get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) << " / "
+          << m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB) << dendl;
+
+  if (!is_primary() || !m_pg->is_active() ||
+      (queued && m_pg->pg_has_reset_since(queued))) {
+    return true;
+  }
+
+  if (m_flags.required) {
+    return false;  // not stopping 'required' scrubs for configuration changes
+  }
+
+  if (state_test(PG_STATE_DEEP_SCRUB)) {
+    if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+       m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
+      dout(10) << "nodeep_scrub set, aborting" << dendl;
+      return true;
+    }
+  } else if (state_test(PG_STATE_SCRUBBING)) {
+    if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+       m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
+      dout(10) << "noscrub set, aborting" << dendl;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void PgScrubber::send_start_scrub()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  if (should_abort_scrub(epoch_t(0))) {
+    dout(10) << __func__ << " aborting!" << dendl;
+    scrub_clear_state(false);
+  } else {
+    m_fsm->my_states();
+    m_fsm->process_event(StartScrub{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_start_after_repair()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  m_fsm->my_states();
+  m_fsm->process_event(AfterRepairScrub{});
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_unblock()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  if (should_abort_scrub(epoch_t(0))) {
+
+    dout(10) << __func__ << " aborting!" << dendl;
+    scrub_clear_state(false);
+
+  } else if (is_scrub_active()) {
+
+    m_fsm->my_states();
+    m_fsm->process_event(Unblocked{});
+
+  } else {
+    dout(10) << __func__ << " ignored as scrub not active" << dendl;
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_resched()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  if (should_abort_scrub(epoch_t(0))) {
+    dout(10) << __func__ << " aborting!" << dendl;
+    scrub_clear_state(false);
+  } else if (is_scrub_active()) {
+    m_fsm->my_states();
+    m_fsm->process_event(InternalSchedScrub{});
+  } else {
+    // no need to send anything
+    dout(10) << __func__ << " event no longer relevant" << dendl;
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_start_replica()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  m_fsm->my_states();
+  m_fsm->process_event(StartReplica{});
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_sched_replica()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  m_fsm->my_states();
+  m_fsm->process_event(SchedReplica{});         // retest for map availability
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::active_pushes_notification()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  if (should_abort_scrub(epoch_t(0))) {
+    dout(10) << __func__ << " aborting!" << dendl;
+    scrub_clear_state(false);
+  } else {
+    m_fsm->my_states();
+    m_fsm->process_event(ActivePushesUpd{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::update_applied_notification(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << "() epoch: " << epoch_queued << dendl;
+  if (should_abort_scrub(epoch_queued)) {
+    dout(10) << __func__ << " aborting!" << dendl;
+    scrub_clear_state(false);
+  } else {
+    m_fsm->my_states();
+    m_fsm->process_event(UpdatesApplied{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::digest_update_notification()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  m_fsm->my_states();
+  if (is_event_relevant(epoch_t(0))) {
+    m_fsm->process_event(DigestUpdate{});
+  } else {
+    // no need to send anything
+    dout(10) << __func__ << " event no longer relevant" << dendl;
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_epoch_changed()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  if (is_scrub_active()) {
+    m_fsm->my_states();
+    m_fsm->process_event(EpochChanged{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_maps_ready()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  m_fsm->my_states();
+  if (is_scrub_active()) {
+    m_fsm->process_event(GotReplicas{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_pushes_upd()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  m_fsm->my_states();
+  if (is_scrub_active()) {
+    m_fsm->process_event(ReplicaPushesUpd{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_remotes_reserved()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  m_fsm->my_states();
+  m_fsm->process_event(RemotesReserved{});  // note: too early to check for 'active'!
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_reservation_failure()
+{
+  dout(10) << "scrubber event -->> " << __func__ << dendl;
+  m_fsm->my_states();
+  m_fsm->process_event(ReservationFailure{});  // do not check for 'active'!
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+bool PgScrubber::is_scrub_active() const
+{
+  dout(10) << " " << __func__ << " actv? " << m_active << "pg:" << m_pg->pg_id << dendl;
+  return m_active;
+}
+
+bool PgScrubber::is_reserving() const
+{
+  return m_fsm->is_reserving();
+}
+
+void PgScrubber::reset_epoch(epoch_t epoch_queued)
+{
+  dout(10) << __func__ << " PG( " << m_pg->pg_id
+          << (m_pg->is_primary() ? ") prm" : ") rpl") << " epoch: " << epoch_queued
+          << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
+
+  dout(10) << __func__ << " STATE_SCRUBBING? " << state_test(PG_STATE_SCRUBBING) << dendl;
+  m_epoch_queued = epoch_queued;
+  m_needs_sleep = true;
+
+  m_fsm->assert_not_active();
+
+  m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
+{
+  unsigned int qu_priority = m_flags.priority;
+
+  if (with_priority == Scrub::scrub_prio_t::high_priority) {
+    qu_priority =
+      std::max(qu_priority, (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
+  }
+  return qu_priority;
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+                                               unsigned int suggested_priority) const
+{
+  if (with_priority == Scrub::scrub_prio_t::high_priority) {
+    suggested_priority = std::max(suggested_priority,
+                                 (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
+  }
+  return suggested_priority;
+}
+
+// ///////////////////////////////////////////////////////////////////// //
+// scrub op registration handling
+
+bool PgScrubber::is_scrub_registered() const
+{
+  return !m_scrub_reg_stamp.is_zero();
+}
+
+void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
+{
+  if (!is_primary()) {
+    dout(20) << __func__ << ": not a primary!" << dendl;
+    return;
+  }
+
+  dout(10) << __func__ << " planned.m.s: " << request_flags.must_scrub
+          << ": planned.n.a.: " << request_flags.need_auto
+          << " stamp: " << m_pg->info.history.last_scrub_stamp << dendl;
+
+  ceph_assert(!is_scrub_registered());
+
+  utime_t reg_stamp;
+  bool must = false;
+
+  if (request_flags.must_scrub || request_flags.need_auto) {
+    // Set the smallest time that isn't utime_t()
+    reg_stamp = PgScrubber::scrub_must_stamp();
+    must = true;
+  } else if (m_pg->info.stats.stats_invalid &&
+            m_pg->cct->_conf->osd_scrub_invalid_stats) {
+    reg_stamp = ceph_clock_now();
+    must = true;
+  } else {
+    reg_stamp = m_pg->info.history.last_scrub_stamp;
+  }
+
+  dout(9) << __func__ << " pg(" << m_pg_id << ") must: " << must
+         << " required:" << m_flags.required << " flags: " << request_flags
+         << " stamp: " << reg_stamp << dendl;
+
+  // note down the sched_time, so we can locate this scrub, and remove it
+  // later on.
+  double scrub_min_interval = 0;
+  double scrub_max_interval = 0;
+  m_pg->pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
+  m_pg->pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
+
+  m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval,
+                                          scrub_max_interval, must);
+  dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time "
+          << m_scrub_reg_stamp << ", must = " << (int)must << dendl;
+}
+
+void PgScrubber::unreg_next_scrub()
+{
+  if (is_scrub_registered()) {
+    m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
+    m_scrub_reg_stamp = utime_t{};
+  }
+}
+
+/// debug/development temporary code:
+void PgScrubber::debug_dump_reservations(std::string_view header_txt) const
+{
+  std::string format;
+  auto f = Formatter::create(format, "json-pretty", "json-pretty");
+  m_osds->dump_scrub_reservations(f);
+  std::stringstream o;
+  f->flush(o);
+  dout(20) << header_txt << o.str() << dendl;
+  delete f;
+}
+
+void PgScrubber::scrub_requested(scrub_level_t scrub_level,
+                                scrub_type_t scrub_type,
+                                requested_scrub_t& req_flags)
+{
+  dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ")
+          << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ")
+          << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered()
+          << dendl;
+
+  debug_dump_reservations(" before_unreg ");
+
+  unreg_next_scrub();
+
+  req_flags.must_scrub = true;
+  req_flags.must_deep_scrub =
+    (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair);
+  req_flags.must_repair = (scrub_type == scrub_type_t::do_repair);
+  // User might intervene, so clear this
+  req_flags.need_auto = false;
+  req_flags.req_scrub = true;
+
+  dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl;
+  debug_dump_reservations(" before_reg ");
+
+  reg_next_scrub(req_flags);
+
+  debug_dump_reservations(" after_reg ");
+}
+
+void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
+{
+  dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << " ## "
+          << is_scrub_registered() << dendl;
+  debug_dump_reservations(" auto-scrub before ");
+
+  unreg_next_scrub();
+  req_flags.need_auto = true;
+  reg_next_scrub(req_flags);
+
+  debug_dump_reservations(" auto-scrub after ");
+}
+
+bool PgScrubber::reserve_local()
+{
+  // try to create the reservation object (which translates into asking the
+  // OSD for the local scrub resource). If failing - undo it immediately
+
+  m_local_osd_resource.emplace(m_pg, m_osds);
+  if (!m_local_osd_resource->is_reserved()) {
+    m_local_osd_resource.reset();
+    return false;
+  }
+
+  return true;
+}
+
+// ----------------------------------------------------------------------------
+
+bool PgScrubber::has_pg_marked_new_updates() const
+{
+  auto last_applied = m_pg->recovery_state.get_last_update_applied();
+  dout(10) << __func__ << " recovery last: " << last_applied
+          << " vs. scrub's: " << m_subset_last_update << dendl;
+
+  return last_applied >= m_subset_last_update;
+}
+
+void PgScrubber::set_subset_last_update(eversion_t e)
+{
+  m_subset_last_update = e;
+}
+
+/*
+ * setting:
+ * - m_subset_last_update
+ * - m_max_end
+ * - end
+ * - start
+ * By:
+ * - setting tentative range based on conf and divisor
+ * - requesting a partial list of elements from the backend;
+ * - handling some head/clones issues
+ * - ...
+ *
+ * The selected range is set directly into 'm_start' and 'm_end'
+ */
+bool PgScrubber::select_range()
+{
+  m_primary_scrubmap = ScrubMap{};
+  m_received_maps.clear();
+
+  /* get the start and end of our scrub chunk
+   *
+   * Our scrub chunk has an important restriction we're going to need to
+   * respect. We can't let head be start or end.
+   * Using a half-open interval means that if end == head,
+   * we'd scrub/lock head and the clone right next to head in different
+   * chunks which would allow us to miss clones created between
+   * scrubbing that chunk and scrubbing the chunk including head.
+   * This isn't true for any of the other clones since clones can
+   * only be created "just to the left of" head.  There is one exception
+   * to this: promotion of clones which always happens to the left of the
+   * left-most clone, but promote_object checks the scrubber in that
+   * case, so it should be ok.  Also, it's ok to "miss" clones at the
+   * left end of the range if we are a tier because they may legitimately
+   * not exist (see _scrub).
+   */
+  int min_idx = std::max<int64_t>(
+    3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor());
+
+  int max_idx = std::max<int64_t>(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max /
+                                            preemption_data.chunk_divisor());
+
+  // why mixing 'int' and int64_t? RRR
+
+  dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx
+          << " Div: " << preemption_data.chunk_divisor() << dendl;
+
+  hobject_t start = m_start;
+  hobject_t candidate_end;
+  std::vector<hobject_t> objects;
+  int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects,
+                                                       &candidate_end);
+  ceph_assert(ret >= 0);
+
+  if (!objects.empty()) {
+
+    hobject_t back = objects.back();
+    while (candidate_end.is_head() && candidate_end == back.get_head()) {
+      candidate_end = back;
+      objects.pop_back();
+      if (objects.empty()) {
+       ceph_assert(0 ==
+                   "Somehow we got more than 2 objects which"
+                   "have the same head but are not clones");
+      }
+      back = objects.back();
+    }
+
+    if (candidate_end.is_head()) {
+      ceph_assert(candidate_end != back.get_head());
+      candidate_end = candidate_end.get_object_boundary();
+    }
+
+  } else {
+    ceph_assert(candidate_end.is_max());
+  }
+
+  // is that range free for us? if not - we will be rescheduled later by whoever
+  // triggered us this time
+
+  if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) {
+    // we'll be requeued by whatever made us unavailable for scrub
+    dout(10) << __func__ << ": scrub blocked somewhere in range "
+            << "[" << m_start << ", " << candidate_end << ")" << dendl;
+    return false;
+  }
+
+  m_end = candidate_end;
+  if (m_end > m_max_end)
+    m_max_end = m_end;
+
+  dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// "
+          << m_max_end << dendl;
+  return true;
+}
+
+bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
+{
+  if (soid < m_start || soid >= m_end) {
+    return false;
+  }
+
+  dout(10) << __func__ << " " << soid << " can preempt? "
+          << preemption_data.is_preemptable() << dendl;
+  dout(10) << __func__ << " " << soid << " already? " << preemption_data.was_preempted()
+          << dendl;
+
+  if (preemption_data.is_preemptable()) {
+
+    if (!preemption_data.was_preempted()) {
+      dout(10) << __func__ << " " << soid << " preempted" << dendl;
+
+      // signal the preemption
+      preemption_data.do_preempt();
+
+    } else {
+      dout(10) << __func__ << " " << soid << " already preempted" << dendl;
+    }
+    return false;
+  }
+  return true;
+}
+
+bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end)
+{
+  // does [start, end] intersect [scrubber.start, scrubber.m_max_end)
+  return (start < m_max_end && end >= m_start);
+}
+
+/**
+ *  if we are required to sleep:
+ *     arrange a callback sometimes later.
+ *     be sure to be able to identify a stale callback.
+ *  Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue)
+ *    anyway.
+ */
+void PgScrubber::add_delayed_scheduling()
+{
+  milliseconds sleep_time{0ms};
+  if (m_needs_sleep) {
+    double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required);
+    dout(10) << __func__ << " sleep: " << scrub_sleep << dendl;
+    sleep_time = milliseconds{long(scrub_sleep)};
+  }
+  dout(15) << __func__ << " sleep: " << sleep_time.count() << " needed? " << m_needs_sleep
+          << dendl;
+
+  if (sleep_time.count()) {
+    // schedule a transition for some 'sleep_time' ms in the future
+
+    m_needs_sleep = false;
+    m_sleep_started_at = ceph_clock_now();
+
+    // the 'delayer' for crimson is different. Will be factored out.
+
+    spg_t pgid = m_pg->get_pgid();
+    auto callbk = new LambdaContext([osds = m_osds, pgid,
+                                    scrbr = this]([[maybe_unused]] int r) mutable {
+      PGRef pg = osds->osd->lookup_lock_pg(pgid);
+      if (!pg) {
+       lgeneric_subdout(g_ceph_context, osd, 10)
+         << "scrub_requeue_callback: Could not find "
+         << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl;
+       return;
+      }
+      scrbr->m_needs_sleep = true;
+      lgeneric_dout(scrbr->get_pg_cct(), 7)
+       << "scrub_requeue_callback: slept for "
+       << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl;
+
+      scrbr->m_sleep_started_at = utime_t{};
+      osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority);
+      pg->unlock();
+    });
+
+    std::lock_guard l(m_osds->sleep_lock);
+    m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk);
+
+  } else {
+    // just a requeue
+    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
+  }
+}
+
+/**
+ *  walk the log to find the latest update that affects our chunk
+ */
+eversion_t PgScrubber::search_log_for_updates() const
+{
+  auto& projected = m_pg->projected_log.log;
+  auto pi = find_if(
+    projected.crbegin(), projected.crend(),
+    [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; });
+
+  if (pi != projected.crend())
+    return pi->version;
+
+  // there was no relevant update entry in the log
+
+  auto& log = m_pg->recovery_state.get_pg_log().get_log().log;
+  auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool {
+    return e.soid >= m_start && e.soid < m_end;
+  });
+
+  if (p == log.crend())
+    return eversion_t{};
+  else
+    return p->version;
+}
+
+bool PgScrubber::get_replicas_maps(bool replica_can_preempt)
+{
+  dout(10) << __func__ << " epoch_start: " << m_epoch_start
+          << " pg same_interval_since: " << m_pg->info.history.same_interval_since
+          << dendl;
+
+  bool do_have_replicas = false;
+
+  m_primary_scrubmap_pos.reset();
+
+  // ask replicas to scan and send maps
+  for (const auto& i : m_pg->get_acting_recovery_backfill()) {
+
+    if (i == m_pg_whoami)
+      continue;
+
+    do_have_replicas = true;
+    m_maps_status.mark_replica_map_request(i);
+    _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep,
+                      replica_can_preempt);
+  }
+
+  dout(10) << __func__ << " awaiting" << m_maps_status << dendl;
+  return do_have_replicas;
+}
+
+bool PgScrubber::was_epoch_changed() const
+{
+  // for crimson we have m_pg->get_info().history.same_interval_since
+  dout(10) << __func__ << " epoch_start: " << m_epoch_start
+          << " from pg: " << m_pg->get_history().same_interval_since << dendl;
+
+  return m_epoch_start < m_pg->get_history().same_interval_since;
+}
+
+void PgScrubber::mark_local_map_ready()
+{
+  m_maps_status.mark_local_map_ready();
+}
+
+bool PgScrubber::are_all_maps_available() const
+{
+  return m_maps_status.are_all_maps_available();
+}
+
+std::string PgScrubber::dump_awaited_maps() const
+{
+  return m_maps_status.dump();
+}
+
+void PgScrubber::_request_scrub_map(pg_shard_t replica,
+                                   eversion_t version,
+                                   hobject_t start,
+                                   hobject_t end,
+                                   bool deep,
+                                   bool allow_preemption)
+{
+  ceph_assert(replica != m_pg_whoami);
+  dout(10) << __func__ << " scrubmap from osd." << replica
+          << (deep ? " deep" : " shallow") << dendl;
+
+  auto repscrubop = new MOSDRepScrub(
+    spg_t(m_pg->info.pgid.pgid, replica.shard), version, m_pg->get_osdmap_epoch(),
+    m_pg->get_last_peering_reset(), start, end, deep, allow_preemption, m_flags.priority,
+    m_pg->ops_blocked_by_scrub());
+
+  // default priority. We want the replica-scrub processed prior to any recovery
+  // or client io messages (we are holding a lock!)
+  m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
+}
+
+void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
+{
+  if (!m_store)
+    return;
+
+  struct OnComplete : Context {
+    std::unique_ptr<Scrub::Store> store;
+    explicit OnComplete(std::unique_ptr<Scrub::Store>&& store) : store(std::move(store))
+    {}
+    void finish(int) override {}
+  };
+  m_store->cleanup(t);
+  t->register_on_complete(new OnComplete(std::move(m_store)));
+  ceph_assert(!m_store);
+}
+
+void PgScrubber::on_init()
+{
+  // going upwards from 'inactive'
+  ceph_assert(!is_scrub_active());
+
+  preemption_data.reset();
+  m_pg->publish_stats_to_osd();
+  m_epoch_start = m_pg->get_history().same_interval_since;
+
+  dout(10) << __func__ << " start same_interval:" << m_epoch_start << dendl;
+
+  //  create a new store
+  {
+    ObjectStore::Transaction t;
+    cleanup_store(&t);
+    m_store.reset(
+      Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
+    m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+  }
+
+  m_start = m_pg->info.pgid.pgid.get_hobj_start();
+  m_active = true;
+}
+
+void PgScrubber::on_replica_init()
+{
+  ceph_assert(!m_active);
+  m_active = true;
+}
+
+void PgScrubber::_scan_snaps(ScrubMap& smap)
+{
+  hobject_t head;
+  SnapSet snapset;
+
+  // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
+  // caller using clean_meta_map(), and it works properly.
+  dout(15) << __func__ << " starts" << dendl;
+
+  for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
+
+    const hobject_t& hoid = i->first;
+    ScrubMap::object& o = i->second;
+
+    dout(20) << __func__ << " " << hoid << dendl;
+
+    ceph_assert(!hoid.is_snapdir());
+    if (hoid.is_head()) {
+      // parse the SnapSet
+      bufferlist bl;
+      if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
+       continue;
+      }
+      bl.push_back(o.attrs[SS_ATTR]);
+      auto p = bl.cbegin();
+      try {
+       decode(snapset, p);
+      } catch (...) {
+       continue;
+      }
+      head = hoid.get_head();
+      continue;
+    }
+
+    if (hoid.snap < CEPH_MAXSNAP) {
+      // check and if necessary fix snap_mapper
+      if (hoid.get_head() != head) {
+       derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl;
+       continue;
+      }
+      set<snapid_t> obj_snaps;
+      auto p = snapset.clone_snaps.find(hoid.snap);
+      if (p == snapset.clone_snaps.end()) {
+       derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl;
+       continue;
+      }
+      obj_snaps.insert(p->second.begin(), p->second.end());
+      set<snapid_t> cur_snaps;
+      int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps);
+      if (r != 0 && r != -ENOENT) {
+       derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
+       ceph_abort();
+      }
+      if (r == -ENOENT || cur_snaps != obj_snaps) {
+       ObjectStore::Transaction t;
+       OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t));
+       if (r == 0) {
+         r = m_pg->snap_mapper.remove_oid(hoid, &_t);
+         if (r != 0) {
+           derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+           ceph_abort();
+         }
+         m_pg->osd->clog->error()
+           << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+           << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps
+           << ", oi: " << obj_snaps << "...repaired";
+       } else {
+         m_pg->osd->clog->error()
+           << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+           << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper"
+           << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r
+           << "...repaired";
+       }
+       m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t);
+
+       // wait for repair to apply to avoid confusing other bits of the system.
+       {
+         dout(15) << __func__ << " wait on repair!" << dendl;
+
+         ceph::condition_variable my_cond;
+         ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
+         int e = 0;
+         bool done;
+
+         t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e));
+
+         e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t));
+         if (e != 0) {
+           derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl;
+         } else {
+           std::unique_lock l{my_lock};
+           my_cond.wait(l, [&done] { return done; });
+         }
+       }
+      }
+    }
+  }
+}
+
+int PgScrubber::build_primary_map_chunk()
+{
+  auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start,
+                                  m_end, m_is_deep);
+
+  if (ret == -EINPROGRESS)
+    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
+
+  return ret;
+}
+
+int PgScrubber::build_replica_map_chunk()
+{
+  dout(10) << __func__ << " epoch start: " << m_epoch_start << " ep q: " << m_epoch_queued
+          << dendl;
+  dout(10) << __func__ << " deep: " << m_is_deep << dendl;
+
+  auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
+                                  m_is_deep);
+
+  if (ret == 0) {
+
+    // finished!
+    // In case we restarted smaller chunk, clear old data
+
+    ScrubMap for_meta_scrub;
+    m_cleaned_meta_map.clear_from(m_start);
+    m_cleaned_meta_map.insert(replica_scrubmap);
+    clean_meta_map(for_meta_scrub);
+    _scan_snaps(for_meta_scrub);
+  }
+
+  // previous version used low priority here. Now switched to using the priority
+  // of the original message
+  if (ret == -EINPROGRESS)
+    requeue_replica(m_replica_request_priority);
+
+  return ret;
+}
+
+int PgScrubber::build_scrub_map_chunk(
+  ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep)
+{
+  dout(10) << __func__ << " [" << start << "," << end << ") "
+          << " pos " << pos << " Deep: " << deep << dendl;
+
+  // start
+  while (pos.empty()) {
+
+    pos.deep = deep;
+    map.valid_through = m_pg->info.last_update;
+
+    // objects
+    vector<ghobject_t> rollback_obs;
+    pos.ret =
+      m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs);
+    dout(10) << __func__ << " while pos empty " << pos.ret << dendl;
+    if (pos.ret < 0) {
+      dout(5) << "objects_list_range error: " << pos.ret << dendl;
+      return pos.ret;
+    }
+    dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl;
+    if (pos.ls.empty()) {
+      break;
+    }
+    m_pg->_scan_rollback_obs(rollback_obs);
+    pos.pos = 0;
+    return -EINPROGRESS;
+  }
+
+  // scan objects
+  while (!pos.done()) {
+    int r = m_pg->get_pgbackend()->be_scan_list(map, pos);
+    dout(10) << __func__ << " be r " << r << dendl;
+    if (r == -EINPROGRESS) {
+      dout(8 /*20*/) << __func__ << " in progress" << dendl;
+      return r;
+    }
+  }
+
+  // finish
+  dout(8 /*20*/) << __func__ << " finishing" << dendl;
+  ceph_assert(pos.done());
+  m_pg->_repair_oinfo_oid(map);
+
+  dout(8 /*20*/) << __func__ << " done, got " << map.objects.size() << " items" << dendl;
+  return 0;
+}
+
+/**
+ * \todo describe what we are doing here
+ *
+ * @param for_meta_scrub
+ */
+void PgScrubber::clean_meta_map(ScrubMap& for_meta_scrub)
+{
+  if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) {
+    m_cleaned_meta_map.swap(for_meta_scrub);
+  } else {
+    auto iter = m_cleaned_meta_map.objects.end();
+    --iter;  // not empty, see 'if' clause
+    auto begin = m_cleaned_meta_map.objects.begin();
+    if (iter->first.has_snapset()) {
+      ++iter;
+    } else {
+      while (iter != begin) {
+       auto next = iter--;
+       if (next->first.get_head() != iter->first.get_head()) {
+         ++iter;
+         break;
+       }
+      }
+    }
+    for_meta_scrub.objects.insert(begin, iter);
+    m_cleaned_meta_map.objects.erase(begin, iter);
+  }
+}
+
+void PgScrubber::run_callbacks()
+{
+  std::list<Context*> to_run;
+  to_run.swap(m_callbacks);
+
+  for (auto& tr : to_run) {
+    tr->complete(0);
+  }
+}
+
+void PgScrubber::maps_compare_n_cleanup()
+{
+  scrub_compare_maps();
+  m_start = m_end;
+  run_callbacks();
+  requeue_waiting();
+}
+
+Scrub::preemption_t* PgScrubber::get_preemptor()
+{
+  return &preemption_data;
+}
+
+void PgScrubber::requeue_replica(Scrub::scrub_prio_t is_high_priority)
+{
+  dout(10) << __func__ << dendl;
+  m_osds->queue_for_rep_scrub_resched(m_pg, is_high_priority, m_flags.priority);
+}
+
+/*
+ * Process note: called for the arriving "give me your map, replica!" request. Unlike
+ * the original implementation, we do not requeue the Op waiting for
+ * updates. Instead - we trigger the FSM.
+ */
+void PgScrubber::replica_scrub_op(OpRequestRef op)
+{
+  auto msg = op->get_req<MOSDRepScrub>();
+  dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch
+          << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl;
+
+  if (msg->map_epoch < m_pg->info.history.same_interval_since) {
+    dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
+            << " < " << m_pg->info.history.same_interval_since << dendl;
+    return;
+  }
+
+  replica_scrubmap = ScrubMap{};
+  replica_scrubmap_pos = ScrubMapBuilder{};
+
+  // m_replica_epoch_start is overwritten if requeued waiting for active pushes
+  m_replica_epoch_start = m_pg->info.history.same_interval_since;
+  m_replica_min_epoch = msg->min_epoch;
+  m_start = msg->start;
+  m_end = msg->end;
+  m_max_end = msg->end;
+  m_is_deep = msg->deep;
+  m_epoch_start = m_pg->info.history.same_interval_since;
+  m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority
+                                                 : Scrub::scrub_prio_t::low_priority;
+  m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority();
+
+  preemption_data.reset();
+  preemption_data.force_preemptability(msg->allow_preemption);
+
+  replica_scrubmap_pos.reset();
+
+  // make sure the FSM is at NotActive
+  m_fsm->assert_not_active();
+
+  m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority);
+}
+
+void PgScrubber::replica_scrub(epoch_t epoch_queued)
+{
+  dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued
+          << dendl;
+  dout(20) << __func__ << " m_epoch_start: " << m_epoch_start
+          << " better be >= " << m_pg->info.history.same_interval_since << dendl;
+  dout(20) << __func__ << " m_is_deep: " << m_is_deep << dendl;
+
+  if (m_pg->pg_has_reset_since(epoch_queued)) {
+    dout(10) << "replica_scrub(epoch,) - reset!" << dendl;
+    send_epoch_changed();
+    return;
+  }
+
+  if (was_epoch_changed()) {
+    dout(10) << "replica_scrub(epoch,) - epoch!" << dendl;
+    send_epoch_changed();
+    return;
+  }
+  ceph_assert(!is_primary());  // as should have been caught by the epoch-changed check
+
+  send_start_replica();
+}
+
+void PgScrubber::replica_scrub_resched(epoch_t epoch_queued)
+{
+  dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued
+          << dendl;
+
+  if (m_pg->pg_has_reset_since(epoch_queued)) {
+    dout(10) << "replica_scrub(epoch,) - reset!" << dendl;
+    send_epoch_changed();
+    return;
+  }
+
+  if (was_epoch_changed()) {
+    dout(10) << __func__ << " epoch changed!" << dendl;
+    send_epoch_changed();
+    return;
+  }
+  ceph_assert(!is_primary());  // as should have been caught by the epoch-changed check
+
+  send_sched_replica();
+}
+
+void PgScrubber::set_op_parameters(requested_scrub_t& request)
+{
+  dout(10) << __func__ << " input: " << request << dendl;
+
+  m_flags.check_repair = request.check_repair;
+  m_flags.auto_repair = request.auto_repair || request.need_auto;
+  m_flags.required = request.req_scrub || request.must_scrub;
+
+  m_flags.priority = (request.must_scrub || request.need_auto)
+                      ? get_pg_cct()->_conf->osd_requested_scrub_priority
+                      : m_pg->get_scrub_priority();
+
+  state_set(PG_STATE_SCRUBBING);
+
+  // will we be deep-scrubbing?
+  if (request.must_deep_scrub || request.need_auto || request.time_for_deep) {
+    state_set(PG_STATE_DEEP_SCRUB);
+  }
+
+  if (request.must_repair || m_flags.auto_repair) {
+    state_set(PG_STATE_REPAIR);
+  }
+
+  // the publishing here seems to be required for tests synchronization
+  m_pg->publish_stats_to_osd();
+  m_flags.deep_scrub_on_error = request.deep_scrub_on_error;
+  request = requested_scrub_t{};
+}
+
+/**
+ *  RRR \todo ask why we collect from acting+recovery+backfill, but use the size of
+ *  only the acting set
+ */
+void PgScrubber::scrub_compare_maps()
+{
+  dout(10) << __func__ << " has maps, analyzing" << dendl;
+
+  // construct authoritative scrub map for type-specific scrubbing
+  m_cleaned_meta_map.insert(m_primary_scrubmap);
+  map<hobject_t, pair<std::optional<uint32_t>, std::optional<uint32_t>>> missing_digest;
+
+  map<pg_shard_t, ScrubMap*> maps;
+  maps[m_pg_whoami] = &m_primary_scrubmap;
+
+  for (const auto& i : m_pg->get_acting_recovery_backfill()) {
+    if (i == m_pg_whoami)
+      continue;
+    dout(2) << __func__ << " replica " << i << " has "
+           << m_received_maps[i].objects.size() << " items" << dendl;
+    maps[i] = &m_received_maps[i];
+  }
+
+  set<hobject_t> master_set;
+
+  // Construct master set
+  for (const auto& map : maps) {
+    for (const auto& i : map.second->objects) {
+      master_set.insert(i.first);
+    }
+  }
+
+  stringstream ss;
+  m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss);
+
+  if (!ss.str().empty()) {
+    m_osds->clog->warn(ss);
+  }
+
+  if (m_pg->recovery_state.get_acting().size() > 1) {
+
+    // RRR add a comment here
+
+    dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
+
+    // Map from object with errors to good peer
+    map<hobject_t, list<pg_shard_t>> authoritative;
+
+    dout(2) << __func__ << m_pg->get_primary() << " has "
+           << m_primary_scrubmap.objects.size() << " items" << dendl;
+
+    ss.str("");
+    ss.clear();
+
+    m_pg->get_pgbackend()->be_compare_scrubmaps(
+      maps, master_set, state_test(PG_STATE_REPAIR), m_missing, m_inconsistent,
+      authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(),
+      m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss);
+    dout(2) << ss.str() << dendl;
+
+    if (!ss.str().empty()) {
+      m_osds->clog->error(ss);
+    }
+
+    for (auto& i : authoritative) {
+      list<pair<ScrubMap::object, pg_shard_t>> good_peers;
+      for (list<pg_shard_t>::const_iterator j = i.second.begin(); j != i.second.end();
+          ++j) {
+       good_peers.emplace_back(maps[*j]->objects[i.first], *j);
+      }
+      m_authoritative.emplace(i.first, good_peers);
+    }
+
+    for (auto i = authoritative.begin(); i != authoritative.end(); ++i) {
+      m_cleaned_meta_map.objects.erase(i->first);
+      m_cleaned_meta_map.objects.insert(
+       *(maps[i->second.back()]->objects.find(i->first)));
+    }
+  }
+
+  ScrubMap for_meta_scrub;
+  clean_meta_map(for_meta_scrub);
+
+  // ok, do the pg-type specific scrubbing
+
+  // (Validates consistency of the object info and snap sets)
+  scrub_snapshot_metadata(for_meta_scrub, missing_digest);
+
+  // Called here on the primary can use an authoritative map if it isn't the primary
+  _scan_snaps(for_meta_scrub);
+
+  if (!m_store->empty()) {
+
+    if (state_test(PG_STATE_REPAIR)) {
+      dout(10) << __func__ << ": discarding scrub results" << dendl;
+      m_store->flush(nullptr);
+    } else {
+      dout(10) << __func__ << ": updating scrub object" << dendl;
+      ObjectStore::Transaction t;
+      m_store->flush(&t);
+      m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+    }
+  }
+}
+
+void PgScrubber::replica_update_start_epoch()
+{
+  dout(10) << __func__ << " start:" << m_pg->info.history.same_interval_since << dendl;
+  m_replica_epoch_start = m_pg->info.history.same_interval_since;
+}
+
+/**
+ * Send the requested map back to the primary (or - if we
+ * were preempted - let the primary know).
+ */
+void PgScrubber::send_replica_map(bool was_preempted)
+{
+  dout(10) << __func__ << " min epoch:" << m_replica_min_epoch
+          << " epoch_start:" << m_replica_epoch_start << dendl;
+
+  auto reply = new MOSDRepScrubMap(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
+                                  m_replica_min_epoch, m_pg_whoami);
+
+  reply->preempted = was_preempted;
+  ::encode(replica_scrubmap, reply->get_data());
+
+  m_osds->send_message_osd_cluster(m_pg->get_primary().osd, reply, m_replica_min_epoch);
+}
+
+/**
+ *  - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
+ *    The state-machine will react to that when all replica maps are received.
+ *  - when all maps are received, we signal the FSM with the GotReplicas event (see
+ *    scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
+ *    FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
+ *    handle (well - the incoming message is marked for fast dispatching, which is an
+ *    even better reason for handling it via the queue).
+ */
+void PgScrubber::map_from_replica(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDRepScrubMap>();
+  dout(15) << __func__ << " " << *m << dendl;
+
+  if (m->map_epoch < m_pg->info.history.same_interval_since) {
+    dout(10) << __func__ << " discarding old from " << m->map_epoch << " < "
+            << m_pg->info.history.same_interval_since << dendl;
+    return;
+  }
+
+  auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
+
+  m_received_maps[m->from].decode(p, m_pg->info.pgid.pool());
+  dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl;
+
+  [[maybe_unused]] auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
+  ceph_assert(is_ok);  // and not an error message, following the original code
+
+  if (m->preempted) {
+    dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+    ceph_assert(preemption_data.is_preemptable());  // otherwise - how dare the replica!
+    preemption_data.do_preempt();
+  }
+
+  if (m_maps_status.are_all_maps_available()) {
+    dout(10) << __func__ << " osd-queuing GotReplicas" << dendl;
+    m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops());
+  }
+}
+
+/**
+ *  we are a replica being asked by the Primary to reserve OSD resources for
+ * scrubbing
+ */
+void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_reserved()) {
+    dout(10) << __func__ << " ignoring reserve request: Already reserved" << dendl;
+    return;
+  }
+
+  bool granted{false};
+
+  if (m_pg->cct->_conf->osd_scrub_during_recovery || !m_osds->is_recovery_active()) {
+
+    m_remote_osd_resource.emplace(m_pg, m_osds);
+    // OSD resources allocated?
+    granted = m_remote_osd_resource->is_reserved();
+    if (!granted) {
+      // just forget it
+      m_remote_osd_resource.reset();
+      dout(20) << __func__ << ": failed to reserve remotely" << dendl;
+    }
+  }
+
+  dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
+
+  auto m = op->get_req<MOSDScrubReserve>();
+  Message* reply = new MOSDScrubReserve(
+    spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), m->map_epoch,
+    granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami);
+
+  m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
+}
+
+void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  if (m_reservations.has_value()) {
+    m_reservations->handle_reserve_grant(op, from);
+  } else {
+    derr << __func__ << ": replica scrub reservations that will be leaked!" << dendl;
+  }
+}
+
+void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  if (m_reservations.has_value()) {
+    // there is an active reservation process. No action is required otherwise.
+    m_reservations->handle_reserve_reject(op, from);
+  }
+}
+
+void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+  m_remote_osd_resource.reset();
+}
+
+void PgScrubber::clear_scrub_reservations()
+{
+  dout(10) << __func__ << dendl;
+  m_reservations.reset();        // the remote reservations
+  m_local_osd_resource.reset();          // the local reservation
+  m_remote_osd_resource.reset();  // we as replica reserved for a Primary
+}
+
+void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text)
+{
+  ceph_assert(m_pg->recovery_state.get_backfill_targets()
+               .empty());  // RRR ask: (the code was copied as is) Why checking here?
+
+  std::vector<std::pair<int, Message*>> messages;
+  messages.reserve(m_pg->get_actingset().size());
+
+  epoch_t epch = get_osdmap_epoch();
+
+  for (auto& p : m_pg->get_actingset()) {
+
+    if (p == m_pg_whoami)
+      continue;
+
+    dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch
+            << dendl;
+    Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode,
+                                     m_pg_whoami);
+    messages.push_back(std::make_pair(p.osd, m));
+  }
+
+  if (!messages.empty()) {
+    m_osds->send_message_osd_cluster(messages, epch);
+  }
+}
+
+void PgScrubber::unreserve_replicas()
+{
+  dout(10) << __func__ << dendl;
+  m_reservations.reset();
+}
+
+[[nodiscard]] bool PgScrubber::scrub_process_inconsistent()
+{
+  dout(10) << __func__ << ": checking authoritative" << dendl;
+
+  bool repair = state_test(PG_STATE_REPAIR);
+  const bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
+  const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
+  dout(20) << __func__ << " deep_scrub: " << deep_scrub << " m_is_deep: " << m_is_deep
+          << " repair: " << repair << dendl;
+
+  // authoritative only store objects which are missing or inconsistent.
+  if (!m_authoritative.empty()) {
+
+    stringstream ss;
+    ss << m_pg->info.pgid << " " << mode << " " << m_missing.size() << " missing, "
+       << m_inconsistent.size() << " inconsistent objects";
+    dout(2) << ss.str() << dendl;
+    m_osds->clog->error(ss);
+
+    if (repair) {
+      state_clear(PG_STATE_CLEAN);
+
+      for (const auto& [hobj, shrd_list] : m_authoritative) {
+
+       auto missing_entry = m_missing.find(hobj);
+
+       if (missing_entry != m_missing.end()) {
+         m_pg->repair_object(hobj, shrd_list, missing_entry->second);
+         m_fixed_count += missing_entry->second.size();
+       }
+
+       if (m_inconsistent.count(hobj)) {
+         m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]);
+         m_fixed_count += m_inconsistent[hobj].size();
+       }
+      }
+    }
+  }
+  return (!m_authoritative.empty() && repair);
+}
+
+/*
+ * note: only called for the Primary.
+ */
+void PgScrubber::scrub_finish()
+{
+  dout(10) << __func__ << " before flags: " << m_flags
+          << " deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl;
+
+  ceph_assert(m_pg->is_locked());
+
+  // if the repair request comes from auto-repair and large number of errors,
+  // we would like to cancel auto-repair
+
+  bool repair = state_test(PG_STATE_REPAIR);
+  if (repair && m_flags.auto_repair &&
+      m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+
+    dout(10) << __func__ << " undoing the repair" << dendl;
+    state_clear(PG_STATE_REPAIR);
+    repair = false;
+  }
+
+  bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
+  const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
+  bool do_auto_scrub = false;
+
+  // if a regular scrub had errors within the limit, do a deep scrub to auto repair
+  if (m_flags.deep_scrub_on_error && m_authoritative.size() &&
+      m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+    ceph_assert(!deep_scrub);
+    do_auto_scrub = true;
+    dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl;
+  }
+
+  m_flags.deep_scrub_on_error = false;
+
+  // type-specific finish (can tally more errors)
+  _scrub_finish();
+
+  bool has_error = scrub_process_inconsistent();
+
+  {
+    stringstream oss;
+    oss << m_pg->info.pgid.pgid << " " << mode << " ";
+    int total_errors = m_shallow_errors + m_deep_errors;
+    if (total_errors)
+      oss << total_errors << " errors";
+    else
+      oss << "ok";
+    if (!deep_scrub && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
+      oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors
+         << " remaining deep scrub error details lost)";
+    if (repair)
+      oss << ", " << m_fixed_count << " fixed";
+    if (total_errors)
+      m_osds->clog->error(oss);
+    else
+      m_osds->clog->debug(oss);
+  }
+
+  // Since we don't know which errors were fixed, we can only clear them
+  // when every one has been fixed.
+  if (repair) {
+    if (m_fixed_count == m_shallow_errors + m_deep_errors) {
+
+      ceph_assert(deep_scrub);
+      m_shallow_errors = 0;
+      m_deep_errors = 0;
+      dout(20) << __func__ << " All may be fixed" << dendl;
+
+    } else if (has_error) {
+
+      // Deep scrub in order to get corrected error counts
+      m_pg->scrub_after_recovery = true;
+      m_pg->m_planned_scrub.req_scrub =
+       m_pg->m_planned_scrub.req_scrub || m_flags.required;
+
+      dout(20) << __func__ << " Current 'required': " << m_flags.required
+              << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl;
+
+    } else if (m_shallow_errors || m_deep_errors) {
+
+      // We have errors but nothing can be fixed, so there is no repair
+      // possible.
+      state_set(PG_STATE_FAILED_REPAIR);
+      dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors)
+              << " error(s) present with no repair possible" << dendl;
+    }
+  }
+
+  {
+    // finish up
+    ObjectStore::Transaction t;
+    m_pg->recovery_state.update_stats(
+      [this, deep_scrub](auto& history, auto& stats) {
+       dout(10) << "m_pg->recovery_state.update_stats()" << dendl;
+       utime_t now = ceph_clock_now();
+       history.last_scrub = m_pg->recovery_state.get_info().last_update;
+       history.last_scrub_stamp = now;
+       if (m_is_deep) {
+         history.last_deep_scrub = m_pg->recovery_state.get_info().last_update;
+         history.last_deep_scrub_stamp = now;
+       }
+
+       if (deep_scrub) {
+         if ((m_shallow_errors == 0) && (m_deep_errors == 0))
+           history.last_clean_scrub_stamp = now;
+         stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+         stats.stats.sum.num_deep_scrub_errors = m_deep_errors;
+         stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects;
+         stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes;
+         stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys;
+         dout(10 /*25*/) << "scrub_finish shard " << m_pg_whoami
+                         << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes
+                         << " num_omap_keys = " << stats.stats.sum.num_omap_keys
+                         << dendl;
+       } else {
+         stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+         // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
+         // because of deep-scrub errors
+         if (m_shallow_errors == 0)
+           history.last_clean_scrub_stamp = now;
+       }
+       stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors +
+                                          stats.stats.sum.num_deep_scrub_errors;
+       if (m_flags.check_repair) {
+         m_flags.check_repair = false;
+         if (m_pg->info.stats.stats.sum.num_scrub_errors) {
+           state_set(PG_STATE_FAILED_REPAIR);
+           dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors
+                    << " error(s) still present after re-scrub" << dendl;
+         }
+       }
+       return true;
+      },
+      &t);
+    int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+    ceph_assert(tr == 0);
+
+    if (!m_pg->snap_trimq.empty()) {
+      dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
+      m_pg->snap_trimmer_scrub_complete();
+    }
+  }
+
+  if (has_error) {
+    m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared<PGPeeringEvent>(
+      get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
+  } else {
+    state_clear(PG_STATE_REPAIR);
+  }
+
+  cleanup_on_finish();
+  if (do_auto_scrub) {
+    request_rescrubbing(m_pg->m_planned_scrub);
+  }
+
+  if (m_pg->is_active() && m_pg->is_primary()) {
+    m_pg->recovery_state.share_pg_info();
+  }
+}
+
+Scrub::FsmNext PgScrubber::on_digest_updates()
+{
+  dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " are we done? "
+          << num_digest_updates_pending
+          << (m_end.is_max() ? " <last chunk> " : " <mid chunk> ") << dendl;
+
+  if (num_digest_updates_pending == 0) {
+
+    // got all updates, and finished with this chunk. Any more?
+    if (m_end.is_max()) {
+      scrub_finish();
+      return Scrub::FsmNext::goto_notactive;
+    } else {
+      // go get a new chunk (via "requeue")
+      preemption_data.reset();
+      return Scrub::FsmNext::next_chunk;
+    }
+  } else {
+    return Scrub::FsmNext::do_discard;
+  }
+}
+
+/*
+ * note that the flags-set fetched from the PG (m_pg->m_planned_scrub)
+ * is cleared once scrubbing starts; Some of the values dumped here are
+ * thus transitory.
+ */
+void PgScrubber::dump(ceph::Formatter* f) const
+{
+  f->open_object_section("scrubber");
+  f->dump_stream("epoch_start") << m_epoch_start;
+  f->dump_bool("active", m_active);
+  if (m_active) {
+    f->dump_stream("start") << m_start;
+    f->dump_stream("end") << m_end;
+    f->dump_stream("m_max_end") << m_max_end;
+    f->dump_stream("subset_last_update") << m_subset_last_update;
+    f->dump_bool("deep", m_is_deep);
+    f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required));
+    f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub);
+    f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair);
+    f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto);
+    f->dump_bool("req_scrub", m_flags.required);
+    f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep);
+    f->dump_bool("auto_repair", m_flags.auto_repair);
+    f->dump_bool("check_repair", m_flags.check_repair);
+    f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error);
+    f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp;  // utime_t
+    f->dump_unsigned("priority", m_flags.priority);
+    f->dump_int("shallow_errors", m_shallow_errors);
+    f->dump_int("deep_errors", m_deep_errors);
+    f->dump_int("fixed", m_fixed_count);
+    {
+      f->open_array_section("waiting_on_whom");
+      for (const auto& p : m_maps_status.get_awaited()) {
+       f->dump_stream("shard") << p;
+      }
+      f->close_section();
+    }
+  }
+  f->close_section();
+}
+
+
+void PgScrubber::handle_query_state(ceph::Formatter* f)
+{
+  dout(10) << __func__ << dendl;
+
+  f->open_object_section("scrub");
+  f->dump_stream("scrubber.epoch_start") << m_epoch_start;
+  f->dump_bool("scrubber.active", m_active);
+  f->dump_stream("scrubber.start") << m_start;
+  f->dump_stream("scrubber.end") << m_end;
+  f->dump_stream("scrubber.m_max_end") << m_max_end;
+  f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update;
+  f->dump_bool("scrubber.deep", m_is_deep);
+  {
+    f->open_array_section("scrubber.waiting_on_whom");
+    for (const auto& p : m_maps_status.get_awaited()) {
+      f->dump_stream("shard") << p;
+    }
+    f->close_section();
+  }
+
+  f->dump_string("comment", "DEPRECATED - may be removed in the next release");
+
+  f->close_section();
+}
+
+PgScrubber::~PgScrubber()
+{
+  dout(10) << __func__ << dendl;
+}
+
+PgScrubber::PgScrubber(PG* pg)
+    : m_pg{pg}
+    , m_pg_id{pg->pg_id}
+    , m_osds{m_pg->osd}
+    , m_pg_whoami{pg->pg_whoami}
+    , m_epoch_queued{0}
+    , preemption_data{pg}
+{
+  dout(20) << " creating PgScrubber for " << pg->pg_id << " / " << m_pg_whoami << dendl;
+  m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
+  m_fsm->initiate();
+}
+
+void PgScrubber::reserve_replicas()
+{
+  dout(10) << __func__ << dendl;
+  m_reservations.emplace(m_pg, m_pg_whoami);
+}
+
+//  called only for normal end-of-scrub, and only for a Primary
+void PgScrubber::cleanup_on_finish()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(m_pg->is_locked());
+
+  state_clear(PG_STATE_SCRUBBING);
+  state_clear(PG_STATE_DEEP_SCRUB);
+  m_pg->publish_stats_to_osd();
+
+  m_reservations.reset();
+  m_local_osd_resource.reset();
+
+  m_pg->requeue_ops(m_pg->waiting_for_scrub);
+
+  reset_internal_state();
+  // type-specific state clear
+  _scrub_clear_state();
+}
+
+// uses process_event(), so must be invoked externally
+void PgScrubber::scrub_clear_state(bool keep_repair_state)
+{
+  dout(10) << __func__ << dendl;
+
+  clear_pgscrub_state(keep_repair_state);
+  m_fsm->process_event(FullReset{});
+}
+
+/*
+ * note: does not access the state-machine
+ */
+void PgScrubber::clear_pgscrub_state(bool keep_repair_state)
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(m_pg->is_locked());
+
+  state_clear(PG_STATE_SCRUBBING);
+  state_clear(PG_STATE_DEEP_SCRUB);
+  if (!keep_repair_state)
+    state_clear(PG_STATE_REPAIR);
+
+  clear_scrub_reservations();
+  m_pg->publish_stats_to_osd();
+
+  m_pg->requeue_ops(m_pg->waiting_for_scrub);
+
+  reset_internal_state();
+
+  // type-specific state clear
+  _scrub_clear_state();
+}
+
+void PgScrubber::replica_handling_done()
+{
+  dout(10) << __func__ << dendl;
+
+  state_clear(PG_STATE_SCRUBBING);
+  state_clear(PG_STATE_DEEP_SCRUB);
+
+  // make sure we cleared the reservations!
+
+  preemption_data.reset();
+  m_maps_status.reset();
+  m_received_maps.clear();
+
+  m_start = hobject_t{};
+  m_end = hobject_t{};
+  m_max_end = hobject_t{};
+  m_subset_last_update = eversion_t{};
+  m_shallow_errors = 0;
+  m_deep_errors = 0;
+  m_fixed_count = 0;
+  m_omap_stats = (const struct omap_stat_t){0};
+
+  run_callbacks();
+  m_inconsistent.clear();
+  m_missing.clear();
+  m_authoritative.clear();
+  num_digest_updates_pending = 0;
+  replica_scrubmap = ScrubMap{};
+  replica_scrubmap_pos.reset();
+
+  m_cleaned_meta_map = ScrubMap{};
+  m_needs_sleep = true;
+  m_sleep_started_at = utime_t{};
+
+  m_active = false;
+  m_pg->publish_stats_to_osd();
+}
+
+/*
+ * note: performs run_callbacks()
+ * note: reservations-related variables are not reset here
+ */
+void PgScrubber::reset_internal_state()
+{
+  dout(10) << __func__ << dendl;
+
+  preemption_data.reset();
+  m_maps_status.reset();
+  m_received_maps.clear();
+
+  m_start = hobject_t{};
+  m_end = hobject_t{};
+  m_max_end = hobject_t{};
+  m_subset_last_update = eversion_t{};
+  m_shallow_errors = 0;
+  m_deep_errors = 0;
+  m_fixed_count = 0;
+  m_omap_stats = (const struct omap_stat_t){0};
+
+  run_callbacks();
+
+  m_inconsistent.clear();
+  m_missing.clear();
+  m_authoritative.clear();
+  num_digest_updates_pending = 0;
+  m_primary_scrubmap = ScrubMap{};
+  m_primary_scrubmap_pos.reset();
+  replica_scrubmap = ScrubMap{};
+  replica_scrubmap_pos.reset();
+  m_cleaned_meta_map = ScrubMap{};
+  m_needs_sleep = true;
+  m_sleep_started_at = utime_t{};
+
+  m_flags = scrub_flags_t{};
+
+  m_active = false;
+}
+
+const OSDMapRef& PgScrubber::get_osdmap() const
+{
+  return m_pg->get_osdmap();
+}
+
+ostream& operator<<(ostream& out, const PgScrubber& scrubber)
+{
+  return out << scrubber.m_flags;
+}
+
+ostream& PgScrubber::show(ostream& out) const
+{
+  return out << " [ " << m_pg_id << ": " << /*for now*/ m_flags << " ] ";
+}
+
  // ///////////////////// preemption_data_t //////////////////////////////////
  
  PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h

index 760c34310c6966aadfdc4920c0450522a25566d1..0a390ce18e504741d11070f010beb0f947b9dad2 100644 (file)
--- a/src/osd/pg_scrubber.h
+++ b/src/osd/pg_scrubber.h
@@ -133,9 +133,458 @@ class MapsCollectionStatus {
  
  }  // namespace Scrub
  
-// an almost-empty PgScrubber for this commit:
+/**
+ * the scrub operation flags. Primary only.
+ * Set at scrub start. Checked in multiple locations - mostly
+ * at finish.
+ */
+struct scrub_flags_t {
+
+  unsigned int priority{0};
+
+  /**
+   * set by queue_scrub() if either planned_scrub.auto_repair or
+   * need_auto were set.
+   * Tested at scrub end.
+   */
+  bool auto_repair{false};
+
+  /// this flag indicates that we are scrubbing post repair to verify everything is fixed
+  bool check_repair{false};
+
+  /// checked at the end of the scrub, to possibly initiate a deep-scrub
+  bool deep_scrub_on_error{false};
+
+  /**
+   * scrub must not be aborted.
+   * Set for explicitly requested scrubs, and for scrubs originated by the pairing
+   * process with the 'repair' flag set (in the RequestScrub event).
+   */
+  bool required{false};
+};
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf);
+
+
+/**
+ * The part of PG-scrubbing code that isn't state-machine wiring.
+ *
+ * Why the separation? I wish to move to a different FSM implementation. Thus I
+ * am forced to strongly decouple the state-machine implementation details from
+ * the actual scrubbing code.
+ */
  class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
  
+ public:
+  explicit PgScrubber(PG* pg);
+
+  //  ------------------  the I/F exposed to the PG (ScrubPgIF) -------------
+
+  /// are we waiting for resource reservation grants form our replicas?
+  [[nodiscard]] bool is_reserving() const final;
+
+  void send_start_scrub() final;
+
+  void send_start_after_repair() final;
+
+  void send_scrub_resched() final;
+
+  void active_pushes_notification() final;
+
+  void update_applied_notification(epoch_t epoch_queued) final;
+
+  void send_scrub_unblock() final;
+
+  void digest_update_notification() final;
+
+  void send_replica_maps_ready() final;
+
+  void send_replica_pushes_upd() final;
+
+  void reset_epoch(epoch_t epoch_queued) final;
+
+  /**
+   *  we allow some number of preemptions of the scrub, which mean we do
+   *  not block.  Then we start to block.  Once we start blocking, we do
+   *  not stop until the scrub range is completed.
+   */
+  bool write_blocked_by_scrub(const hobject_t& soid) final;
+
+  /// true if the given range intersects the scrub interval in any way
+  bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
+
+  void handle_scrub_reserve_request(OpRequestRef op) final;
+  void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
+  void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
+  void handle_scrub_reserve_release(OpRequestRef op) final;
+  void clear_scrub_reservations() final;  // PG::clear... fwds to here
+  void unreserve_replicas() final;
+
+  // managing scrub op registration
+
+  void reg_next_scrub(const requested_scrub_t& request_flags) final;
+
+  void unreg_next_scrub() final;
+
+  void scrub_requested(scrub_level_t scrub_level,
+                      scrub_type_t scrub_type,
+                      requested_scrub_t& req_flags) final;
+
+  /**
+   * Reserve local scrub resources (managed by the OSD)
+   *
+   * Fails if OSD's local-scrubs budget was exhausted
+   * \returns were local resources reserved?
+   */
+  bool reserve_local() final;
+
+  void handle_query_state(ceph::Formatter* f) final;
+
+  void dump(ceph::Formatter* f) const override;
+
+  // used if we are a replica
+
+  void replica_scrub_op(OpRequestRef op) final;
+  void replica_scrub(epoch_t epoch_queued) final;
+  void replica_scrub_resched(epoch_t epoch_queued) final;
+
+  /// the op priority, taken from the primary's request message
+  Scrub::scrub_prio_t replica_op_priority() const final
+  {
+    return m_replica_request_priority;
+  };
+
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+                                     unsigned int suggested_priority) const final;
+  /// the version that refers to m_flags.priority
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
+
+  void add_callback(Context* context) final { m_callbacks.push_back(context); }
+
+  [[nodiscard]] bool are_callbacks_pending() const final  // used for an assert in PG.cc
+  {
+    return !m_callbacks.empty();
+  }
+
+  /// handle a message carrying a replica map
+  void map_from_replica(OpRequestRef op) final;
+
+  /**
+   *  should we requeue blocked ops?
+   *  Applicable to the PrimaryLogScrub derived class.
+   */
+  [[nodiscard]] virtual bool should_requeue_blocked_ops(
+    eversion_t last_recovery_applied) const override
+  {
+    return false;
+  }
+
+  void scrub_clear_state(bool keep_repair_state = false) final;
+
+  /**
+   *  add to scrub statistics, but only if the soid is below the scrub start
+   */
+  virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+                                       const hobject_t& soid) override
+  {
+    ceph_assert(false);
+  }
+
+  /**
+   * finalize the parameters of the initiated scrubbing session:
+   *
+   * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
+   * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
+   */
+  void set_op_parameters(requested_scrub_t& request) final;
+
+  void cleanup_store(ObjectStore::Transaction* t) final;
+
+  bool get_store_errors(const scrub_ls_arg_t& arg,
+                       scrub_ls_result_t& res_inout) const override
+  {
+    return false;
+  };
+
+  // -------------------------------------------------------------------------------------------
+  // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
+
+  bool select_range() final;
+
+  /// walk the log to find the latest update that affects our chunk
+  eversion_t search_log_for_updates() const final;
+
+  eversion_t get_last_update_applied() const final
+  {
+    return m_pg->recovery_state.get_last_update_applied();
+  }
+
+  void requeue_waiting() const final { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
+
+  int pending_active_pushes() const final { return m_pg->active_pushes; }
+
+  void scrub_compare_maps() final;
+
+  void on_init() final;
+  void on_replica_init() final;
+  void replica_handling_done() final;
+
+  /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+  /// (thus can be called from FSM reactions)
+  void clear_pgscrub_state(bool keep_repair_state) final;
+
+  void add_delayed_scheduling() final;
+
+  /**
+   * @returns have we asked at least one replica?
+   * 'false' means we are configured with no replicas, and
+   * should expect no maps to arrive.
+   */
+  bool get_replicas_maps(bool replica_can_preempt) final;
+
+  Scrub::FsmNext on_digest_updates() final;
+
+  void send_replica_map(bool was_preempted) final;
+
+  void send_remotes_reserved() final;
+  void send_reservation_failure() final;
+
+  /**
+   *  does the PG have newer updates than what we (the scrubber) know?
+   */
+  [[nodiscard]] bool has_pg_marked_new_updates() const final;
+
+  void set_subset_last_update(eversion_t e) final;
+
+  void replica_update_start_epoch() final;
+
+  void maps_compare_n_cleanup() final;
+
+  Scrub::preemption_t* get_preemptor() final;
+
+  int build_primary_map_chunk() final;
+
+  int build_replica_map_chunk() final;
+
+  void reserve_replicas() final;
+
+  [[nodiscard]] bool was_epoch_changed() const final;
+
+  void mark_local_map_ready() final;
+
+  [[nodiscard]] bool are_all_maps_available() const final;
+
+  std::string dump_awaited_maps() const final;
+
+ protected:
+  bool state_test(uint64_t m) const { return m_pg->state_test(m); }
+  void state_set(uint64_t m) { m_pg->state_set(m); }
+  void state_clear(uint64_t m) { m_pg->state_clear(m); }
+
+  [[nodiscard]] bool is_primary() const { return m_pg->recovery_state.is_primary(); }
+
+  [[nodiscard]] bool is_scrub_registered() const;
+
+  virtual void _scrub_clear_state() {}
+
+  utime_t m_scrub_reg_stamp;  ///< stamp we registered for
+
+  ostream& show(ostream& out) const override;
+
+ public:
+  // -------------------------------------------------------------------------------------------
+
+  friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
+
+  static utime_t scrub_must_stamp() { return utime_t(1, 1); }
+
+  virtual ~PgScrubber();  // must be defined separately, in the .cc file
+
+  [[nodiscard]] bool is_scrub_active() const final;
+
+ private:
+  void reset_internal_state();
+
+  void _scan_snaps(ScrubMap& smap);  // note that the (non-standard for a
+                                    // non-virtual) name of the function is searched
+                                    // for by the QA standalone tests. Do not modify.
+
+  void clean_meta_map(ScrubMap& for_meta_scrub);
+
+  void run_callbacks();
+
+  /**
+   * are we still a clean & healthy scrubbing primary?
+   *
+   * relevant only after the initial sched_scrub
+   */
+  [[nodiscard]] bool is_event_relevant(epoch_t queued) const;
+
+  /**
+   * check the 'no scrub' configuration options.
+   */
+  [[nodiscard]] bool should_abort_scrub(epoch_t queued) const;
+
+  void send_epoch_changed();
+
+  /**
+   * return true if any inconsistency/missing is repaired, false otherwise
+   */
+  [[nodiscard]] bool scrub_process_inconsistent();
+
+  bool m_needs_sleep{true};  ///< should we sleep before being rescheduled? always
+                            ///< 'true', unless we just got out of a sleep period
+
+
+  // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
+  // to guarantee un-reserving when deleted.
+  std::optional<Scrub::ReplicaReservations> m_reservations;
+  std::optional<Scrub::LocalReservation> m_local_osd_resource;
+
+  /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
+  std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
+
+  void cleanup_on_finish();  // scrub_clear_state() as called for a Primary when
+                            // Active->NotActive
+
+  /// the part that actually finalizes a scrub
+  void scrub_finish();
+
+  utime_t m_sleep_started_at;
+
+ protected:
+  PG* const m_pg;
+
+  /**
+   * the derivative-specific scrub-finishing touches:
+   */
+  virtual void _scrub_finish() {}
+
+  /**
+   * Validate consistency of the object info and snap sets.
+   */
+  virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
+  {}
+
+  // common code used by build_primary_map_chunk() and build_replica_map_chunk():
+  int build_scrub_map_chunk(ScrubMap& map,  // primary or replica?
+                           ScrubMapBuilder& pos,
+                           hobject_t start,
+                           hobject_t end,
+                           bool deep);
+
+  std::unique_ptr<Scrub::ScrubMachine> m_fsm;
+  const spg_t m_pg_id; ///< a local copy of m_pg->pg_id
+  OSDService* const m_osds;
+  const pg_shard_t m_pg_whoami;         ///< a local copy of m_pg->pg_whoami;
+
+  epoch_t m_epoch_start;  ///< epoch when scrubbing was first scheduled
+  epoch_t m_epoch_queued;
+  scrub_flags_t m_flags;
+
+  bool m_active{false};
+
+  eversion_t m_subset_last_update;
+
+  std::unique_ptr<Scrub::Store> m_store;
+
+  int num_digest_updates_pending{0};
+  hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
+
+  /// Returns reference to current osdmap
+  const OSDMapRef& get_osdmap() const;
+
+  /// Returns epoch of current osdmap
+  epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
+
+  CephContext* get_pg_cct() const { return m_pg->cct; }
+
+  void send_start_replica();
+
+  void send_sched_replica();
+
+  // collected statistics
+  int m_shallow_errors{0};
+  int m_deep_errors{0};
+  int m_fixed_count{0};
+
+  /// Maps from objects with errors to missing peers
+  HobjToShardSetMapping m_missing;
+
+ private:
+  /**
+   * 'm_is_deep' - is the running scrub a deep one?
+   *
+   * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
+   * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
+   * meaningful both for the primary and the replicas, and is used as a parameter when
+   * building the scrub maps.
+   */
+  bool m_is_deep{false};
+
+  inline static int fake_count{2};  // unit-tests. To be removed
+
+  /**
+   * initiate a deep-scrub after the current scrub ended with errors.
+   */
+  void request_rescrubbing(requested_scrub_t& req_flags);
+
+  std::list<Context*> m_callbacks;
+
+  /**
+   * send a replica (un)reservation request to the acting set
+   *
+   * @param opcode - one of MOSDScrubReserve::REQUEST
+   *                  or MOSDScrubReserve::RELEASE
+   */
+  void message_all_replicas(int32_t opcode, std::string_view op_text);
+
+  hobject_t m_max_end; ///< Largest end that may have been sent to replicas
+  ScrubMap m_primary_scrubmap;
+  ScrubMapBuilder m_primary_scrubmap_pos;
+
+  std::map<pg_shard_t, ScrubMap> m_received_maps;
+
+  /// Cleaned std::map pending snap metadata scrub
+  ScrubMap m_cleaned_meta_map;
+
+  void _request_scrub_map(pg_shard_t replica,
+                         eversion_t version,
+                         hobject_t start,
+                         hobject_t end,
+                         bool deep,
+                         bool allow_preemption);
+
+
+  Scrub::MapsCollectionStatus m_maps_status;
+
+  omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
+
+  /// Maps from objects with errors to inconsistent peers
+  HobjToShardSetMapping m_inconsistent;
+
+  /// Maps from object with errors to good peers
+  std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
+
+  // ------------ members used if we are a replica
+
+  epoch_t m_replica_epoch_start;
+  epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message
+
+  ScrubMapBuilder replica_scrubmap_pos;         /// \todo document
+  ScrubMap replica_scrubmap;            /// \todo document
+  /**
+   * we mark the request priority as it arrived. It influences the queuing priority
+   * when we wait for local updates
+   */
+  Scrub::scrub_prio_t m_replica_request_priority;
+
+  /**
+   *  Queue a XX event to be sent to the replica, to trigger a re-check of the
+   * availability of the scrub map prepared by the backend.
+   */
+  void requeue_replica(Scrub::scrub_prio_t is_high_priority);
+
    /**
     * the 'preemption' "state-machine".
     * Note: I was considering an orthogonal sub-machine implementation, but as
@@ -223,4 +672,9 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
        return m_left > 0;
      }
    };
+
+  preemption_data_t preemption_data;
+
+  // debug/development temporary code:
+  void debug_dump_reservations(std::string_view header_txt) const;
  };
diff --git a/src/osd/scheduler/OpSchedulerItem.cc b/src/osd/scheduler/OpSchedulerItem.cc

index 13c360b0323f0f518187c3e0e48701bccec5a2ef..3d6fb9aaac73621e3a03f0148c61955a6f3f8d1c 100644 (file)
--- a/src/osd/scheduler/OpSchedulerItem.cc
+++ b/src/osd/scheduler/OpSchedulerItem.cc
@@ -46,6 +46,30 @@ void PGSnapTrim::run(
    pg->unlock();
  }
  
+void PGScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
+{
+  pg->scrub(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubAfterRepair::run(OSD* osd,
+                         OSDShard* sdata,
+                         PGRef& pg,
+                         ThreadPool::TPHandle& handle)
+{
+  pg->recovery_scrub(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubResched::run(OSD* osd,
+                        OSDShard* sdata,
+                        PGRef& pg,
+                        ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_scrub_resched(epoch_queued, handle);
+  pg->unlock();
+}
+
  void PGScrubResourcesOK::run(OSD* osd,
                              OSDShard* sdata,
                              PGRef& pg,
@@ -64,13 +88,72 @@ void PGScrubDenied::run(OSD* osd,
    pg->unlock();
  }
  
-void PGScrub::run(
-  OSD *osd,
-  OSDShard *sdata,
-  PGRef& pg,
-  ThreadPool::TPHandle &handle)
+void PGScrubPushesUpdate::run(OSD* osd,
+                             OSDShard* sdata,
+                             PGRef& pg,
+                             ThreadPool::TPHandle& handle)
  {
-  pg->scrub(epoch_queued, handle);
+  pg->scrub_send_pushes_update(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubAppliedUpdate::run(OSD* osd,
+                              OSDShard* sdata,
+                              PGRef& pg,
+                              ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_applied_update(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubUnblocked::run(OSD* osd,
+                          OSDShard* sdata,
+                          PGRef& pg,
+                          ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_unblocking(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubDigestUpdate::run(OSD* osd,
+                             OSDShard* sdata,
+                             PGRef& pg,
+                             ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_digest_update(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubGotReplMaps::run(OSD* osd,
+                            OSDShard* sdata,
+                            PGRef& pg,
+                            ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_replmaps_ready(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
+{
+  pg->replica_scrub(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGRepScrubResched::run(OSD* osd,
+                           OSDShard* sdata,
+                           PGRef& pg,
+                           ThreadPool::TPHandle& handle)
+{
+  pg->replica_scrub_resched(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubReplicaPushes::run([[maybe_unused]] OSD* osd,
+                             OSDShard* sdata,
+                             PGRef& pg,
+                             ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_replica_pushes(epoch_queued, handle);
    pg->unlock();
  }
  
diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h

index 6850c180a85575d6892ecb3888b372f3eda6a054..afa363e19e6895f7311819c5a2d305d3321b5499 100644 (file)
--- a/src/osd/scheduler/OpSchedulerItem.h
+++ b/src/osd/scheduler/OpSchedulerItem.h
@@ -348,6 +348,14 @@ class PGScrubItem : public PGOpQueueable {
    }
  };
  
+class PGScrubResched : public PGScrubItem {
+ public:
+  PGScrubResched(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubResched"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
  /**
   *  all replicas have granted our scrub resources request
   */
@@ -370,6 +378,87 @@ class PGScrubDenied : public PGScrubItem {
    void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
  };
  
+/**
+ *  called when a repair process completes, to initiate scrubbing. No local/remote
+ *  resources are allocated.
+ */
+class PGScrubAfterRepair : public PGScrubItem {
+ public:
+  PGScrubAfterRepair(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubAfterRepair"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubPushesUpdate : public PGScrubItem {
+ public:
+  PGScrubPushesUpdate(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubPushesUpdate"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubAppliedUpdate : public PGScrubItem {
+ public:
+  PGScrubAppliedUpdate(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubAppliedUpdate"}
+  {}
+  void run(OSD* osd,
+          OSDShard* sdata,
+          PGRef& pg,
+          [[maybe_unused]] ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubUnblocked : public PGScrubItem {
+ public:
+  PGScrubUnblocked(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubUnblocked"}
+  {}
+  void run(OSD* osd,
+          OSDShard* sdata,
+          PGRef& pg,
+          [[maybe_unused]] ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubDigestUpdate : public PGScrubItem {
+ public:
+  PGScrubDigestUpdate(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubDigestUpdate"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubGotReplMaps : public PGScrubItem {
+ public:
+  PGScrubGotReplMaps(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubGotReplMaps"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRepScrub : public PGScrubItem {
+ public:
+  PGRepScrub(spg_t pg, epoch_t epoch_queued) : PGScrubItem{pg, epoch_queued, "PGRepScrub"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRepScrubResched : public PGScrubItem {
+ public:
+  PGRepScrubResched(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGRepScrubResched"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubReplicaPushes : public PGScrubItem {
+ public:
+  PGScrubReplicaPushes(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubReplicaPushes"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
  class PGRecovery : public PGOpQueueable {
    epoch_t epoch_queued;
    uint64_t reserved_pushes;
author	Ronen Friedman <rfriedma@redhat.com>
	Sun, 15 Nov 2020 16:39:33 +0000 (18:39 +0200)
committer	Ronen Friedman <rfriedma@redhat.com>
	Thu, 10 Dec 2020 13:21:53 +0000 (15:21 +0200)
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history
src/osd/PG.cc		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history
src/osd/PeeringState.h		patch \| blob \| history
src/osd/PrimaryLogPG.cc		patch \| blob \| history
src/osd/PrimaryLogPG.h		patch \| blob \| history
src/osd/pg_scrubber.cc		patch \| blob \| history
src/osd/pg_scrubber.h		patch \| blob \| history
src/osd/scheduler/OpSchedulerItem.cc		patch \| blob \| history
src/osd/scheduler/OpSchedulerItem.h		patch \| blob \| history