]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Extracting scrub code from PrimaryLogPG (a derivative of 'PG')
authorRonen Friedman <rfriedma@redhat.com>
Sun, 15 Nov 2020 20:13:52 +0000 (22:13 +0200)
committerRonen Friedman <rfriedma@redhat.com>
Thu, 10 Dec 2020 13:21:53 +0000 (15:21 +0200)
into a PrimaryLogScrub - a derivative of PgScrubber.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/osd/CMakeLists.txt
src/osd/PG.h
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h
src/osd/PrimaryLogScrub.cc [new file with mode: 0644]
src/osd/PrimaryLogScrub.h [new file with mode: 0644]
src/osd/pg_scrubber.cc
src/osd/pg_scrubber.h

index d3b6c21c79e1c8a58859d7d406fc14d88c41f7e5..0d0ca63b347b34259dcd9a366a31b7d9dd53ecbc 100644 (file)
@@ -13,6 +13,7 @@ set(osd_srcs
   OSD.cc
   pg_scrubber.cc
   scrub_machine.cc
+  PrimaryLogScrub.cc
   Watch.cc
   ClassHandler.cc
   PG.cc
index 9119b2979380e8403e3c61056c93abb58a03cffd..a2403cc07d551fe757bfe7951061b02c6d954edc 100644 (file)
@@ -170,6 +170,7 @@ class PG : public DoutPrefixProvider, public PeeringState::PeeringListener {
   friend struct NamedState;
   friend class PeeringState;
   friend class PgScrubber;
+  friend class PrimaryLogScrub;
   friend class Scrub::ReplicaReservations;
   friend class Scrub::LocalReservation;  // dout()-only friendship
   friend class Scrub::ReservedByRemotePrimary;  //  dout()-only friendship
index e8038a6e955142d434d8856d81720293f09751ef..2df61761b0dcbb2466c87432d8fd751f91521fa3 100644 (file)
@@ -21,6 +21,7 @@
 #include "pg_scrubber.h"
 #include "PrimaryLogPG.h"
 #include "OSD.h"
+#include "PrimaryLogScrub.h"
 #include "OpRequest.h"
 #include "ScrubStore.h"
 #include "Session.h"
@@ -1609,6 +1610,53 @@ int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
 
   return r;
 }
+
+/**
+ * Releases locks
+ *
+ * @param manager [in] manager with locks to release
+ */
+void PrimaryLogPG::release_object_locks(
+  ObcLockManager &lock_manager) {
+  std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
+  bool requeue_recovery = false;
+  bool requeue_snaptrim = false;
+  lock_manager.put_locks(
+    &to_req,
+    &requeue_recovery,
+    &requeue_snaptrim);
+  if (requeue_recovery)
+    queue_recovery();
+  if (requeue_snaptrim)
+    snap_trimmer_machine.process_event(TrimWriteUnblocked());
+
+  if (!to_req.empty()) {
+    // requeue at front of scrub blocking queue if we are blocked by scrub
+    for (auto &&p: to_req) {
+      if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
+        for (auto& op : p.second) {
+          op->mark_delayed("waiting for scrub");
+        }
+
+       waiting_for_scrub.splice(
+         waiting_for_scrub.begin(),
+         p.second,
+         p.second.begin(),
+         p.second.end());
+      } else if (is_laggy()) {
+        for (auto& op : p.second) {
+          op->mark_delayed("waiting for readable");
+        }
+       waiting_for_readable.splice(
+         waiting_for_readable.begin(),
+         p.second,
+         p.second.begin(),
+         p.second.end());
+      } else {
+       requeue_ops(p.second);
+      }
+    }
+  }
 }
 
 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
@@ -1628,8 +1676,7 @@ PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
     pgbackend->get_is_recoverable_predicate());
   snap_trimmer_machine.initiate();
 
-  m_scrubber = make_unique<PgScrubber>(this); // *not* the final code
-  // next commit: m_scrubber = make_unique<PrimaryLogScrub>(this);
+  m_scrubber = make_unique<PrimaryLogScrub>(this);
 }
 
 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
@@ -14929,507 +14976,6 @@ bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
   return true;
 }
 
-static bool doing_clones(const std::optional<SnapSet> &snapset,
-                        const vector<snapid_t>::reverse_iterator &curclone) {
-    return snapset && curclone != snapset->clones.rend();
-}
-
-void PrimaryLogPG::log_missing(unsigned missing,
-                       const std::optional<hobject_t> &head,
-                       LogChannelRef clog,
-                       const spg_t &pgid,
-                       const char *func,
-                       const char *mode,
-                       bool allow_incomplete_clones)
-{
-  ceph_assert(head);
-  if (allow_incomplete_clones) {
-    dout(20) << func << " " << mode << " " << pgid << " " << *head
-            << " skipped " << missing << " clone(s) in cache tier" << dendl;
-  } else {
-    clog->info() << mode << " " << pgid << " " << *head
-                << " : " << missing << " missing clone(s)";
-  }
-}
-
-unsigned PrimaryLogPG::process_clones_to(const std::optional<hobject_t> &head,
-  const std::optional<SnapSet> &snapset,
-  LogChannelRef clog,
-  const spg_t &pgid,
-  const char *mode,
-  bool allow_incomplete_clones,
-  std::optional<snapid_t> target,
-  vector<snapid_t>::reverse_iterator *curclone,
-  inconsistent_snapset_wrapper &e)
-{
-  ceph_assert(head);
-  ceph_assert(snapset);
-  unsigned missing = 0;
-
-  // NOTE: clones are in descending order, thus **curclone > target test here
-  hobject_t next_clone(*head);
-  while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
-    ++missing;
-    // it is okay to be missing one or more clones in a cache tier.
-    // skip higher-numbered clones in the list.
-    if (!allow_incomplete_clones) {
-      next_clone.snap = **curclone;
-      clog->error() << mode << " " << pgid << " " << *head
-                        << " : expected clone " << next_clone << " " << missing
-                         << " missing";
-      ++scrubber.shallow_errors;
-      e.set_clone_missing(next_clone.snap);
-    }
-    // Clones are descending
-    ++(*curclone);
-  }
-  return missing;
-}
-
-/*
- * Validate consistency of the object info and snap sets.
- *
- * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
- * the comparison of the objects is against multiple snapset.clones. There are
- * multiple clone lists and in between lists we expect head.
- *
- * Example
- *
- * objects              expected
- * =======              =======
- * obj1 snap 1          head, unexpected obj1 snap 1
- * obj2 head            head, match
- *              [SnapSet clones 6 4 2 1]
- * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
- * obj2 snap 6          obj2 snap 6, match
- * obj2 snap 4          obj2 snap 4, match
- * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), match
- *              [Snapset clones 3 1]
- * obj3 snap 3          obj3 snap 3 match
- * obj3 snap 1          obj3 snap 1 match
- * obj4 head            head, match
- *              [Snapset clones 4]
- * EOL                  obj4 snap 4, (expected)
- */
-void PrimaryLogPG::scrub_snapshot_metadata(
-  ScrubMap &scrubmap,
-  const map<hobject_t,
-            pair<std::optional<uint32_t>,
-                 std::optional<uint32_t>>> &missing_digest)
-{
-  dout(10) << __func__ << dendl;
-
-  bool repair = state_test(PG_STATE_REPAIR);
-  bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
-  const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
-  std::optional<snapid_t> all_clones;   // Unspecified snapid_t or std::nullopt
-
-  // traverse in reverse order.
-  std::optional<hobject_t> head;
-  std::optional<SnapSet> snapset; // If initialized so will head (above)
-  vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
-  unsigned missing = 0;
-  inconsistent_snapset_wrapper soid_error, head_error;
-  unsigned soid_error_count = 0;
-
-  for (map<hobject_t,ScrubMap::object>::reverse_iterator
-       p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
-    const hobject_t& soid = p->first;
-    ceph_assert(!soid.is_snapdir());
-    soid_error = inconsistent_snapset_wrapper{soid};
-    object_stat_sum_t stat;
-    std::optional<object_info_t> oi;
-
-    stat.num_objects++;
-
-    if (soid.nspace == cct->_conf->osd_hit_set_namespace)
-      stat.num_objects_hit_set_archive++;
-
-    if (soid.is_snap()) {
-      // it's a clone
-      stat.num_object_clones++;
-    }
-
-    // basic checks.
-    if (p->second.attrs.count(OI_ATTR) == 0) {
-      oi = std::nullopt;
-      osd->clog->error() << mode << " " << info.pgid << " " << soid
-                       << " : no '" << OI_ATTR << "' attr";
-      ++scrubber.shallow_errors;
-      soid_error.set_info_missing();
-    } else {
-      bufferlist bv;
-      bv.push_back(p->second.attrs[OI_ATTR]);
-      try {
-       oi = object_info_t(); // Initialize optional<> before decode into it
-       oi->decode(bv);
-      } catch (ceph::buffer::error& e) {
-       oi = std::nullopt;
-       osd->clog->error() << mode << " " << info.pgid << " " << soid
-               << " : can't decode '" << OI_ATTR << "' attr " << e.what();
-       ++scrubber.shallow_errors;
-       soid_error.set_info_corrupted();
-        soid_error.set_info_missing(); // Not available too
-      }
-    }
-
-    if (oi) {
-      if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
-       osd->clog->error() << mode << " " << info.pgid << " " << soid
-                          << " : on disk size (" << p->second.size
-                          << ") does not match object info size ("
-                          << oi->size << ") adjusted for ondisk to ("
-                          << pgbackend->be_get_ondisk_size(oi->size)
-                          << ")";
-       soid_error.set_size_mismatch();
-       ++scrubber.shallow_errors;
-      }
-
-      dout(20) << mode << "  " << soid << " " << *oi << dendl;
-
-      // A clone num_bytes will be added later when we have snapset
-      if (!soid.is_snap()) {
-        stat.num_bytes += oi->size;
-      }
-      if (soid.nspace == cct->_conf->osd_hit_set_namespace)
-       stat.num_bytes_hit_set_archive += oi->size;
-
-      if (oi->is_dirty())
-       ++stat.num_objects_dirty;
-      if (oi->is_whiteout())
-       ++stat.num_whiteouts;
-      if (oi->is_omap())
-       ++stat.num_objects_omap;
-      if (oi->is_cache_pinned())
-       ++stat.num_objects_pinned;
-      if (oi->has_manifest())
-       ++stat.num_objects_manifest;
-    }
-
-    // Check for any problems while processing clones
-    if (doing_clones(snapset, curclone)) {
-      std::optional<snapid_t> target;
-      // Expecting an object with snap for current head
-      if (soid.has_snapset() || soid.get_head() != head->get_head()) {
-
-       dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
-                << soid << " while processing " << *head << dendl;
-
-        target = all_clones;
-      } else {
-        ceph_assert(soid.is_snap());
-        target = soid.snap;
-      }
-
-      // Log any clones we were expecting to be there up to target
-      // This will set missing, but will be a no-op if snap.soid == *curclone.
-      missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
-                       pool.info.allow_incomplete_clones(), target, &curclone,
-                       head_error);
-    }
-    bool expected;
-    // Check doing_clones() again in case we ran process_clones_to()
-    if (doing_clones(snapset, curclone)) {
-      // A head would have processed all clones above
-      // or all greater than *curclone.
-      ceph_assert(soid.is_snap() && *curclone <= soid.snap);
-
-      // After processing above clone snap should match the expected curclone
-      expected = (*curclone == soid.snap);
-    } else {
-      // If we aren't doing clones any longer, then expecting head
-      expected = soid.has_snapset();
-    }
-    if (!expected) {
-      // If we couldn't read the head's snapset, just ignore clones
-      if (head && !snapset) {
-       osd->clog->error() << mode << " " << info.pgid << " " << soid
-                         << " : clone ignored due to missing snapset";
-      } else {
-       osd->clog->error() << mode << " " << info.pgid << " " << soid
-                          << " : is an unexpected clone";
-      }
-      ++scrubber.shallow_errors;
-      soid_error.set_headless();
-      scrubber.store->add_snap_error(pool.id, soid_error);
-      ++soid_error_count;
-      if (head && soid.get_head() == head->get_head())
-       head_error.set_clone(soid.snap);
-      continue;
-    }
-
-    // new snapset?
-    if (soid.has_snapset()) {
-
-      if (missing) {
-       log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
-                   pool.info.allow_incomplete_clones());
-      }
-
-      // Save previous head error information
-      if (head && (head_error.errors || soid_error_count))
-       scrubber.store->add_snap_error(pool.id, head_error);
-      // Set this as a new head object
-      head = soid;
-      missing = 0;
-      head_error = soid_error;
-      soid_error_count = 0;
-
-      dout(20) << __func__ << " " << mode << " new head " << head << dendl;
-
-      if (p->second.attrs.count(SS_ATTR) == 0) {
-       osd->clog->error() << mode << " " << info.pgid << " " << soid
-                         << " : no '" << SS_ATTR << "' attr";
-        ++scrubber.shallow_errors;
-       snapset = std::nullopt;
-       head_error.set_snapset_missing();
-      } else {
-       bufferlist bl;
-       bl.push_back(p->second.attrs[SS_ATTR]);
-       auto blp = bl.cbegin();
-        try {
-         snapset = SnapSet(); // Initialize optional<> before decoding into it
-         decode(*snapset, blp);
-          head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
-        } catch (ceph::buffer::error& e) {
-         snapset = std::nullopt;
-          osd->clog->error() << mode << " " << info.pgid << " " << soid
-               << " : can't decode '" << SS_ATTR << "' attr " << e.what();
-         ++scrubber.shallow_errors;
-         head_error.set_snapset_corrupted();
-        }
-      }
-
-      if (snapset) {
-       // what will be next?
-       curclone = snapset->clones.rbegin();
-
-       if (!snapset->clones.empty()) {
-         dout(20) << "  snapset " << *snapset << dendl;
-         if (snapset->seq == 0) {
-           osd->clog->error() << mode << " " << info.pgid << " " << soid
-                              << " : snaps.seq not set";
-           ++scrubber.shallow_errors;
-           head_error.set_snapset_error();
-          }
-       }
-      }
-    } else {
-      ceph_assert(soid.is_snap());
-      ceph_assert(head);
-      ceph_assert(snapset);
-      ceph_assert(soid.snap == *curclone);
-
-      dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
-
-      if (snapset->clone_size.count(soid.snap) == 0) {
-       osd->clog->error() << mode << " " << info.pgid << " " << soid
-                          << " : is missing in clone_size";
-       ++scrubber.shallow_errors;
-       soid_error.set_size_mismatch();
-      } else {
-        if (oi && oi->size != snapset->clone_size[soid.snap]) {
-         osd->clog->error() << mode << " " << info.pgid << " " << soid
-                            << " : size " << oi->size << " != clone_size "
-                            << snapset->clone_size[*curclone];
-         ++scrubber.shallow_errors;
-         soid_error.set_size_mismatch();
-        }
-
-        if (snapset->clone_overlap.count(soid.snap) == 0) {
-         osd->clog->error() << mode << " " << info.pgid << " " << soid
-                            << " : is missing in clone_overlap";
-         ++scrubber.shallow_errors;
-         soid_error.set_size_mismatch();
-       } else {
-         // This checking is based on get_clone_bytes().  The first 2 asserts
-         // can't happen because we know we have a clone_size and
-         // a clone_overlap.  Now we check that the interval_set won't
-         // cause the last assert.
-         uint64_t size = snapset->clone_size.find(soid.snap)->second;
-         const interval_set<uint64_t> &overlap =
-               snapset->clone_overlap.find(soid.snap)->second;
-         bool bad_interval_set = false;
-         for (interval_set<uint64_t>::const_iterator i = overlap.begin();
-              i != overlap.end(); ++i) {
-           if (size < i.get_len()) {
-             bad_interval_set = true;
-             break;
-           }
-           size -= i.get_len();
-         }
-
-         if (bad_interval_set) {
-           osd->clog->error() << mode << " " << info.pgid << " " << soid
-                              << " : bad interval_set in clone_overlap";
-           ++scrubber.shallow_errors;
-           soid_error.set_size_mismatch();
-         } else {
-            stat.num_bytes += snapset->get_clone_bytes(soid.snap);
-         }
-        }
-      }
-
-      // what's next?
-      ++curclone;
-      if (soid_error.errors) {
-        scrubber.store->add_snap_error(pool.id, soid_error);
-       ++soid_error_count;
-      }
-    }
-
-    scrub_cstat.add(stat);
-  }
-
-  if (doing_clones(snapset, curclone)) {
-    dout(10) << __func__ << " " << mode << " " << info.pgid
-            << " No more objects while processing " << *head << dendl;
-
-    missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
-                     pool.info.allow_incomplete_clones(), all_clones, &curclone,
-                     head_error);
-  }
-  // There could be missing found by the test above or even
-  // before dropping out of the loop for the last head.
-  if (missing) {
-    log_missing(missing, head, osd->clog, info.pgid, __func__,
-               mode, pool.info.allow_incomplete_clones());
-  }
-  if (head && (head_error.errors || soid_error_count))
-    scrubber.store->add_snap_error(pool.id, head_error);
-
-  for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
-    ceph_assert(!p->first.is_snapdir());
-    dout(10) << __func__ << " recording digests for " << p->first << dendl;
-    ObjectContextRef obc = get_object_context(p->first, false);
-    if (!obc) {
-      osd->clog->error() << info.pgid << " " << mode
-                        << " cannot get object context for object "
-                        << p->first;
-      continue;
-    } else if (obc->obs.oi.soid != p->first) {
-      osd->clog->error() << info.pgid << " " << mode
-                        << " " << p->first
-                        << " : object has a valid oi attr with a mismatched name, "
-                        << " obc->obs.oi.soid: " << obc->obs.oi.soid;
-      continue;
-    }
-    OpContextUPtr ctx = simple_opc_create(obc);
-    ctx->at_version = get_next_version();
-    ctx->mtime = utime_t();      // do not update mtime
-    if (p->second.first) {
-      ctx->new_obs.oi.set_data_digest(*p->second.first);
-    } else {
-      ctx->new_obs.oi.clear_data_digest();
-    }
-    if (p->second.second) {
-      ctx->new_obs.oi.set_omap_digest(*p->second.second);
-    } else {
-      ctx->new_obs.oi.clear_omap_digest();
-    }
-    finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
-
-    ctx->register_on_success(
-      [this]() {
-       dout(20) << "updating scrub digest" << dendl;
-       if (--scrubber.num_digest_updates_pending == 0) {
-         requeue_scrub();
-       }
-      });
-
-    simple_opc_submit(std::move(ctx));
-    ++scrubber.num_digest_updates_pending;
-  }
-
-  dout(10) << __func__ << " (" << mode << ") finish" << dendl;
-}
-
-void PrimaryLogPG::_scrub_clear_state()
-{
-  scrub_cstat = object_stat_collection_t();
-}
-
-void PrimaryLogPG::_scrub_finish()
-{
-  bool repair = state_test(PG_STATE_REPAIR);
-  bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
-  const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
-
-  if (info.stats.stats_invalid) {
-    recovery_state.update_stats(
-      [=](auto &history, auto &stats) {
-       stats.stats = scrub_cstat;
-       stats.stats_invalid = false;
-       return false;
-      });
-
-    if (agent_state)
-      agent_choose_mode();
-  }
-
-  dout(10) << mode << " got "
-          << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
-          << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
-          << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
-          << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
-          << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
-          << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
-          << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
-          << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
-          << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
-          << dendl;
-
-  if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
-      scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
-      (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
-       !info.stats.dirty_stats_invalid) ||
-      (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
-       !info.stats.omap_stats_invalid) ||
-      (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
-       !info.stats.pin_stats_invalid) ||
-      (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
-       !info.stats.hitset_stats_invalid) ||
-      (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
-       !info.stats.hitset_bytes_stats_invalid) ||
-      (scrub_cstat.sum.num_objects_manifest != info.stats.stats.sum.num_objects_manifest &&
-       !info.stats.manifest_stats_invalid) ||
-      scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
-      scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
-    osd->clog->error() << info.pgid << " " << mode
-                     << " : stat mismatch, got "
-                     << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
-                     << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
-                     << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
-                     << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
-                     << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
-                     << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
-                     << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
-                     << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
-                     << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
-                     << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
-    ++scrubber.shallow_errors;
-
-    if (repair) {
-      ++scrubber.fixed;
-      recovery_state.update_stats(
-       [this](auto &history, auto &stats) {
-         stats.stats = scrub_cstat;
-         stats.dirty_stats_invalid = false;
-         stats.omap_stats_invalid = false;
-         stats.hitset_stats_invalid = false;
-         stats.hitset_bytes_stats_invalid = false;
-         stats.pin_stats_invalid = false;
-         stats.manifest_stats_invalid = false;
-         return false;
-       });
-      publish_stats_to_osd();
-      recovery_state.share_pg_info();
-    }
-  }
-  // Clear object context cache to get repair information
-  if (repair)
-    object_contexts.clear();
-}
 
 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
 {
@@ -15491,6 +15037,13 @@ void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_t
   ldout(pg->cct, 20) << "exit " << state_name << dendl;
 }
 
+bool PrimaryLogPG::SnapTrimmer::permit_trim() {
+  return
+    pg->is_clean() &&
+    !pg->m_scrubber->is_scrub_active() &&
+    !pg->snap_trimq.empty();
+}
+
 /*---SnapTrimmer states---*/
 #undef dout_prefix
 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
index 211f8ede52277929503f2e19055cf266b78e3882..5a8c9f405d6cf1165f82d66f1ff0d48c049e896f 100644 (file)
@@ -58,6 +58,7 @@ struct inconsistent_snapset_wrapper;
 class PrimaryLogPG : public PG, public PGBackend::Listener {
   friend class OSD;
   friend class Watch;
+  friend class PrimaryLogScrub;
 
 public:
   MEMPOOL_CLASS_HELPERS();
@@ -367,8 +368,8 @@ public:
     return gen_prefix(out);
   }
 
-  const std::map<hobject_t, std::set<pg_shard_t>>
-    &get_missing_loc_shards() const override {
+  const HobjToShardSetMapping& get_missing_loc_shards() const override
+  {
     return recovery_state.get_missing_loc().get_missing_locs();
   }
   const std::map<pg_shard_t, pg_missing_t> &get_shard_missing() const override {
@@ -1391,14 +1392,6 @@ protected:
   // -- scrub --
   bool _range_available_for_scrub(
     const hobject_t &begin, const hobject_t &end) override;
-  void scrub_snapshot_metadata(
-    ScrubMap &map,
-    const std::map<hobject_t,
-                   std::pair<std::optional<uint32_t>,
-                        std::optional<uint32_t>>> &missing_digest) override;
-  void _scrub_clear_state() override;
-  void _scrub_finish() override;
-  object_stat_collection_t scrub_cstat;
 
   void _split_into(pg_t child_pgid, PG *child,
                    unsigned split_bits) override;
@@ -1554,22 +1547,6 @@ private:
     pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
     return  pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
   }
-  void log_missing(unsigned missing,
-                       const std::optional<hobject_t> &head,
-                       LogChannelRef clog,
-                       const spg_t &pgid,
-                       const char *func,
-                       const char *mode,
-                       bool allow_incomplete_clones);
-  unsigned process_clones_to(const std::optional<hobject_t> &head,
-    const std::optional<SnapSet> &snapset,
-    LogChannelRef clog,
-    const spg_t &pgid,
-    const char *mode,
-    bool allow_incomplete_clones,
-    std::optional<snapid_t> target,
-    std::vector<snapid_t>::reverse_iterator *curclone,
-    inconsistent_snapset_wrapper &snap_error);
 
 public:
   coll_t get_coll() {
@@ -1623,12 +1600,7 @@ private:
     explicit SnapTrimmer(PrimaryLogPG *pg) : pg(pg) {}
     void log_enter(const char *state_name);
     void log_exit(const char *state_name, utime_t duration);
-    bool permit_trim() {
-      return
-       pg->is_clean() &&
-       !pg->scrubber.active &&
-       !pg->snap_trimq.empty();
-    }
+    bool permit_trim();
     bool can_trim() {
       return
        permit_trim() &&
diff --git a/src/osd/PrimaryLogScrub.cc b/src/osd/PrimaryLogScrub.cc
new file mode 100644 (file)
index 0000000..6cafa25
--- /dev/null
@@ -0,0 +1,611 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PrimaryLogScrub.h"
+
+#include "common/scrub_types.h"
+
+#include "PeeringState.h"
+#include "PrimaryLogPG.h"
+#include "scrub_machine.h"
+
+#define dout_context (m_pg->cct)
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this->m_pg)
+
+template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
+{
+  return t->gen_prefix(*_dout) << " PrimaryLog scrubber pg(" << t->pg_id << ") ";
+}
+
+using namespace Scrub;
+using Scrub::ScrubMachine;
+
+bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg,
+                                      scrub_ls_result_t& res_inout) const
+{
+  if (!m_store) {
+    return false;
+  }
+
+  if (arg.get_snapsets) {
+    res_inout.vals =
+      m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return);
+  } else {
+    res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after,
+                                               arg.max_return);
+  }
+  return true;
+}
+
+void PrimaryLogScrub::_scrub_finish()
+{
+  auto& info = m_pg->info;  ///< a temporary alias
+
+  dout(10) << __func__
+          << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid")
+          << dendl;
+
+  bool repair = state_test(PG_STATE_REPAIR);
+  bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
+  const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
+
+  if (info.stats.stats_invalid) {
+    m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) {
+      stats.stats = m_scrub_cstat;
+      stats.stats_invalid = false;
+      return false;
+    });
+
+    if (m_pl_pg->agent_state)
+      m_pl_pg->agent_choose_mode();
+  }
+
+  dout(10) << mode << " got " << m_scrub_cstat.sum.num_objects << "/"
+          << info.stats.stats.sum.num_objects << " objects, "
+          << m_scrub_cstat.sum.num_object_clones << "/"
+          << info.stats.stats.sum.num_object_clones << " clones, "
+          << m_scrub_cstat.sum.num_objects_dirty << "/"
+          << info.stats.stats.sum.num_objects_dirty << " dirty, "
+          << m_scrub_cstat.sum.num_objects_omap << "/"
+          << info.stats.stats.sum.num_objects_omap << " omap, "
+          << m_scrub_cstat.sum.num_objects_pinned << "/"
+          << info.stats.stats.sum.num_objects_pinned << " pinned, "
+          << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+          << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
+          << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes
+          << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/"
+          << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
+          << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+          << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
+          << dendl;
+
+  if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
+      m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
+      (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
+       !info.stats.dirty_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
+       !info.stats.omap_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
+       !info.stats.pin_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_hit_set_archive !=
+        info.stats.stats.sum.num_objects_hit_set_archive &&
+       !info.stats.hitset_stats_invalid) ||
+      (m_scrub_cstat.sum.num_bytes_hit_set_archive !=
+        info.stats.stats.sum.num_bytes_hit_set_archive &&
+       !info.stats.hitset_bytes_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_manifest !=
+        info.stats.stats.sum.num_objects_manifest &&
+       !info.stats.manifest_stats_invalid) ||
+      m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
+      m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
+    m_osds->clog->error() << info.pgid << " " << mode << " : stat mismatch, got "
+                         << m_scrub_cstat.sum.num_objects << "/"
+                         << info.stats.stats.sum.num_objects << " objects, "
+                         << m_scrub_cstat.sum.num_object_clones << "/"
+                         << info.stats.stats.sum.num_object_clones << " clones, "
+                         << m_scrub_cstat.sum.num_objects_dirty << "/"
+                         << info.stats.stats.sum.num_objects_dirty << " dirty, "
+                         << m_scrub_cstat.sum.num_objects_omap << "/"
+                         << info.stats.stats.sum.num_objects_omap << " omap, "
+                         << m_scrub_cstat.sum.num_objects_pinned << "/"
+                         << info.stats.stats.sum.num_objects_pinned << " pinned, "
+                         << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+                         << info.stats.stats.sum.num_objects_hit_set_archive
+                         << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts
+                         << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
+                         << m_scrub_cstat.sum.num_bytes << "/"
+                         << info.stats.stats.sum.num_bytes << " bytes, "
+                         << m_scrub_cstat.sum.num_objects_manifest << "/"
+                         << info.stats.stats.sum.num_objects_manifest
+                         << " manifest objects, "
+                         << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+                         << info.stats.stats.sum.num_bytes_hit_set_archive
+                         << " hit_set_archive bytes.";
+    ++m_shallow_errors;
+
+    if (repair) {
+      ++m_fixed_count;
+      m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) {
+       stats.stats = m_scrub_cstat;
+       stats.dirty_stats_invalid = false;
+       stats.omap_stats_invalid = false;
+       stats.hitset_stats_invalid = false;
+       stats.hitset_bytes_stats_invalid = false;
+       stats.pin_stats_invalid = false;
+       stats.manifest_stats_invalid = false;
+       return false;
+      });
+      m_pl_pg->publish_stats_to_osd();
+      m_pl_pg->recovery_state.share_pg_info();
+    }
+  }
+  // Clear object context cache to get repair information
+  if (repair)
+    m_pl_pg->object_contexts.clear();
+}
+
+static bool doing_clones(const std::optional<SnapSet>& snapset,
+                        const vector<snapid_t>::reverse_iterator& curclone)
+{
+  return snapset && curclone != snapset->clones.rend();
+}
+
+void PrimaryLogScrub::log_missing(int missing,
+                                 const std::optional<hobject_t>& head,
+                                 LogChannelRef clog,
+                                 const spg_t& pgid,
+                                 const char* func,
+                                 const char* mode,
+                                 bool allow_incomplete_clones)
+{
+  ceph_assert(head);
+  if (allow_incomplete_clones) {
+    dout(20) << func << " " << mode << " " << pgid << " " << *head << " skipped "
+            << missing << " clone(s) in cache tier" << dendl;
+  } else {
+    clog->info() << mode << " " << pgid << " " << *head << " : " << missing
+                << " missing clone(s)";
+  }
+}
+
+int PrimaryLogScrub::process_clones_to(const std::optional<hobject_t>& head,
+                                      const std::optional<SnapSet>& snapset,
+                                      LogChannelRef clog,
+                                      const spg_t& pgid,
+                                      const char* mode,
+                                      bool allow_incomplete_clones,
+                                      std::optional<snapid_t> target,
+                                      vector<snapid_t>::reverse_iterator* curclone,
+                                      inconsistent_snapset_wrapper& e)
+{
+  ceph_assert(head);
+  ceph_assert(snapset);
+  int missing_count = 0;
+
+  // NOTE: clones are in descending order, thus **curclone > target test here
+  hobject_t next_clone(*head);
+  while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
+
+    ++missing_count;
+    // it is okay to be missing one or more clones in a cache tier.
+    // skip higher-numbered clones in the list.
+    if (!allow_incomplete_clones) {
+      next_clone.snap = **curclone;
+      clog->error() << mode << " " << pgid << " " << *head << " : expected clone "
+                   << next_clone << " " << m_missing << " missing";
+      ++m_shallow_errors;
+      e.set_clone_missing(next_clone.snap);
+    }
+    // Clones are descending
+    ++(*curclone);
+  }
+  return missing_count;
+}
+
+/*
+ * Validate consistency of the object info and snap sets.
+ *
+ * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
+ * the comparison of the objects is against multiple snapset.clones. There are
+ * multiple clone lists and in between lists we expect head.
+ *
+ * Example
+ *
+ * objects              expected
+ * =======              =======
+ * obj1 snap 1          head, unexpected obj1 snap 1
+ * obj2 head            head, match
+ *              [SnapSet clones 6 4 2 1]
+ * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
+ * obj2 snap 6          obj2 snap 6, match
+ * obj2 snap 4          obj2 snap 4, match
+ * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), match
+ *              [Snapset clones 3 1]
+ * obj3 snap 3          obj3 snap 3 match
+ * obj3 snap 1          obj3 snap 1 match
+ * obj4 head            head, match
+ *              [Snapset clones 4]
+ * EOL                  obj4 snap 4, (expected)
+ */
+void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap,
+                                             const missing_map_t& missing_digest)
+{
+  dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects
+          << dendl;
+
+  auto& info = m_pl_pg->info;
+  const PGPool& pool = m_pl_pg->pool;
+  bool allow_incomplete_clones = pool.info.allow_incomplete_clones();
+
+  bool repair = state_test(PG_STATE_REPAIR);
+  bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
+  const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub"));
+
+  std::optional<snapid_t> all_clones;  // Unspecified snapid_t or std::nullopt
+
+  // traverse in reverse order.
+  std::optional<hobject_t> head;
+  std::optional<SnapSet> snapset;              // If initialized so will head (above)
+  vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
+  int missing = 0;
+  inconsistent_snapset_wrapper soid_error, head_error;
+  int soid_error_count = 0;
+
+  for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
+
+    const hobject_t& soid = p->first;
+    ceph_assert(!soid.is_snapdir());
+    soid_error = inconsistent_snapset_wrapper{soid};
+    object_stat_sum_t stat;
+    std::optional<object_info_t> oi;
+
+    stat.num_objects++;
+
+    if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+      stat.num_objects_hit_set_archive++;
+
+    if (soid.is_snap()) {
+      // it's a clone
+      stat.num_object_clones++;
+    }
+
+    // basic checks.
+    if (p->second.attrs.count(OI_ATTR) == 0) {
+      oi = std::nullopt;
+      m_osds->clog->error() << mode << " " << info.pgid << " " << soid << " : no '"
+                           << OI_ATTR << "' attr";
+      ++m_shallow_errors;
+      soid_error.set_info_missing();
+    } else {
+      bufferlist bv;
+      bv.push_back(p->second.attrs[OI_ATTR]);
+      try {
+       oi = object_info_t();  // Initialize optional<> before decode into it
+       oi->decode(bv);
+      } catch (ceph::buffer::error& e) {
+       oi = std::nullopt;
+       m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+                             << " : can't decode '" << OI_ATTR << "' attr " << e.what();
+       ++m_shallow_errors;
+       soid_error.set_info_corrupted();
+       soid_error.set_info_missing();  // Not available too
+      }
+    }
+
+    if (oi) {
+      if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
+       m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+                             << " : on disk size (" << p->second.size
+                             << ") does not match object info size (" << oi->size
+                             << ") adjusted for ondisk to ("
+                             << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")";
+       soid_error.set_size_mismatch();
+       ++m_shallow_errors;
+      }
+
+      dout(20) << mode << "  " << soid << " " << *oi << dendl;
+
+      // A clone num_bytes will be added later when we have snapset
+      if (!soid.is_snap()) {
+       stat.num_bytes += oi->size;
+      }
+      if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+       stat.num_bytes_hit_set_archive += oi->size;
+
+      if (oi->is_dirty())
+       ++stat.num_objects_dirty;
+      if (oi->is_whiteout())
+       ++stat.num_whiteouts;
+      if (oi->is_omap())
+       ++stat.num_objects_omap;
+      if (oi->is_cache_pinned())
+       ++stat.num_objects_pinned;
+      if (oi->has_manifest())
+       ++stat.num_objects_manifest;
+    }
+
+    // Check for any problems while processing clones
+    if (doing_clones(snapset, curclone)) {
+      std::optional<snapid_t> target;
+      // Expecting an object with snap for current head
+      if (soid.has_snapset() || soid.get_head() != head->get_head()) {
+
+       dout(10) << __func__ << " " << mode << " " << info.pgid << " new object " << soid
+                << " while processing " << *head << dendl;
+
+       target = all_clones;
+      } else {
+       ceph_assert(soid.is_snap());
+       target = soid.snap;
+      }
+
+      // Log any clones we were expecting to be there up to target
+      // This will set missing, but will be a no-op if snap.soid == *curclone.
+      missing +=
+       process_clones_to(head, snapset, m_osds->clog, info.pgid, mode,
+                         allow_incomplete_clones, target, &curclone, head_error);
+    }
+
+    bool expected;
+    // Check doing_clones() again in case we ran process_clones_to()
+    if (doing_clones(snapset, curclone)) {
+      // A head would have processed all clones above
+      // or all greater than *curclone.
+      ceph_assert(soid.is_snap() && *curclone <= soid.snap);
+
+      // After processing above clone snap should match the expected curclone
+      expected = (*curclone == soid.snap);
+    } else {
+      // If we aren't doing clones any longer, then expecting head
+      expected = soid.has_snapset();
+    }
+    if (!expected) {
+      // If we couldn't read the head's snapset, just ignore clones
+      if (head && !snapset) {
+       m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+                             << " : clone ignored due to missing snapset";
+      } else {
+       m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+                             << " : is an unexpected clone";
+      }
+      ++m_shallow_errors;
+      soid_error.set_headless();
+      m_store->add_snap_error(pool.id, soid_error);
+      ++soid_error_count;
+      if (head && soid.get_head() == head->get_head())
+       head_error.set_clone(soid.snap);
+      continue;
+    }
+
+    // new snapset?
+    if (soid.has_snapset()) {
+
+      if (missing) {
+       log_missing(missing, head, m_osds->clog, info.pgid, __func__, mode,
+                   pool.info.allow_incomplete_clones());
+      }
+
+      // Save previous head error information
+      if (head && (head_error.errors || soid_error_count))
+       m_store->add_snap_error(pool.id, head_error);
+      // Set this as a new head object
+      head = soid;
+      missing = 0;
+      head_error = soid_error;
+      soid_error_count = 0;
+
+      dout(20) << __func__ << " " << mode << " new head " << head << dendl;
+
+      if (p->second.attrs.count(SS_ATTR) == 0) {
+       m_osds->clog->error() << mode << " " << info.pgid << " " << soid << " : no '"
+                             << SS_ATTR << "' attr";
+       ++m_shallow_errors;
+       snapset = std::nullopt;
+       head_error.set_snapset_missing();
+      } else {
+       bufferlist bl;
+       bl.push_back(p->second.attrs[SS_ATTR]);
+       auto blp = bl.cbegin();
+       try {
+         snapset = SnapSet();  // Initialize optional<> before decoding into it
+         decode(*snapset, blp);
+         head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
+       } catch (ceph::buffer::error& e) {
+         snapset = std::nullopt;
+         m_osds->clog->error()
+           << mode << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR
+           << "' attr " << e.what();
+         ++m_shallow_errors;
+         head_error.set_snapset_corrupted();
+       }
+      }
+
+      if (snapset) {
+       // what will be next?
+       curclone = snapset->clones.rbegin();
+
+       if (!snapset->clones.empty()) {
+         dout(20) << "  snapset " << *snapset << dendl;
+         if (snapset->seq == 0) {
+           m_osds->clog->error()
+             << mode << " " << info.pgid << " " << soid << " : snaps.seq not set";
+           ++m_shallow_errors;
+           head_error.set_snapset_error();
+         }
+       }
+      }
+    } else {
+      ceph_assert(soid.is_snap());
+      ceph_assert(head);
+      ceph_assert(snapset);
+      ceph_assert(soid.snap == *curclone);
+
+      dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
+
+      if (snapset->clone_size.count(soid.snap) == 0) {
+       m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+                             << " : is missing in clone_size";
+       ++m_shallow_errors;
+       soid_error.set_size_mismatch();
+      } else {
+       if (oi && oi->size != snapset->clone_size[soid.snap]) {
+         m_osds->clog->error()
+           << mode << " " << info.pgid << " " << soid << " : size " << oi->size
+           << " != clone_size " << snapset->clone_size[*curclone];
+         ++m_shallow_errors;
+         soid_error.set_size_mismatch();
+       }
+
+       if (snapset->clone_overlap.count(soid.snap) == 0) {
+         m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+                               << " : is missing in clone_overlap";
+         ++m_shallow_errors;
+         soid_error.set_size_mismatch();
+       } else {
+         // This checking is based on get_clone_bytes().  The first 2 asserts
+         // can't happen because we know we have a clone_size and
+         // a clone_overlap.  Now we check that the interval_set won't
+         // cause the last assert.
+         uint64_t size = snapset->clone_size.find(soid.snap)->second;
+         const interval_set<uint64_t>& overlap =
+           snapset->clone_overlap.find(soid.snap)->second;
+         bool bad_interval_set = false;
+         for (interval_set<uint64_t>::const_iterator i = overlap.begin();
+              i != overlap.end(); ++i) {
+           if (size < i.get_len()) {
+             bad_interval_set = true;
+             break;
+           }
+           size -= i.get_len();
+         }
+
+         if (bad_interval_set) {
+           m_osds->clog->error() << mode << " " << info.pgid << " " << soid
+                                 << " : bad interval_set in clone_overlap";
+           ++m_shallow_errors;
+           soid_error.set_size_mismatch();
+         } else {
+           stat.num_bytes += snapset->get_clone_bytes(soid.snap);
+         }
+       }
+      }
+
+      // what's next?
+      ++curclone;
+      if (soid_error.errors) {
+       m_store->add_snap_error(pool.id, soid_error);
+       ++soid_error_count;
+      }
+    }
+    m_scrub_cstat.add(stat);
+  }
+
+  if (doing_clones(snapset, curclone)) {
+    dout(10) << __func__ << " " << mode << " " << info.pgid
+            << " No more objects while processing " << *head << dendl;
+
+    missing +=
+      process_clones_to(head, snapset, m_osds->clog, info.pgid, mode,
+                       allow_incomplete_clones, all_clones, &curclone, head_error);
+  }
+
+  // There could be missing found by the test above or even
+  // before dropping out of the loop for the last head.
+  if (missing) {
+    log_missing(missing, head, m_osds->clog, info.pgid, __func__, mode,
+               allow_incomplete_clones);
+  }
+  if (head && (head_error.errors || soid_error_count))
+    m_store->add_snap_error(pool.id, head_error);
+
+  dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing"
+          << dendl;
+  for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
+
+    ceph_assert(!p->first.is_snapdir());
+    dout(10) << __func__ << " recording digests for " << p->first << dendl;
+
+    ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false);
+    if (!obc) {
+      m_osds->clog->error() << info.pgid << " " << mode
+                           << " cannot get object context for object " << p->first;
+      continue;
+    }
+    if (obc->obs.oi.soid != p->first) {
+      m_osds->clog->error() << info.pgid << " " << mode << " " << p->first
+                           << " : object has a valid oi attr with a mismatched name, "
+                           << " obc->obs.oi.soid: " << obc->obs.oi.soid;
+      continue;
+    }
+    PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc);
+    ctx->at_version = m_pl_pg->get_next_version();
+    ctx->mtime = utime_t();  // do not update mtime
+    if (p->second.first) {
+      ctx->new_obs.oi.set_data_digest(*p->second.first);
+    } else {
+      ctx->new_obs.oi.clear_data_digest();
+    }
+    if (p->second.second) {
+      ctx->new_obs.oi.set_omap_digest(*p->second.second);
+    } else {
+      ctx->new_obs.oi.clear_omap_digest();
+    }
+    m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
+
+    ctx->register_on_success([this]() {
+      dout(20) << "updating scrub digest " << num_digest_updates_pending << dendl;
+      if (--num_digest_updates_pending <= 0) {
+       m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops());
+      }
+    });
+
+    ++num_digest_updates_pending;
+    m_pl_pg->simple_opc_submit(std::move(ctx));
+  }
+
+  dout(10) << __func__ << " (" << mode << ") finish" << dendl;
+}
+
+PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg)
+    : PgScrubber{pg}, m_pl_pg{pg}
+{}
+
+void PrimaryLogScrub::_scrub_clear_state()
+{
+  dout(7) << __func__ << " - pg(" << m_pl_pg->pg_id << dendl;
+  m_scrub_cstat = object_stat_collection_t();
+}
+
+void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+                                              const hobject_t& soid)
+{
+  // We scrub objects in hobject_t order, so objects before m_start have already been
+  // scrubbed and their stats have already been added to the scrubber. Objects after that
+  // point haven't been included in the scrubber's stats accounting yet, so they will be
+  // included when the scrubber gets to that object.
+  dout(15) << __func__ << " soid: " << soid << " scrub is active? " << is_scrub_active()
+          << dendl;
+  if (is_primary() && is_scrub_active()) {
+    if (soid < m_start) {
+      dout(20) << __func__ << " " << soid << " < [" << m_start << "," << m_end << ")"
+              << dendl;
+      m_scrub_cstat.add(delta_stats);
+    } else {
+      dout(20) << __func__ << " " << soid << " >= [" << m_start << "," << m_end << ")"
+              << dendl;
+    }
+  }
+}
+
+bool PrimaryLogScrub::should_requeue_blocked_ops(eversion_t last_recovery_applied) const
+{
+  if (!is_scrub_active()) {
+    // just verify that things indeed are quiet
+    ceph_assert(m_start == m_end);
+    return false;
+  }
+
+  return last_recovery_applied >= m_subset_last_update;
+}
diff --git a/src/osd/PrimaryLogScrub.h b/src/osd/PrimaryLogScrub.h
new file mode 100644 (file)
index 0000000..d24fa09
--- /dev/null
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+// the './' includes are marked this way to affect clang-format
+#include "./pg_scrubber.h"
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "debug.h"
+
+#include "common/errno.h"
+#include "common/scrub_types.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrubReserve.h"
+
+#include "OSD.h"
+#include "scrub_machine.h"
+
+class PrimaryLogPG;
+
+/**
+ * The derivative of PgScrubber that is used by PrimaryLogPG.
+ */
+class PrimaryLogScrub : public PgScrubber {
+ public:
+  explicit PrimaryLogScrub(PrimaryLogPG* pg);
+
+  void _scrub_finish() final;
+
+  bool get_store_errors(const scrub_ls_arg_t& arg,
+                       scrub_ls_result_t& res_inout) const final;
+
+  /**
+   *  should we requeue blocked ops?
+   *  Yes - if our 'subset_last_applied' is less up-to-date than the
+   *  new recovery_state.get_last_update_applied().
+   *  (used by PrimaryLogPG::op_applied())
+   */
+  [[nodiscard]] bool should_requeue_blocked_ops(
+    eversion_t last_recovery_applied) const final;
+
+  void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+                               const hobject_t& soid) final;
+
+ private:
+  // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object:
+  PrimaryLogPG* const m_pl_pg;
+
+  /**
+   * Validate consistency of the object info and snap sets.
+   */
+  void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final;
+
+  void log_missing(int missing,
+                  const std::optional<hobject_t>& head,
+                  LogChannelRef clog,
+                  const spg_t& pgid,
+                  const char* func,
+                  const char* mode,
+                  bool allow_incomplete_clones);
+
+  int process_clones_to(const std::optional<hobject_t>& head,
+                       const std::optional<SnapSet>& snapset,
+                       LogChannelRef clog,
+                       const spg_t& pgid,
+                       const char* mode,
+                       bool allow_incomplete_clones,
+                       std::optional<snapid_t> target,
+                       std::vector<snapid_t>::reverse_iterator* curclone,
+                       inconsistent_snapset_wrapper& snap_error);
+
+
+  // handle our part in stats collection
+  object_stat_collection_t m_scrub_cstat;
+  void _scrub_clear_state() final;  // which just clears the stats
+};
index bcabad41f73e34a0c1870ffcdf410cb7ac1287a8..2afa689aed4b764ecd25857e41d3e80f126551c0 100644 (file)
@@ -325,7 +325,7 @@ bool PgScrubber::is_scrub_registered() const
 void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
 {
   if (!is_primary()) {
-    dout(20) << __func__ << ": not a primary!" << dendl;
+    // normal. No warning is required.
     return;
   }
 
@@ -375,18 +375,6 @@ void PgScrubber::unreg_next_scrub()
   }
 }
 
-/// debug/development temporary code:
-void PgScrubber::debug_dump_reservations(std::string_view header_txt) const
-{
-  std::string format;
-  auto f = Formatter::create(format, "json-pretty", "json-pretty");
-  m_osds->dump_scrub_reservations(f);
-  std::stringstream o;
-  f->flush(o);
-  dout(20) << header_txt << o.str() << dendl;
-  delete f;
-}
-
 void PgScrubber::scrub_requested(scrub_level_t scrub_level,
                                 scrub_type_t scrub_type,
                                 requested_scrub_t& req_flags)
@@ -396,8 +384,6 @@ void PgScrubber::scrub_requested(scrub_level_t scrub_level,
           << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered()
           << dendl;
 
-  debug_dump_reservations(" before_unreg ");
-
   unreg_next_scrub();
 
   req_flags.must_scrub = true;
@@ -409,24 +395,18 @@ void PgScrubber::scrub_requested(scrub_level_t scrub_level,
   req_flags.req_scrub = true;
 
   dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl;
-  debug_dump_reservations(" before_reg ");
 
   reg_next_scrub(req_flags);
-
-  debug_dump_reservations(" after_reg ");
 }
 
 void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
 {
   dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << " ## "
           << is_scrub_registered() << dendl;
-  debug_dump_reservations(" auto-scrub before ");
 
   unreg_next_scrub();
   req_flags.need_auto = true;
   reg_next_scrub(req_flags);
-
-  debug_dump_reservations(" auto-scrub after ");
 }
 
 bool PgScrubber::reserve_local()
@@ -469,7 +449,6 @@ void PgScrubber::set_subset_last_update(eversion_t e)
  * - setting tentative range based on conf and divisor
  * - requesting a partial list of elements from the backend;
  * - handling some head/clones issues
- * - ...
  *
  * The selected range is set directly into 'm_start' and 'm_end'
  */
@@ -500,8 +479,6 @@ bool PgScrubber::select_range()
   int max_idx = std::max<int64_t>(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max /
                                             preemption_data.chunk_divisor());
 
-  // why mixing 'int' and int64_t? RRR
-
   dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx
           << " Div: " << preemption_data.chunk_divisor() << dendl;
 
@@ -560,10 +537,9 @@ bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
     return false;
   }
 
-  dout(10) << __func__ << " " << soid << " can preempt? "
-          << preemption_data.is_preemptable() << dendl;
-  dout(10) << __func__ << " " << soid << " already? " << preemption_data.was_preempted()
-          << dendl;
+  dout(20) << __func__ << " " << soid << " can preempt? "
+          << preemption_data.is_preemptable() << " already preempted? "
+          << preemption_data.was_preempted() << dendl;
 
   if (preemption_data.is_preemptable()) {
 
@@ -1189,8 +1165,6 @@ void PgScrubber::scrub_compare_maps()
 
   if (m_pg->recovery_state.get_acting().size() > 1) {
 
-    // RRR add a comment here
-
     dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
 
     // Map from object with errors to good peer
@@ -1395,8 +1369,7 @@ void PgScrubber::clear_scrub_reservations()
 
 void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text)
 {
-  ceph_assert(m_pg->recovery_state.get_backfill_targets()
-               .empty());  // RRR ask: (the code was copied as is) Why checking here?
+  ceph_assert(m_pg->recovery_state.get_backfill_targets().empty());
 
   std::vector<std::pair<int, Message*>> messages;
   messages.reserve(m_pg->get_actingset().size());
index 0a390ce18e504741d11070f010beb0f947b9dad2..e3936691578fcc416040f8ed2f59975b58cc0aca 100644 (file)
@@ -674,7 +674,4 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
   };
 
   preemption_data_t preemption_data;
-
-  // debug/development temporary code:
-  void debug_dump_reservations(std::string_view header_txt) const;
 };