set(osd_srcs
OSD.cc
- pg_scrubber.cc
- scrub_machine.cc
- PrimaryLogScrub.cc
Watch.cc
ClassHandler.cc
PG.cc
ECTransaction.cc
PGBackend.cc
OSDCap.cc
+ scrubber/pg_scrubber.cc
+ scrubber/PrimaryLogScrub.cc
+ scrubber/scrub_machine.cc
+ scrubber/ScrubStore.cc
Watch.cc
Session.cc
SnapMapper.cc
- ScrubStore.cc
osd_types.cc
ECUtil.cc
ExtentCache.cc
#endif
#include "osd/PG.h"
-#include "osd/scrub_machine.h"
-#include "osd/pg_scrubber.h"
+#include "osd/scrubber/scrub_machine.h"
+#include "osd/scrubber/pg_scrubber.h"
#include "include/types.h"
#include "include/compat.h"
#include "common/config.h"
#include "OSD.h"
#include "OpRequest.h"
-#include "ScrubStore.h"
-#include "pg_scrubber.h"
+#include "scrubber/ScrubStore.h"
+#include "scrubber/pg_scrubber.h"
#include "Session.h"
#include "osd/scheduler/OpSchedulerItem.h"
#include "common/errno.h"
#include "common/scrub_types.h"
#include "ReplicatedBackend.h"
-#include "ScrubStore.h"
+#include "scrubber/ScrubStore.h"
#include "ECBackend.h"
#include "PGBackend.h"
#include "OSD.h"
*
*/
-#include <errno.h>
-
-#include <charconv>
-#include <sstream>
-#include <utility>
+#include "PrimaryLogPG.h"
#include <boost/intrusive_ptr.hpp>
-#include <boost/tuple/tuple.hpp>
-
-#include "PG.h"
-#include "pg_scrubber.h"
-#include "PrimaryLogPG.h"
-#include "OSD.h"
-#include "PrimaryLogScrub.h"
-#include "OpRequest.h"
-#include "ScrubStore.h"
-#include "Session.h"
-#include "objclass/objclass.h"
-#include "osd/ClassHandler.h"
#include "cls/cas/cls_cas_ops.h"
+#include "common/EventTrace.h"
#include "common/ceph_crypto.h"
+#include "common/CDC.h"
#include "common/config.h"
#include "common/errno.h"
-#include "common/scrub_types.h"
-#include "common/perf_counters.h"
-#include "common/CDC.h"
#include "common/EventTrace.h"
-
-#include "messages/MOSDOp.h"
+#include "common/perf_counters.h"
+#include "common/scrub_types.h"
+#include "include/compat.h"
+#include "messages/MCommandReply.h"
#include "messages/MOSDBackoff.h"
-#include "messages/MOSDPGTrim.h"
-#include "messages/MOSDPGScan.h"
-#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDOp.h"
#include "messages/MOSDPGBackfill.h"
#include "messages/MOSDPGBackfillRemove.h"
#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGScan.h"
+#include "messages/MOSDPGTrim.h"
#include "messages/MOSDPGUpdateLogMissing.h"
#include "messages/MOSDPGUpdateLogMissingReply.h"
-#include "messages/MCommandReply.h"
+#include "messages/MOSDRepScrub.h"
#include "messages/MOSDScrubReserve.h"
-
-#include "include/compat.h"
#include "mon/MonClient.h"
+#include "objclass/objclass.h"
+#include "osd/ClassHandler.h"
+#include "osd/OpRequest.h"
+#include "osd/Session.h"
#include "osdc/Objecter.h"
+#include "scrubber/PrimaryLogScrub.h"
+
+// required includes order:
#include "json_spirit/json_spirit_value.h"
#include "json_spirit/json_spirit_reader.h"
#include "include/ceph_assert.h" // json_spirit clobbers it
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "PrimaryLogScrub.h"
-
-#include "common/scrub_types.h"
-#include "osd/osd_types_fmt.h"
-
-#include "PeeringState.h"
-#include "PrimaryLogPG.h"
-#include "scrub_machine.h"
-
-#define dout_context (m_pg->get_cct())
-#define dout_subsys ceph_subsys_osd
-#undef dout_prefix
-#define dout_prefix _prefix(_dout, this->m_pg)
-
-using std::vector;
-
-template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
-{
- return t->gen_prefix(*_dout) << " PrimaryLog scrubber pg(" << t->pg_id << ") ";
-}
-
-using namespace Scrub;
-using Scrub::ScrubMachine;
-
-bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg,
- scrub_ls_result_t& res_inout) const
-{
- if (!m_store) {
- return false;
- }
-
- if (arg.get_snapsets) {
- res_inout.vals =
- m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return);
- } else {
- res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after,
- arg.max_return);
- }
- return true;
-}
-
-void PrimaryLogScrub::_scrub_finish()
-{
- auto& info = m_pg->get_pg_info(ScrubberPasskey{}); ///< a temporary alias
-
- dout(10) << __func__
- << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid")
- << dendl;
-
- if (info.stats.stats_invalid) {
- m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) {
- stats.stats = m_scrub_cstat;
- stats.stats_invalid = false;
- return false;
- });
-
- if (m_pl_pg->agent_state)
- m_pl_pg->agent_choose_mode();
- }
-
- dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/"
- << info.stats.stats.sum.num_objects << " objects, "
- << m_scrub_cstat.sum.num_object_clones << "/"
- << info.stats.stats.sum.num_object_clones << " clones, "
- << m_scrub_cstat.sum.num_objects_dirty << "/"
- << info.stats.stats.sum.num_objects_dirty << " dirty, "
- << m_scrub_cstat.sum.num_objects_omap << "/"
- << info.stats.stats.sum.num_objects_omap << " omap, "
- << m_scrub_cstat.sum.num_objects_pinned << "/"
- << info.stats.stats.sum.num_objects_pinned << " pinned, "
- << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
- << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
- << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes
- << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/"
- << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
- << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
- << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
- << dendl;
-
- if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
- m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
- (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
- !info.stats.dirty_stats_invalid) ||
- (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
- !info.stats.omap_stats_invalid) ||
- (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
- !info.stats.pin_stats_invalid) ||
- (m_scrub_cstat.sum.num_objects_hit_set_archive !=
- info.stats.stats.sum.num_objects_hit_set_archive &&
- !info.stats.hitset_stats_invalid) ||
- (m_scrub_cstat.sum.num_bytes_hit_set_archive !=
- info.stats.stats.sum.num_bytes_hit_set_archive &&
- !info.stats.hitset_bytes_stats_invalid) ||
- (m_scrub_cstat.sum.num_objects_manifest !=
- info.stats.stats.sum.num_objects_manifest &&
- !info.stats.manifest_stats_invalid) ||
- m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
- m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
- m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got "
- << m_scrub_cstat.sum.num_objects << "/"
- << info.stats.stats.sum.num_objects << " objects, "
- << m_scrub_cstat.sum.num_object_clones << "/"
- << info.stats.stats.sum.num_object_clones << " clones, "
- << m_scrub_cstat.sum.num_objects_dirty << "/"
- << info.stats.stats.sum.num_objects_dirty << " dirty, "
- << m_scrub_cstat.sum.num_objects_omap << "/"
- << info.stats.stats.sum.num_objects_omap << " omap, "
- << m_scrub_cstat.sum.num_objects_pinned << "/"
- << info.stats.stats.sum.num_objects_pinned << " pinned, "
- << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
- << info.stats.stats.sum.num_objects_hit_set_archive
- << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts
- << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
- << m_scrub_cstat.sum.num_bytes << "/"
- << info.stats.stats.sum.num_bytes << " bytes, "
- << m_scrub_cstat.sum.num_objects_manifest << "/"
- << info.stats.stats.sum.num_objects_manifest
- << " manifest objects, "
- << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
- << info.stats.stats.sum.num_bytes_hit_set_archive
- << " hit_set_archive bytes.";
- ++m_shallow_errors;
-
- if (m_is_repair) {
- ++m_fixed_count;
- m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) {
- stats.stats = m_scrub_cstat;
- stats.dirty_stats_invalid = false;
- stats.omap_stats_invalid = false;
- stats.hitset_stats_invalid = false;
- stats.hitset_bytes_stats_invalid = false;
- stats.pin_stats_invalid = false;
- stats.manifest_stats_invalid = false;
- return false;
- });
- m_pl_pg->publish_stats_to_osd();
- m_pl_pg->recovery_state.share_pg_info();
- }
- }
- // Clear object context cache to get repair information
- if (m_is_repair)
- m_pl_pg->object_contexts.clear();
-}
-
-static bool doing_clones(const std::optional<SnapSet>& snapset,
- const vector<snapid_t>::reverse_iterator& curclone)
-{
- return snapset && curclone != snapset->clones.rend();
-}
-
-void PrimaryLogScrub::log_missing(int missing,
- const std::optional<hobject_t>& head,
- LogChannelRef clog,
- const spg_t& pgid,
- const char* func,
- bool allow_incomplete_clones)
-{
- ceph_assert(head);
- if (allow_incomplete_clones) {
- dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped "
- << missing << " clone(s) in cache tier" << dendl;
- } else {
- clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing
- << " missing clone(s)";
- }
-}
-
-int PrimaryLogScrub::process_clones_to(const std::optional<hobject_t>& head,
- const std::optional<SnapSet>& snapset,
- LogChannelRef clog,
- const spg_t& pgid,
- bool allow_incomplete_clones,
- std::optional<snapid_t> target,
- vector<snapid_t>::reverse_iterator* curclone,
- inconsistent_snapset_wrapper& e)
-{
- ceph_assert(head);
- ceph_assert(snapset);
- int missing_count = 0;
-
- // NOTE: clones are in descending order, thus **curclone > target test here
- hobject_t next_clone(*head);
- while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
-
- ++missing_count;
- // it is okay to be missing one or more clones in a cache tier.
- // skip higher-numbered clones in the list.
- if (!allow_incomplete_clones) {
- next_clone.snap = **curclone;
- clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone "
- << next_clone << " " << m_missing << " missing";
- ++m_shallow_errors;
- e.set_clone_missing(next_clone.snap);
- }
- // Clones are descending
- ++(*curclone);
- }
- return missing_count;
-}
-
-/*
- * Validate consistency of the object info and snap sets.
- *
- * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
- * the comparison of the objects is against multiple snapset.clones. There are
- * multiple clone lists and in between lists we expect head.
- *
- * Example
- *
- * objects expected
- * ======= =======
- * obj1 snap 1 head, unexpected obj1 snap 1
- * obj2 head head, match
- * [SnapSet clones 6 4 2 1]
- * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
- * obj2 snap 6 obj2 snap 6, match
- * obj2 snap 4 obj2 snap 4, match
- * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
- * [Snapset clones 3 1]
- * obj3 snap 3 obj3 snap 3 match
- * obj3 snap 1 obj3 snap 1 match
- * obj4 head head, match
- * [Snapset clones 4]
- * EOL obj4 snap 4, (expected)
- */
-void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap,
- const missing_map_t& missing_digest)
-{
- dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects
- << dendl;
-
- auto& info = m_pl_pg->info;
- const PGPool& pool = m_pl_pg->pool;
- bool allow_incomplete_clones = pool.info.allow_incomplete_clones();
-
- std::optional<snapid_t> all_clones; // Unspecified snapid_t or std::nullopt
-
- // traverse in reverse order.
- std::optional<hobject_t> head;
- std::optional<SnapSet> snapset; // If initialized so will head (above)
- vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
- int missing = 0;
- inconsistent_snapset_wrapper soid_error, head_error;
- int soid_error_count = 0;
-
- for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
-
- const hobject_t& soid = p->first;
- ceph_assert(!soid.is_snapdir());
- soid_error = inconsistent_snapset_wrapper{soid};
- object_stat_sum_t stat;
- std::optional<object_info_t> oi;
-
- stat.num_objects++;
-
- if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
- stat.num_objects_hit_set_archive++;
-
- if (soid.is_snap()) {
- // it's a clone
- stat.num_object_clones++;
- }
-
- // basic checks.
- if (p->second.attrs.count(OI_ATTR) == 0) {
- oi = std::nullopt;
- m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
- << OI_ATTR << "' attr";
- ++m_shallow_errors;
- soid_error.set_info_missing();
- } else {
- bufferlist bv;
- bv.push_back(p->second.attrs[OI_ATTR]);
- try {
- oi = object_info_t(bv);
- } catch (ceph::buffer::error& e) {
- oi = std::nullopt;
- m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
- << " : can't decode '" << OI_ATTR << "' attr " << e.what();
- ++m_shallow_errors;
- soid_error.set_info_corrupted();
- soid_error.set_info_missing(); // Not available too
- }
- }
-
- if (oi) {
- if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
- m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
- << " : on disk size (" << p->second.size
- << ") does not match object info size (" << oi->size
- << ") adjusted for ondisk to ("
- << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")";
- soid_error.set_size_mismatch();
- ++m_shallow_errors;
- }
-
- dout(20) << m_mode_desc << " " << soid << " " << *oi << dendl;
-
- // A clone num_bytes will be added later when we have snapset
- if (!soid.is_snap()) {
- stat.num_bytes += oi->size;
- }
- if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
- stat.num_bytes_hit_set_archive += oi->size;
-
- if (oi->is_dirty())
- ++stat.num_objects_dirty;
- if (oi->is_whiteout())
- ++stat.num_whiteouts;
- if (oi->is_omap())
- ++stat.num_objects_omap;
- if (oi->is_cache_pinned())
- ++stat.num_objects_pinned;
- if (oi->has_manifest())
- ++stat.num_objects_manifest;
- }
-
- // Check for any problems while processing clones
- if (doing_clones(snapset, curclone)) {
- std::optional<snapid_t> target;
- // Expecting an object with snap for current head
- if (soid.has_snapset() || soid.get_head() != head->get_head()) {
-
- dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid
- << " while processing " << *head << dendl;
-
- target = all_clones;
- } else {
- ceph_assert(soid.is_snap());
- target = soid.snap;
- }
-
- // Log any clones we were expecting to be there up to target
- // This will set missing, but will be a no-op if snap.soid == *curclone.
- missing +=
- process_clones_to(head, snapset, m_osds->clog, info.pgid,
- allow_incomplete_clones, target, &curclone, head_error);
- }
-
- bool expected;
- // Check doing_clones() again in case we ran process_clones_to()
- if (doing_clones(snapset, curclone)) {
- // A head would have processed all clones above
- // or all greater than *curclone.
- ceph_assert(soid.is_snap() && *curclone <= soid.snap);
-
- // After processing above clone snap should match the expected curclone
- expected = (*curclone == soid.snap);
- } else {
- // If we aren't doing clones any longer, then expecting head
- expected = soid.has_snapset();
- }
- if (!expected) {
- // If we couldn't read the head's snapset, just ignore clones
- if (head && !snapset) {
- m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
- << " : clone ignored due to missing snapset";
- } else {
- m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
- << " : is an unexpected clone";
- }
- ++m_shallow_errors;
- soid_error.set_headless();
- m_store->add_snap_error(pool.id, soid_error);
- ++soid_error_count;
- if (head && soid.get_head() == head->get_head())
- head_error.set_clone(soid.snap);
- continue;
- }
-
- // new snapset?
- if (soid.has_snapset()) {
-
- if (missing) {
- log_missing(missing, head, m_osds->clog, info.pgid, __func__,
- pool.info.allow_incomplete_clones());
- }
-
- // Save previous head error information
- if (head && (head_error.errors || soid_error_count))
- m_store->add_snap_error(pool.id, head_error);
- // Set this as a new head object
- head = soid;
- missing = 0;
- head_error = soid_error;
- soid_error_count = 0;
-
- dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl;
-
- if (p->second.attrs.count(SS_ATTR) == 0) {
- m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
- << SS_ATTR << "' attr";
- ++m_shallow_errors;
- snapset = std::nullopt;
- head_error.set_snapset_missing();
- } else {
- bufferlist bl;
- bl.push_back(p->second.attrs[SS_ATTR]);
- auto blp = bl.cbegin();
- try {
- snapset = SnapSet(); // Initialize optional<> before decoding into it
- decode(*snapset, blp);
- head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
- } catch (ceph::buffer::error& e) {
- snapset = std::nullopt;
- m_osds->clog->error()
- << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR
- << "' attr " << e.what();
- ++m_shallow_errors;
- head_error.set_snapset_corrupted();
- }
- }
-
- if (snapset) {
- // what will be next?
- curclone = snapset->clones.rbegin();
-
- if (!snapset->clones.empty()) {
- dout(20) << " snapset " << *snapset << dendl;
- if (snapset->seq == 0) {
- m_osds->clog->error()
- << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set";
- ++m_shallow_errors;
- head_error.set_snapset_error();
- }
- }
- }
- } else {
- ceph_assert(soid.is_snap());
- ceph_assert(head);
- ceph_assert(snapset);
- ceph_assert(soid.snap == *curclone);
-
- dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl;
-
- if (snapset->clone_size.count(soid.snap) == 0) {
- m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
- << " : is missing in clone_size";
- ++m_shallow_errors;
- soid_error.set_size_mismatch();
- } else {
- if (oi && oi->size != snapset->clone_size[soid.snap]) {
- m_osds->clog->error()
- << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size
- << " != clone_size " << snapset->clone_size[*curclone];
- ++m_shallow_errors;
- soid_error.set_size_mismatch();
- }
-
- if (snapset->clone_overlap.count(soid.snap) == 0) {
- m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
- << " : is missing in clone_overlap";
- ++m_shallow_errors;
- soid_error.set_size_mismatch();
- } else {
- // This checking is based on get_clone_bytes(). The first 2 asserts
- // can't happen because we know we have a clone_size and
- // a clone_overlap. Now we check that the interval_set won't
- // cause the last assert.
- uint64_t size = snapset->clone_size.find(soid.snap)->second;
- const interval_set<uint64_t>& overlap =
- snapset->clone_overlap.find(soid.snap)->second;
- bool bad_interval_set = false;
- for (interval_set<uint64_t>::const_iterator i = overlap.begin();
- i != overlap.end(); ++i) {
- if (size < i.get_len()) {
- bad_interval_set = true;
- break;
- }
- size -= i.get_len();
- }
-
- if (bad_interval_set) {
- m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
- << " : bad interval_set in clone_overlap";
- ++m_shallow_errors;
- soid_error.set_size_mismatch();
- } else {
- stat.num_bytes += snapset->get_clone_bytes(soid.snap);
- }
- }
- }
-
- // what's next?
- ++curclone;
- if (soid_error.errors) {
- m_store->add_snap_error(pool.id, soid_error);
- ++soid_error_count;
- }
- }
- m_scrub_cstat.add(stat);
- }
-
- if (doing_clones(snapset, curclone)) {
- dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid
- << " No more objects while processing " << *head << dendl;
-
- missing +=
- process_clones_to(head, snapset, m_osds->clog, info.pgid,
- allow_incomplete_clones, all_clones, &curclone, head_error);
- }
-
- // There could be missing found by the test above or even
- // before dropping out of the loop for the last head.
- if (missing) {
- log_missing(missing, head, m_osds->clog, info.pgid, __func__,
- allow_incomplete_clones);
- }
- if (head && (head_error.errors || soid_error_count))
- m_store->add_snap_error(pool.id, head_error);
-
- dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing"
- << dendl;
- for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
-
- ceph_assert(!p->first.is_snapdir());
- dout(10) << __func__ << " recording digests for " << p->first << dendl;
-
- ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false);
- if (!obc) {
- m_osds->clog->error() << info.pgid << " " << m_mode_desc
- << " cannot get object context for object " << p->first;
- continue;
- }
- if (obc->obs.oi.soid != p->first) {
- m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first
- << " : object has a valid oi attr with a mismatched name, "
- << " obc->obs.oi.soid: " << obc->obs.oi.soid;
- continue;
- }
- PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc);
- ctx->at_version = m_pl_pg->get_next_version();
- ctx->mtime = utime_t(); // do not update mtime
- if (p->second.first) {
- ctx->new_obs.oi.set_data_digest(*p->second.first);
- } else {
- ctx->new_obs.oi.clear_data_digest();
- }
- if (p->second.second) {
- ctx->new_obs.oi.set_omap_digest(*p->second.second);
- } else {
- ctx->new_obs.oi.clear_omap_digest();
- }
- m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
-
- ++num_digest_updates_pending;
- ctx->register_on_success([this]() {
- dout(20) << "updating scrub digest " << num_digest_updates_pending << dendl;
- if (--num_digest_updates_pending <= 0) {
- m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops());
- }
- });
-
- m_pl_pg->simple_opc_submit(std::move(ctx));
- }
-
- dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl;
-}
-
-PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {}
-
-void PrimaryLogScrub::_scrub_clear_state()
-{
- dout(15) << __func__ << dendl;
- m_scrub_cstat = object_stat_collection_t();
-}
-
-void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats,
- const hobject_t& soid)
-{
- // We scrub objects in hobject_t order, so objects before m_start have already been
- // scrubbed and their stats have already been added to the scrubber. Objects after that
- // point haven't been included in the scrubber's stats accounting yet, so they will be
- // included when the scrubber gets to that object.
- if (is_primary() && is_scrub_active()) {
- if (soid < m_start) {
-
- dout(20) << fmt::format("{} {} < [{},{})", __func__, soid, m_start, m_end) << dendl;
- m_scrub_cstat.add(delta_stats);
-
- } else {
-
- dout(25) << fmt::format("{} {} >= [{},{})", __func__, soid, m_start, m_end) << dendl;
- }
- }
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#pragma once
-
-// the './' includes are marked this way to affect clang-format
-#include "./pg_scrubber.h"
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include "debug.h"
-
-#include "common/errno.h"
-#include "common/scrub_types.h"
-#include "messages/MOSDOp.h"
-#include "messages/MOSDRepScrub.h"
-#include "messages/MOSDRepScrubMap.h"
-#include "messages/MOSDScrub.h"
-#include "messages/MOSDScrubReserve.h"
-
-#include "OSD.h"
-#include "scrub_machine.h"
-
-class PrimaryLogPG;
-
-/**
- * The derivative of PgScrubber that is used by PrimaryLogPG.
- */
-class PrimaryLogScrub : public PgScrubber {
- public:
- explicit PrimaryLogScrub(PrimaryLogPG* pg);
-
- void _scrub_finish() final;
-
- bool get_store_errors(const scrub_ls_arg_t& arg,
- scrub_ls_result_t& res_inout) const final;
-
- void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
- const hobject_t& soid) final;
-
- private:
- // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object:
- PrimaryLogPG* const m_pl_pg;
-
- /**
- * Validate consistency of the object info and snap sets.
- */
- void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final;
-
- void log_missing(int missing,
- const std::optional<hobject_t>& head,
- LogChannelRef clog,
- const spg_t& pgid,
- const char* func,
- bool allow_incomplete_clones);
-
- int process_clones_to(const std::optional<hobject_t>& head,
- const std::optional<SnapSet>& snapset,
- LogChannelRef clog,
- const spg_t& pgid,
- bool allow_incomplete_clones,
- std::optional<snapid_t> target,
- std::vector<snapid_t>::reverse_iterator* curclone,
- inconsistent_snapset_wrapper& snap_error);
-
-
- // handle our part in stats collection
- object_stat_collection_t m_scrub_cstat;
- void _scrub_clear_state() final; // which just clears the stats
-};
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "ScrubStore.h"
-#include "osd_types.h"
-#include "common/scrub_types.h"
-#include "include/rados/rados_types.hpp"
-
-using std::ostringstream;
-using std::string;
-using std::vector;
-
-using ceph::bufferlist;
-
-namespace {
-ghobject_t make_scrub_object(const spg_t& pgid)
-{
- ostringstream ss;
- ss << "scrub_" << pgid;
- return pgid.make_temp_ghobject(ss.str());
-}
-
-string first_object_key(int64_t pool)
-{
- auto hoid = hobject_t(object_t(),
- "",
- 0,
- 0x00000000,
- pool,
- "");
- hoid.build_hash_cache();
- return "SCRUB_OBJ_" + hoid.to_str();
-}
-
-// the object_key should be unique across pools
-string to_object_key(int64_t pool, const librados::object_id_t& oid)
-{
- auto hoid = hobject_t(object_t(oid.name),
- oid.locator, // key
- oid.snap,
- 0, // hash
- pool,
- oid.nspace);
- hoid.build_hash_cache();
- return "SCRUB_OBJ_" + hoid.to_str();
-}
-
-string last_object_key(int64_t pool)
-{
- auto hoid = hobject_t(object_t(),
- "",
- 0,
- 0xffffffff,
- pool,
- "");
- hoid.build_hash_cache();
- return "SCRUB_OBJ_" + hoid.to_str();
-}
-
-string first_snap_key(int64_t pool)
-{
- // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
- // the representing the minimal and maximum keys. and this relies on how
- // hobject_t::to_str() works: hex(pool).hex(revhash).
- auto hoid = hobject_t(object_t(),
- "",
- 0,
- 0x00000000,
- pool,
- "");
- hoid.build_hash_cache();
- return "SCRUB_SS_" + hoid.to_str();
-}
-
-string to_snap_key(int64_t pool, const librados::object_id_t& oid)
-{
- auto hoid = hobject_t(object_t(oid.name),
- oid.locator, // key
- oid.snap,
- 0x77777777, // hash
- pool,
- oid.nspace);
- hoid.build_hash_cache();
- return "SCRUB_SS_" + hoid.to_str();
-}
-
-string last_snap_key(int64_t pool)
-{
- auto hoid = hobject_t(object_t(),
- "",
- 0,
- 0xffffffff,
- pool,
- "");
- hoid.build_hash_cache();
- return "SCRUB_SS_" + hoid.to_str();
-}
-}
-
-namespace Scrub {
-
-Store*
-Store::create(ObjectStore* store,
- ObjectStore::Transaction* t,
- const spg_t& pgid,
- const coll_t& coll)
-{
- ceph_assert(store);
- ceph_assert(t);
- ghobject_t oid = make_scrub_object(pgid);
- t->touch(coll, oid);
- return new Store{coll, oid, store};
-}
-
-Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
- : coll(coll),
- hoid(oid),
- driver(store, coll, hoid),
- backend(&driver)
-{}
-
-Store::~Store()
-{
- ceph_assert(results.empty());
-}
-
-void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
-{
- bufferlist bl;
- e.encode(bl);
- results[to_object_key(pool, e.object)] = bl;
-}
-
-void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
-{
- bufferlist bl;
- e.encode(bl);
- results[to_snap_key(pool, e.object)] = bl;
-}
-
-bool Store::empty() const
-{
- return results.empty();
-}
-
-void Store::flush(ObjectStore::Transaction* t)
-{
- if (t) {
- OSDriver::OSTransaction txn = driver.get_transaction(t);
- backend.set_keys(results, &txn);
- }
- results.clear();
-}
-
-void Store::cleanup(ObjectStore::Transaction* t)
-{
- t->remove(coll, hoid);
-}
-
-std::vector<bufferlist>
-Store::get_snap_errors(int64_t pool,
- const librados::object_id_t& start,
- uint64_t max_return) const
-{
- const string begin = (start.name.empty() ?
- first_snap_key(pool) : to_snap_key(pool, start));
- const string end = last_snap_key(pool);
- return get_errors(begin, end, max_return);
-}
-
-std::vector<bufferlist>
-Store::get_object_errors(int64_t pool,
- const librados::object_id_t& start,
- uint64_t max_return) const
-{
- const string begin = (start.name.empty() ?
- first_object_key(pool) : to_object_key(pool, start));
- const string end = last_object_key(pool);
- return get_errors(begin, end, max_return);
-}
-
-std::vector<bufferlist>
-Store::get_errors(const string& begin,
- const string& end,
- uint64_t max_return) const
-{
- vector<bufferlist> errors;
- auto next = std::make_pair(begin, bufferlist{});
- while (max_return && !backend.get_next(next.first, &next)) {
- if (next.first >= end)
- break;
- errors.push_back(next.second);
- max_return--;
- }
- return errors;
-}
-
-} // namespace Scrub
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_SCRUB_RESULT_H
-#define CEPH_SCRUB_RESULT_H
-
-#include "SnapMapper.h" // for OSDriver
-#include "common/map_cacher.hpp"
-
-namespace librados {
- struct object_id_t;
-}
-
-struct inconsistent_obj_wrapper;
-struct inconsistent_snapset_wrapper;
-
-namespace Scrub {
-
-class Store {
-public:
- ~Store();
- static Store* create(ObjectStore* store,
- ObjectStore::Transaction* t,
- const spg_t& pgid,
- const coll_t& coll);
- void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
- void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
- bool empty() const;
- void flush(ObjectStore::Transaction *);
- void cleanup(ObjectStore::Transaction *);
- std::vector<ceph::buffer::list> get_snap_errors(int64_t pool,
- const librados::object_id_t& start,
- uint64_t max_return) const;
- std::vector<ceph::buffer::list> get_object_errors(int64_t pool,
- const librados::object_id_t& start,
- uint64_t max_return) const;
-private:
- Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
- std::vector<ceph::buffer::list> get_errors(const std::string& start, const std::string& end,
- uint64_t max_return) const;
-private:
- const coll_t coll;
- const ghobject_t hoid;
- // a temp object holding mappings from seq-id to inconsistencies found in
- // scrubbing
- OSDriver driver;
- mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
- std::map<std::string, ceph::buffer::list> results;
-};
-}
-
-#endif // CEPH_SCRUB_RESULT_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=2 sw=2 smarttab
-
-#include "./pg_scrubber.h" // the '.' notation used to affect clang-format order
-
-#include <iostream>
-#include <vector>
-
-#include "debug.h"
-
-#include "common/errno.h"
-#include "messages/MOSDOp.h"
-#include "messages/MOSDRepScrub.h"
-#include "messages/MOSDRepScrubMap.h"
-#include "messages/MOSDScrub.h"
-#include "messages/MOSDScrubReserve.h"
-
-#include "OSD.h"
-#include "ScrubStore.h"
-#include "scrub_machine.h"
-
-using std::list;
-using std::map;
-using std::pair;
-using std::set;
-using std::stringstream;
-using std::vector;
-using namespace Scrub;
-using namespace std::chrono;
-using namespace std::chrono_literals;
-using namespace std::literals;
-
-#define dout_context (m_pg->get_cct())
-#define dout_subsys ceph_subsys_osd
-#undef dout_prefix
-#define dout_prefix _prefix(_dout, this->m_pg)
-
-template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
-{
- return t->gen_prefix(*_dout) << " scrubber pg(" << t->pg_id << ") ";
-}
-
-ostream& operator<<(ostream& out, const scrub_flags_t& sf)
-{
- if (sf.auto_repair)
- out << " AUTO_REPAIR";
- if (sf.check_repair)
- out << " CHECK_REPAIR";
- if (sf.deep_scrub_on_error)
- out << " DEEP_SCRUB_ON_ERROR";
- if (sf.required)
- out << " REQ_SCRUB";
-
- return out;
-}
-
-ostream& operator<<(ostream& out, const requested_scrub_t& sf)
-{
- if (sf.must_repair)
- out << " MUST_REPAIR";
- if (sf.auto_repair)
- out << " planned AUTO_REPAIR";
- if (sf.check_repair)
- out << " planned CHECK_REPAIR";
- if (sf.deep_scrub_on_error)
- out << " planned DEEP_SCRUB_ON_ERROR";
- if (sf.must_deep_scrub)
- out << " MUST_DEEP_SCRUB";
- if (sf.must_scrub)
- out << " MUST_SCRUB";
- if (sf.time_for_deep)
- out << " TIME_FOR_DEEP";
- if (sf.need_auto)
- out << " NEED_AUTO";
- if (sf.req_scrub)
- out << " planned REQ_SCRUB";
-
- return out;
-}
-
-/*
- * if the incoming message is from a previous interval, it must mean
- * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard
- * the stale message.
- */
-bool PgScrubber::check_interval(epoch_t epoch_to_verify)
-{
- return epoch_to_verify >= m_pg->get_same_interval_since();
-}
-
-bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify)
-{
- if (!m_active) {
- // not scrubbing. We can assume that the scrub was already terminated, and we
- // can silently discard the incoming event.
- return false;
- }
-
- // is this a message from before we started this scrub?
- if (epoch_to_verify < m_epoch_start) {
- return false;
- }
-
- // has a new interval started?
- if (!check_interval(epoch_to_verify)) {
- // if this is a new interval, on_change() has already terminated that
- // old scrub.
- return false;
- }
-
- ceph_assert(is_primary());
-
- // were we instructed to abort?
- return verify_against_abort(epoch_to_verify);
-}
-
-bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify)
-{
- if (!should_abort()) {
- return true;
- }
-
- dout(10) << __func__ << " aborting. incoming epoch: " << epoch_to_verify
- << " vs last-aborted: " << m_last_aborted << dendl;
-
- // if we were not aware of the abort before - kill the scrub.
- if (epoch_to_verify > m_last_aborted) {
- scrub_clear_state();
- m_last_aborted = std::max(epoch_to_verify, m_epoch_start);
- }
- return false;
-}
-
-bool PgScrubber::should_abort() const
-{
- if (m_flags.required) {
- return false; // not stopping 'required' scrubs for configuration changes
- }
-
- if (m_is_deep) {
- if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
- m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
- dout(10) << "nodeep_scrub set, aborting" << dendl;
- return true;
- }
- }
-
- if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
- m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
- dout(10) << "noscrub set, aborting" << dendl;
- return true;
- }
-
- return false;
-}
-
-// initiating state-machine events --------------------------------
-
-/*
- * a note re the checks performed before sending scrub-initiating messages:
- *
- * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that
- * possibly were in the queue while the PG changed state and became unavailable for
- * scrubbing:
- *
- * The check_interval() catches all major changes to the PG. As for the other conditions
- * we may check (and see is_message_relevant() above):
- *
- * - we are not 'active' yet, so must not check against is_active(), and:
- *
- * - the 'abort' flags were just verified (when the triggering message was queued). As
- * those are only modified in human speeds - they need not be queried again.
- *
- * Some of the considerations above are also relevant to the replica-side initiation
- * ('StartReplica' & 'StartReplicaNoWait').
- */
-
-void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
-{
- dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
- // we may have lost our Primary status while the message languished in the queue
- if (check_interval(epoch_queued)) {
- dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl;
- reset_epoch(epoch_queued);
- m_fsm->my_states();
- m_fsm->process_event(StartScrub{});
- dout(10) << "scrubber event --<< StartScrub" << dendl;
- }
-}
-
-void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued)
-{
- dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
- // we may have lost our Primary status while the message languished in the queue
- if (check_interval(epoch_queued)) {
- dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl;
- reset_epoch(epoch_queued);
- m_fsm->my_states();
- m_fsm->process_event(AfterRepairScrub{});
- dout(10) << "scrubber event --<< AfterRepairScrub" << dendl;
- }
-}
-
-void PgScrubber::send_scrub_unblock(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (is_message_relevant(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(Unblocked{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_scrub_resched(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (is_message_relevant(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(InternalSchedScrub{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
- << " token: " << token << dendl;
- if (is_primary()) {
- // shouldn't happen. Ignore
- dout(1) << "got a replica scrub request while Primary!" << dendl;
- return;
- }
-
- if (check_interval(epoch_queued) && is_token_current(token)) {
- m_fsm->my_states();
- // save us some time by not waiting for updates if there are none
- // to wait for. Affects the transition from NotActive into either
- // ReplicaWaitUpdates or ActiveReplica.
- if (pending_active_pushes())
- m_fsm->process_event(StartReplica{});
- else
- m_fsm->process_event(StartReplicaNoWait{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
- << " token: " << token << dendl;
- if (check_interval(epoch_queued) && is_token_current(token)) {
- m_fsm->my_states();
- m_fsm->process_event(SchedReplica{}); // retest for map availability
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::active_pushes_notification(epoch_t epoch_queued)
-{
- // note: Primary only
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (is_message_relevant(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(ActivePushesUpd{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::update_applied_notification(epoch_t epoch_queued)
-{
- // note: Primary only
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (is_message_relevant(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(UpdatesApplied{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::digest_update_notification(epoch_t epoch_queued)
-{
- // note: Primary only
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (is_message_relevant(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(DigestUpdate{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_local_map_done(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (is_message_relevant(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(Scrub::IntLocalMapDone{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (is_message_relevant(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(GotReplicas{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (check_interval(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(ReplicaPushesUpd{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_remotes_reserved(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- // note: scrub is not active yet
- if (check_interval(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(RemotesReserved{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_reservation_failure(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (check_interval(epoch_queued)) { // do not check for 'active'!
- m_fsm->my_states();
- m_fsm->process_event(ReservationFailure{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_full_reset(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-
- m_fsm->my_states();
- m_fsm->process_event(Scrub::FullReset{});
-
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_chunk_free(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (check_interval(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(Scrub::SelectedChunkFree{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_chunk_busy(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (check_interval(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(Scrub::ChunkIsBusy{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_get_next_chunk(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
- if (is_message_relevant(epoch_queued)) {
- m_fsm->my_states();
- m_fsm->process_event(Scrub::NextChunk{});
- }
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-
- // can't check for "active"
-
- m_fsm->my_states();
- m_fsm->process_event(Scrub::ScrubFinished{});
-
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-void PgScrubber::send_maps_compared(epoch_t epoch_queued)
-{
- dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-
- m_fsm->my_states();
- m_fsm->process_event(Scrub::MapsCompared{});
-
- dout(10) << "scrubber event --<< " << __func__ << dendl;
-}
-
-// -----------------
-
-bool PgScrubber::is_reserving() const
-{
- return m_fsm->is_reserving();
-}
-
-void PgScrubber::reset_epoch(epoch_t epoch_queued)
-{
- dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
- m_fsm->assert_not_active();
-
- m_epoch_start = epoch_queued;
- m_needs_sleep = true;
- m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
- update_op_mode_text();
-}
-
-unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
-{
- unsigned int qu_priority = m_flags.priority;
-
- if (with_priority == Scrub::scrub_prio_t::high_priority) {
- qu_priority =
- std::max(qu_priority, (unsigned int)m_pg->get_cct()->_conf->osd_client_op_priority);
- }
- return qu_priority;
-}
-
-unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
- unsigned int suggested_priority) const
-{
- if (with_priority == Scrub::scrub_prio_t::high_priority) {
- suggested_priority = std::max(suggested_priority,
- (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
- }
- return suggested_priority;
-}
-
-// ///////////////////////////////////////////////////////////////////// //
-// scrub-op registration handling
-
-bool PgScrubber::is_scrub_registered() const
-{
- return !m_scrub_reg_stamp.is_zero();
-}
-
-void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
-{
- if (!is_primary()) {
- // normal. No warning is required.
- return;
- }
-
- dout(10) << __func__ << " planned: must? " << request_flags.must_scrub << " need-auto? "
- << request_flags.need_auto << " stamp: " << m_pg->info.history.last_scrub_stamp
- << dendl;
-
- ceph_assert(!is_scrub_registered());
-
- utime_t reg_stamp;
- bool must = false;
-
- if (request_flags.must_scrub || request_flags.need_auto) {
- // Set the smallest time that isn't utime_t()
- reg_stamp = PgScrubber::scrub_must_stamp();
- must = true;
- } else if (m_pg->info.stats.stats_invalid &&
- m_pg->cct->_conf->osd_scrub_invalid_stats) {
- reg_stamp = ceph_clock_now();
- must = true;
- } else {
- reg_stamp = m_pg->info.history.last_scrub_stamp;
- }
-
- dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must
- << " required:" << m_flags.required << " flags: " << request_flags
- << " stamp: " << reg_stamp << dendl;
-
- const double scrub_min_interval =
- m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0);
- const double scrub_max_interval =
- m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0);
-
- // note the sched_time, so we can locate this scrub, and remove it later
- m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval,
- scrub_max_interval, must);
- dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time "
- << m_scrub_reg_stamp << ", must = " << (int)must << dendl;
-}
-
-void PgScrubber::unreg_next_scrub()
-{
- if (is_scrub_registered()) {
- dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl;
- m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
- m_scrub_reg_stamp = utime_t{};
- }
-}
-
-void PgScrubber::scrub_requested(scrub_level_t scrub_level,
- scrub_type_t scrub_type,
- requested_scrub_t& req_flags)
-{
- dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ")
- << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ")
- << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered()
- << dendl;
-
- unreg_next_scrub();
-
- req_flags.must_scrub = true;
- req_flags.must_deep_scrub =
- (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair);
- req_flags.must_repair = (scrub_type == scrub_type_t::do_repair);
- // User might intervene, so clear this
- req_flags.need_auto = false;
- req_flags.req_scrub = true;
-
- dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl;
-
- reg_next_scrub(req_flags);
-}
-
-void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
-{
- dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? "
- << is_scrub_registered() << dendl;
-
- unreg_next_scrub();
- req_flags.need_auto = true;
- reg_next_scrub(req_flags);
-}
-
-bool PgScrubber::reserve_local()
-{
- // try to create the reservation object (which translates into asking the
- // OSD for the local scrub resource). If failing - undo it immediately
-
- m_local_osd_resource.emplace(m_pg, m_osds);
- if (!m_local_osd_resource->is_reserved()) {
- m_local_osd_resource.reset();
- return false;
- }
-
- return true;
-}
-
-// ----------------------------------------------------------------------------
-
-bool PgScrubber::has_pg_marked_new_updates() const
-{
- auto last_applied = m_pg->recovery_state.get_last_update_applied();
- dout(10) << __func__ << " recovery last: " << last_applied
- << " vs. scrub's: " << m_subset_last_update << dendl;
-
- return last_applied >= m_subset_last_update;
-}
-
-void PgScrubber::set_subset_last_update(eversion_t e)
-{
- m_subset_last_update = e;
- dout(15) << __func__ << " last-update: " << e << dendl;
-}
-
-void PgScrubber::on_applied_when_primary(const eversion_t& applied_version)
-{
- // we are only interested in updates if we are the Primary, and in state
- // WaitLastUpdate
- if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) {
- m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops());
- dout(15) << __func__ << " update: " << applied_version
- << " vs. required: " << m_subset_last_update << dendl;
- }
-}
-
-/*
- * The selected range is set directly into 'm_start' and 'm_end'
- * setting:
- * - m_subset_last_update
- * - m_max_end
- * - end
- * - start
- */
-bool PgScrubber::select_range()
-{
- m_primary_scrubmap = ScrubMap{};
- m_received_maps.clear();
-
- /* get the start and end of our scrub chunk
- *
- * Our scrub chunk has an important restriction we're going to need to
- * respect. We can't let head be start or end.
- * Using a half-open interval means that if end == head,
- * we'd scrub/lock head and the clone right next to head in different
- * chunks which would allow us to miss clones created between
- * scrubbing that chunk and scrubbing the chunk including head.
- * This isn't true for any of the other clones since clones can
- * only be created "just to the left of" head. There is one exception
- * to this: promotion of clones which always happens to the left of the
- * left-most clone, but promote_object checks the scrubber in that
- * case, so it should be ok. Also, it's ok to "miss" clones at the
- * left end of the range if we are a tier because they may legitimately
- * not exist (see _scrub).
- */
- int min_idx = std::max<int64_t>(
- 3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor());
-
- int max_idx = std::max<int64_t>(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max /
- preemption_data.chunk_divisor());
-
- dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx
- << " Div: " << preemption_data.chunk_divisor() << dendl;
-
- hobject_t start = m_start;
- hobject_t candidate_end;
- std::vector<hobject_t> objects;
- int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects,
- &candidate_end);
- ceph_assert(ret >= 0);
-
- if (!objects.empty()) {
-
- hobject_t back = objects.back();
- while (candidate_end.is_head() && candidate_end == back.get_head()) {
- candidate_end = back;
- objects.pop_back();
- if (objects.empty()) {
- ceph_assert(0 ==
- "Somehow we got more than 2 objects which"
- "have the same head but are not clones");
- }
- back = objects.back();
- }
-
- if (candidate_end.is_head()) {
- ceph_assert(candidate_end != back.get_head());
- candidate_end = candidate_end.get_object_boundary();
- }
-
- } else {
- ceph_assert(candidate_end.is_max());
- }
-
- // is that range free for us? if not - we will be rescheduled later by whoever
- // triggered us this time
-
- if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) {
- // we'll be requeued by whatever made us unavailable for scrub
- dout(10) << __func__ << ": scrub blocked somewhere in range "
- << "[" << m_start << ", " << candidate_end << ")" << dendl;
- return false;
- }
-
- m_end = candidate_end;
- if (m_end > m_max_end)
- m_max_end = m_end;
-
- dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// "
- << m_max_end << dendl;
-
- // debug: be 'blocked' if told so by the 'pg scrub_debug block' asok command
- if (m_debug_blockrange > 0) {
- m_debug_blockrange--;
- return false;
- }
- return true;
-}
-
-void PgScrubber::select_range_n_notify()
-{
- if (select_range()) {
- // the next chunk to handle is not blocked
- dout(20) << __func__ << ": selection OK" << dendl;
- m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority);
-
- } else {
- // we will wait for the objects range to become available for scrubbing
- dout(10) << __func__ << ": selected chunk is busy" << dendl;
- m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority);
- }
-}
-
-bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
-{
- if (soid < m_start || soid >= m_end) {
- return false;
- }
-
- dout(20) << __func__ << " " << soid << " can preempt? "
- << preemption_data.is_preemptable() << " already preempted? "
- << preemption_data.was_preempted() << dendl;
-
- if (preemption_data.was_preempted()) {
- // otherwise - write requests arriving while 'already preempted' is set
- // but 'preemptable' is not - will not be allowed to continue, and will
- // not be requeued on time.
- return false;
- }
-
- if (preemption_data.is_preemptable()) {
-
- dout(10) << __func__ << " " << soid << " preempted" << dendl;
-
- // signal the preemption
- preemption_data.do_preempt();
- m_end = m_start; // free the range we were scrubbing
-
- return false;
- }
- return true;
-}
-
-bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end)
-{
- // does [start, end] intersect [scrubber.start, scrubber.m_max_end)
- return (start < m_max_end && end >= m_start);
-}
-
-Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm()
-{
- return std::make_unique<blocked_range_t>(m_osds, ceph::timespan{300s}, m_pg_id);
-}
-
-/**
- * if we are required to sleep:
- * arrange a callback sometimes later.
- * be sure to be able to identify a stale callback.
- * Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue)
- * anyway.
- */
-void PgScrubber::add_delayed_scheduling()
-{
- m_end = m_start; // not blocking any range now
-
- milliseconds sleep_time{0ms};
- if (m_needs_sleep) {
- double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required);
- sleep_time = milliseconds{long(scrub_sleep)};
- }
- dout(15) << __func__ << " sleep: " << sleep_time.count() << "ms. needed? "
- << m_needs_sleep << dendl;
-
- if (sleep_time.count()) {
- // schedule a transition for some 'sleep_time' ms in the future
-
- m_needs_sleep = false;
- m_sleep_started_at = ceph_clock_now();
-
- // the following log line is used by osd-scrub-test.sh
- dout(20) << __func__ << " scrub state is PendingTimer, sleeping" << dendl;
-
- // the 'delayer' for crimson is different. Will be factored out.
-
- spg_t pgid = m_pg->get_pgid();
- auto callbk = new LambdaContext([osds = m_osds, pgid,
- scrbr = this]([[maybe_unused]] int r) mutable {
- PGRef pg = osds->osd->lookup_lock_pg(pgid);
- if (!pg) {
- lgeneric_subdout(g_ceph_context, osd, 10)
- << "scrub_requeue_callback: Could not find "
- << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl;
- return;
- }
- scrbr->m_needs_sleep = true;
- lgeneric_dout(scrbr->get_pg_cct(), 7)
- << "scrub_requeue_callback: slept for "
- << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl;
-
- scrbr->m_sleep_started_at = utime_t{};
- osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority);
- pg->unlock();
- });
-
- std::lock_guard l(m_osds->sleep_lock);
- m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk);
-
- } else {
- // just a requeue
- m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
- }
-}
-
-eversion_t PgScrubber::search_log_for_updates() const
-{
- auto& projected = m_pg->projected_log.log;
- auto pi = find_if(
- projected.crbegin(), projected.crend(),
- [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; });
-
- if (pi != projected.crend())
- return pi->version;
-
- // there was no relevant update entry in the log
-
- auto& log = m_pg->recovery_state.get_pg_log().get_log().log;
- auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool {
- return e.soid >= m_start && e.soid < m_end;
- });
-
- if (p == log.crend())
- return eversion_t{};
- else
- return p->version;
-}
-
-void PgScrubber::get_replicas_maps(bool replica_can_preempt)
-{
- dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/"
- << m_interval_start
- << " pg same_interval_since: " << m_pg->info.history.same_interval_since
- << dendl;
-
- m_primary_scrubmap_pos.reset();
-
- // ask replicas to scan and send maps
- for (const auto& i : m_pg->get_acting_recovery_backfill()) {
-
- if (i == m_pg_whoami)
- continue;
-
- m_maps_status.mark_replica_map_request(i);
- _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep,
- replica_can_preempt);
- }
-
- dout(10) << __func__ << " awaiting" << m_maps_status << dendl;
-}
-
-bool PgScrubber::was_epoch_changed() const
-{
- // for crimson we have m_pg->get_info().history.same_interval_since
- dout(10) << __func__ << " epoch_start: " << m_interval_start
- << " from pg: " << m_pg->get_history().same_interval_since << dendl;
-
- return m_interval_start < m_pg->get_history().same_interval_since;
-}
-
-void PgScrubber::mark_local_map_ready()
-{
- m_maps_status.mark_local_map_ready();
-}
-
-bool PgScrubber::are_all_maps_available() const
-{
- return m_maps_status.are_all_maps_available();
-}
-
-std::string PgScrubber::dump_awaited_maps() const
-{
- return m_maps_status.dump();
-}
-
-void PgScrubber::update_op_mode_text()
-{
- auto visible_repair = state_test(PG_STATE_REPAIR);
- m_mode_desc = (visible_repair ? "repair" : (m_is_deep ? "deep-scrub" : "scrub"));
-
- dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false")
- << ", internal: " << (m_is_repair ? "true" : "false")
- << ". Displayed: " << m_mode_desc << dendl;
-}
-
-void PgScrubber::_request_scrub_map(pg_shard_t replica,
- eversion_t version,
- hobject_t start,
- hobject_t end,
- bool deep,
- bool allow_preemption)
-{
- ceph_assert(replica != m_pg_whoami);
- dout(10) << __func__ << " scrubmap from osd." << replica
- << (deep ? " deep" : " shallow") << dendl;
-
- auto repscrubop =
- new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version,
- get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep,
- allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub());
-
- // default priority. We want the replica-scrub processed prior to any recovery
- // or client io messages (we are holding a lock!)
- m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
-}
-
-void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
-{
- if (!m_store)
- return;
-
- struct OnComplete : Context {
- std::unique_ptr<Scrub::Store> store;
- explicit OnComplete(std::unique_ptr<Scrub::Store>&& store) : store(std::move(store))
- {}
- void finish(int) override {}
- };
- m_store->cleanup(t);
- t->register_on_complete(new OnComplete(std::move(m_store)));
- ceph_assert(!m_store);
-}
-
-void PgScrubber::on_init()
-{
- // going upwards from 'inactive'
- ceph_assert(!is_scrub_active());
-
- preemption_data.reset();
- m_pg->publish_stats_to_osd();
- m_interval_start = m_pg->get_history().same_interval_since;
-
- dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl;
-
- // create a new store
- {
- ObjectStore::Transaction t;
- cleanup_store(&t);
- m_store.reset(
- Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
- m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
- }
-
- m_start = m_pg->info.pgid.pgid.get_hobj_start();
- m_active = true;
-}
-
-void PgScrubber::on_replica_init()
-{
- m_active = true;
-}
-
-void PgScrubber::_scan_snaps(ScrubMap& smap)
-{
- hobject_t head;
- SnapSet snapset;
-
- // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings
- // in this function
- dout(15) << "_scan_snaps starts" << dendl;
-
- for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
-
- const hobject_t& hoid = i->first;
- ScrubMap::object& o = i->second;
-
- dout(20) << __func__ << " " << hoid << dendl;
-
- ceph_assert(!hoid.is_snapdir());
- if (hoid.is_head()) {
- // parse the SnapSet
- bufferlist bl;
- if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
- continue;
- }
- bl.push_back(o.attrs[SS_ATTR]);
- auto p = bl.cbegin();
- try {
- decode(snapset, p);
- } catch (...) {
- continue;
- }
- head = hoid.get_head();
- continue;
- }
-
- if (hoid.snap < CEPH_MAXSNAP) {
- // check and if necessary fix snap_mapper
- if (hoid.get_head() != head) {
- derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl;
- continue;
- }
- set<snapid_t> obj_snaps;
- auto p = snapset.clone_snaps.find(hoid.snap);
- if (p == snapset.clone_snaps.end()) {
- derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl;
- continue;
- }
- obj_snaps.insert(p->second.begin(), p->second.end());
- set<snapid_t> cur_snaps;
- int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps);
- if (r != 0 && r != -ENOENT) {
- derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
- ceph_abort();
- }
- if (r == -ENOENT || cur_snaps != obj_snaps) {
- ObjectStore::Transaction t;
- OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t));
- if (r == 0) {
- r = m_pg->snap_mapper.remove_oid(hoid, &_t);
- if (r != 0) {
- derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
- ceph_abort();
- }
- m_pg->osd->clog->error()
- << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
- << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps
- << ", oi: " << obj_snaps << "...repaired";
- } else {
- m_pg->osd->clog->error()
- << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
- << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper"
- << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r
- << "...repaired";
- }
- m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t);
-
- // wait for repair to apply to avoid confusing other bits of the system.
- {
- dout(15) << __func__ << " wait on repair!" << dendl;
-
- ceph::condition_variable my_cond;
- ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
- int e = 0;
- bool done;
-
- t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e));
-
- e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t));
- if (e != 0) {
- derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl;
- } else {
- std::unique_lock l{my_lock};
- my_cond.wait(l, [&done] { return done; });
- }
- }
- }
- }
- }
-}
-
-int PgScrubber::build_primary_map_chunk()
-{
- epoch_t map_building_since = m_pg->get_osdmap_epoch();
- dout(20) << __func__ << ": initiated at epoch " << map_building_since << dendl;
-
- auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start,
- m_end, m_is_deep);
-
- if (ret == -EINPROGRESS) {
- // reschedule another round of asking the backend to collect the scrub data
- m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority);
- }
- return ret;
-}
-
-int PgScrubber::build_replica_map_chunk()
-{
- dout(10) << __func__ << " interval start: " << m_interval_start
- << " current token: " << m_current_token << " epoch: " << m_epoch_start
- << " deep: " << m_is_deep << dendl;
-
- auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
- m_is_deep);
-
- switch (ret) {
-
- case -EINPROGRESS:
- // must wait for the backend to finish. No external event source.
- // (note: previous version used low priority here. Now switched to using the
- // priority of the original message)
- m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority,
- m_flags.priority, m_current_token);
- break;
-
- case 0: {
- // finished!
- m_cleaned_meta_map.clear_from(m_start);
- m_cleaned_meta_map.insert(replica_scrubmap);
- auto for_meta_scrub = clean_meta_map();
- _scan_snaps(for_meta_scrub);
-
- // the local map has been created. Send it to the primary.
- // Note: once the message reaches the Primary, it may ask us for another
- // chunk - and we better be done with the current scrub. Thus - the preparation of
- // the reply message is separate, and we clear the scrub state before actually
- // sending it.
-
- auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
- replica_handling_done();
- dout(15) << __func__ << " chunk map sent " << dendl;
- send_replica_map(reply);
- } break;
-
- default:
- // negative retval: build_scrub_map_chunk() signalled an error
- // Pre-Pacific code ignored this option, treating it as a success.
- // \todo Add an error flag in the returning message.
- dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret
- << dendl;
- replica_handling_done();
- // only in debug mode for now:
- assert(false && "backend error");
- break;
- };
-
- return ret;
-}
-
-int PgScrubber::build_scrub_map_chunk(
- ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep)
-{
- dout(10) << __func__ << " [" << start << "," << end << ") "
- << " pos " << pos << " Deep: " << deep << dendl;
-
- // start
- while (pos.empty()) {
-
- pos.deep = deep;
- map.valid_through = m_pg->info.last_update;
-
- // objects
- vector<ghobject_t> rollback_obs;
- pos.ret =
- m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs);
- dout(10) << __func__ << " while pos empty " << pos.ret << dendl;
- if (pos.ret < 0) {
- dout(5) << "objects_list_range error: " << pos.ret << dendl;
- return pos.ret;
- }
- dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl;
- if (pos.ls.empty()) {
- break;
- }
- m_pg->_scan_rollback_obs(rollback_obs);
- pos.pos = 0;
- return -EINPROGRESS;
- }
-
- // scan objects
- while (!pos.done()) {
-
- int r = m_pg->get_pgbackend()->be_scan_list(map, pos);
- dout(30) << __func__ << " BE returned " << r << dendl;
- if (r == -EINPROGRESS) {
- dout(20) << __func__ << " in progress" << dendl;
- return r;
- }
- }
-
- // finish
- dout(20) << __func__ << " finishing" << dendl;
- ceph_assert(pos.done());
- m_pg->_repair_oinfo_oid(map);
-
- dout(20) << __func__ << " done, got " << map.objects.size() << " items" << dendl;
- return 0;
-}
-
-/*
- * Process:
- * Building a map of objects suitable for snapshot validation.
- * The data in m_cleaned_meta_map is the left over partial items that need to
- * be completed before they can be processed.
- *
- * Snapshots in maps precede the head object, which is why we are scanning backwards.
- */
-ScrubMap PgScrubber::clean_meta_map()
-{
- ScrubMap for_meta_scrub;
-
- if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) {
- m_cleaned_meta_map.swap(for_meta_scrub);
- } else {
- auto iter = m_cleaned_meta_map.objects.end();
- --iter; // not empty, see 'if' clause
- auto begin = m_cleaned_meta_map.objects.begin();
- if (iter->first.has_snapset()) {
- ++iter;
- } else {
- while (iter != begin) {
- auto next = iter--;
- if (next->first.get_head() != iter->first.get_head()) {
- ++iter;
- break;
- }
- }
- }
- for_meta_scrub.objects.insert(begin, iter);
- m_cleaned_meta_map.objects.erase(begin, iter);
- }
-
- return for_meta_scrub;
-}
-
-void PgScrubber::run_callbacks()
-{
- std::list<Context*> to_run;
- to_run.swap(m_callbacks);
-
- for (auto& tr : to_run) {
- tr->complete(0);
- }
-}
-
-void PgScrubber::maps_compare_n_cleanup()
-{
- scrub_compare_maps();
- m_start = m_end;
- run_callbacks();
- requeue_waiting();
- m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority);
-}
-
-Scrub::preemption_t& PgScrubber::get_preemptor()
-{
- return preemption_data;
-}
-
-/*
- * Process note: called for the arriving "give me your map, replica!" request. Unlike
- * the original implementation, we do not requeue the Op waiting for
- * updates. Instead - we trigger the FSM.
- */
-void PgScrubber::replica_scrub_op(OpRequestRef op)
-{
- op->mark_started();
- auto msg = op->get_req<MOSDRepScrub>();
- dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch
- << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl;
-
- // are we still processing a previous scrub-map request without noticing that the
- // interval changed? won't see it here, but rather at the reservation stage.
-
- if (msg->map_epoch < m_pg->info.history.same_interval_since) {
- dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
- << " < " << m_pg->info.history.same_interval_since << dendl;
-
- // is there a general sync issue? are we holding a stale reservation?
- // not checking now - assuming we will actively react to interval change.
-
- return;
- }
-
- replica_scrubmap = ScrubMap{};
- replica_scrubmap_pos = ScrubMapBuilder{};
-
- m_replica_min_epoch = msg->min_epoch;
- m_start = msg->start;
- m_end = msg->end;
- m_max_end = msg->end;
- m_is_deep = msg->deep;
- m_interval_start = m_pg->info.history.same_interval_since;
- m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority
- : Scrub::scrub_prio_t::low_priority;
- m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority();
-
- preemption_data.reset();
- preemption_data.force_preemptability(msg->allow_preemption);
-
- replica_scrubmap_pos.reset();
-
- // make sure the FSM is at NotActive
- m_fsm->assert_not_active();
-
- m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority,
- m_current_token);
-}
-
-void PgScrubber::set_op_parameters(requested_scrub_t& request)
-{
- dout(10) << __func__ << " input: " << request << dendl;
-
- // write down the epoch of starting a new scrub. Will be used
- // to discard stale messages from previous aborted scrubs.
- m_epoch_start = m_pg->get_osdmap_epoch();
-
- m_flags.check_repair = request.check_repair;
- m_flags.auto_repair = request.auto_repair || request.need_auto;
- m_flags.required = request.req_scrub || request.must_scrub;
-
- m_flags.priority = (request.must_scrub || request.need_auto)
- ? get_pg_cct()->_conf->osd_requested_scrub_priority
- : m_pg->get_scrub_priority();
-
- state_set(PG_STATE_SCRUBBING);
-
- // will we be deep-scrubbing?
- if (request.must_deep_scrub || request.need_auto || request.time_for_deep) {
- state_set(PG_STATE_DEEP_SCRUB);
- }
-
- // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e.
- // deep-scrub with the auto_repair configuration flag set). m_is_repair value
- // determines the scrubber behavior.
- // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the
- // PG status as appearing in the logs).
- m_is_repair = request.must_repair || m_flags.auto_repair;
- if (request.must_repair) {
- state_set(PG_STATE_REPAIR);
- // not calling update_op_mode_text() yet, as m_is_deep not set yet
- }
-
- // the publishing here seems to be required for tests synchronization
- m_pg->publish_stats_to_osd();
- m_flags.deep_scrub_on_error = request.deep_scrub_on_error;
-}
-
-void PgScrubber::scrub_compare_maps()
-{
- dout(10) << __func__ << " has maps, analyzing" << dendl;
-
- // construct authoritative scrub map for type-specific scrubbing
- m_cleaned_meta_map.insert(m_primary_scrubmap);
- map<hobject_t, pair<std::optional<uint32_t>, std::optional<uint32_t>>> missing_digest;
-
- map<pg_shard_t, ScrubMap*> maps;
- maps[m_pg_whoami] = &m_primary_scrubmap;
-
- for (const auto& i : m_pg->get_acting_recovery_backfill()) {
- if (i == m_pg_whoami)
- continue;
- dout(2) << __func__ << " replica " << i << " has "
- << m_received_maps[i].objects.size() << " items" << dendl;
- maps[i] = &m_received_maps[i];
- }
-
- set<hobject_t> master_set;
-
- // Construct master set
- for (const auto& map : maps) {
- for (const auto& i : map.second->objects) {
- master_set.insert(i.first);
- }
- }
-
- stringstream ss;
- m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss);
-
- if (!ss.str().empty()) {
- m_osds->clog->warn(ss);
- }
-
- if (m_pg->recovery_state.get_acting_recovery_backfill().size() > 1) {
-
- dout(10) << __func__ << " comparing replica scrub maps" << dendl;
-
- // Map from object with errors to good peer
- map<hobject_t, list<pg_shard_t>> authoritative;
-
- dout(2) << __func__ << ": primary (" << m_pg->get_primary() << ") has "
- << m_primary_scrubmap.objects.size() << " items" << dendl;
-
- ss.str("");
- ss.clear();
-
- m_pg->get_pgbackend()->be_compare_scrubmaps(
- maps, master_set, m_is_repair, m_missing, m_inconsistent,
- authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(),
- m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss);
-
- if (!ss.str().empty()) {
- m_osds->clog->error(ss);
- }
-
- for (auto& i : authoritative) {
- list<pair<ScrubMap::object, pg_shard_t>> good_peers;
- for (list<pg_shard_t>::const_iterator j = i.second.begin(); j != i.second.end();
- ++j) {
- good_peers.emplace_back(maps[*j]->objects[i.first], *j);
- }
- m_authoritative.emplace(i.first, good_peers);
- }
-
- for (auto i = authoritative.begin(); i != authoritative.end(); ++i) {
- m_cleaned_meta_map.objects.erase(i->first);
- m_cleaned_meta_map.objects.insert(
- *(maps[i->second.back()]->objects.find(i->first)));
- }
- }
-
- auto for_meta_scrub = clean_meta_map();
-
- // ok, do the pg-type specific scrubbing
-
- // (Validates consistency of the object info and snap sets)
- scrub_snapshot_metadata(for_meta_scrub, missing_digest);
-
- // Called here on the primary can use an authoritative map if it isn't the primary
- _scan_snaps(for_meta_scrub);
-
- if (!m_store->empty()) {
-
- if (m_is_repair) {
- dout(10) << __func__ << ": discarding scrub results" << dendl;
- m_store->flush(nullptr);
- } else {
- dout(10) << __func__ << ": updating scrub object" << dendl;
- ObjectStore::Transaction t;
- m_store->flush(&t);
- m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
- }
- }
-}
-
-ScrubMachineListener::MsgAndEpoch PgScrubber::prep_replica_map_msg(
- PreemptionNoted was_preempted)
-{
- dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl;
-
- auto reply =
- make_message<MOSDRepScrubMap>(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
- m_replica_min_epoch, m_pg_whoami);
-
- reply->preempted = (was_preempted == PreemptionNoted::preempted);
- ::encode(replica_scrubmap, reply->get_data());
-
- return ScrubMachineListener::MsgAndEpoch{reply, m_replica_min_epoch};
-}
-
-void PgScrubber::send_replica_map(const MsgAndEpoch& preprepared)
-{
- m_pg->send_cluster_message(m_pg->get_primary().osd, preprepared.m_msg,
- preprepared.m_epoch, false);
-}
-
-void PgScrubber::send_preempted_replica()
-{
- auto reply =
- make_message<MOSDRepScrubMap>(spg_t{m_pg->info.pgid.pgid, m_pg->get_primary().shard},
- m_replica_min_epoch, m_pg_whoami);
-
- reply->preempted = true;
- ::encode(replica_scrubmap, reply->get_data()); // must not skip this
- m_pg->send_cluster_message(m_pg->get_primary().osd, reply, m_replica_min_epoch, false);
-}
-
-/*
- * - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
- * The state-machine will react to that when all replica maps are received.
- * - when all maps are received, we signal the FSM with the GotReplicas event (see
- * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
- * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
- * handle.
- */
-void PgScrubber::map_from_replica(OpRequestRef op)
-{
- auto m = op->get_req<MOSDRepScrubMap>();
- dout(15) << __func__ << " " << *m << dendl;
-
- if (m->map_epoch < m_pg->info.history.same_interval_since) {
- dout(10) << __func__ << " discarding old from " << m->map_epoch << " < "
- << m_pg->info.history.same_interval_since << dendl;
- return;
- }
-
- auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
-
- m_received_maps[m->from].decode(p, m_pg->info.pgid.pool());
- dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl;
-
- auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
- if (!is_ok) {
- // previously an unexpected map was triggering an assert. Now, as scrubs can be
- // aborted at any time, the chances of this happening have increased, and aborting is
- // not justified
- dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl;
- return;
- }
-
- if (m->preempted) {
- dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
- preemption_data.do_preempt();
- }
-
- if (m_maps_status.are_all_maps_available()) {
- dout(15) << __func__ << " all repl-maps available" << dendl;
- m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops());
- }
-}
-
-void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
-{
- dout(10) << __func__ << " " << *op->get_req() << dendl;
- op->mark_started();
- auto request_ep = op->get_req<MOSDScrubReserve>()->get_map_epoch();
-
- /*
- * if we are currently holding a reservation, then:
- * either (1) we, the scrubber, did not yet notice an interval change. The remembered
- * reservation epoch is from before our interval, and we can silently discard the
- * reservation (no message is required).
- * or:
- * (2) the interval hasn't changed, but the same Primary that (we think) holds the
- * lock just sent us a new request. Note that we know it's the same Primary, as
- * otherwise the interval would have changed.
- * Ostensibly we can discard & redo the reservation. But then we
- * will be temporarily releasing the OSD resource - and might not be able to grab it
- * again. Thus, we simply treat this as a successful new request
- * (but mark the fact that if there is a previous request from the primary to
- * scrub a specific chunk - that request is now defunct).
- */
-
- if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
- // we are holding a stale reservation from a past epoch
- m_remote_osd_resource.reset();
- dout(10) << __func__ << " stale reservation request" << dendl;
- }
-
- if (request_ep < m_pg->get_same_interval_since()) {
- // will not ack stale requests
- return;
- }
-
- bool granted{false};
- if (m_remote_osd_resource.has_value()) {
-
- dout(10) << __func__ << " already reserved." << dendl;
-
- /*
- * it might well be that we did not yet finish handling the latest scrub-op from
- * our primary. This happens, for example, if 'noscrub' was set via a command, then
- * reset. The primary in this scenario will remain in the same interval, but we do need
- * to reset our internal state (otherwise - the first renewed 'give me your scrub map'
- * from the primary will see us in active state, crashing the OSD).
- */
- advance_token();
- granted = true;
-
- } else if (m_pg->cct->_conf->osd_scrub_during_recovery ||
- !m_osds->is_recovery_active()) {
- m_remote_osd_resource.emplace(m_pg, m_osds, request_ep);
- // OSD resources allocated?
- granted = m_remote_osd_resource->is_reserved();
- if (!granted) {
- // just forget it
- m_remote_osd_resource.reset();
- dout(20) << __func__ << ": failed to reserve remotely" << dendl;
- }
- }
-
- dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
-
- Message* reply = new MOSDScrubReserve(
- spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep,
- granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami);
-
- m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
-}
-
-void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
-{
- dout(10) << __func__ << " " << *op->get_req() << dendl;
- op->mark_started();
-
- if (m_reservations.has_value()) {
- m_reservations->handle_reserve_grant(op, from);
- } else {
- derr << __func__ << ": received unsolicited reservation grant from osd " << from
- << " (" << op << ")" << dendl;
- }
-}
-
-void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
-{
- dout(10) << __func__ << " " << *op->get_req() << dendl;
- op->mark_started();
-
- if (m_reservations.has_value()) {
- // there is an active reservation process. No action is required otherwise.
- m_reservations->handle_reserve_reject(op, from);
- }
-}
-
-void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
-{
- dout(10) << __func__ << " " << *op->get_req() << dendl;
- op->mark_started();
-
- /*
- * this specific scrub session has terminated. All incoming events carrying the old
- * tag will be discarded.
- */
- advance_token();
- m_remote_osd_resource.reset();
-}
-
-void PgScrubber::discard_replica_reservations()
-{
- dout(10) << __func__ << dendl;
- if (m_reservations.has_value()) {
- m_reservations->discard_all();
- }
-}
-
-void PgScrubber::clear_scrub_reservations()
-{
- dout(10) << __func__ << dendl;
- m_reservations.reset(); // the remote reservations
- m_local_osd_resource.reset(); // the local reservation
- m_remote_osd_resource.reset(); // we as replica reserved for a Primary
-}
-
-void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text)
-{
- ceph_assert(m_pg->recovery_state.get_backfill_targets().empty());
-
- std::vector<std::pair<int, Message*>> messages;
- messages.reserve(m_pg->get_actingset().size());
-
- epoch_t epch = get_osdmap_epoch();
-
- for (auto& p : m_pg->get_actingset()) {
-
- if (p == m_pg_whoami)
- continue;
-
- dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch
- << dendl;
- Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode,
- m_pg_whoami);
- messages.push_back(std::make_pair(p.osd, m));
- }
-
- if (!messages.empty()) {
- m_osds->send_message_osd_cluster(messages, epch);
- }
-}
-
-void PgScrubber::unreserve_replicas()
-{
- dout(10) << __func__ << dendl;
- m_reservations.reset();
-}
-
-[[nodiscard]] bool PgScrubber::scrub_process_inconsistent()
-{
- dout(10) << __func__ << ": checking authoritative (mode="
- << m_mode_desc << ", auth remaining #: " << m_authoritative.size()
- << ")" << dendl;
-
- // authoritative only store objects which are missing or inconsistent.
- if (!m_authoritative.empty()) {
-
- stringstream ss;
- ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, "
- << m_inconsistent.size() << " inconsistent objects";
- dout(2) << ss.str() << dendl;
- m_osds->clog->error(ss);
-
- if (m_is_repair) {
- state_clear(PG_STATE_CLEAN);
- // we know we have a problem, so it's OK to set the user-visible flag
- // even if we only reached here via auto-repair
- state_set(PG_STATE_REPAIR);
- update_op_mode_text();
-
- for (const auto& [hobj, shrd_list] : m_authoritative) {
-
- auto missing_entry = m_missing.find(hobj);
-
- if (missing_entry != m_missing.end()) {
- m_pg->repair_object(hobj, shrd_list, missing_entry->second);
- m_fixed_count += missing_entry->second.size();
- }
-
- if (m_inconsistent.count(hobj)) {
- m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]);
- m_fixed_count += m_inconsistent[hobj].size();
- }
- }
- }
- }
- return (!m_authoritative.empty() && m_is_repair);
-}
-
-/*
- * note: only called for the Primary.
- */
-void PgScrubber::scrub_finish()
-{
- dout(10) << __func__ << " before flags: " << m_flags
- << ". repair state: " << (state_test(PG_STATE_REPAIR) ? "repair" : "no-repair")
- << ". deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl;
-
- ceph_assert(m_pg->is_locked());
-
- m_pg->m_planned_scrub = requested_scrub_t{};
-
- // if the repair request comes from auto-repair and large number of errors,
- // we would like to cancel auto-repair
- if (m_is_repair && m_flags.auto_repair &&
- m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
-
- dout(10) << __func__ << " undoing the repair" << dendl;
- state_clear(PG_STATE_REPAIR); // not expected to be set, anyway
- m_is_repair = false;
- update_op_mode_text();
- }
-
- bool do_auto_scrub = false;
-
- // if a regular scrub had errors within the limit, do a deep scrub to auto repair
- if (m_flags.deep_scrub_on_error && !m_authoritative.empty() &&
- m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
- ceph_assert(!m_is_deep);
- do_auto_scrub = true;
- dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl;
- }
-
- m_flags.deep_scrub_on_error = false;
-
- // type-specific finish (can tally more errors)
- _scrub_finish();
-
- bool has_error = scrub_process_inconsistent();
-
- {
- stringstream oss;
- oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " ";
- int total_errors = m_shallow_errors + m_deep_errors;
- if (total_errors)
- oss << total_errors << " errors";
- else
- oss << "ok";
- if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
- oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors
- << " remaining deep scrub error details lost)";
- if (m_is_repair)
- oss << ", " << m_fixed_count << " fixed";
- if (total_errors)
- m_osds->clog->error(oss);
- else
- m_osds->clog->debug(oss);
- }
-
- // Since we don't know which errors were fixed, we can only clear them
- // when every one has been fixed.
- if (m_is_repair) {
- if (m_fixed_count == m_shallow_errors + m_deep_errors) {
-
- ceph_assert(m_is_deep);
- m_shallow_errors = 0;
- m_deep_errors = 0;
- dout(20) << __func__ << " All may be fixed" << dendl;
-
- } else if (has_error) {
-
- // Deep scrub in order to get corrected error counts
- m_pg->scrub_after_recovery = true;
- m_pg->m_planned_scrub.req_scrub =
- m_pg->m_planned_scrub.req_scrub || m_flags.required;
-
- dout(20) << __func__ << " Current 'required': " << m_flags.required
- << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl;
-
- } else if (m_shallow_errors || m_deep_errors) {
-
- // We have errors but nothing can be fixed, so there is no repair
- // possible.
- state_set(PG_STATE_FAILED_REPAIR);
- dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors)
- << " error(s) present with no repair possible" << dendl;
- }
- }
-
- {
- // finish up
- ObjectStore::Transaction t;
- m_pg->recovery_state.update_stats(
- [this](auto& history, auto& stats) {
- dout(10) << "m_pg->recovery_state.update_stats()" << dendl;
- utime_t now = ceph_clock_now();
- history.last_scrub = m_pg->recovery_state.get_info().last_update;
- history.last_scrub_stamp = now;
- if (m_is_deep) {
- history.last_deep_scrub = m_pg->recovery_state.get_info().last_update;
- history.last_deep_scrub_stamp = now;
- }
-
- if (m_is_deep) {
- if ((m_shallow_errors == 0) && (m_deep_errors == 0))
- history.last_clean_scrub_stamp = now;
- stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
- stats.stats.sum.num_deep_scrub_errors = m_deep_errors;
- stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects;
- stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes;
- stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys;
- dout(25) << "scrub_finish shard " << m_pg_whoami
- << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes
- << " num_omap_keys = " << stats.stats.sum.num_omap_keys << dendl;
- } else {
- stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
- // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
- // because of deep-scrub errors
- if (m_shallow_errors == 0)
- history.last_clean_scrub_stamp = now;
- }
- stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors +
- stats.stats.sum.num_deep_scrub_errors;
- if (m_flags.check_repair) {
- m_flags.check_repair = false;
- if (m_pg->info.stats.stats.sum.num_scrub_errors) {
- state_set(PG_STATE_FAILED_REPAIR);
- dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors
- << " error(s) still present after re-scrub" << dendl;
- }
- }
- return true;
- },
- &t);
- int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
- ceph_assert(tr == 0);
-
- if (!m_pg->snap_trimq.empty()) {
- dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
- m_pg->snap_trimmer_scrub_complete();
- }
- }
-
- if (has_error) {
- m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared<PGPeeringEvent>(
- get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
- } else {
- m_is_repair = false;
- state_clear(PG_STATE_REPAIR);
- update_op_mode_text();
- }
-
- cleanup_on_finish();
- if (do_auto_scrub) {
- request_rescrubbing(m_pg->m_planned_scrub);
- }
-
- if (m_pg->is_active() && m_pg->is_primary()) {
- m_pg->recovery_state.share_pg_info();
- }
-}
-
-void PgScrubber::on_digest_updates()
-{
- dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " pending? "
- << num_digest_updates_pending
- << (m_end.is_max() ? " <last chunk> " : " <mid chunk> ") << dendl;
-
- if (num_digest_updates_pending > 0) {
- // do nothing for now. We will be called again when new updates arrive
- return;
- }
-
- // got all updates, and finished with this chunk. Any more?
- if (m_end.is_max()) {
-
- scrub_finish();
- m_osds->queue_scrub_is_finished(m_pg);
-
- } else {
- // go get a new chunk (via "requeue")
- preemption_data.reset();
- m_osds->queue_scrub_next_chunk(m_pg, m_pg->is_scrub_blocking_ops());
- }
-}
-
-
-/*
- * note that the flags-set fetched from the PG (m_pg->m_planned_scrub)
- * is cleared once scrubbing starts; Some of the values dumped here are
- * thus transitory.
- */
-void PgScrubber::dump(ceph::Formatter* f) const
-{
- f->open_object_section("scrubber");
- f->dump_stream("epoch_start") << m_interval_start;
- f->dump_bool("active", m_active);
- if (m_active) {
- f->dump_stream("start") << m_start;
- f->dump_stream("end") << m_end;
- f->dump_stream("m_max_end") << m_max_end;
- f->dump_stream("subset_last_update") << m_subset_last_update;
- f->dump_bool("deep", m_is_deep);
- f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required));
- f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub);
- f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair);
- f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto);
- f->dump_bool("req_scrub", m_flags.required);
- f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep);
- f->dump_bool("auto_repair", m_flags.auto_repair);
- f->dump_bool("check_repair", m_flags.check_repair);
- f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error);
- f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp; // utime_t
- f->dump_unsigned("priority", m_flags.priority);
- f->dump_int("shallow_errors", m_shallow_errors);
- f->dump_int("deep_errors", m_deep_errors);
- f->dump_int("fixed", m_fixed_count);
- {
- f->open_array_section("waiting_on_whom");
- for (const auto& p : m_maps_status.get_awaited()) {
- f->dump_stream("shard") << p;
- }
- f->close_section();
- }
- }
- f->close_section();
-}
-
-
-void PgScrubber::handle_query_state(ceph::Formatter* f)
-{
- dout(10) << __func__ << dendl;
-
- f->open_object_section("scrub");
- f->dump_stream("scrubber.epoch_start") << m_interval_start;
- f->dump_bool("scrubber.active", m_active);
- f->dump_stream("scrubber.start") << m_start;
- f->dump_stream("scrubber.end") << m_end;
- f->dump_stream("scrubber.m_max_end") << m_max_end;
- f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update;
- f->dump_bool("scrubber.deep", m_is_deep);
- {
- f->open_array_section("scrubber.waiting_on_whom");
- for (const auto& p : m_maps_status.get_awaited()) {
- f->dump_stream("shard") << p;
- }
- f->close_section();
- }
-
- f->dump_string("comment", "DEPRECATED - may be removed in the next release");
-
- f->close_section();
-}
-
-PgScrubber::~PgScrubber() = default;
-
-PgScrubber::PgScrubber(PG* pg)
- : m_pg{pg}
- , m_pg_id{pg->pg_id}
- , m_osds{m_pg->osd}
- , m_pg_whoami{pg->pg_whoami}
- , preemption_data{pg}
-{
- m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
- m_fsm->initiate();
-}
-
-void PgScrubber::reserve_replicas()
-{
- dout(10) << __func__ << dendl;
- m_reservations.emplace(m_pg, m_pg_whoami);
-}
-
-void PgScrubber::cleanup_on_finish()
-{
- dout(10) << __func__ << dendl;
- ceph_assert(m_pg->is_locked());
-
- state_clear(PG_STATE_SCRUBBING);
- state_clear(PG_STATE_DEEP_SCRUB);
- m_pg->publish_stats_to_osd();
-
- clear_scrub_reservations();
- m_pg->publish_stats_to_osd();
-
- requeue_waiting();
-
- reset_internal_state();
- m_flags = scrub_flags_t{};
-
- // type-specific state clear
- _scrub_clear_state();
-}
-
-// uses process_event(), so must be invoked externally
-void PgScrubber::scrub_clear_state()
-{
- dout(10) << __func__ << dendl;
-
- clear_pgscrub_state();
- m_fsm->process_event(FullReset{});
-}
-
-/*
- * note: does not access the state-machine
- */
-void PgScrubber::clear_pgscrub_state()
-{
- dout(10) << __func__ << dendl;
- ceph_assert(m_pg->is_locked());
-
- state_clear(PG_STATE_SCRUBBING);
- state_clear(PG_STATE_DEEP_SCRUB);
-
- state_clear(PG_STATE_REPAIR);
-
- clear_scrub_reservations();
- m_pg->publish_stats_to_osd();
-
- requeue_waiting();
-
- reset_internal_state();
- m_flags = scrub_flags_t{};
-
- // type-specific state clear
- _scrub_clear_state();
-}
-
-void PgScrubber::replica_handling_done()
-{
- dout(10) << __func__ << dendl;
-
- state_clear(PG_STATE_SCRUBBING);
- state_clear(PG_STATE_DEEP_SCRUB);
-
- reset_internal_state();
-
- m_pg->publish_stats_to_osd();
-}
-
-/*
- * note: performs run_callbacks()
- * note: reservations-related variables are not reset here
- */
-void PgScrubber::reset_internal_state()
-{
- dout(10) << __func__ << dendl;
-
- preemption_data.reset();
- m_maps_status.reset();
- m_received_maps.clear();
-
- m_start = hobject_t{};
- m_end = hobject_t{};
- m_max_end = hobject_t{};
- m_subset_last_update = eversion_t{};
- m_shallow_errors = 0;
- m_deep_errors = 0;
- m_fixed_count = 0;
- m_omap_stats = (const struct omap_stat_t){0};
-
- run_callbacks();
-
- m_inconsistent.clear();
- m_missing.clear();
- m_authoritative.clear();
- num_digest_updates_pending = 0;
- m_primary_scrubmap = ScrubMap{};
- m_primary_scrubmap_pos.reset();
- replica_scrubmap = ScrubMap{};
- replica_scrubmap_pos.reset();
- m_cleaned_meta_map = ScrubMap{};
- m_needs_sleep = true;
- m_sleep_started_at = utime_t{};
-
- m_active = false;
-}
-
-// note that only applicable to the Replica:
-void PgScrubber::advance_token()
-{
- dout(10) << __func__ << " was: " << m_current_token << dendl;
- m_current_token++;
-
- // when advance_token() is called, it is assumed that no scrubbing takes place.
- // We will, though, verify that. And if we are actually still handling a stale request -
- // both our internal state and the FSM state will be cleared.
- replica_handling_done();
- m_fsm->process_event(FullReset{});
-}
-
-bool PgScrubber::is_token_current(Scrub::act_token_t received_token)
-{
- if (received_token == 0 || received_token == m_current_token) {
- return true;
- }
- dout(5) << __func__ << " obsolete token (" << received_token
- << " vs current " << m_current_token << dendl;
-
- return false;
-}
-
-const OSDMapRef& PgScrubber::get_osdmap() const
-{
- return m_pg->get_osdmap();
-}
-
-ostream& operator<<(ostream& out, const PgScrubber& scrubber)
-{
- return out << scrubber.m_flags;
-}
-
-ostream& PgScrubber::show(ostream& out) const
-{
- return out << " [ " << m_pg_id << ": " << m_flags << " ] ";
-}
-
-int PgScrubber::asok_debug(std::string_view cmd,
- std::string param,
- Formatter* f,
- stringstream& ss)
-{
- dout(10) << __func__ << " cmd: " << cmd << " param: " << param << dendl;
-
- if (cmd == "block") {
- // set a flag that will cause the next 'select_range' to report a blocked object
- m_debug_blockrange = 1;
- } else if (cmd == "unblock") {
- // send an 'unblock' event, as if a blocked range was freed
- m_debug_blockrange = 0;
- m_fsm->process_event(Unblocked{});
- }
- return 0;
-}
-// ///////////////////// preemption_data_t //////////////////////////////////
-
-PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
-{
- m_left = static_cast<int>(
- m_pg->get_cct()->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
-}
-
-void PgScrubber::preemption_data_t::reset()
-{
- std::lock_guard<std::mutex> lk{m_preemption_lock};
-
- m_preemptable = false;
- m_preempted = false;
- m_left =
- static_cast<int>(m_pg->cct->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
- m_size_divisor = 1;
-}
-
-
-// ///////////////////// ReplicaReservations //////////////////////////////////
-namespace Scrub {
-
-void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch)
-{
- auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, peer.shard), epoch,
- MOSDScrubReserve::RELEASE, m_pg->pg_whoami);
- m_osds->send_message_osd_cluster(peer.osd, m, epoch);
-}
-
-ReplicaReservations::ReplicaReservations(PG* pg, pg_shard_t whoami)
- : m_pg{pg}
- , m_acting_set{pg->get_actingset()}
- , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
- , m_pending{static_cast<int>(m_acting_set.size()) - 1}
- , m_pg_info{m_pg->get_pg_info(ScrubberPasskey())}
-{
- epoch_t epoch = m_pg->get_osdmap_epoch();
-
- // handle the special case of no replicas
- if (m_pending <= 0) {
- // just signal the scrub state-machine to continue
- send_all_done();
-
- } else {
-
- for (auto p : m_acting_set) {
- if (p == whoami)
- continue;
- auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, p.shard), epoch,
- MOSDScrubReserve::REQUEST, m_pg->pg_whoami);
- m_osds->send_message_osd_cluster(p.osd, m, epoch);
- m_waited_for_peers.push_back(p);
- dout(10) << __func__ << " <ReplicaReservations> reserve<-> " << p.osd << dendl;
- }
- }
-}
-
-void ReplicaReservations::send_all_done()
-{
- m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
-}
-
-void ReplicaReservations::send_reject()
-{
- m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
-}
-
-void ReplicaReservations::discard_all()
-{
- dout(10) << __func__ << " " << m_reserved_peers << dendl;
-
- m_had_rejections = true; // preventing late-coming responses from triggering events
- m_reserved_peers.clear();
- m_waited_for_peers.clear();
-}
-
-ReplicaReservations::~ReplicaReservations()
-{
- m_had_rejections = true; // preventing late-coming responses from triggering events
-
- // send un-reserve messages to all reserved replicas. We do not wait for answer (there
- // wouldn't be one). Other incoming messages will be discarded on the way, by our
- // owner.
- epoch_t epoch = m_pg->get_osdmap_epoch();
-
- for (auto& p : m_reserved_peers) {
- release_replica(p, epoch);
- }
- m_reserved_peers.clear();
-
- // note: the release will follow on the heels of the request. When tried otherwise,
- // grants that followed a reject arrived after the whole scrub machine-state was
- // reset, causing leaked reservations.
- for (auto& p : m_waited_for_peers) {
- release_replica(p, epoch);
- }
- m_waited_for_peers.clear();
-}
-
-/**
- * @ATTN we would not reach here if the ReplicaReservation object managed by the
- * scrubber was reset.
- */
-void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
-{
- dout(10) << __func__ << " <ReplicaReservations> granted-> " << from << dendl;
- op->mark_started();
-
- {
- // reduce the amount of extra release messages. Not a must, but the log is cleaner
- auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
- if (w != m_waited_for_peers.end())
- m_waited_for_peers.erase(w);
- }
-
- // are we forced to reject the reservation?
- if (m_had_rejections) {
-
- dout(10) << " rejecting late-coming reservation from " << from << dendl;
- release_replica(from, m_pg->get_osdmap_epoch());
-
- } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
- m_reserved_peers.end()) {
-
- dout(10) << " already had osd." << from << " reserved" << dendl;
-
- } else {
-
- dout(10) << " osd." << from << " scrub reserve = success" << dendl;
- m_reserved_peers.push_back(from);
- if (--m_pending == 0) {
- send_all_done();
- }
- }
-}
-
-void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from)
-{
- dout(10) << __func__ << " <ReplicaReservations> rejected-> " << from << dendl;
- dout(10) << __func__ << " " << *op->get_req() << dendl;
- op->mark_started();
-
- {
- // reduce the amount of extra release messages. Not a must, but the log is cleaner
- auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
- if (w != m_waited_for_peers.end())
- m_waited_for_peers.erase(w);
- }
-
- if (m_had_rejections) {
-
- // our failure was already handled when the first rejection arrived
- dout(15) << " ignoring late-coming rejection from " << from << dendl;
-
- } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
- m_reserved_peers.end()) {
-
- dout(10) << " already had osd." << from << " reserved" << dendl;
-
- } else {
-
- dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
- m_had_rejections = true; // preventing any additional notifications
- send_reject();
- }
-}
-
-
-// ///////////////////// LocalReservation //////////////////////////////////
-
-LocalReservation::LocalReservation(PG* pg, OSDService* osds)
- : m_pg{pg} // holding the "whole PG" for dout() sake
- , m_osds{osds}
-{
- if (!m_osds->inc_scrubs_local()) {
- dout(10) << __func__ << ": failed to reserve locally " << dendl;
- // the failure is signalled by not having m_holding_local_reservation set
- return;
- }
-
- dout(20) << __func__ << ": local OSD scrub resources reserved" << dendl;
- m_holding_local_reservation = true;
-}
-
-LocalReservation::~LocalReservation()
-{
- if (m_holding_local_reservation) {
- m_holding_local_reservation = false;
- m_osds->dec_scrubs_local();
- }
-}
-
-
-// ///////////////////// ReservedByRemotePrimary ///////////////////////////////
-
-ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch)
- : m_pg{pg}, m_osds{osds}, m_reserved_at{epoch}
-{
- if (!m_osds->inc_scrubs_remote()) {
- dout(10) << __func__ << ": failed to reserve at Primary request" << dendl;
- // the failure is signalled by not having m_reserved_by_remote_primary set
- return;
- }
-
- dout(20) << __func__ << ": scrub resources reserved at Primary request" << dendl;
- m_reserved_by_remote_primary = true;
-}
-
-bool ReservedByRemotePrimary::is_stale() const
-{
- return m_reserved_at < m_pg->get_same_interval_since();
-}
-
-ReservedByRemotePrimary::~ReservedByRemotePrimary()
-{
- if (m_reserved_by_remote_primary) {
- m_reserved_by_remote_primary = false;
- m_osds->dec_scrubs_remote();
- }
-}
-
-// ///////////////////// MapsCollectionStatus ////////////////////////////////
-
-auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from)
- -> std::tuple<bool, std::string_view>
-{
- auto fe = std::find(m_maps_awaited_for.begin(), m_maps_awaited_for.end(), from);
- if (fe != m_maps_awaited_for.end()) {
- // we are indeed waiting for a map from this replica
- m_maps_awaited_for.erase(fe);
- return std::tuple{true, ""sv};
- } else {
- return std::tuple{false, " unsolicited scrub-map"sv};
- }
-}
-
-void MapsCollectionStatus::reset()
-{
- *this = MapsCollectionStatus{};
-}
-
-std::string MapsCollectionStatus::dump() const
-{
- std::string all;
- for (const auto& rp : m_maps_awaited_for) {
- all.append(rp.get_osd() + " "s);
- }
- return all;
-}
-
-ostream& operator<<(ostream& out, const MapsCollectionStatus& sf)
-{
- out << " [ ";
- for (const auto& rp : sf.m_maps_awaited_for) {
- out << rp.get_osd() << " ";
- }
- if (!sf.m_local_map_ready) {
- out << " local ";
- }
- return out << " ] ";
-}
-
-// ///////////////////// blocked_range_t ///////////////////////////////
-
-blocked_range_t::blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id)
- : m_osds{osds}
-{
- auto now_is = std::chrono::system_clock::now();
- m_callbk = new LambdaContext([now_is, pg_id, osds]([[maybe_unused]] int r) {
- std::time_t now_c = std::chrono::system_clock::to_time_t(now_is);
- char buf[50];
- strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c));
- lgeneric_subdout(g_ceph_context, osd, 10)
- << "PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf
- << ")" << dendl;
- osds->clog->warn() << "osd." << osds->whoami << " PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf << ")";
- return;
- });
-
- std::lock_guard l(m_osds->sleep_lock);
- m_osds->sleep_timer.add_event_after(waittime, m_callbk);
-}
-
-blocked_range_t::~blocked_range_t()
-{
- std::lock_guard l(m_osds->sleep_lock);
- m_osds->sleep_timer.cancel_event(m_callbk);
-}
-
-} // namespace Scrub
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-
-#include <cassert>
-#include <chrono>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "PG.h"
-#include "ScrubStore.h"
-#include "scrub_machine_lstnr.h"
-#include "scrubber_common.h"
-
-class Callback;
-
-namespace Scrub {
-class ScrubMachine;
-struct BuildMap;
-
-/**
- * Reserving/freeing scrub resources at the replicas.
- *
- * When constructed - sends reservation requests to the acting_set.
- * A rejection triggers a "couldn't acquire the replicas' scrub resources" event.
- * All previous requests, whether already granted or not, are explicitly released.
- *
- * A note re performance: I've measured a few container alternatives for
- * m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as
- * expected. flat_set is only slightly better. Surprisingly - std::vector (with no
- * sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve.
- */
-class ReplicaReservations {
- using OrigSet = decltype(std::declval<PG>().get_actingset());
-
- PG* m_pg;
- OrigSet m_acting_set;
- OSDService* m_osds;
- std::vector<pg_shard_t> m_waited_for_peers;
- std::vector<pg_shard_t> m_reserved_peers;
- bool m_had_rejections{false};
- int m_pending{-1};
- const pg_info_t& m_pg_info;
-
- void release_replica(pg_shard_t peer, epoch_t epoch);
-
- void send_all_done(); ///< all reservations are granted
-
- /// notify the scrubber that we have failed to reserve replicas' resources
- void send_reject();
-
- public:
- /**
- * quietly discard all knowledge about existing reservations. No messages
- * are sent to peers.
- * To be used upon interval change, as we know the the running scrub is no longer
- * relevant, and that the replicas had reset the reservations on their side.
- */
- void discard_all();
-
- ReplicaReservations(PG* pg, pg_shard_t whoami);
-
- ~ReplicaReservations();
-
- void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
-
- void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
-};
-
-/**
- * wraps the local OSD scrub resource reservation in an RAII wrapper
- */
-class LocalReservation {
- PG* m_pg;
- OSDService* m_osds;
- bool m_holding_local_reservation{false};
-
- public:
- LocalReservation(PG* pg, OSDService* osds);
- ~LocalReservation();
- bool is_reserved() const { return m_holding_local_reservation; }
-};
-
-/**
- * wraps the OSD resource we are using when reserved as a replica by a scrubbing master.
- */
-class ReservedByRemotePrimary {
- PG* m_pg;
- OSDService* m_osds;
- bool m_reserved_by_remote_primary{false};
- const epoch_t m_reserved_at;
-
- public:
- ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch);
- ~ReservedByRemotePrimary();
- [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; }
-
- /// compare the remembered reserved-at epoch to the current interval
- [[nodiscard]] bool is_stale() const;
-};
-
-/**
- * Once all replicas' scrub maps are received, we go on to compare the maps. That is -
- * unless we we have not yet completed building our own scrub map. MapsCollectionStatus
- * combines the status of waiting for both the local map and the replicas, without
- * resorting to adding dummy entries into a list.
- */
-class MapsCollectionStatus {
-
- bool m_local_map_ready{false};
- std::vector<pg_shard_t> m_maps_awaited_for;
-
- public:
- [[nodiscard]] bool are_all_maps_available() const
- {
- return m_local_map_ready && m_maps_awaited_for.empty();
- }
-
- void mark_local_map_ready() { m_local_map_ready = true; }
-
- void mark_replica_map_request(pg_shard_t from_whom)
- {
- m_maps_awaited_for.push_back(from_whom);
- }
-
- /// @returns true if indeed waiting for this one. Otherwise: an error string
- auto mark_arriving_map(pg_shard_t from) -> std::tuple<bool, std::string_view>;
-
- std::vector<pg_shard_t> get_awaited() const { return m_maps_awaited_for; }
-
- void reset();
-
- std::string dump() const;
-
- friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf);
-};
-
-
-} // namespace Scrub
-
-
-/**
- * the scrub operation flags. Primary only.
- * Set at scrub start. Checked in multiple locations - mostly
- * at finish.
- */
-struct scrub_flags_t {
-
- unsigned int priority{0};
-
- /**
- * set by queue_scrub() if either planned_scrub.auto_repair or
- * need_auto were set.
- * Tested at scrub end.
- */
- bool auto_repair{false};
-
- /// this flag indicates that we are scrubbing post repair to verify everything is fixed
- bool check_repair{false};
-
- /// checked at the end of the scrub, to possibly initiate a deep-scrub
- bool deep_scrub_on_error{false};
-
- /**
- * scrub must not be aborted.
- * Set for explicitly requested scrubs, and for scrubs originated by the pairing
- * process with the 'repair' flag set (in the RequestScrub event).
- */
- bool required{false};
-};
-
-ostream& operator<<(ostream& out, const scrub_flags_t& sf);
-
-
-/**
- * The part of PG-scrubbing code that isn't state-machine wiring.
- *
- * Why the separation? I wish to move to a different FSM implementation. Thus I
- * am forced to strongly decouple the state-machine implementation details from
- * the actual scrubbing code.
- */
-class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
-
- public:
- explicit PgScrubber(PG* pg);
-
- // ------------------ the I/F exposed to the PG (ScrubPgIF) -------------
-
- /// are we waiting for resource reservation grants form our replicas?
- [[nodiscard]] bool is_reserving() const final;
-
- void initiate_regular_scrub(epoch_t epoch_queued) final;
-
- void initiate_scrub_after_repair(epoch_t epoch_queued) final;
-
- void send_scrub_resched(epoch_t epoch_queued) final;
-
- void active_pushes_notification(epoch_t epoch_queued) final;
-
- void update_applied_notification(epoch_t epoch_queued) final;
-
- void send_scrub_unblock(epoch_t epoch_queued) final;
-
- void digest_update_notification(epoch_t epoch_queued) final;
-
- void send_replica_maps_ready(epoch_t epoch_queued) final;
-
- void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
-
- void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
-
- void send_replica_pushes_upd(epoch_t epoch_queued) final;
- /**
- * The PG has updated its 'applied version'. It might be that we are waiting for this
- * information: after selecting a range of objects to scrub, we've marked the latest
- * version of these objects in m_subset_last_update. We will not start the map building
- * before we know that the PG has reached this version.
- */
- void on_applied_when_primary(const eversion_t& applied_version) final;
-
- void send_full_reset(epoch_t epoch_queued) final;
-
- void send_chunk_free(epoch_t epoch_queued) final;
-
- void send_chunk_busy(epoch_t epoch_queued) final;
-
- void send_local_map_done(epoch_t epoch_queued) final;
-
- void send_maps_compared(epoch_t epoch_queued) final;
-
- void send_get_next_chunk(epoch_t epoch_queued) final;
-
- void send_scrub_is_finished(epoch_t epoch_queued) final;
-
- /**
- * we allow some number of preemptions of the scrub, which mean we do
- * not block. Then we start to block. Once we start blocking, we do
- * not stop until the scrub range is completed.
- */
- bool write_blocked_by_scrub(const hobject_t& soid) final;
-
- /// true if the given range intersects the scrub interval in any way
- bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
-
- /**
- * we are a replica being asked by the Primary to reserve OSD resources for
- * scrubbing
- */
- void handle_scrub_reserve_request(OpRequestRef op) final;
-
- void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
- void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
- void handle_scrub_reserve_release(OpRequestRef op) final;
- void discard_replica_reservations() final;
- void clear_scrub_reservations() final; // PG::clear... fwds to here
- void unreserve_replicas() final;
-
- // managing scrub op registration
-
- void reg_next_scrub(const requested_scrub_t& request_flags) final;
-
- void unreg_next_scrub() final;
-
- void scrub_requested(scrub_level_t scrub_level,
- scrub_type_t scrub_type,
- requested_scrub_t& req_flags) final;
-
- /**
- * Reserve local scrub resources (managed by the OSD)
- *
- * Fails if OSD's local-scrubs budget was exhausted
- * \returns were local resources reserved?
- */
- bool reserve_local() final;
-
- void handle_query_state(ceph::Formatter* f) final;
-
- void dump(ceph::Formatter* f) const override;
-
- // used if we are a replica
-
- void replica_scrub_op(OpRequestRef op) final;
-
- /// the op priority, taken from the primary's request message
- Scrub::scrub_prio_t replica_op_priority() const final
- {
- return m_replica_request_priority;
- };
-
- unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
- unsigned int suggested_priority) const final;
- /// the version that refers to m_flags.priority
- unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
-
- void add_callback(Context* context) final { m_callbacks.push_back(context); }
-
- [[nodiscard]] bool are_callbacks_pending() const final // used for an assert in PG.cc
- {
- return !m_callbacks.empty();
- }
-
- /// handle a message carrying a replica map
- void map_from_replica(OpRequestRef op) final;
-
- void scrub_clear_state() final;
-
- /**
- * add to scrub statistics, but only if the soid is below the scrub start
- */
- virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
- const hobject_t& soid) override
- {
- ceph_assert(false);
- }
-
- /**
- * finalize the parameters of the initiated scrubbing session:
- *
- * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
- * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
- */
- void set_op_parameters(requested_scrub_t& request) final;
-
- void cleanup_store(ObjectStore::Transaction* t) final;
-
- bool get_store_errors(const scrub_ls_arg_t& arg,
- scrub_ls_result_t& res_inout) const override
- {
- return false;
- }
-
- int asok_debug(std::string_view cmd,
- std::string param,
- Formatter* f,
- std::stringstream& ss) override;
- int m_debug_blockrange{0};
-
- // -------------------------------------------------------------------------------------------
- // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
-
- [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); }
-
- void select_range_n_notify() final;
-
- Scrub::BlockedRangeWarning acquire_blocked_alarm() final;
-
- /// walk the log to find the latest update that affects our chunk
- eversion_t search_log_for_updates() const final;
-
- eversion_t get_last_update_applied() const final
- {
- return m_pg->recovery_state.get_last_update_applied();
- }
-
- int pending_active_pushes() const final { return m_pg->active_pushes; }
-
- void on_init() final;
- void on_replica_init() final;
- void replica_handling_done() final;
-
- /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
- /// (thus can be called from FSM reactions)
- void clear_pgscrub_state() final;
-
- /*
- * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
- * is asserted - after a configuration-dependent timeout.
- */
- void add_delayed_scheduling() final;
-
- void get_replicas_maps(bool replica_can_preempt) final;
-
- void on_digest_updates() final;
-
- ScrubMachineListener::MsgAndEpoch
- prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final;
-
- void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final;
-
- void send_preempted_replica() final;
-
- void send_remotes_reserved(epoch_t epoch_queued) final;
- void send_reservation_failure(epoch_t epoch_queued) final;
-
- /**
- * does the PG have newer updates than what we (the scrubber) know?
- */
- [[nodiscard]] bool has_pg_marked_new_updates() const final;
-
- void set_subset_last_update(eversion_t e) final;
-
- void maps_compare_n_cleanup() final;
-
- Scrub::preemption_t& get_preemptor() final;
-
- int build_primary_map_chunk() final;
-
- int build_replica_map_chunk() final;
-
- void reserve_replicas() final;
-
- [[nodiscard]] bool was_epoch_changed() const final;
-
- void mark_local_map_ready() final;
-
- [[nodiscard]] bool are_all_maps_available() const final;
-
- std::string dump_awaited_maps() const final;
-
- protected:
- bool state_test(uint64_t m) const { return m_pg->state_test(m); }
- void state_set(uint64_t m) { m_pg->state_set(m); }
- void state_clear(uint64_t m) { m_pg->state_clear(m); }
-
- [[nodiscard]] bool is_scrub_registered() const;
-
- virtual void _scrub_clear_state() {}
-
- utime_t m_scrub_reg_stamp; ///< stamp we registered for
-
- ostream& show(ostream& out) const override;
-
- public:
- // -------------------------------------------------------------------------------------------
-
- friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
-
- static utime_t scrub_must_stamp() { return utime_t(1, 1); }
-
- virtual ~PgScrubber(); // must be defined separately, in the .cc file
-
- [[nodiscard]] bool is_scrub_active() const final { return m_active; }
-
- private:
- void reset_internal_state();
-
- /**
- * the current scrubbing operation is done. We should mark that fact, so that
- * all events related to the previous operation can be discarded.
- */
- void advance_token();
-
- bool is_token_current(Scrub::act_token_t received_token);
-
- void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
-
- void _scan_snaps(ScrubMap& smap);
-
- ScrubMap clean_meta_map();
-
- /**
- * mark down some parameters of the initiated scrub:
- * - the epoch when started;
- * - the depth of the scrub requested (from the PG_STATE variable)
- */
- void reset_epoch(epoch_t epoch_queued);
-
- void run_callbacks();
-
- // ----- methods used to verify the relevance of incoming events:
-
- /**
- * is the incoming event still relevant, and should be processed?
- *
- * It isn't if:
- * - (1) we are no longer 'actively scrubbing'; or
- * - (2) the message is from an epoch prior to when we started the current scrub
- * session; or
- * - (3) the message epoch is from a previous interval; or
- * - (4) the 'abort' configuration flags were set.
- *
- * For (1) & (2) - teh incoming message is discarded, w/o further action.
- *
- * For (3): (see check_interval() for a full description) if we have not reacted yet
- * to this specific new interval, we do now:
- * - replica reservations are silently discarded (we count on the replicas to notice
- * the interval change and un-reserve themselves);
- * - the scrubbing is halted.
- *
- * For (4): the message will be discarded, but also:
- * if this is the first time we've noticed the 'abort' request, we perform the abort.
- *
- * \returns should the incoming event be processed?
- */
- bool is_message_relevant(epoch_t epoch_to_verify);
-
- /**
- * check the 'no scrub' configuration options.
- */
- [[nodiscard]] bool should_abort() const;
-
- /**
- * Check the 'no scrub' configuration flags.
- *
- * Reset everything if the abort was not handled before.
- * @returns false if the message was discarded due to abort flag.
- */
- [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify);
-
- [[nodiscard]] bool check_interval(epoch_t epoch_to_verify);
-
- epoch_t m_last_aborted{}; // last time we've noticed a request to abort
-
- /**
- * return true if any inconsistency/missing is repaired, false otherwise
- */
- [[nodiscard]] bool scrub_process_inconsistent();
-
- void scrub_compare_maps();
-
- bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always
- ///< 'true', unless we just got out of a sleep period
-
- utime_t m_sleep_started_at;
-
-
- // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
- // to guarantee un-reserving when deleted.
- std::optional<Scrub::ReplicaReservations> m_reservations;
- std::optional<Scrub::LocalReservation> m_local_osd_resource;
-
- /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
- std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
-
- void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when
- // Active->NotActive
-
- /// the part that actually finalizes a scrub
- void scrub_finish();
-
- protected:
- PG* const m_pg;
-
- /**
- * the derivative-specific scrub-finishing touches:
- */
- virtual void _scrub_finish() {}
-
- /**
- * Validate consistency of the object info and snap sets.
- */
- virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
- {}
-
- // common code used by build_primary_map_chunk() and build_replica_map_chunk():
- int build_scrub_map_chunk(ScrubMap& map, // primary or replica?
- ScrubMapBuilder& pos,
- hobject_t start,
- hobject_t end,
- bool deep);
-
- std::unique_ptr<Scrub::ScrubMachine> m_fsm;
- const spg_t m_pg_id; ///< a local copy of m_pg->pg_id
- OSDService* const m_osds;
- const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami;
-
- epoch_t m_interval_start{0}; ///< interval's 'from' of when scrubbing was first scheduled
- /*
- * the exact epoch when the scrubbing actually started (started here - cleared checks
- * for no-scrub conf). Incoming events are verified against this, with stale events
- * discarded.
- */
- epoch_t m_epoch_start{0}; ///< the actual epoch when scrubbing started
-
- /**
- * (replica) a tag identifying a specific scrub "session". Incremented whenever the
- * Primary releases the replica scrub resources.
- * When the scrub session is terminated (even if the interval remains unchanged, as
- * might happen following an asok no-scrub command), stale scrub-resched messages
- * triggered by the backend will be discarded.
- */
- Scrub::act_token_t m_current_token{1};
-
- scrub_flags_t m_flags;
-
- bool m_active{false};
-
- eversion_t m_subset_last_update{};
-
- std::unique_ptr<Scrub::Store> m_store;
-
- int num_digest_updates_pending{0};
- hobject_t m_start, m_end; ///< note: half-closed: [start,end)
-
- /// Returns reference to current osdmap
- const OSDMapRef& get_osdmap() const;
-
- /// Returns epoch of current osdmap
- epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
-
- CephContext* get_pg_cct() const { return m_pg->cct; }
-
- // collected statistics
- int m_shallow_errors{0};
- int m_deep_errors{0};
- int m_fixed_count{0};
-
- /// Maps from objects with errors to missing peers
- HobjToShardSetMapping m_missing;
-
- protected:
- /**
- * 'm_is_deep' - is the running scrub a deep one?
- *
- * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
- * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
- * meaningful both for the primary and the replicas, and is used as a parameter when
- * building the scrub maps.
- */
- bool m_is_deep{false};
-
- /**
- * If set: affects the backend & scrubber-backend functions called after all
- * scrub maps are available.
- *
- * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be
- * a "user facing" status display only).
- */
- bool m_is_repair{false};
-
- /**
- * User-readable summary of the scrubber's current mode of operation. Used for
- * both osd.*.log and the cluster log.
- * One of:
- * "repair"
- * "deep-scrub",
- * "scrub
- *
- * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for
- * auto_repair will show as "deep-scrub" and not as "repair" (until the first error
- * is detected).
- */
- std::string_view m_mode_desc;
-
- void update_op_mode_text();
-
-private:
-
- /**
- * initiate a deep-scrub after the current scrub ended with errors.
- */
- void request_rescrubbing(requested_scrub_t& req_flags);
-
- /*
- * Select a range of objects to scrub.
- *
- * By:
- * - setting tentative range based on conf and divisor
- * - requesting a partial list of elements from the backend;
- * - handling some head/clones issues
- *
- * The selected range is set directly into 'm_start' and 'm_end'
- */
- bool select_range();
-
- std::list<Context*> m_callbacks;
-
- /**
- * send a replica (un)reservation request to the acting set
- *
- * @param opcode - one of MOSDScrubReserve::REQUEST
- * or MOSDScrubReserve::RELEASE
- */
- void message_all_replicas(int32_t opcode, std::string_view op_text);
-
- hobject_t m_max_end; ///< Largest end that may have been sent to replicas
- ScrubMap m_primary_scrubmap;
- ScrubMapBuilder m_primary_scrubmap_pos;
-
- std::map<pg_shard_t, ScrubMap> m_received_maps;
-
- /// Cleaned std::map pending snap metadata scrub
- ScrubMap m_cleaned_meta_map;
-
- void _request_scrub_map(pg_shard_t replica,
- eversion_t version,
- hobject_t start,
- hobject_t end,
- bool deep,
- bool allow_preemption);
-
-
- Scrub::MapsCollectionStatus m_maps_status;
-
- omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
-
- /// Maps from objects with errors to inconsistent peers
- HobjToShardSetMapping m_inconsistent;
-
- /// Maps from object with errors to good peers
- std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
-
- // ------------ members used if we are a replica
-
- epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message
-
- ScrubMapBuilder replica_scrubmap_pos;
- ScrubMap replica_scrubmap;
-
- /**
- * we mark the request priority as it arrived. It influences the queuing priority
- * when we wait for local updates
- */
- Scrub::scrub_prio_t m_replica_request_priority;
-
- /**
- * the 'preemption' "state-machine".
- * Note: I was considering an orthogonal sub-machine implementation, but as
- * the state diagram is extremely simple, the added complexity wasn't justified.
- */
- class preemption_data_t : public Scrub::preemption_t {
- public:
- preemption_data_t(PG* pg); // the PG access is used for conf access (and logs)
-
- [[nodiscard]] bool is_preemptable() const final { return m_preemptable; }
-
- bool do_preempt() final
- {
- if (m_preempted || !m_preemptable)
- return false;
-
- std::lock_guard<std::mutex> lk{m_preemption_lock};
- if (!m_preemptable)
- return false;
-
- m_preempted = true;
- return true;
- }
-
- /// same as 'do_preempt()' but w/o checks (as once a replica
- /// was preempted, we cannot continue)
- void replica_preempted() { m_preempted = true; }
-
- void enable_preemption()
- {
- std::lock_guard<std::mutex> lk{m_preemption_lock};
- if (are_preemptions_left() && !m_preempted) {
- m_preemptable = true;
- }
- }
-
- /// used by a replica to set preemptability state according to the Primary's request
- void force_preemptability(bool is_allowed)
- {
- // note: no need to lock for a replica
- m_preempted = false;
- m_preemptable = is_allowed;
- }
-
- bool disable_and_test() final
- {
- std::lock_guard<std::mutex> lk{m_preemption_lock};
- m_preemptable = false;
- return m_preempted;
- }
-
- [[nodiscard]] bool was_preempted() const { return m_preempted; }
-
- [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; }
-
- void reset();
-
- void adjust_parameters() final
- {
- std::lock_guard<std::mutex> lk{m_preemption_lock};
-
- if (m_preempted) {
- m_preempted = false;
- m_preemptable = adjust_left();
- } else {
- m_preemptable = are_preemptions_left();
- }
- }
-
- private:
- PG* m_pg;
- mutable std::mutex m_preemption_lock;
- bool m_preemptable{false};
- bool m_preempted{false};
- int m_left;
- size_t m_size_divisor{1};
- bool are_preemptions_left() const { return m_left > 0; }
-
- bool adjust_left()
- {
- if (m_left > 0) {
- --m_left;
- m_size_divisor *= 2;
- }
- return m_left > 0;
- }
- };
-
- preemption_data_t preemption_data;
-};
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "scrub_machine.h"
-
-#include <chrono>
-#include <typeinfo>
-
-#include <boost/core/demangle.hpp>
-
-#include "OSD.h"
-#include "OpRequest.h"
-#include "ScrubStore.h"
-#include "scrub_machine_lstnr.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_osd
-#undef dout_prefix
-#define dout_prefix *_dout << " scrubberFSM "
-
-using namespace std::chrono;
-using namespace std::chrono_literals;
-namespace sc = boost::statechart;
-
-#define DECLARE_LOCALS \
- ScrubMachineListener* scrbr = context<ScrubMachine>().m_scrbr; \
- std::ignore = scrbr; \
- auto pg_id = context<ScrubMachine>().m_pg_id; \
- std::ignore = pg_id;
-
-namespace Scrub {
-
-// --------- trace/debug auxiliaries -------------------------------
-
-void on_event_creation(std::string_view nm)
-{
- dout(20) << " event: --vvvv---- " << nm << dendl;
-}
-
-void on_event_discard(std::string_view nm)
-{
- dout(20) << " event: --^^^^---- " << nm << dendl;
-}
-
-void ScrubMachine::my_states() const
-{
- for (auto si = state_begin(); si != state_end(); ++si) {
- const auto& siw{*si}; // prevents a warning re side-effects
- dout(20) << " state: " << boost::core::demangle(typeid(siw).name()) << dendl;
- }
-}
-
-void ScrubMachine::assert_not_active() const
-{
- ceph_assert(state_cast<const NotActive*>());
-}
-
-bool ScrubMachine::is_reserving() const
-{
- return state_cast<const ReservingReplicas*>();
-}
-
-bool ScrubMachine::is_accepting_updates() const
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- ceph_assert(scrbr->is_primary());
-
- return state_cast<const WaitLastUpdate*>();
-}
-
-// for the rest of the code in this file - we know what PG we are dealing with:
-#undef dout_prefix
-#define dout_prefix _prefix(_dout, this->context<ScrubMachine>().m_pg)
-template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
-{
- return t->gen_prefix(*_dout) << " scrubberFSM pg(" << t->pg_id << ") ";
-}
-
-// ////////////// the actual actions
-
-// ----------------------- NotActive -----------------------------------------
-
-NotActive::NotActive(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> NotActive" << dendl;
-}
-
-// ----------------------- ReservingReplicas ---------------------------------
-
-ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> ReservingReplicas" << dendl;
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- scrbr->reserve_replicas();
-}
-
-sc::result ReservingReplicas::react(const ReservationFailure&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
-
- // the Scrubber must release all resources and abort the scrubbing
- scrbr->clear_pgscrub_state();
- return transit<NotActive>();
-}
-
-/**
- * note: the event poster is handling the scrubber reset
- */
-sc::result ReservingReplicas::react(const FullReset&)
-{
- dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl;
- return transit<NotActive>();
-}
-
-// ----------------------- ActiveScrubbing -----------------------------------
-
-ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> ActiveScrubbing" << dendl;
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- scrbr->on_init();
-}
-
-/**
- * upon exiting the Active state
- */
-ActiveScrubbing::~ActiveScrubbing()
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(15) << __func__ << dendl;
- scrbr->unreserve_replicas();
-}
-
-/*
- * The only source of an InternalError event as of now is the BuildMap state,
- * when encountering a backend error.
- * We kill the scrub and reset the FSM.
- */
-sc::result ActiveScrubbing::react(const InternalError&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << __func__ << dendl;
- scrbr->clear_pgscrub_state();
- return transit<NotActive>();
-}
-
-sc::result ActiveScrubbing::react(const FullReset&)
-{
- dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl;
- // caller takes care of clearing the scrubber & FSM states
- return transit<NotActive>();
-}
-
-// ----------------------- RangeBlocked -----------------------------------
-
-/*
- * Blocked. Will be released by kick_object_context_blocked() (or upon
- * an abort)
- *
- * Note: we are never expected to be waiting for long for a blocked object.
- * Unfortunately we know from experience that a bug elsewhere might result
- * in an indefinite wait in this state, for an object that is never released.
- * If that happens, all we can do is to issue a warning message to help
- * with the debugging.
- */
-RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> Act/RangeBlocked" << dendl;
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
-
- // arrange to have a warning message issued if we are stuck in this
- // state for longer than some reasonable number of minutes.
- m_timeout = scrbr->acquire_blocked_alarm();
-}
-
-// ----------------------- PendingTimer -----------------------------------
-
-/**
- * Sleeping till timer reactivation - or just requeuing
- */
-PendingTimer::PendingTimer(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> Act/PendingTimer" << dendl;
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
-
- scrbr->add_delayed_scheduling();
-}
-
-// ----------------------- NewChunk -----------------------------------
-
-/**
- * Preconditions:
- * - preemption data was set
- * - epoch start was updated
- */
-NewChunk::NewChunk(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> Act/NewChunk" << dendl;
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
-
- scrbr->get_preemptor().adjust_parameters();
-
- // choose range to work on
- // select_range_n_notify() will signal either SelectedChunkFree or
- // ChunkIsBusy. If 'busy', we transition to Blocked, and wait for the
- // range to become available.
- scrbr->select_range_n_notify();
-}
-
-sc::result NewChunk::react(const SelectedChunkFree&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl;
-
- scrbr->set_subset_last_update(scrbr->search_log_for_updates());
- return transit<WaitPushes>();
-}
-
-// ----------------------- WaitPushes -----------------------------------
-
-WaitPushes::WaitPushes(my_context ctx) : my_base(ctx)
-{
- dout(10) << " -- state -->> Act/WaitPushes" << dendl;
- post_event(ActivePushesUpd{});
-}
-
-/*
- * Triggered externally, by the entity that had an update re pushes
- */
-sc::result WaitPushes::react(const ActivePushesUpd&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: "
- << scrbr->pending_active_pushes() << dendl;
-
- if (!scrbr->pending_active_pushes()) {
- // done waiting
- return transit<WaitLastUpdate>();
- }
-
- return discard_event();
-}
-
-// ----------------------- WaitLastUpdate -----------------------------------
-
-WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx)
-{
- dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl;
- post_event(UpdatesApplied{});
-}
-
-/**
- * Note:
- * Updates are locally readable immediately. Thus, on the replicas we do need
- * to wait for the update notifications before scrubbing. For the Primary it's
- * a bit different: on EC (and only there) rmw operations have an additional
- * read roundtrip. That means that on the Primary we need to wait for
- * last_update_applied (the replica side, even on EC, is still safe
- * since the actual transaction will already be readable by commit time.
- */
-void WaitLastUpdate::on_new_updates(const UpdatesApplied&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl;
-
- if (scrbr->has_pg_marked_new_updates()) {
- post_event(InternalAllUpdates{});
- } else {
- // will be requeued by op_applied
- dout(10) << "wait for EC read/modify/writes to queue" << dendl;
- }
-}
-
-/*
- * request maps from the replicas in the acting set
- */
-sc::result WaitLastUpdate::react(const InternalAllUpdates&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl;
-
- scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable());
- return transit<BuildMap>();
-}
-
-// ----------------------- BuildMap -----------------------------------
-
-BuildMap::BuildMap(my_context ctx) : my_base(ctx)
-{
- dout(10) << " -- state -->> Act/BuildMap" << dendl;
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
-
- // no need to check for an epoch change, as all possible flows that brought us here have
- // a check_interval() verification of their final event.
-
- if (scrbr->get_preemptor().was_preempted()) {
-
- // we were preempted, either directly or by a replica
- dout(10) << __func__ << " preempted!!!" << dendl;
- scrbr->mark_local_map_ready();
- post_event(IntBmPreempted{});
-
- } else {
-
- auto ret = scrbr->build_primary_map_chunk();
-
- if (ret == -EINPROGRESS) {
- // must wait for the backend to finish. No specific event provided.
- // build_primary_map_chunk() has already requeued us.
- dout(20) << "waiting for the backend..." << dendl;
-
- } else if (ret < 0) {
-
- dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
- post_event(InternalError{});
-
- } else {
-
- // the local map was created
- post_event(IntLocalMapDone{});
- }
- }
-}
-
-sc::result BuildMap::react(const IntLocalMapDone&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl;
-
- scrbr->mark_local_map_ready();
- return transit<WaitReplicas>();
-}
-
-// ----------------------- DrainReplMaps -----------------------------------
-
-DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> Act/DrainReplMaps" << dendl;
- // we may have received all maps already. Send the event that will make us check.
- post_event(GotReplicas{});
-}
-
-sc::result DrainReplMaps::react(const GotReplicas&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl;
-
- if (scrbr->are_all_maps_available()) {
- // NewChunk will handle the preemption that brought us to this state
- return transit<PendingTimer>();
- }
-
- dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: "
- << scrbr->dump_awaited_maps() << dendl;
- return discard_event();
-}
-
-// ----------------------- WaitReplicas -----------------------------------
-
-WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> Act/WaitReplicas" << dendl;
- post_event(GotReplicas{});
-}
-
-/**
- * note: now that maps_compare_n_cleanup() is "futurized"(*), and we remain in this state
- * for a while even after we got all our maps, we must prevent are_all_maps_available()
- * (actually - the code after the if()) from being called more than once.
- * This is basically a separate state, but it's too transitory and artificial to justify
- * the cost of a separate state.
-
- * (*) "futurized" - in Crimson, the call to maps_compare_n_cleanup() returns immediately
- * after initiating the process. The actual termination of the maps comparing etc' is
- * signalled via an event. As we share the code with "classic" OSD, here too
- * maps_compare_n_cleanup() is responsible for signalling the completion of the
- * processing.
- */
-sc::result WaitReplicas::react(const GotReplicas&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl;
-
- if (!all_maps_already_called && scrbr->are_all_maps_available()) {
- dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl;
-
- all_maps_already_called = true;
-
- // were we preempted?
- if (scrbr->get_preemptor().disable_and_test()) { // a test&set
-
-
- dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl;
- return transit<PendingTimer>();
-
- } else {
-
- // maps_compare_n_cleanup() will arrange for MapsCompared event to be sent:
- scrbr->maps_compare_n_cleanup();
- return discard_event();
- }
- } else {
- return discard_event();
- }
-}
-
-// ----------------------- WaitDigestUpdate -----------------------------------
-
-WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl;
- // perform an initial check: maybe we already
- // have all the updates we need:
- // (note that DigestUpdate is usually an external event)
- post_event(DigestUpdate{});
-}
-
-sc::result WaitDigestUpdate::react(const DigestUpdate&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl;
-
- // on_digest_updates() will either:
- // - do nothing - if we are still waiting for updates, or
- // - finish the scrubbing of the current chunk, and:
- // - send NextChunk, or
- // - send ScrubFinished
-
- scrbr->on_digest_updates();
- return discard_event();
-}
-
-ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
- : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub}
-{
- dout(15) << "ScrubMachine created " << m_pg_id << dendl;
-}
-
-ScrubMachine::~ScrubMachine() = default;
-
-// -------- for replicas -----------------------------------------------------
-
-// ----------------------- ReplicaWaitUpdates --------------------------------
-
-ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx)
-{
- dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl;
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- scrbr->on_replica_init();
-}
-
-/*
- * Triggered externally, by the entity that had an update re pushes
- */
-sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): "
- << scrbr->pending_active_pushes() << dendl;
-
- if (scrbr->pending_active_pushes() == 0) {
-
- // done waiting
- return transit<ActiveReplica>();
- }
-
- return discard_event();
-}
-
-/**
- * the event poster is handling the scrubber reset
- */
-sc::result ReplicaWaitUpdates::react(const FullReset&)
-{
- dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl;
- return transit<NotActive>();
-}
-
-// ----------------------- ActiveReplica -----------------------------------
-
-ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "-- state -->> ActiveReplica" << dendl;
- scrbr->on_replica_init(); // as we might have skipped ReplicaWaitUpdates
- post_event(SchedReplica{});
-}
-
-sc::result ActiveReplica::react(const SchedReplica&)
-{
- DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
- dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? "
- << scrbr->get_preemptor().is_preemptable() << dendl;
-
- if (scrbr->get_preemptor().was_preempted()) {
- dout(10) << "replica scrub job preempted" << dendl;
-
- scrbr->send_preempted_replica();
- scrbr->replica_handling_done();
- return transit<NotActive>();
- }
-
- // start or check progress of build_replica_map_chunk()
- auto ret_init = scrbr->build_replica_map_chunk();
- if (ret_init != -EINPROGRESS) {
- return transit<NotActive>();
- }
-
- return discard_event();
-}
-
-/**
- * the event poster is handling the scrubber reset
- */
-sc::result ActiveReplica::react(const FullReset&)
-{
- dout(10) << "ActiveReplica::react(const FullReset&)" << dendl;
- return transit<NotActive>();
-}
-
-} // namespace Scrub
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#pragma once
-
-#include <string>
-
-#include <boost/statechart/custom_reaction.hpp>
-#include <boost/statechart/deferral.hpp>
-#include <boost/statechart/event.hpp>
-#include <boost/statechart/event_base.hpp>
-#include <boost/statechart/in_state_reaction.hpp>
-#include <boost/statechart/simple_state.hpp>
-#include <boost/statechart/state.hpp>
-#include <boost/statechart/state_machine.hpp>
-#include <boost/statechart/transition.hpp>
-
-#include "common/version.h"
-#include "include/Context.h"
-
-#include "scrub_machine_lstnr.h"
-#include "scrubber_common.h"
-
-using namespace std::string_literals;
-
-class PG; // holding a pointer to that one - just for testing
-class PgScrubber;
-namespace Scrub {
-
-namespace sc = ::boost::statechart;
-namespace mpl = ::boost::mpl;
-
-//
-// EVENTS
-//
-
-void on_event_creation(std::string_view nm);
-void on_event_discard(std::string_view nm);
-
-#define MEV(E) \
- struct E : sc::event<E> { \
- inline static int actv{0}; \
- E() \
- { \
- if (!actv++) \
- on_event_creation(#E); \
- } \
- ~E() \
- { \
- if (!--actv) \
- on_event_discard(#E); \
- } \
- void print(std::ostream* out) const { *out << #E; } \
- std::string_view print() const { return #E; } \
- };
-
-MEV(RemotesReserved) ///< all replicas have granted our reserve request
-
-MEV(ReservationFailure) ///< a reservation request has failed
-
-MEV(StartScrub) ///< initiate a new scrubbing session (relevant if we are a Primary)
-
-MEV(AfterRepairScrub) ///< initiate a new scrubbing session. Only triggered at Recovery
- ///< completion.
-
-MEV(Unblocked) ///< triggered when the PG unblocked an object that was marked for
- ///< scrubbing. Via the PGScrubUnblocked op
-
-MEV(InternalSchedScrub)
-
-MEV(SelectedChunkFree)
-
-MEV(ChunkIsBusy)
-
-MEV(ActivePushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery
- ///< that is in-flight to the local ObjectStore
-
-MEV(UpdatesApplied) ///< (Primary only) all updates are committed
-
-MEV(InternalAllUpdates) ///< the internal counterpart of UpdatesApplied
-
-MEV(GotReplicas) ///< got a map from a replica
-
-MEV(IntBmPreempted) ///< internal - BuildMap preempted. Required, as detected within the
- ///< ctor
-
-MEV(InternalError)
-
-MEV(IntLocalMapDone)
-
-MEV(DigestUpdate) ///< external. called upon success of a MODIFY op. See
- ///< scrub_snapshot_metadata()
-
-MEV(MapsCompared) ///< (Crimson) maps_compare_n_cleanup() transactions are done
-
-MEV(StartReplica) ///< initiating replica scrub.
-
-MEV(StartReplicaNoWait) ///< 'start replica' when there are no pending updates
-
-MEV(SchedReplica)
-
-MEV(ReplicaPushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery
- ///< that is in-flight to the local ObjectStore
-
-MEV(FullReset) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive)
-
-MEV(NextChunk) ///< finished handling this chunk. Go get the next one
-
-MEV(ScrubFinished) ///< all chunks handled
-
-
-struct NotActive; ///< the quiescent state. No active scrubbing.
-struct ReservingReplicas; ///< securing scrub resources from replicas' OSDs
-struct ActiveScrubbing; ///< the active state for a Primary. A sub-machine.
-struct ReplicaWaitUpdates; ///< an active state for a replica. Waiting for all active
- ///< operations to finish.
-struct ActiveReplica; ///< an active state for a replica.
-
-
-class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
- public:
- friend class PgScrubber;
-
- public:
- explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub);
- ~ScrubMachine();
-
- PG* m_pg; // only used for dout messages
- spg_t m_pg_id;
- ScrubMachineListener* m_scrbr;
-
- void my_states() const;
- void assert_not_active() const;
- [[nodiscard]] bool is_reserving() const;
- [[nodiscard]] bool is_accepting_updates() const;
-};
-
-/**
- * The Scrubber's base (quiescent) state.
- * Scrubbing is triggered by one of the following events:
- * - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources
- * reservation process. Will be issued by PG::scrub(), following a
- * queued "PGScrub" op.
- * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is
- * not required to reserve resources.
- * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming
- * MOSDRepScrub message.
- *
- * note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting
- * for replica resources to be acquired. But once replicas started using the
- * resource-request to identify and tag the scrub session, this bypass cannot be
- * supported anymore.
- */
-struct NotActive : sc::state<NotActive, ScrubMachine> {
- explicit NotActive(my_context ctx);
-
- using reactions = mpl::list<sc::transition<StartScrub, ReservingReplicas>,
- // a scrubbing that was initiated at recovery completion,
- // and requires no resource reservations:
- sc::transition<AfterRepairScrub, ReservingReplicas>,
- sc::transition<StartReplica, ReplicaWaitUpdates>,
- sc::transition<StartReplicaNoWait, ActiveReplica>>;
-};
-
-struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine> {
-
- explicit ReservingReplicas(my_context ctx);
- using reactions = mpl::list<sc::custom_reaction<FullReset>,
- // all replicas granted our resources request
- sc::transition<RemotesReserved, ActiveScrubbing>,
- sc::custom_reaction<ReservationFailure>>;
-
- sc::result react(const FullReset&);
-
- /// at least one replica denied us the scrub resources we've requested
- sc::result react(const ReservationFailure&);
-};
-
-
-// the "active" sub-states
-
-struct RangeBlocked; ///< the objects range is blocked
-struct PendingTimer; ///< either delaying the scrub by some time and requeuing, or just
- ///< requeue
-struct NewChunk; ///< select a chunk to scrub, and verify its availability
-struct WaitPushes;
-struct WaitLastUpdate;
-struct BuildMap;
-struct DrainReplMaps; ///< a problem during BuildMap. Wait for all replicas to report,
- ///< then restart.
-struct WaitReplicas; ///< wait for all replicas to report
-struct WaitDigestUpdate;
-
-struct ActiveScrubbing : sc::state<ActiveScrubbing, ScrubMachine, PendingTimer> {
-
- explicit ActiveScrubbing(my_context ctx);
- ~ActiveScrubbing();
-
- using reactions = mpl::list<
- sc::custom_reaction<InternalError>,
- sc::custom_reaction<FullReset>>;
-
- sc::result react(const FullReset&);
- sc::result react(const InternalError&);
-};
-
-struct RangeBlocked : sc::state<RangeBlocked, ActiveScrubbing> {
- explicit RangeBlocked(my_context ctx);
- using reactions = mpl::list<sc::transition<Unblocked, PendingTimer>>;
-
- Scrub::BlockedRangeWarning m_timeout;
-};
-
-struct PendingTimer : sc::state<PendingTimer, ActiveScrubbing> {
-
- explicit PendingTimer(my_context ctx);
-
- using reactions = mpl::list<sc::transition<InternalSchedScrub, NewChunk>>;
-};
-
-struct NewChunk : sc::state<NewChunk, ActiveScrubbing> {
-
- explicit NewChunk(my_context ctx);
-
- using reactions = mpl::list<sc::transition<ChunkIsBusy, RangeBlocked>,
- sc::custom_reaction<SelectedChunkFree>>;
-
- sc::result react(const SelectedChunkFree&);
-};
-
-/**
- * initiate the update process for this chunk
- *
- * Wait fo 'active_pushes' to clear.
- * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence
- * scrub waits until the correct data is readable (in-flight data to the Objectstore is
- * not readable until written to disk, termed 'applied' here)
- */
-struct WaitPushes : sc::state<WaitPushes, ActiveScrubbing> {
-
- explicit WaitPushes(my_context ctx);
-
- using reactions = mpl::list<sc::custom_reaction<ActivePushesUpd>>;
-
- sc::result react(const ActivePushesUpd&);
-};
-
-struct WaitLastUpdate : sc::state<WaitLastUpdate, ActiveScrubbing> {
-
- explicit WaitLastUpdate(my_context ctx);
-
- void on_new_updates(const UpdatesApplied&);
-
- using reactions = mpl::list<sc::custom_reaction<InternalAllUpdates>,
- sc::in_state_reaction<UpdatesApplied,
- WaitLastUpdate,
- &WaitLastUpdate::on_new_updates>>;
-
- sc::result react(const InternalAllUpdates&);
-};
-
-struct BuildMap : sc::state<BuildMap, ActiveScrubbing> {
- explicit BuildMap(my_context ctx);
-
- // possible error scenarios:
- // - an error reported by the backend will trigger an 'InternalError' event,
- // handled by our parent state;
- // - if preempted, we switch to DrainReplMaps, where we will wait for all
- // replicas to send their maps before acknowledging the preemption;
- // - an interval change will be handled by the relevant 'send-event' functions,
- // and will translated into a 'FullReset' event.
- using reactions =
- mpl::list<sc::transition<IntBmPreempted, DrainReplMaps>,
- sc::transition<InternalSchedScrub, BuildMap>, // looping, waiting
- // for the backend to
- // finish
- sc::custom_reaction<IntLocalMapDone>>;
-
- sc::result react(const IntLocalMapDone&);
-};
-
-/*
- * "drain" scrub-maps responses from replicas
- */
-struct DrainReplMaps : sc::state<DrainReplMaps, ActiveScrubbing> {
- explicit DrainReplMaps(my_context ctx);
-
- using reactions =
- mpl::list<sc::custom_reaction<GotReplicas> // all replicas are accounted for
- >;
-
- sc::result react(const GotReplicas&);
-};
-
-struct WaitReplicas : sc::state<WaitReplicas, ActiveScrubbing> {
- explicit WaitReplicas(my_context ctx);
-
- using reactions =
- mpl::list<sc::custom_reaction<GotReplicas>, // all replicas are accounted for
- sc::transition<MapsCompared, WaitDigestUpdate>,
- sc::deferral<DigestUpdate> // might arrive before we've reached WDU
- >;
-
- sc::result react(const GotReplicas&);
-
- bool all_maps_already_called{false}; // see comment in react code
-};
-
-struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing> {
- explicit WaitDigestUpdate(my_context ctx);
-
- using reactions = mpl::list<sc::custom_reaction<DigestUpdate>,
- sc::transition<NextChunk, PendingTimer>,
- sc::transition<ScrubFinished, NotActive>>;
- sc::result react(const DigestUpdate&);
-};
-
-// ----------------------------- the "replica active" states -----------------------
-
-/*
- * Waiting for 'active_pushes' to complete
- *
- * When in this state:
- * - the details of the Primary's request were internalized by PgScrubber;
- * - 'active' scrubbing is set
- */
-struct ReplicaWaitUpdates : sc::state<ReplicaWaitUpdates, ScrubMachine> {
- explicit ReplicaWaitUpdates(my_context ctx);
- using reactions =
- mpl::list<sc::custom_reaction<ReplicaPushesUpd>, sc::custom_reaction<FullReset>>;
-
- sc::result react(const ReplicaPushesUpd&);
- sc::result react(const FullReset&);
-};
-
-
-struct ActiveReplica : sc::state<ActiveReplica, ScrubMachine> {
- explicit ActiveReplica(my_context ctx);
- using reactions = mpl::list<sc::custom_reaction<SchedReplica>,
- sc::custom_reaction<FullReset>,
- sc::transition<ScrubFinished, NotActive>>;
-
- sc::result react(const SchedReplica&);
- sc::result react(const FullReset&);
-};
-
-} // namespace Scrub
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-/**
- * \file the PgScrubber interface used by the scrub FSM
- */
-#include "common/version.h"
-#include "include/Context.h"
-
-#include "osd_types.h"
-
-namespace Scrub {
-
-enum class PreemptionNoted { no_preemption, preempted };
-
-/// the interface exposed by the PgScrubber into its internal
-/// preemption_data object
-struct preemption_t {
-
- virtual ~preemption_t() = default;
-
- [[nodiscard]] virtual bool is_preemptable() const = 0;
-
- [[nodiscard]] virtual bool was_preempted() const = 0;
-
- virtual void adjust_parameters() = 0;
-
- /**
- * Try to preempt the scrub.
- * 'true' (i.e. - preempted) if:
- * preemptable && not already preempted
- */
- virtual bool do_preempt() = 0;
-
- /**
- * disables preemptions.
- * Returns 'true' if we were already preempted
- */
- virtual bool disable_and_test() = 0;
-};
-
-/// an aux used when blocking on a busy object.
-/// Issues a log warning if still blocked after 'waittime'.
-struct blocked_range_t {
- blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id);
- ~blocked_range_t();
-
- OSDService* m_osds;
- Context* m_callbk;
-};
-
-using BlockedRangeWarning = std::unique_ptr<blocked_range_t>;
-
-} // namespace Scrub
-
-struct ScrubMachineListener {
-
- struct MsgAndEpoch {
- MessageRef m_msg;
- epoch_t m_epoch;
- };
-
- virtual ~ScrubMachineListener() = default;
-
- [[nodiscard]] virtual bool is_primary() const = 0;
-
- virtual void select_range_n_notify() = 0;
-
- virtual Scrub::BlockedRangeWarning acquire_blocked_alarm() = 0;
-
- /// walk the log to find the latest update that affects our chunk
- virtual eversion_t search_log_for_updates() const = 0;
-
- virtual eversion_t get_last_update_applied() const = 0;
-
- virtual int pending_active_pushes() const = 0;
-
- virtual int build_primary_map_chunk() = 0;
-
- virtual int build_replica_map_chunk() = 0;
-
- virtual void on_init() = 0;
-
- virtual void on_replica_init() = 0;
-
- virtual void replica_handling_done() = 0;
-
- /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
- /// (thus can be called from FSM reactions)
- virtual void clear_pgscrub_state() = 0;
-
- /*
- * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
- * is asserted - after a configuration-dependent timeout.
- */
- virtual void add_delayed_scheduling() = 0;
-
- /**
- * Ask all replicas for their scrub maps for the current chunk.
- */
- virtual void get_replicas_maps(bool replica_can_preempt) = 0;
-
- virtual void on_digest_updates() = 0;
-
- /**
- * Prepare a MOSDRepScrubMap message carrying the requested scrub map
- * @param was_preempted - were we preempted?
- * @return the message, and the current value of 'm_replica_min_epoch' (which is
- * used when sending the message, but will be overwritten before that).
- */
- [[nodiscard]] virtual MsgAndEpoch prep_replica_map_msg(
- Scrub::PreemptionNoted was_preempted) = 0;
-
- /**
- * Send to the primary the pre-prepared message containing the requested map
- */
- virtual void send_replica_map(const MsgAndEpoch& preprepared) = 0;
-
- /**
- * Let the primary know that we were preempted while trying to build the
- * requested map.
- */
- virtual void send_preempted_replica() = 0;
-
- [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0;
-
- virtual void set_subset_last_update(eversion_t e) = 0;
-
- [[nodiscard]] virtual bool was_epoch_changed() const = 0;
-
- virtual Scrub::preemption_t& get_preemptor() = 0;
-
- /**
- * a "technical" collection of the steps performed once all
- * rep maps are available:
- * - the maps are compared
- * - the scrub region markers (start_ & end_) are advanced
- * - callbacks and ops that were pending are allowed to run
- */
- virtual void maps_compare_n_cleanup() = 0;
-
- /**
- * order the PgScrubber to initiate the process of reserving replicas' scrub
- * resources.
- */
- virtual void reserve_replicas() = 0;
-
- virtual void unreserve_replicas() = 0;
-
- /**
- * the FSM interface into the "are we waiting for maps, either our own or from
- * replicas" state.
- * The FSM can only:
- * - mark the local map as available, and
- * - query status
- */
- virtual void mark_local_map_ready() = 0;
-
- [[nodiscard]] virtual bool are_all_maps_available() const = 0;
-
- /// a log/debug interface
- virtual std::string dump_awaited_maps() const = 0;
-};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PrimaryLogScrub.h"
+
+#include "common/scrub_types.h"
+#include "osd/osd_types_fmt.h"
+
+#include "osd/PeeringState.h"
+#include "osd/PrimaryLogPG.h"
+#include "scrub_machine.h"
+
+#define dout_context (m_pg->get_cct())
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this->m_pg)
+
+using std::vector;
+
+template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
+{
+ return t->gen_prefix(*_dout) << " PrimaryLog scrubber pg(" << t->pg_id << ") ";
+}
+
+using namespace Scrub;
+using Scrub::ScrubMachine;
+
+bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg,
+ scrub_ls_result_t& res_inout) const
+{
+ if (!m_store) {
+ return false;
+ }
+
+ if (arg.get_snapsets) {
+ res_inout.vals =
+ m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return);
+ } else {
+ res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after,
+ arg.max_return);
+ }
+ return true;
+}
+
+void PrimaryLogScrub::_scrub_finish()
+{
+ auto& info = m_pg->get_pg_info(ScrubberPasskey{}); ///< a temporary alias
+
+ dout(10) << __func__
+ << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid")
+ << dendl;
+
+ if (info.stats.stats_invalid) {
+ m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) {
+ stats.stats = m_scrub_cstat;
+ stats.stats_invalid = false;
+ return false;
+ });
+
+ if (m_pl_pg->agent_state)
+ m_pl_pg->agent_choose_mode();
+ }
+
+ dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/"
+ << info.stats.stats.sum.num_objects << " objects, "
+ << m_scrub_cstat.sum.num_object_clones << "/"
+ << info.stats.stats.sum.num_object_clones << " clones, "
+ << m_scrub_cstat.sum.num_objects_dirty << "/"
+ << info.stats.stats.sum.num_objects_dirty << " dirty, "
+ << m_scrub_cstat.sum.num_objects_omap << "/"
+ << info.stats.stats.sum.num_objects_omap << " omap, "
+ << m_scrub_cstat.sum.num_objects_pinned << "/"
+ << info.stats.stats.sum.num_objects_pinned << " pinned, "
+ << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+ << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
+ << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes
+ << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/"
+ << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
+ << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+ << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
+ << dendl;
+
+ if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
+ m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
+ (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
+ !info.stats.dirty_stats_invalid) ||
+ (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
+ !info.stats.omap_stats_invalid) ||
+ (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
+ !info.stats.pin_stats_invalid) ||
+ (m_scrub_cstat.sum.num_objects_hit_set_archive !=
+ info.stats.stats.sum.num_objects_hit_set_archive &&
+ !info.stats.hitset_stats_invalid) ||
+ (m_scrub_cstat.sum.num_bytes_hit_set_archive !=
+ info.stats.stats.sum.num_bytes_hit_set_archive &&
+ !info.stats.hitset_bytes_stats_invalid) ||
+ (m_scrub_cstat.sum.num_objects_manifest !=
+ info.stats.stats.sum.num_objects_manifest &&
+ !info.stats.manifest_stats_invalid) ||
+ m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
+ m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
+ m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got "
+ << m_scrub_cstat.sum.num_objects << "/"
+ << info.stats.stats.sum.num_objects << " objects, "
+ << m_scrub_cstat.sum.num_object_clones << "/"
+ << info.stats.stats.sum.num_object_clones << " clones, "
+ << m_scrub_cstat.sum.num_objects_dirty << "/"
+ << info.stats.stats.sum.num_objects_dirty << " dirty, "
+ << m_scrub_cstat.sum.num_objects_omap << "/"
+ << info.stats.stats.sum.num_objects_omap << " omap, "
+ << m_scrub_cstat.sum.num_objects_pinned << "/"
+ << info.stats.stats.sum.num_objects_pinned << " pinned, "
+ << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+ << info.stats.stats.sum.num_objects_hit_set_archive
+ << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts
+ << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
+ << m_scrub_cstat.sum.num_bytes << "/"
+ << info.stats.stats.sum.num_bytes << " bytes, "
+ << m_scrub_cstat.sum.num_objects_manifest << "/"
+ << info.stats.stats.sum.num_objects_manifest
+ << " manifest objects, "
+ << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+ << info.stats.stats.sum.num_bytes_hit_set_archive
+ << " hit_set_archive bytes.";
+ ++m_shallow_errors;
+
+ if (m_is_repair) {
+ ++m_fixed_count;
+ m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) {
+ stats.stats = m_scrub_cstat;
+ stats.dirty_stats_invalid = false;
+ stats.omap_stats_invalid = false;
+ stats.hitset_stats_invalid = false;
+ stats.hitset_bytes_stats_invalid = false;
+ stats.pin_stats_invalid = false;
+ stats.manifest_stats_invalid = false;
+ return false;
+ });
+ m_pl_pg->publish_stats_to_osd();
+ m_pl_pg->recovery_state.share_pg_info();
+ }
+ }
+ // Clear object context cache to get repair information
+ if (m_is_repair)
+ m_pl_pg->object_contexts.clear();
+}
+
+static bool doing_clones(const std::optional<SnapSet>& snapset,
+ const vector<snapid_t>::reverse_iterator& curclone)
+{
+ return snapset && curclone != snapset->clones.rend();
+}
+
+void PrimaryLogScrub::log_missing(int missing,
+ const std::optional<hobject_t>& head,
+ LogChannelRef clog,
+ const spg_t& pgid,
+ const char* func,
+ bool allow_incomplete_clones)
+{
+ ceph_assert(head);
+ if (allow_incomplete_clones) {
+ dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped "
+ << missing << " clone(s) in cache tier" << dendl;
+ } else {
+ clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing
+ << " missing clone(s)";
+ }
+}
+
+int PrimaryLogScrub::process_clones_to(const std::optional<hobject_t>& head,
+ const std::optional<SnapSet>& snapset,
+ LogChannelRef clog,
+ const spg_t& pgid,
+ bool allow_incomplete_clones,
+ std::optional<snapid_t> target,
+ vector<snapid_t>::reverse_iterator* curclone,
+ inconsistent_snapset_wrapper& e)
+{
+ ceph_assert(head);
+ ceph_assert(snapset);
+ int missing_count = 0;
+
+ // NOTE: clones are in descending order, thus **curclone > target test here
+ hobject_t next_clone(*head);
+ while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
+
+ ++missing_count;
+ // it is okay to be missing one or more clones in a cache tier.
+ // skip higher-numbered clones in the list.
+ if (!allow_incomplete_clones) {
+ next_clone.snap = **curclone;
+ clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone "
+ << next_clone << " " << m_missing << " missing";
+ ++m_shallow_errors;
+ e.set_clone_missing(next_clone.snap);
+ }
+ // Clones are descending
+ ++(*curclone);
+ }
+ return missing_count;
+}
+
+/*
+ * Validate consistency of the object info and snap sets.
+ *
+ * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
+ * the comparison of the objects is against multiple snapset.clones. There are
+ * multiple clone lists and in between lists we expect head.
+ *
+ * Example
+ *
+ * objects expected
+ * ======= =======
+ * obj1 snap 1 head, unexpected obj1 snap 1
+ * obj2 head head, match
+ * [SnapSet clones 6 4 2 1]
+ * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
+ * obj2 snap 6 obj2 snap 6, match
+ * obj2 snap 4 obj2 snap 4, match
+ * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
+ * [Snapset clones 3 1]
+ * obj3 snap 3 obj3 snap 3 match
+ * obj3 snap 1 obj3 snap 1 match
+ * obj4 head head, match
+ * [Snapset clones 4]
+ * EOL obj4 snap 4, (expected)
+ */
+void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap,
+ const missing_map_t& missing_digest)
+{
+ dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects
+ << dendl;
+
+ auto& info = m_pl_pg->info;
+ const PGPool& pool = m_pl_pg->pool;
+ bool allow_incomplete_clones = pool.info.allow_incomplete_clones();
+
+ std::optional<snapid_t> all_clones; // Unspecified snapid_t or std::nullopt
+
+ // traverse in reverse order.
+ std::optional<hobject_t> head;
+ std::optional<SnapSet> snapset; // If initialized so will head (above)
+ vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
+ int missing = 0;
+ inconsistent_snapset_wrapper soid_error, head_error;
+ int soid_error_count = 0;
+
+ for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
+
+ const hobject_t& soid = p->first;
+ ceph_assert(!soid.is_snapdir());
+ soid_error = inconsistent_snapset_wrapper{soid};
+ object_stat_sum_t stat;
+ std::optional<object_info_t> oi;
+
+ stat.num_objects++;
+
+ if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+ stat.num_objects_hit_set_archive++;
+
+ if (soid.is_snap()) {
+ // it's a clone
+ stat.num_object_clones++;
+ }
+
+ // basic checks.
+ if (p->second.attrs.count(OI_ATTR) == 0) {
+ oi = std::nullopt;
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
+ << OI_ATTR << "' attr";
+ ++m_shallow_errors;
+ soid_error.set_info_missing();
+ } else {
+ bufferlist bv;
+ bv.push_back(p->second.attrs[OI_ATTR]);
+ try {
+ oi = object_info_t(bv);
+ } catch (ceph::buffer::error& e) {
+ oi = std::nullopt;
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : can't decode '" << OI_ATTR << "' attr " << e.what();
+ ++m_shallow_errors;
+ soid_error.set_info_corrupted();
+ soid_error.set_info_missing(); // Not available too
+ }
+ }
+
+ if (oi) {
+ if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : on disk size (" << p->second.size
+ << ") does not match object info size (" << oi->size
+ << ") adjusted for ondisk to ("
+ << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")";
+ soid_error.set_size_mismatch();
+ ++m_shallow_errors;
+ }
+
+ dout(20) << m_mode_desc << " " << soid << " " << *oi << dendl;
+
+ // A clone num_bytes will be added later when we have snapset
+ if (!soid.is_snap()) {
+ stat.num_bytes += oi->size;
+ }
+ if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+ stat.num_bytes_hit_set_archive += oi->size;
+
+ if (oi->is_dirty())
+ ++stat.num_objects_dirty;
+ if (oi->is_whiteout())
+ ++stat.num_whiteouts;
+ if (oi->is_omap())
+ ++stat.num_objects_omap;
+ if (oi->is_cache_pinned())
+ ++stat.num_objects_pinned;
+ if (oi->has_manifest())
+ ++stat.num_objects_manifest;
+ }
+
+ // Check for any problems while processing clones
+ if (doing_clones(snapset, curclone)) {
+ std::optional<snapid_t> target;
+ // Expecting an object with snap for current head
+ if (soid.has_snapset() || soid.get_head() != head->get_head()) {
+
+ dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid
+ << " while processing " << *head << dendl;
+
+ target = all_clones;
+ } else {
+ ceph_assert(soid.is_snap());
+ target = soid.snap;
+ }
+
+ // Log any clones we were expecting to be there up to target
+ // This will set missing, but will be a no-op if snap.soid == *curclone.
+ missing +=
+ process_clones_to(head, snapset, m_osds->clog, info.pgid,
+ allow_incomplete_clones, target, &curclone, head_error);
+ }
+
+ bool expected;
+ // Check doing_clones() again in case we ran process_clones_to()
+ if (doing_clones(snapset, curclone)) {
+ // A head would have processed all clones above
+ // or all greater than *curclone.
+ ceph_assert(soid.is_snap() && *curclone <= soid.snap);
+
+ // After processing above clone snap should match the expected curclone
+ expected = (*curclone == soid.snap);
+ } else {
+ // If we aren't doing clones any longer, then expecting head
+ expected = soid.has_snapset();
+ }
+ if (!expected) {
+ // If we couldn't read the head's snapset, just ignore clones
+ if (head && !snapset) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : clone ignored due to missing snapset";
+ } else {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : is an unexpected clone";
+ }
+ ++m_shallow_errors;
+ soid_error.set_headless();
+ m_store->add_snap_error(pool.id, soid_error);
+ ++soid_error_count;
+ if (head && soid.get_head() == head->get_head())
+ head_error.set_clone(soid.snap);
+ continue;
+ }
+
+ // new snapset?
+ if (soid.has_snapset()) {
+
+ if (missing) {
+ log_missing(missing, head, m_osds->clog, info.pgid, __func__,
+ pool.info.allow_incomplete_clones());
+ }
+
+ // Save previous head error information
+ if (head && (head_error.errors || soid_error_count))
+ m_store->add_snap_error(pool.id, head_error);
+ // Set this as a new head object
+ head = soid;
+ missing = 0;
+ head_error = soid_error;
+ soid_error_count = 0;
+
+ dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl;
+
+ if (p->second.attrs.count(SS_ATTR) == 0) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
+ << SS_ATTR << "' attr";
+ ++m_shallow_errors;
+ snapset = std::nullopt;
+ head_error.set_snapset_missing();
+ } else {
+ bufferlist bl;
+ bl.push_back(p->second.attrs[SS_ATTR]);
+ auto blp = bl.cbegin();
+ try {
+ snapset = SnapSet(); // Initialize optional<> before decoding into it
+ decode(*snapset, blp);
+ head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
+ } catch (ceph::buffer::error& e) {
+ snapset = std::nullopt;
+ m_osds->clog->error()
+ << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR
+ << "' attr " << e.what();
+ ++m_shallow_errors;
+ head_error.set_snapset_corrupted();
+ }
+ }
+
+ if (snapset) {
+ // what will be next?
+ curclone = snapset->clones.rbegin();
+
+ if (!snapset->clones.empty()) {
+ dout(20) << " snapset " << *snapset << dendl;
+ if (snapset->seq == 0) {
+ m_osds->clog->error()
+ << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set";
+ ++m_shallow_errors;
+ head_error.set_snapset_error();
+ }
+ }
+ }
+ } else {
+ ceph_assert(soid.is_snap());
+ ceph_assert(head);
+ ceph_assert(snapset);
+ ceph_assert(soid.snap == *curclone);
+
+ dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl;
+
+ if (snapset->clone_size.count(soid.snap) == 0) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : is missing in clone_size";
+ ++m_shallow_errors;
+ soid_error.set_size_mismatch();
+ } else {
+ if (oi && oi->size != snapset->clone_size[soid.snap]) {
+ m_osds->clog->error()
+ << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size
+ << " != clone_size " << snapset->clone_size[*curclone];
+ ++m_shallow_errors;
+ soid_error.set_size_mismatch();
+ }
+
+ if (snapset->clone_overlap.count(soid.snap) == 0) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : is missing in clone_overlap";
+ ++m_shallow_errors;
+ soid_error.set_size_mismatch();
+ } else {
+ // This checking is based on get_clone_bytes(). The first 2 asserts
+ // can't happen because we know we have a clone_size and
+ // a clone_overlap. Now we check that the interval_set won't
+ // cause the last assert.
+ uint64_t size = snapset->clone_size.find(soid.snap)->second;
+ const interval_set<uint64_t>& overlap =
+ snapset->clone_overlap.find(soid.snap)->second;
+ bool bad_interval_set = false;
+ for (interval_set<uint64_t>::const_iterator i = overlap.begin();
+ i != overlap.end(); ++i) {
+ if (size < i.get_len()) {
+ bad_interval_set = true;
+ break;
+ }
+ size -= i.get_len();
+ }
+
+ if (bad_interval_set) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : bad interval_set in clone_overlap";
+ ++m_shallow_errors;
+ soid_error.set_size_mismatch();
+ } else {
+ stat.num_bytes += snapset->get_clone_bytes(soid.snap);
+ }
+ }
+ }
+
+ // what's next?
+ ++curclone;
+ if (soid_error.errors) {
+ m_store->add_snap_error(pool.id, soid_error);
+ ++soid_error_count;
+ }
+ }
+ m_scrub_cstat.add(stat);
+ }
+
+ if (doing_clones(snapset, curclone)) {
+ dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid
+ << " No more objects while processing " << *head << dendl;
+
+ missing +=
+ process_clones_to(head, snapset, m_osds->clog, info.pgid,
+ allow_incomplete_clones, all_clones, &curclone, head_error);
+ }
+
+ // There could be missing found by the test above or even
+ // before dropping out of the loop for the last head.
+ if (missing) {
+ log_missing(missing, head, m_osds->clog, info.pgid, __func__,
+ allow_incomplete_clones);
+ }
+ if (head && (head_error.errors || soid_error_count))
+ m_store->add_snap_error(pool.id, head_error);
+
+ dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing"
+ << dendl;
+ for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
+
+ ceph_assert(!p->first.is_snapdir());
+ dout(10) << __func__ << " recording digests for " << p->first << dendl;
+
+ ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false);
+ if (!obc) {
+ m_osds->clog->error() << info.pgid << " " << m_mode_desc
+ << " cannot get object context for object " << p->first;
+ continue;
+ }
+ if (obc->obs.oi.soid != p->first) {
+ m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first
+ << " : object has a valid oi attr with a mismatched name, "
+ << " obc->obs.oi.soid: " << obc->obs.oi.soid;
+ continue;
+ }
+ PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc);
+ ctx->at_version = m_pl_pg->get_next_version();
+ ctx->mtime = utime_t(); // do not update mtime
+ if (p->second.first) {
+ ctx->new_obs.oi.set_data_digest(*p->second.first);
+ } else {
+ ctx->new_obs.oi.clear_data_digest();
+ }
+ if (p->second.second) {
+ ctx->new_obs.oi.set_omap_digest(*p->second.second);
+ } else {
+ ctx->new_obs.oi.clear_omap_digest();
+ }
+ m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
+
+ ++num_digest_updates_pending;
+ ctx->register_on_success([this]() {
+ dout(20) << "updating scrub digest " << num_digest_updates_pending << dendl;
+ if (--num_digest_updates_pending <= 0) {
+ m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops());
+ }
+ });
+
+ m_pl_pg->simple_opc_submit(std::move(ctx));
+ }
+
+ dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl;
+}
+
+PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {}
+
+void PrimaryLogScrub::_scrub_clear_state()
+{
+ dout(15) << __func__ << dendl;
+ m_scrub_cstat = object_stat_collection_t();
+}
+
+void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+ const hobject_t& soid)
+{
+ // We scrub objects in hobject_t order, so objects before m_start have already been
+ // scrubbed and their stats have already been added to the scrubber. Objects after that
+ // point haven't been included in the scrubber's stats accounting yet, so they will be
+ // included when the scrubber gets to that object.
+ if (is_primary() && is_scrub_active()) {
+ if (soid < m_start) {
+
+ dout(20) << fmt::format("{} {} < [{},{})", __func__, soid, m_start, m_end) << dendl;
+ m_scrub_cstat.add(delta_stats);
+
+ } else {
+
+ dout(25) << fmt::format("{} {} >= [{},{})", __func__, soid, m_start, m_end) << dendl;
+ }
+ }
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+// the './' includes are marked this way to affect clang-format
+#include "./pg_scrubber.h"
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "debug.h"
+
+#include "common/errno.h"
+#include "common/scrub_types.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrubReserve.h"
+
+#include "osd/OSD.h"
+#include "scrub_machine.h"
+
+class PrimaryLogPG;
+
+/**
+ * The derivative of PgScrubber that is used by PrimaryLogPG.
+ */
+class PrimaryLogScrub : public PgScrubber {
+ public:
+ explicit PrimaryLogScrub(PrimaryLogPG* pg);
+
+ void _scrub_finish() final;
+
+ bool get_store_errors(const scrub_ls_arg_t& arg,
+ scrub_ls_result_t& res_inout) const final;
+
+ void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+ const hobject_t& soid) final;
+
+ private:
+ // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object:
+ PrimaryLogPG* const m_pl_pg;
+
+ /**
+ * Validate consistency of the object info and snap sets.
+ */
+ void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final;
+
+ void log_missing(int missing,
+ const std::optional<hobject_t>& head,
+ LogChannelRef clog,
+ const spg_t& pgid,
+ const char* func,
+ bool allow_incomplete_clones);
+
+ int process_clones_to(const std::optional<hobject_t>& head,
+ const std::optional<SnapSet>& snapset,
+ LogChannelRef clog,
+ const spg_t& pgid,
+ bool allow_incomplete_clones,
+ std::optional<snapid_t> target,
+ std::vector<snapid_t>::reverse_iterator* curclone,
+ inconsistent_snapset_wrapper& snap_error);
+
+
+ // handle our part in stats collection
+ object_stat_collection_t m_scrub_cstat;
+ void _scrub_clear_state() final; // which just clears the stats
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ScrubStore.h"
+#include "osd/osd_types.h"
+#include "common/scrub_types.h"
+#include "include/rados/rados_types.hpp"
+
+using std::ostringstream;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+
+namespace {
+ghobject_t make_scrub_object(const spg_t& pgid)
+{
+ ostringstream ss;
+ ss << "scrub_" << pgid;
+ return pgid.make_temp_ghobject(ss.str());
+}
+
+string first_object_key(int64_t pool)
+{
+ auto hoid = hobject_t(object_t(),
+ "",
+ 0,
+ 0x00000000,
+ pool,
+ "");
+ hoid.build_hash_cache();
+ return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+// the object_key should be unique across pools
+string to_object_key(int64_t pool, const librados::object_id_t& oid)
+{
+ auto hoid = hobject_t(object_t(oid.name),
+ oid.locator, // key
+ oid.snap,
+ 0, // hash
+ pool,
+ oid.nspace);
+ hoid.build_hash_cache();
+ return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+string last_object_key(int64_t pool)
+{
+ auto hoid = hobject_t(object_t(),
+ "",
+ 0,
+ 0xffffffff,
+ pool,
+ "");
+ hoid.build_hash_cache();
+ return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+string first_snap_key(int64_t pool)
+{
+ // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
+ // the representing the minimal and maximum keys. and this relies on how
+ // hobject_t::to_str() works: hex(pool).hex(revhash).
+ auto hoid = hobject_t(object_t(),
+ "",
+ 0,
+ 0x00000000,
+ pool,
+ "");
+ hoid.build_hash_cache();
+ return "SCRUB_SS_" + hoid.to_str();
+}
+
+string to_snap_key(int64_t pool, const librados::object_id_t& oid)
+{
+ auto hoid = hobject_t(object_t(oid.name),
+ oid.locator, // key
+ oid.snap,
+ 0x77777777, // hash
+ pool,
+ oid.nspace);
+ hoid.build_hash_cache();
+ return "SCRUB_SS_" + hoid.to_str();
+}
+
+string last_snap_key(int64_t pool)
+{
+ auto hoid = hobject_t(object_t(),
+ "",
+ 0,
+ 0xffffffff,
+ pool,
+ "");
+ hoid.build_hash_cache();
+ return "SCRUB_SS_" + hoid.to_str();
+}
+}
+
+namespace Scrub {
+
+Store*
+Store::create(ObjectStore* store,
+ ObjectStore::Transaction* t,
+ const spg_t& pgid,
+ const coll_t& coll)
+{
+ ceph_assert(store);
+ ceph_assert(t);
+ ghobject_t oid = make_scrub_object(pgid);
+ t->touch(coll, oid);
+ return new Store{coll, oid, store};
+}
+
+Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
+ : coll(coll),
+ hoid(oid),
+ driver(store, coll, hoid),
+ backend(&driver)
+{}
+
+Store::~Store()
+{
+ ceph_assert(results.empty());
+}
+
+void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
+{
+ bufferlist bl;
+ e.encode(bl);
+ results[to_object_key(pool, e.object)] = bl;
+}
+
+void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
+{
+ bufferlist bl;
+ e.encode(bl);
+ results[to_snap_key(pool, e.object)] = bl;
+}
+
+bool Store::empty() const
+{
+ return results.empty();
+}
+
+void Store::flush(ObjectStore::Transaction* t)
+{
+ if (t) {
+ OSDriver::OSTransaction txn = driver.get_transaction(t);
+ backend.set_keys(results, &txn);
+ }
+ results.clear();
+}
+
+void Store::cleanup(ObjectStore::Transaction* t)
+{
+ t->remove(coll, hoid);
+}
+
+std::vector<bufferlist>
+Store::get_snap_errors(int64_t pool,
+ const librados::object_id_t& start,
+ uint64_t max_return) const
+{
+ const string begin = (start.name.empty() ?
+ first_snap_key(pool) : to_snap_key(pool, start));
+ const string end = last_snap_key(pool);
+ return get_errors(begin, end, max_return);
+}
+
+std::vector<bufferlist>
+Store::get_object_errors(int64_t pool,
+ const librados::object_id_t& start,
+ uint64_t max_return) const
+{
+ const string begin = (start.name.empty() ?
+ first_object_key(pool) : to_object_key(pool, start));
+ const string end = last_object_key(pool);
+ return get_errors(begin, end, max_return);
+}
+
+std::vector<bufferlist>
+Store::get_errors(const string& begin,
+ const string& end,
+ uint64_t max_return) const
+{
+ vector<bufferlist> errors;
+ auto next = std::make_pair(begin, bufferlist{});
+ while (max_return && !backend.get_next(next.first, &next)) {
+ if (next.first >= end)
+ break;
+ errors.push_back(next.second);
+ max_return--;
+ }
+ return errors;
+}
+
+} // namespace Scrub
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_SCRUB_RESULT_H
+#define CEPH_SCRUB_RESULT_H
+
+#include "osd/SnapMapper.h" // for OSDriver
+#include "common/map_cacher.hpp"
+
+namespace librados {
+ struct object_id_t;
+}
+
+struct inconsistent_obj_wrapper;
+struct inconsistent_snapset_wrapper;
+
+namespace Scrub {
+
+class Store {
+public:
+ ~Store();
+ static Store* create(ObjectStore* store,
+ ObjectStore::Transaction* t,
+ const spg_t& pgid,
+ const coll_t& coll);
+ void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
+ void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
+ bool empty() const;
+ void flush(ObjectStore::Transaction *);
+ void cleanup(ObjectStore::Transaction *);
+ std::vector<ceph::buffer::list> get_snap_errors(int64_t pool,
+ const librados::object_id_t& start,
+ uint64_t max_return) const;
+ std::vector<ceph::buffer::list> get_object_errors(int64_t pool,
+ const librados::object_id_t& start,
+ uint64_t max_return) const;
+private:
+ Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
+ std::vector<ceph::buffer::list> get_errors(const std::string& start, const std::string& end,
+ uint64_t max_return) const;
+private:
+ const coll_t coll;
+ const ghobject_t hoid;
+ // a temp object holding mappings from seq-id to inconsistencies found in
+ // scrubbing
+ OSDriver driver;
+ mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+ std::map<std::string, ceph::buffer::list> results;
+};
+}
+
+#endif // CEPH_SCRUB_RESULT_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=2 sw=2 smarttab
+
+#include "./pg_scrubber.h" // the '.' notation used to affect clang-format order
+
+#include <iostream>
+#include <vector>
+
+#include "debug.h"
+
+#include "common/errno.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrubReserve.h"
+
+#include "osd/OSD.h"
+#include "ScrubStore.h"
+#include "scrub_machine.h"
+
+using std::list;
+using std::map;
+using std::pair;
+using std::set;
+using std::stringstream;
+using std::vector;
+using namespace Scrub;
+using namespace std::chrono;
+using namespace std::chrono_literals;
+using namespace std::literals;
+
+#define dout_context (m_pg->get_cct())
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this->m_pg)
+
+template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
+{
+ return t->gen_prefix(*_dout) << " scrubber pg(" << t->pg_id << ") ";
+}
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf)
+{
+ if (sf.auto_repair)
+ out << " AUTO_REPAIR";
+ if (sf.check_repair)
+ out << " CHECK_REPAIR";
+ if (sf.deep_scrub_on_error)
+ out << " DEEP_SCRUB_ON_ERROR";
+ if (sf.required)
+ out << " REQ_SCRUB";
+
+ return out;
+}
+
+ostream& operator<<(ostream& out, const requested_scrub_t& sf)
+{
+ if (sf.must_repair)
+ out << " MUST_REPAIR";
+ if (sf.auto_repair)
+ out << " planned AUTO_REPAIR";
+ if (sf.check_repair)
+ out << " planned CHECK_REPAIR";
+ if (sf.deep_scrub_on_error)
+ out << " planned DEEP_SCRUB_ON_ERROR";
+ if (sf.must_deep_scrub)
+ out << " MUST_DEEP_SCRUB";
+ if (sf.must_scrub)
+ out << " MUST_SCRUB";
+ if (sf.time_for_deep)
+ out << " TIME_FOR_DEEP";
+ if (sf.need_auto)
+ out << " NEED_AUTO";
+ if (sf.req_scrub)
+ out << " planned REQ_SCRUB";
+
+ return out;
+}
+
+/*
+ * if the incoming message is from a previous interval, it must mean
+ * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard
+ * the stale message.
+ */
+bool PgScrubber::check_interval(epoch_t epoch_to_verify)
+{
+ return epoch_to_verify >= m_pg->get_same_interval_since();
+}
+
+bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify)
+{
+ if (!m_active) {
+ // not scrubbing. We can assume that the scrub was already terminated, and we
+ // can silently discard the incoming event.
+ return false;
+ }
+
+ // is this a message from before we started this scrub?
+ if (epoch_to_verify < m_epoch_start) {
+ return false;
+ }
+
+ // has a new interval started?
+ if (!check_interval(epoch_to_verify)) {
+ // if this is a new interval, on_change() has already terminated that
+ // old scrub.
+ return false;
+ }
+
+ ceph_assert(is_primary());
+
+ // were we instructed to abort?
+ return verify_against_abort(epoch_to_verify);
+}
+
+bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify)
+{
+ if (!should_abort()) {
+ return true;
+ }
+
+ dout(10) << __func__ << " aborting. incoming epoch: " << epoch_to_verify
+ << " vs last-aborted: " << m_last_aborted << dendl;
+
+ // if we were not aware of the abort before - kill the scrub.
+ if (epoch_to_verify > m_last_aborted) {
+ scrub_clear_state();
+ m_last_aborted = std::max(epoch_to_verify, m_epoch_start);
+ }
+ return false;
+}
+
+bool PgScrubber::should_abort() const
+{
+ if (m_flags.required) {
+ return false; // not stopping 'required' scrubs for configuration changes
+ }
+
+ if (m_is_deep) {
+ if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+ m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
+ dout(10) << "nodeep_scrub set, aborting" << dendl;
+ return true;
+ }
+ }
+
+ if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+ m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
+ dout(10) << "noscrub set, aborting" << dendl;
+ return true;
+ }
+
+ return false;
+}
+
+// initiating state-machine events --------------------------------
+
+/*
+ * a note re the checks performed before sending scrub-initiating messages:
+ *
+ * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that
+ * possibly were in the queue while the PG changed state and became unavailable for
+ * scrubbing:
+ *
+ * The check_interval() catches all major changes to the PG. As for the other conditions
+ * we may check (and see is_message_relevant() above):
+ *
+ * - we are not 'active' yet, so must not check against is_active(), and:
+ *
+ * - the 'abort' flags were just verified (when the triggering message was queued). As
+ * those are only modified in human speeds - they need not be queried again.
+ *
+ * Some of the considerations above are also relevant to the replica-side initiation
+ * ('StartReplica' & 'StartReplicaNoWait').
+ */
+
+void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
+{
+ dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
+ // we may have lost our Primary status while the message languished in the queue
+ if (check_interval(epoch_queued)) {
+ dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl;
+ reset_epoch(epoch_queued);
+ m_fsm->my_states();
+ m_fsm->process_event(StartScrub{});
+ dout(10) << "scrubber event --<< StartScrub" << dendl;
+ }
+}
+
+void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued)
+{
+ dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
+ // we may have lost our Primary status while the message languished in the queue
+ if (check_interval(epoch_queued)) {
+ dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl;
+ reset_epoch(epoch_queued);
+ m_fsm->my_states();
+ m_fsm->process_event(AfterRepairScrub{});
+ dout(10) << "scrubber event --<< AfterRepairScrub" << dendl;
+ }
+}
+
+void PgScrubber::send_scrub_unblock(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(Unblocked{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_resched(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(InternalSchedScrub{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+ << " token: " << token << dendl;
+ if (is_primary()) {
+ // shouldn't happen. Ignore
+ dout(1) << "got a replica scrub request while Primary!" << dendl;
+ return;
+ }
+
+ if (check_interval(epoch_queued) && is_token_current(token)) {
+ m_fsm->my_states();
+ // save us some time by not waiting for updates if there are none
+ // to wait for. Affects the transition from NotActive into either
+ // ReplicaWaitUpdates or ActiveReplica.
+ if (pending_active_pushes())
+ m_fsm->process_event(StartReplica{});
+ else
+ m_fsm->process_event(StartReplicaNoWait{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+ << " token: " << token << dendl;
+ if (check_interval(epoch_queued) && is_token_current(token)) {
+ m_fsm->my_states();
+ m_fsm->process_event(SchedReplica{}); // retest for map availability
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::active_pushes_notification(epoch_t epoch_queued)
+{
+ // note: Primary only
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(ActivePushesUpd{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::update_applied_notification(epoch_t epoch_queued)
+{
+ // note: Primary only
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(UpdatesApplied{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::digest_update_notification(epoch_t epoch_queued)
+{
+ // note: Primary only
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(DigestUpdate{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_local_map_done(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(Scrub::IntLocalMapDone{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(GotReplicas{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (check_interval(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(ReplicaPushesUpd{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_remotes_reserved(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ // note: scrub is not active yet
+ if (check_interval(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(RemotesReserved{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_reservation_failure(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (check_interval(epoch_queued)) { // do not check for 'active'!
+ m_fsm->my_states();
+ m_fsm->process_event(ReservationFailure{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_full_reset(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+ m_fsm->my_states();
+ m_fsm->process_event(Scrub::FullReset{});
+
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_chunk_free(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (check_interval(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(Scrub::SelectedChunkFree{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_chunk_busy(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (check_interval(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(Scrub::ChunkIsBusy{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_get_next_chunk(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->my_states();
+ m_fsm->process_event(Scrub::NextChunk{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+ // can't check for "active"
+
+ m_fsm->my_states();
+ m_fsm->process_event(Scrub::ScrubFinished{});
+
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_maps_compared(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+ m_fsm->my_states();
+ m_fsm->process_event(Scrub::MapsCompared{});
+
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+// -----------------
+
+bool PgScrubber::is_reserving() const
+{
+ return m_fsm->is_reserving();
+}
+
+void PgScrubber::reset_epoch(epoch_t epoch_queued)
+{
+ dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
+ m_fsm->assert_not_active();
+
+ m_epoch_start = epoch_queued;
+ m_needs_sleep = true;
+ m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
+ update_op_mode_text();
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
+{
+ unsigned int qu_priority = m_flags.priority;
+
+ if (with_priority == Scrub::scrub_prio_t::high_priority) {
+ qu_priority =
+ std::max(qu_priority, (unsigned int)m_pg->get_cct()->_conf->osd_client_op_priority);
+ }
+ return qu_priority;
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+ unsigned int suggested_priority) const
+{
+ if (with_priority == Scrub::scrub_prio_t::high_priority) {
+ suggested_priority = std::max(suggested_priority,
+ (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
+ }
+ return suggested_priority;
+}
+
+// ///////////////////////////////////////////////////////////////////// //
+// scrub-op registration handling
+
+bool PgScrubber::is_scrub_registered() const
+{
+ return !m_scrub_reg_stamp.is_zero();
+}
+
+void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
+{
+ if (!is_primary()) {
+ // normal. No warning is required.
+ return;
+ }
+
+ dout(10) << __func__ << " planned: must? " << request_flags.must_scrub << " need-auto? "
+ << request_flags.need_auto << " stamp: " << m_pg->info.history.last_scrub_stamp
+ << dendl;
+
+ ceph_assert(!is_scrub_registered());
+
+ utime_t reg_stamp;
+ bool must = false;
+
+ if (request_flags.must_scrub || request_flags.need_auto) {
+ // Set the smallest time that isn't utime_t()
+ reg_stamp = PgScrubber::scrub_must_stamp();
+ must = true;
+ } else if (m_pg->info.stats.stats_invalid &&
+ m_pg->cct->_conf->osd_scrub_invalid_stats) {
+ reg_stamp = ceph_clock_now();
+ must = true;
+ } else {
+ reg_stamp = m_pg->info.history.last_scrub_stamp;
+ }
+
+ dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must
+ << " required:" << m_flags.required << " flags: " << request_flags
+ << " stamp: " << reg_stamp << dendl;
+
+ const double scrub_min_interval =
+ m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0);
+ const double scrub_max_interval =
+ m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0);
+
+ // note the sched_time, so we can locate this scrub, and remove it later
+ m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval,
+ scrub_max_interval, must);
+ dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time "
+ << m_scrub_reg_stamp << ", must = " << (int)must << dendl;
+}
+
+void PgScrubber::unreg_next_scrub()
+{
+ if (is_scrub_registered()) {
+ dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl;
+ m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
+ m_scrub_reg_stamp = utime_t{};
+ }
+}
+
+void PgScrubber::scrub_requested(scrub_level_t scrub_level,
+ scrub_type_t scrub_type,
+ requested_scrub_t& req_flags)
+{
+ dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ")
+ << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ")
+ << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered()
+ << dendl;
+
+ unreg_next_scrub();
+
+ req_flags.must_scrub = true;
+ req_flags.must_deep_scrub =
+ (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair);
+ req_flags.must_repair = (scrub_type == scrub_type_t::do_repair);
+ // User might intervene, so clear this
+ req_flags.need_auto = false;
+ req_flags.req_scrub = true;
+
+ dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl;
+
+ reg_next_scrub(req_flags);
+}
+
+void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
+{
+ dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? "
+ << is_scrub_registered() << dendl;
+
+ unreg_next_scrub();
+ req_flags.need_auto = true;
+ reg_next_scrub(req_flags);
+}
+
+bool PgScrubber::reserve_local()
+{
+ // try to create the reservation object (which translates into asking the
+ // OSD for the local scrub resource). If failing - undo it immediately
+
+ m_local_osd_resource.emplace(m_pg, m_osds);
+ if (!m_local_osd_resource->is_reserved()) {
+ m_local_osd_resource.reset();
+ return false;
+ }
+
+ return true;
+}
+
+// ----------------------------------------------------------------------------
+
+bool PgScrubber::has_pg_marked_new_updates() const
+{
+ auto last_applied = m_pg->recovery_state.get_last_update_applied();
+ dout(10) << __func__ << " recovery last: " << last_applied
+ << " vs. scrub's: " << m_subset_last_update << dendl;
+
+ return last_applied >= m_subset_last_update;
+}
+
+void PgScrubber::set_subset_last_update(eversion_t e)
+{
+ m_subset_last_update = e;
+ dout(15) << __func__ << " last-update: " << e << dendl;
+}
+
+void PgScrubber::on_applied_when_primary(const eversion_t& applied_version)
+{
+ // we are only interested in updates if we are the Primary, and in state
+ // WaitLastUpdate
+ if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) {
+ m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops());
+ dout(15) << __func__ << " update: " << applied_version
+ << " vs. required: " << m_subset_last_update << dendl;
+ }
+}
+
+/*
+ * The selected range is set directly into 'm_start' and 'm_end'
+ * setting:
+ * - m_subset_last_update
+ * - m_max_end
+ * - end
+ * - start
+ */
+bool PgScrubber::select_range()
+{
+ m_primary_scrubmap = ScrubMap{};
+ m_received_maps.clear();
+
+ /* get the start and end of our scrub chunk
+ *
+ * Our scrub chunk has an important restriction we're going to need to
+ * respect. We can't let head be start or end.
+ * Using a half-open interval means that if end == head,
+ * we'd scrub/lock head and the clone right next to head in different
+ * chunks which would allow us to miss clones created between
+ * scrubbing that chunk and scrubbing the chunk including head.
+ * This isn't true for any of the other clones since clones can
+ * only be created "just to the left of" head. There is one exception
+ * to this: promotion of clones which always happens to the left of the
+ * left-most clone, but promote_object checks the scrubber in that
+ * case, so it should be ok. Also, it's ok to "miss" clones at the
+ * left end of the range if we are a tier because they may legitimately
+ * not exist (see _scrub).
+ */
+ int min_idx = std::max<int64_t>(
+ 3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor());
+
+ int max_idx = std::max<int64_t>(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max /
+ preemption_data.chunk_divisor());
+
+ dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx
+ << " Div: " << preemption_data.chunk_divisor() << dendl;
+
+ hobject_t start = m_start;
+ hobject_t candidate_end;
+ std::vector<hobject_t> objects;
+ int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects,
+ &candidate_end);
+ ceph_assert(ret >= 0);
+
+ if (!objects.empty()) {
+
+ hobject_t back = objects.back();
+ while (candidate_end.is_head() && candidate_end == back.get_head()) {
+ candidate_end = back;
+ objects.pop_back();
+ if (objects.empty()) {
+ ceph_assert(0 ==
+ "Somehow we got more than 2 objects which"
+ "have the same head but are not clones");
+ }
+ back = objects.back();
+ }
+
+ if (candidate_end.is_head()) {
+ ceph_assert(candidate_end != back.get_head());
+ candidate_end = candidate_end.get_object_boundary();
+ }
+
+ } else {
+ ceph_assert(candidate_end.is_max());
+ }
+
+ // is that range free for us? if not - we will be rescheduled later by whoever
+ // triggered us this time
+
+ if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) {
+ // we'll be requeued by whatever made us unavailable for scrub
+ dout(10) << __func__ << ": scrub blocked somewhere in range "
+ << "[" << m_start << ", " << candidate_end << ")" << dendl;
+ return false;
+ }
+
+ m_end = candidate_end;
+ if (m_end > m_max_end)
+ m_max_end = m_end;
+
+ dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// "
+ << m_max_end << dendl;
+
+ // debug: be 'blocked' if told so by the 'pg scrub_debug block' asok command
+ if (m_debug_blockrange > 0) {
+ m_debug_blockrange--;
+ return false;
+ }
+ return true;
+}
+
+void PgScrubber::select_range_n_notify()
+{
+ if (select_range()) {
+ // the next chunk to handle is not blocked
+ dout(20) << __func__ << ": selection OK" << dendl;
+ m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority);
+
+ } else {
+ // we will wait for the objects range to become available for scrubbing
+ dout(10) << __func__ << ": selected chunk is busy" << dendl;
+ m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority);
+ }
+}
+
+bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
+{
+ if (soid < m_start || soid >= m_end) {
+ return false;
+ }
+
+ dout(20) << __func__ << " " << soid << " can preempt? "
+ << preemption_data.is_preemptable() << " already preempted? "
+ << preemption_data.was_preempted() << dendl;
+
+ if (preemption_data.was_preempted()) {
+ // otherwise - write requests arriving while 'already preempted' is set
+ // but 'preemptable' is not - will not be allowed to continue, and will
+ // not be requeued on time.
+ return false;
+ }
+
+ if (preemption_data.is_preemptable()) {
+
+ dout(10) << __func__ << " " << soid << " preempted" << dendl;
+
+ // signal the preemption
+ preemption_data.do_preempt();
+ m_end = m_start; // free the range we were scrubbing
+
+ return false;
+ }
+ return true;
+}
+
+bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end)
+{
+ // does [start, end] intersect [scrubber.start, scrubber.m_max_end)
+ return (start < m_max_end && end >= m_start);
+}
+
+Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm()
+{
+ return std::make_unique<blocked_range_t>(m_osds, ceph::timespan{300s}, m_pg_id);
+}
+
+/**
+ * if we are required to sleep:
+ * arrange a callback sometimes later.
+ * be sure to be able to identify a stale callback.
+ * Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue)
+ * anyway.
+ */
+void PgScrubber::add_delayed_scheduling()
+{
+ m_end = m_start; // not blocking any range now
+
+ milliseconds sleep_time{0ms};
+ if (m_needs_sleep) {
+ double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required);
+ sleep_time = milliseconds{long(scrub_sleep)};
+ }
+ dout(15) << __func__ << " sleep: " << sleep_time.count() << "ms. needed? "
+ << m_needs_sleep << dendl;
+
+ if (sleep_time.count()) {
+ // schedule a transition for some 'sleep_time' ms in the future
+
+ m_needs_sleep = false;
+ m_sleep_started_at = ceph_clock_now();
+
+ // the following log line is used by osd-scrub-test.sh
+ dout(20) << __func__ << " scrub state is PendingTimer, sleeping" << dendl;
+
+ // the 'delayer' for crimson is different. Will be factored out.
+
+ spg_t pgid = m_pg->get_pgid();
+ auto callbk = new LambdaContext([osds = m_osds, pgid,
+ scrbr = this]([[maybe_unused]] int r) mutable {
+ PGRef pg = osds->osd->lookup_lock_pg(pgid);
+ if (!pg) {
+ lgeneric_subdout(g_ceph_context, osd, 10)
+ << "scrub_requeue_callback: Could not find "
+ << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl;
+ return;
+ }
+ scrbr->m_needs_sleep = true;
+ lgeneric_dout(scrbr->get_pg_cct(), 7)
+ << "scrub_requeue_callback: slept for "
+ << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl;
+
+ scrbr->m_sleep_started_at = utime_t{};
+ osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority);
+ pg->unlock();
+ });
+
+ std::lock_guard l(m_osds->sleep_lock);
+ m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk);
+
+ } else {
+ // just a requeue
+ m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
+ }
+}
+
+eversion_t PgScrubber::search_log_for_updates() const
+{
+ auto& projected = m_pg->projected_log.log;
+ auto pi = find_if(
+ projected.crbegin(), projected.crend(),
+ [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; });
+
+ if (pi != projected.crend())
+ return pi->version;
+
+ // there was no relevant update entry in the log
+
+ auto& log = m_pg->recovery_state.get_pg_log().get_log().log;
+ auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool {
+ return e.soid >= m_start && e.soid < m_end;
+ });
+
+ if (p == log.crend())
+ return eversion_t{};
+ else
+ return p->version;
+}
+
+void PgScrubber::get_replicas_maps(bool replica_can_preempt)
+{
+ dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/"
+ << m_interval_start
+ << " pg same_interval_since: " << m_pg->info.history.same_interval_since
+ << dendl;
+
+ m_primary_scrubmap_pos.reset();
+
+ // ask replicas to scan and send maps
+ for (const auto& i : m_pg->get_acting_recovery_backfill()) {
+
+ if (i == m_pg_whoami)
+ continue;
+
+ m_maps_status.mark_replica_map_request(i);
+ _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep,
+ replica_can_preempt);
+ }
+
+ dout(10) << __func__ << " awaiting" << m_maps_status << dendl;
+}
+
+bool PgScrubber::was_epoch_changed() const
+{
+ // for crimson we have m_pg->get_info().history.same_interval_since
+ dout(10) << __func__ << " epoch_start: " << m_interval_start
+ << " from pg: " << m_pg->get_history().same_interval_since << dendl;
+
+ return m_interval_start < m_pg->get_history().same_interval_since;
+}
+
+void PgScrubber::mark_local_map_ready()
+{
+ m_maps_status.mark_local_map_ready();
+}
+
+bool PgScrubber::are_all_maps_available() const
+{
+ return m_maps_status.are_all_maps_available();
+}
+
+std::string PgScrubber::dump_awaited_maps() const
+{
+ return m_maps_status.dump();
+}
+
+void PgScrubber::update_op_mode_text()
+{
+ auto visible_repair = state_test(PG_STATE_REPAIR);
+ m_mode_desc = (visible_repair ? "repair" : (m_is_deep ? "deep-scrub" : "scrub"));
+
+ dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false")
+ << ", internal: " << (m_is_repair ? "true" : "false")
+ << ". Displayed: " << m_mode_desc << dendl;
+}
+
+void PgScrubber::_request_scrub_map(pg_shard_t replica,
+ eversion_t version,
+ hobject_t start,
+ hobject_t end,
+ bool deep,
+ bool allow_preemption)
+{
+ ceph_assert(replica != m_pg_whoami);
+ dout(10) << __func__ << " scrubmap from osd." << replica
+ << (deep ? " deep" : " shallow") << dendl;
+
+ auto repscrubop =
+ new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version,
+ get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep,
+ allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub());
+
+ // default priority. We want the replica-scrub processed prior to any recovery
+ // or client io messages (we are holding a lock!)
+ m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
+}
+
+void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
+{
+ if (!m_store)
+ return;
+
+ struct OnComplete : Context {
+ std::unique_ptr<Scrub::Store> store;
+ explicit OnComplete(std::unique_ptr<Scrub::Store>&& store) : store(std::move(store))
+ {}
+ void finish(int) override {}
+ };
+ m_store->cleanup(t);
+ t->register_on_complete(new OnComplete(std::move(m_store)));
+ ceph_assert(!m_store);
+}
+
+void PgScrubber::on_init()
+{
+ // going upwards from 'inactive'
+ ceph_assert(!is_scrub_active());
+
+ preemption_data.reset();
+ m_pg->publish_stats_to_osd();
+ m_interval_start = m_pg->get_history().same_interval_since;
+
+ dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl;
+
+ // create a new store
+ {
+ ObjectStore::Transaction t;
+ cleanup_store(&t);
+ m_store.reset(
+ Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
+ m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+ }
+
+ m_start = m_pg->info.pgid.pgid.get_hobj_start();
+ m_active = true;
+}
+
+void PgScrubber::on_replica_init()
+{
+ m_active = true;
+}
+
+void PgScrubber::_scan_snaps(ScrubMap& smap)
+{
+ hobject_t head;
+ SnapSet snapset;
+
+ // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings
+ // in this function
+ dout(15) << "_scan_snaps starts" << dendl;
+
+ for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
+
+ const hobject_t& hoid = i->first;
+ ScrubMap::object& o = i->second;
+
+ dout(20) << __func__ << " " << hoid << dendl;
+
+ ceph_assert(!hoid.is_snapdir());
+ if (hoid.is_head()) {
+ // parse the SnapSet
+ bufferlist bl;
+ if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
+ continue;
+ }
+ bl.push_back(o.attrs[SS_ATTR]);
+ auto p = bl.cbegin();
+ try {
+ decode(snapset, p);
+ } catch (...) {
+ continue;
+ }
+ head = hoid.get_head();
+ continue;
+ }
+
+ if (hoid.snap < CEPH_MAXSNAP) {
+ // check and if necessary fix snap_mapper
+ if (hoid.get_head() != head) {
+ derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl;
+ continue;
+ }
+ set<snapid_t> obj_snaps;
+ auto p = snapset.clone_snaps.find(hoid.snap);
+ if (p == snapset.clone_snaps.end()) {
+ derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl;
+ continue;
+ }
+ obj_snaps.insert(p->second.begin(), p->second.end());
+ set<snapid_t> cur_snaps;
+ int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps);
+ if (r != 0 && r != -ENOENT) {
+ derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ if (r == -ENOENT || cur_snaps != obj_snaps) {
+ ObjectStore::Transaction t;
+ OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t));
+ if (r == 0) {
+ r = m_pg->snap_mapper.remove_oid(hoid, &_t);
+ if (r != 0) {
+ derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ m_pg->osd->clog->error()
+ << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+ << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps
+ << ", oi: " << obj_snaps << "...repaired";
+ } else {
+ m_pg->osd->clog->error()
+ << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+ << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper"
+ << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r
+ << "...repaired";
+ }
+ m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t);
+
+ // wait for repair to apply to avoid confusing other bits of the system.
+ {
+ dout(15) << __func__ << " wait on repair!" << dendl;
+
+ ceph::condition_variable my_cond;
+ ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
+ int e = 0;
+ bool done;
+
+ t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e));
+
+ e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t));
+ if (e != 0) {
+ derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl;
+ } else {
+ std::unique_lock l{my_lock};
+ my_cond.wait(l, [&done] { return done; });
+ }
+ }
+ }
+ }
+ }
+}
+
+int PgScrubber::build_primary_map_chunk()
+{
+ epoch_t map_building_since = m_pg->get_osdmap_epoch();
+ dout(20) << __func__ << ": initiated at epoch " << map_building_since << dendl;
+
+ auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start,
+ m_end, m_is_deep);
+
+ if (ret == -EINPROGRESS) {
+ // reschedule another round of asking the backend to collect the scrub data
+ m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority);
+ }
+ return ret;
+}
+
+int PgScrubber::build_replica_map_chunk()
+{
+ dout(10) << __func__ << " interval start: " << m_interval_start
+ << " current token: " << m_current_token << " epoch: " << m_epoch_start
+ << " deep: " << m_is_deep << dendl;
+
+ auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
+ m_is_deep);
+
+ switch (ret) {
+
+ case -EINPROGRESS:
+ // must wait for the backend to finish. No external event source.
+ // (note: previous version used low priority here. Now switched to using the
+ // priority of the original message)
+ m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority,
+ m_flags.priority, m_current_token);
+ break;
+
+ case 0: {
+ // finished!
+ m_cleaned_meta_map.clear_from(m_start);
+ m_cleaned_meta_map.insert(replica_scrubmap);
+ auto for_meta_scrub = clean_meta_map();
+ _scan_snaps(for_meta_scrub);
+
+ // the local map has been created. Send it to the primary.
+ // Note: once the message reaches the Primary, it may ask us for another
+ // chunk - and we better be done with the current scrub. Thus - the preparation of
+ // the reply message is separate, and we clear the scrub state before actually
+ // sending it.
+
+ auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
+ replica_handling_done();
+ dout(15) << __func__ << " chunk map sent " << dendl;
+ send_replica_map(reply);
+ } break;
+
+ default:
+ // negative retval: build_scrub_map_chunk() signalled an error
+ // Pre-Pacific code ignored this option, treating it as a success.
+ // \todo Add an error flag in the returning message.
+ dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret
+ << dendl;
+ replica_handling_done();
+ // only in debug mode for now:
+ assert(false && "backend error");
+ break;
+ };
+
+ return ret;
+}
+
+int PgScrubber::build_scrub_map_chunk(
+ ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep)
+{
+ dout(10) << __func__ << " [" << start << "," << end << ") "
+ << " pos " << pos << " Deep: " << deep << dendl;
+
+ // start
+ while (pos.empty()) {
+
+ pos.deep = deep;
+ map.valid_through = m_pg->info.last_update;
+
+ // objects
+ vector<ghobject_t> rollback_obs;
+ pos.ret =
+ m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs);
+ dout(10) << __func__ << " while pos empty " << pos.ret << dendl;
+ if (pos.ret < 0) {
+ dout(5) << "objects_list_range error: " << pos.ret << dendl;
+ return pos.ret;
+ }
+ dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl;
+ if (pos.ls.empty()) {
+ break;
+ }
+ m_pg->_scan_rollback_obs(rollback_obs);
+ pos.pos = 0;
+ return -EINPROGRESS;
+ }
+
+ // scan objects
+ while (!pos.done()) {
+
+ int r = m_pg->get_pgbackend()->be_scan_list(map, pos);
+ dout(30) << __func__ << " BE returned " << r << dendl;
+ if (r == -EINPROGRESS) {
+ dout(20) << __func__ << " in progress" << dendl;
+ return r;
+ }
+ }
+
+ // finish
+ dout(20) << __func__ << " finishing" << dendl;
+ ceph_assert(pos.done());
+ m_pg->_repair_oinfo_oid(map);
+
+ dout(20) << __func__ << " done, got " << map.objects.size() << " items" << dendl;
+ return 0;
+}
+
+/*
+ * Process:
+ * Building a map of objects suitable for snapshot validation.
+ * The data in m_cleaned_meta_map is the left over partial items that need to
+ * be completed before they can be processed.
+ *
+ * Snapshots in maps precede the head object, which is why we are scanning backwards.
+ */
+ScrubMap PgScrubber::clean_meta_map()
+{
+ ScrubMap for_meta_scrub;
+
+ if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) {
+ m_cleaned_meta_map.swap(for_meta_scrub);
+ } else {
+ auto iter = m_cleaned_meta_map.objects.end();
+ --iter; // not empty, see 'if' clause
+ auto begin = m_cleaned_meta_map.objects.begin();
+ if (iter->first.has_snapset()) {
+ ++iter;
+ } else {
+ while (iter != begin) {
+ auto next = iter--;
+ if (next->first.get_head() != iter->first.get_head()) {
+ ++iter;
+ break;
+ }
+ }
+ }
+ for_meta_scrub.objects.insert(begin, iter);
+ m_cleaned_meta_map.objects.erase(begin, iter);
+ }
+
+ return for_meta_scrub;
+}
+
+void PgScrubber::run_callbacks()
+{
+ std::list<Context*> to_run;
+ to_run.swap(m_callbacks);
+
+ for (auto& tr : to_run) {
+ tr->complete(0);
+ }
+}
+
+void PgScrubber::maps_compare_n_cleanup()
+{
+ scrub_compare_maps();
+ m_start = m_end;
+ run_callbacks();
+ requeue_waiting();
+ m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority);
+}
+
+Scrub::preemption_t& PgScrubber::get_preemptor()
+{
+ return preemption_data;
+}
+
+/*
+ * Process note: called for the arriving "give me your map, replica!" request. Unlike
+ * the original implementation, we do not requeue the Op waiting for
+ * updates. Instead - we trigger the FSM.
+ */
+void PgScrubber::replica_scrub_op(OpRequestRef op)
+{
+ op->mark_started();
+ auto msg = op->get_req<MOSDRepScrub>();
+ dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch
+ << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl;
+
+ // are we still processing a previous scrub-map request without noticing that the
+ // interval changed? won't see it here, but rather at the reservation stage.
+
+ if (msg->map_epoch < m_pg->info.history.same_interval_since) {
+ dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
+ << " < " << m_pg->info.history.same_interval_since << dendl;
+
+ // is there a general sync issue? are we holding a stale reservation?
+ // not checking now - assuming we will actively react to interval change.
+
+ return;
+ }
+
+ replica_scrubmap = ScrubMap{};
+ replica_scrubmap_pos = ScrubMapBuilder{};
+
+ m_replica_min_epoch = msg->min_epoch;
+ m_start = msg->start;
+ m_end = msg->end;
+ m_max_end = msg->end;
+ m_is_deep = msg->deep;
+ m_interval_start = m_pg->info.history.same_interval_since;
+ m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority
+ : Scrub::scrub_prio_t::low_priority;
+ m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority();
+
+ preemption_data.reset();
+ preemption_data.force_preemptability(msg->allow_preemption);
+
+ replica_scrubmap_pos.reset();
+
+ // make sure the FSM is at NotActive
+ m_fsm->assert_not_active();
+
+ m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority,
+ m_current_token);
+}
+
+void PgScrubber::set_op_parameters(requested_scrub_t& request)
+{
+ dout(10) << __func__ << " input: " << request << dendl;
+
+ // write down the epoch of starting a new scrub. Will be used
+ // to discard stale messages from previous aborted scrubs.
+ m_epoch_start = m_pg->get_osdmap_epoch();
+
+ m_flags.check_repair = request.check_repair;
+ m_flags.auto_repair = request.auto_repair || request.need_auto;
+ m_flags.required = request.req_scrub || request.must_scrub;
+
+ m_flags.priority = (request.must_scrub || request.need_auto)
+ ? get_pg_cct()->_conf->osd_requested_scrub_priority
+ : m_pg->get_scrub_priority();
+
+ state_set(PG_STATE_SCRUBBING);
+
+ // will we be deep-scrubbing?
+ if (request.must_deep_scrub || request.need_auto || request.time_for_deep) {
+ state_set(PG_STATE_DEEP_SCRUB);
+ }
+
+ // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e.
+ // deep-scrub with the auto_repair configuration flag set). m_is_repair value
+ // determines the scrubber behavior.
+ // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the
+ // PG status as appearing in the logs).
+ m_is_repair = request.must_repair || m_flags.auto_repair;
+ if (request.must_repair) {
+ state_set(PG_STATE_REPAIR);
+ // not calling update_op_mode_text() yet, as m_is_deep not set yet
+ }
+
+ // the publishing here seems to be required for tests synchronization
+ m_pg->publish_stats_to_osd();
+ m_flags.deep_scrub_on_error = request.deep_scrub_on_error;
+}
+
+void PgScrubber::scrub_compare_maps()
+{
+ dout(10) << __func__ << " has maps, analyzing" << dendl;
+
+ // construct authoritative scrub map for type-specific scrubbing
+ m_cleaned_meta_map.insert(m_primary_scrubmap);
+ map<hobject_t, pair<std::optional<uint32_t>, std::optional<uint32_t>>> missing_digest;
+
+ map<pg_shard_t, ScrubMap*> maps;
+ maps[m_pg_whoami] = &m_primary_scrubmap;
+
+ for (const auto& i : m_pg->get_acting_recovery_backfill()) {
+ if (i == m_pg_whoami)
+ continue;
+ dout(2) << __func__ << " replica " << i << " has "
+ << m_received_maps[i].objects.size() << " items" << dendl;
+ maps[i] = &m_received_maps[i];
+ }
+
+ set<hobject_t> master_set;
+
+ // Construct master set
+ for (const auto& map : maps) {
+ for (const auto& i : map.second->objects) {
+ master_set.insert(i.first);
+ }
+ }
+
+ stringstream ss;
+ m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss);
+
+ if (!ss.str().empty()) {
+ m_osds->clog->warn(ss);
+ }
+
+ if (m_pg->recovery_state.get_acting_recovery_backfill().size() > 1) {
+
+ dout(10) << __func__ << " comparing replica scrub maps" << dendl;
+
+ // Map from object with errors to good peer
+ map<hobject_t, list<pg_shard_t>> authoritative;
+
+ dout(2) << __func__ << ": primary (" << m_pg->get_primary() << ") has "
+ << m_primary_scrubmap.objects.size() << " items" << dendl;
+
+ ss.str("");
+ ss.clear();
+
+ m_pg->get_pgbackend()->be_compare_scrubmaps(
+ maps, master_set, m_is_repair, m_missing, m_inconsistent,
+ authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(),
+ m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss);
+
+ if (!ss.str().empty()) {
+ m_osds->clog->error(ss);
+ }
+
+ for (auto& i : authoritative) {
+ list<pair<ScrubMap::object, pg_shard_t>> good_peers;
+ for (list<pg_shard_t>::const_iterator j = i.second.begin(); j != i.second.end();
+ ++j) {
+ good_peers.emplace_back(maps[*j]->objects[i.first], *j);
+ }
+ m_authoritative.emplace(i.first, good_peers);
+ }
+
+ for (auto i = authoritative.begin(); i != authoritative.end(); ++i) {
+ m_cleaned_meta_map.objects.erase(i->first);
+ m_cleaned_meta_map.objects.insert(
+ *(maps[i->second.back()]->objects.find(i->first)));
+ }
+ }
+
+ auto for_meta_scrub = clean_meta_map();
+
+ // ok, do the pg-type specific scrubbing
+
+ // (Validates consistency of the object info and snap sets)
+ scrub_snapshot_metadata(for_meta_scrub, missing_digest);
+
+ // Called here on the primary can use an authoritative map if it isn't the primary
+ _scan_snaps(for_meta_scrub);
+
+ if (!m_store->empty()) {
+
+ if (m_is_repair) {
+ dout(10) << __func__ << ": discarding scrub results" << dendl;
+ m_store->flush(nullptr);
+ } else {
+ dout(10) << __func__ << ": updating scrub object" << dendl;
+ ObjectStore::Transaction t;
+ m_store->flush(&t);
+ m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+ }
+ }
+}
+
+ScrubMachineListener::MsgAndEpoch PgScrubber::prep_replica_map_msg(
+ PreemptionNoted was_preempted)
+{
+ dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl;
+
+ auto reply =
+ make_message<MOSDRepScrubMap>(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
+ m_replica_min_epoch, m_pg_whoami);
+
+ reply->preempted = (was_preempted == PreemptionNoted::preempted);
+ ::encode(replica_scrubmap, reply->get_data());
+
+ return ScrubMachineListener::MsgAndEpoch{reply, m_replica_min_epoch};
+}
+
+void PgScrubber::send_replica_map(const MsgAndEpoch& preprepared)
+{
+ m_pg->send_cluster_message(m_pg->get_primary().osd, preprepared.m_msg,
+ preprepared.m_epoch, false);
+}
+
+void PgScrubber::send_preempted_replica()
+{
+ auto reply =
+ make_message<MOSDRepScrubMap>(spg_t{m_pg->info.pgid.pgid, m_pg->get_primary().shard},
+ m_replica_min_epoch, m_pg_whoami);
+
+ reply->preempted = true;
+ ::encode(replica_scrubmap, reply->get_data()); // must not skip this
+ m_pg->send_cluster_message(m_pg->get_primary().osd, reply, m_replica_min_epoch, false);
+}
+
+/*
+ * - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
+ * The state-machine will react to that when all replica maps are received.
+ * - when all maps are received, we signal the FSM with the GotReplicas event (see
+ * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
+ * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
+ * handle.
+ */
+void PgScrubber::map_from_replica(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDRepScrubMap>();
+ dout(15) << __func__ << " " << *m << dendl;
+
+ if (m->map_epoch < m_pg->info.history.same_interval_since) {
+ dout(10) << __func__ << " discarding old from " << m->map_epoch << " < "
+ << m_pg->info.history.same_interval_since << dendl;
+ return;
+ }
+
+ auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
+
+ m_received_maps[m->from].decode(p, m_pg->info.pgid.pool());
+ dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl;
+
+ auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
+ if (!is_ok) {
+ // previously an unexpected map was triggering an assert. Now, as scrubs can be
+ // aborted at any time, the chances of this happening have increased, and aborting is
+ // not justified
+ dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl;
+ return;
+ }
+
+ if (m->preempted) {
+ dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+ preemption_data.do_preempt();
+ }
+
+ if (m_maps_status.are_all_maps_available()) {
+ dout(15) << __func__ << " all repl-maps available" << dendl;
+ m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops());
+ }
+}
+
+void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+ auto request_ep = op->get_req<MOSDScrubReserve>()->get_map_epoch();
+
+ /*
+ * if we are currently holding a reservation, then:
+ * either (1) we, the scrubber, did not yet notice an interval change. The remembered
+ * reservation epoch is from before our interval, and we can silently discard the
+ * reservation (no message is required).
+ * or:
+ * (2) the interval hasn't changed, but the same Primary that (we think) holds the
+ * lock just sent us a new request. Note that we know it's the same Primary, as
+ * otherwise the interval would have changed.
+ * Ostensibly we can discard & redo the reservation. But then we
+ * will be temporarily releasing the OSD resource - and might not be able to grab it
+ * again. Thus, we simply treat this as a successful new request
+ * (but mark the fact that if there is a previous request from the primary to
+ * scrub a specific chunk - that request is now defunct).
+ */
+
+ if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
+ // we are holding a stale reservation from a past epoch
+ m_remote_osd_resource.reset();
+ dout(10) << __func__ << " stale reservation request" << dendl;
+ }
+
+ if (request_ep < m_pg->get_same_interval_since()) {
+ // will not ack stale requests
+ return;
+ }
+
+ bool granted{false};
+ if (m_remote_osd_resource.has_value()) {
+
+ dout(10) << __func__ << " already reserved." << dendl;
+
+ /*
+ * it might well be that we did not yet finish handling the latest scrub-op from
+ * our primary. This happens, for example, if 'noscrub' was set via a command, then
+ * reset. The primary in this scenario will remain in the same interval, but we do need
+ * to reset our internal state (otherwise - the first renewed 'give me your scrub map'
+ * from the primary will see us in active state, crashing the OSD).
+ */
+ advance_token();
+ granted = true;
+
+ } else if (m_pg->cct->_conf->osd_scrub_during_recovery ||
+ !m_osds->is_recovery_active()) {
+ m_remote_osd_resource.emplace(m_pg, m_osds, request_ep);
+ // OSD resources allocated?
+ granted = m_remote_osd_resource->is_reserved();
+ if (!granted) {
+ // just forget it
+ m_remote_osd_resource.reset();
+ dout(20) << __func__ << ": failed to reserve remotely" << dendl;
+ }
+ }
+
+ dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
+
+ Message* reply = new MOSDScrubReserve(
+ spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep,
+ granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami);
+
+ m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
+}
+
+void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ if (m_reservations.has_value()) {
+ m_reservations->handle_reserve_grant(op, from);
+ } else {
+ derr << __func__ << ": received unsolicited reservation grant from osd " << from
+ << " (" << op << ")" << dendl;
+ }
+}
+
+void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ if (m_reservations.has_value()) {
+ // there is an active reservation process. No action is required otherwise.
+ m_reservations->handle_reserve_reject(op, from);
+ }
+}
+
+void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ /*
+ * this specific scrub session has terminated. All incoming events carrying the old
+ * tag will be discarded.
+ */
+ advance_token();
+ m_remote_osd_resource.reset();
+}
+
+void PgScrubber::discard_replica_reservations()
+{
+ dout(10) << __func__ << dendl;
+ if (m_reservations.has_value()) {
+ m_reservations->discard_all();
+ }
+}
+
+void PgScrubber::clear_scrub_reservations()
+{
+ dout(10) << __func__ << dendl;
+ m_reservations.reset(); // the remote reservations
+ m_local_osd_resource.reset(); // the local reservation
+ m_remote_osd_resource.reset(); // we as replica reserved for a Primary
+}
+
+void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text)
+{
+ ceph_assert(m_pg->recovery_state.get_backfill_targets().empty());
+
+ std::vector<std::pair<int, Message*>> messages;
+ messages.reserve(m_pg->get_actingset().size());
+
+ epoch_t epch = get_osdmap_epoch();
+
+ for (auto& p : m_pg->get_actingset()) {
+
+ if (p == m_pg_whoami)
+ continue;
+
+ dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch
+ << dendl;
+ Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode,
+ m_pg_whoami);
+ messages.push_back(std::make_pair(p.osd, m));
+ }
+
+ if (!messages.empty()) {
+ m_osds->send_message_osd_cluster(messages, epch);
+ }
+}
+
+void PgScrubber::unreserve_replicas()
+{
+ dout(10) << __func__ << dendl;
+ m_reservations.reset();
+}
+
+[[nodiscard]] bool PgScrubber::scrub_process_inconsistent()
+{
+ dout(10) << __func__ << ": checking authoritative (mode="
+ << m_mode_desc << ", auth remaining #: " << m_authoritative.size()
+ << ")" << dendl;
+
+ // authoritative only store objects which are missing or inconsistent.
+ if (!m_authoritative.empty()) {
+
+ stringstream ss;
+ ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, "
+ << m_inconsistent.size() << " inconsistent objects";
+ dout(2) << ss.str() << dendl;
+ m_osds->clog->error(ss);
+
+ if (m_is_repair) {
+ state_clear(PG_STATE_CLEAN);
+ // we know we have a problem, so it's OK to set the user-visible flag
+ // even if we only reached here via auto-repair
+ state_set(PG_STATE_REPAIR);
+ update_op_mode_text();
+
+ for (const auto& [hobj, shrd_list] : m_authoritative) {
+
+ auto missing_entry = m_missing.find(hobj);
+
+ if (missing_entry != m_missing.end()) {
+ m_pg->repair_object(hobj, shrd_list, missing_entry->second);
+ m_fixed_count += missing_entry->second.size();
+ }
+
+ if (m_inconsistent.count(hobj)) {
+ m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]);
+ m_fixed_count += m_inconsistent[hobj].size();
+ }
+ }
+ }
+ }
+ return (!m_authoritative.empty() && m_is_repair);
+}
+
+/*
+ * note: only called for the Primary.
+ */
+void PgScrubber::scrub_finish()
+{
+ dout(10) << __func__ << " before flags: " << m_flags
+ << ". repair state: " << (state_test(PG_STATE_REPAIR) ? "repair" : "no-repair")
+ << ". deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl;
+
+ ceph_assert(m_pg->is_locked());
+
+ m_pg->m_planned_scrub = requested_scrub_t{};
+
+ // if the repair request comes from auto-repair and large number of errors,
+ // we would like to cancel auto-repair
+ if (m_is_repair && m_flags.auto_repair &&
+ m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+
+ dout(10) << __func__ << " undoing the repair" << dendl;
+ state_clear(PG_STATE_REPAIR); // not expected to be set, anyway
+ m_is_repair = false;
+ update_op_mode_text();
+ }
+
+ bool do_auto_scrub = false;
+
+ // if a regular scrub had errors within the limit, do a deep scrub to auto repair
+ if (m_flags.deep_scrub_on_error && !m_authoritative.empty() &&
+ m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+ ceph_assert(!m_is_deep);
+ do_auto_scrub = true;
+ dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl;
+ }
+
+ m_flags.deep_scrub_on_error = false;
+
+ // type-specific finish (can tally more errors)
+ _scrub_finish();
+
+ bool has_error = scrub_process_inconsistent();
+
+ {
+ stringstream oss;
+ oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " ";
+ int total_errors = m_shallow_errors + m_deep_errors;
+ if (total_errors)
+ oss << total_errors << " errors";
+ else
+ oss << "ok";
+ if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
+ oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors
+ << " remaining deep scrub error details lost)";
+ if (m_is_repair)
+ oss << ", " << m_fixed_count << " fixed";
+ if (total_errors)
+ m_osds->clog->error(oss);
+ else
+ m_osds->clog->debug(oss);
+ }
+
+ // Since we don't know which errors were fixed, we can only clear them
+ // when every one has been fixed.
+ if (m_is_repair) {
+ if (m_fixed_count == m_shallow_errors + m_deep_errors) {
+
+ ceph_assert(m_is_deep);
+ m_shallow_errors = 0;
+ m_deep_errors = 0;
+ dout(20) << __func__ << " All may be fixed" << dendl;
+
+ } else if (has_error) {
+
+ // Deep scrub in order to get corrected error counts
+ m_pg->scrub_after_recovery = true;
+ m_pg->m_planned_scrub.req_scrub =
+ m_pg->m_planned_scrub.req_scrub || m_flags.required;
+
+ dout(20) << __func__ << " Current 'required': " << m_flags.required
+ << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl;
+
+ } else if (m_shallow_errors || m_deep_errors) {
+
+ // We have errors but nothing can be fixed, so there is no repair
+ // possible.
+ state_set(PG_STATE_FAILED_REPAIR);
+ dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors)
+ << " error(s) present with no repair possible" << dendl;
+ }
+ }
+
+ {
+ // finish up
+ ObjectStore::Transaction t;
+ m_pg->recovery_state.update_stats(
+ [this](auto& history, auto& stats) {
+ dout(10) << "m_pg->recovery_state.update_stats()" << dendl;
+ utime_t now = ceph_clock_now();
+ history.last_scrub = m_pg->recovery_state.get_info().last_update;
+ history.last_scrub_stamp = now;
+ if (m_is_deep) {
+ history.last_deep_scrub = m_pg->recovery_state.get_info().last_update;
+ history.last_deep_scrub_stamp = now;
+ }
+
+ if (m_is_deep) {
+ if ((m_shallow_errors == 0) && (m_deep_errors == 0))
+ history.last_clean_scrub_stamp = now;
+ stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+ stats.stats.sum.num_deep_scrub_errors = m_deep_errors;
+ stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects;
+ stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes;
+ stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys;
+ dout(25) << "scrub_finish shard " << m_pg_whoami
+ << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes
+ << " num_omap_keys = " << stats.stats.sum.num_omap_keys << dendl;
+ } else {
+ stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+ // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
+ // because of deep-scrub errors
+ if (m_shallow_errors == 0)
+ history.last_clean_scrub_stamp = now;
+ }
+ stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors +
+ stats.stats.sum.num_deep_scrub_errors;
+ if (m_flags.check_repair) {
+ m_flags.check_repair = false;
+ if (m_pg->info.stats.stats.sum.num_scrub_errors) {
+ state_set(PG_STATE_FAILED_REPAIR);
+ dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors
+ << " error(s) still present after re-scrub" << dendl;
+ }
+ }
+ return true;
+ },
+ &t);
+ int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+ ceph_assert(tr == 0);
+
+ if (!m_pg->snap_trimq.empty()) {
+ dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
+ m_pg->snap_trimmer_scrub_complete();
+ }
+ }
+
+ if (has_error) {
+ m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
+ } else {
+ m_is_repair = false;
+ state_clear(PG_STATE_REPAIR);
+ update_op_mode_text();
+ }
+
+ cleanup_on_finish();
+ if (do_auto_scrub) {
+ request_rescrubbing(m_pg->m_planned_scrub);
+ }
+
+ if (m_pg->is_active() && m_pg->is_primary()) {
+ m_pg->recovery_state.share_pg_info();
+ }
+}
+
+void PgScrubber::on_digest_updates()
+{
+ dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " pending? "
+ << num_digest_updates_pending
+ << (m_end.is_max() ? " <last chunk> " : " <mid chunk> ") << dendl;
+
+ if (num_digest_updates_pending > 0) {
+ // do nothing for now. We will be called again when new updates arrive
+ return;
+ }
+
+ // got all updates, and finished with this chunk. Any more?
+ if (m_end.is_max()) {
+
+ scrub_finish();
+ m_osds->queue_scrub_is_finished(m_pg);
+
+ } else {
+ // go get a new chunk (via "requeue")
+ preemption_data.reset();
+ m_osds->queue_scrub_next_chunk(m_pg, m_pg->is_scrub_blocking_ops());
+ }
+}
+
+
+/*
+ * note that the flags-set fetched from the PG (m_pg->m_planned_scrub)
+ * is cleared once scrubbing starts; Some of the values dumped here are
+ * thus transitory.
+ */
+void PgScrubber::dump(ceph::Formatter* f) const
+{
+ f->open_object_section("scrubber");
+ f->dump_stream("epoch_start") << m_interval_start;
+ f->dump_bool("active", m_active);
+ if (m_active) {
+ f->dump_stream("start") << m_start;
+ f->dump_stream("end") << m_end;
+ f->dump_stream("m_max_end") << m_max_end;
+ f->dump_stream("subset_last_update") << m_subset_last_update;
+ f->dump_bool("deep", m_is_deep);
+ f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required));
+ f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub);
+ f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair);
+ f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto);
+ f->dump_bool("req_scrub", m_flags.required);
+ f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep);
+ f->dump_bool("auto_repair", m_flags.auto_repair);
+ f->dump_bool("check_repair", m_flags.check_repair);
+ f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error);
+ f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp; // utime_t
+ f->dump_unsigned("priority", m_flags.priority);
+ f->dump_int("shallow_errors", m_shallow_errors);
+ f->dump_int("deep_errors", m_deep_errors);
+ f->dump_int("fixed", m_fixed_count);
+ {
+ f->open_array_section("waiting_on_whom");
+ for (const auto& p : m_maps_status.get_awaited()) {
+ f->dump_stream("shard") << p;
+ }
+ f->close_section();
+ }
+ }
+ f->close_section();
+}
+
+
+void PgScrubber::handle_query_state(ceph::Formatter* f)
+{
+ dout(10) << __func__ << dendl;
+
+ f->open_object_section("scrub");
+ f->dump_stream("scrubber.epoch_start") << m_interval_start;
+ f->dump_bool("scrubber.active", m_active);
+ f->dump_stream("scrubber.start") << m_start;
+ f->dump_stream("scrubber.end") << m_end;
+ f->dump_stream("scrubber.m_max_end") << m_max_end;
+ f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update;
+ f->dump_bool("scrubber.deep", m_is_deep);
+ {
+ f->open_array_section("scrubber.waiting_on_whom");
+ for (const auto& p : m_maps_status.get_awaited()) {
+ f->dump_stream("shard") << p;
+ }
+ f->close_section();
+ }
+
+ f->dump_string("comment", "DEPRECATED - may be removed in the next release");
+
+ f->close_section();
+}
+
+PgScrubber::~PgScrubber() = default;
+
+PgScrubber::PgScrubber(PG* pg)
+ : m_pg{pg}
+ , m_pg_id{pg->pg_id}
+ , m_osds{m_pg->osd}
+ , m_pg_whoami{pg->pg_whoami}
+ , preemption_data{pg}
+{
+ m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
+ m_fsm->initiate();
+}
+
+void PgScrubber::reserve_replicas()
+{
+ dout(10) << __func__ << dendl;
+ m_reservations.emplace(m_pg, m_pg_whoami);
+}
+
+void PgScrubber::cleanup_on_finish()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(m_pg->is_locked());
+
+ state_clear(PG_STATE_SCRUBBING);
+ state_clear(PG_STATE_DEEP_SCRUB);
+ m_pg->publish_stats_to_osd();
+
+ clear_scrub_reservations();
+ m_pg->publish_stats_to_osd();
+
+ requeue_waiting();
+
+ reset_internal_state();
+ m_flags = scrub_flags_t{};
+
+ // type-specific state clear
+ _scrub_clear_state();
+}
+
+// uses process_event(), so must be invoked externally
+void PgScrubber::scrub_clear_state()
+{
+ dout(10) << __func__ << dendl;
+
+ clear_pgscrub_state();
+ m_fsm->process_event(FullReset{});
+}
+
+/*
+ * note: does not access the state-machine
+ */
+void PgScrubber::clear_pgscrub_state()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(m_pg->is_locked());
+
+ state_clear(PG_STATE_SCRUBBING);
+ state_clear(PG_STATE_DEEP_SCRUB);
+
+ state_clear(PG_STATE_REPAIR);
+
+ clear_scrub_reservations();
+ m_pg->publish_stats_to_osd();
+
+ requeue_waiting();
+
+ reset_internal_state();
+ m_flags = scrub_flags_t{};
+
+ // type-specific state clear
+ _scrub_clear_state();
+}
+
+void PgScrubber::replica_handling_done()
+{
+ dout(10) << __func__ << dendl;
+
+ state_clear(PG_STATE_SCRUBBING);
+ state_clear(PG_STATE_DEEP_SCRUB);
+
+ reset_internal_state();
+
+ m_pg->publish_stats_to_osd();
+}
+
+/*
+ * note: performs run_callbacks()
+ * note: reservations-related variables are not reset here
+ */
+void PgScrubber::reset_internal_state()
+{
+ dout(10) << __func__ << dendl;
+
+ preemption_data.reset();
+ m_maps_status.reset();
+ m_received_maps.clear();
+
+ m_start = hobject_t{};
+ m_end = hobject_t{};
+ m_max_end = hobject_t{};
+ m_subset_last_update = eversion_t{};
+ m_shallow_errors = 0;
+ m_deep_errors = 0;
+ m_fixed_count = 0;
+ m_omap_stats = (const struct omap_stat_t){0};
+
+ run_callbacks();
+
+ m_inconsistent.clear();
+ m_missing.clear();
+ m_authoritative.clear();
+ num_digest_updates_pending = 0;
+ m_primary_scrubmap = ScrubMap{};
+ m_primary_scrubmap_pos.reset();
+ replica_scrubmap = ScrubMap{};
+ replica_scrubmap_pos.reset();
+ m_cleaned_meta_map = ScrubMap{};
+ m_needs_sleep = true;
+ m_sleep_started_at = utime_t{};
+
+ m_active = false;
+}
+
+// note that only applicable to the Replica:
+void PgScrubber::advance_token()
+{
+ dout(10) << __func__ << " was: " << m_current_token << dendl;
+ m_current_token++;
+
+ // when advance_token() is called, it is assumed that no scrubbing takes place.
+ // We will, though, verify that. And if we are actually still handling a stale request -
+ // both our internal state and the FSM state will be cleared.
+ replica_handling_done();
+ m_fsm->process_event(FullReset{});
+}
+
+bool PgScrubber::is_token_current(Scrub::act_token_t received_token)
+{
+ if (received_token == 0 || received_token == m_current_token) {
+ return true;
+ }
+ dout(5) << __func__ << " obsolete token (" << received_token
+ << " vs current " << m_current_token << dendl;
+
+ return false;
+}
+
+const OSDMapRef& PgScrubber::get_osdmap() const
+{
+ return m_pg->get_osdmap();
+}
+
+ostream& operator<<(ostream& out, const PgScrubber& scrubber)
+{
+ return out << scrubber.m_flags;
+}
+
+ostream& PgScrubber::show(ostream& out) const
+{
+ return out << " [ " << m_pg_id << ": " << m_flags << " ] ";
+}
+
+int PgScrubber::asok_debug(std::string_view cmd,
+ std::string param,
+ Formatter* f,
+ stringstream& ss)
+{
+ dout(10) << __func__ << " cmd: " << cmd << " param: " << param << dendl;
+
+ if (cmd == "block") {
+ // set a flag that will cause the next 'select_range' to report a blocked object
+ m_debug_blockrange = 1;
+ } else if (cmd == "unblock") {
+ // send an 'unblock' event, as if a blocked range was freed
+ m_debug_blockrange = 0;
+ m_fsm->process_event(Unblocked{});
+ }
+ return 0;
+}
+// ///////////////////// preemption_data_t //////////////////////////////////
+
+PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
+{
+ m_left = static_cast<int>(
+ m_pg->get_cct()->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+}
+
+void PgScrubber::preemption_data_t::reset()
+{
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+
+ m_preemptable = false;
+ m_preempted = false;
+ m_left =
+ static_cast<int>(m_pg->cct->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+ m_size_divisor = 1;
+}
+
+
+// ///////////////////// ReplicaReservations //////////////////////////////////
+namespace Scrub {
+
+void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch)
+{
+ auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, peer.shard), epoch,
+ MOSDScrubReserve::RELEASE, m_pg->pg_whoami);
+ m_osds->send_message_osd_cluster(peer.osd, m, epoch);
+}
+
+ReplicaReservations::ReplicaReservations(PG* pg, pg_shard_t whoami)
+ : m_pg{pg}
+ , m_acting_set{pg->get_actingset()}
+ , m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
+ , m_pending{static_cast<int>(m_acting_set.size()) - 1}
+ , m_pg_info{m_pg->get_pg_info(ScrubberPasskey())}
+{
+ epoch_t epoch = m_pg->get_osdmap_epoch();
+
+ // handle the special case of no replicas
+ if (m_pending <= 0) {
+ // just signal the scrub state-machine to continue
+ send_all_done();
+
+ } else {
+
+ for (auto p : m_acting_set) {
+ if (p == whoami)
+ continue;
+ auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, p.shard), epoch,
+ MOSDScrubReserve::REQUEST, m_pg->pg_whoami);
+ m_osds->send_message_osd_cluster(p.osd, m, epoch);
+ m_waited_for_peers.push_back(p);
+ dout(10) << __func__ << " <ReplicaReservations> reserve<-> " << p.osd << dendl;
+ }
+ }
+}
+
+void ReplicaReservations::send_all_done()
+{
+ m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
+}
+
+void ReplicaReservations::send_reject()
+{
+ m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
+}
+
+void ReplicaReservations::discard_all()
+{
+ dout(10) << __func__ << " " << m_reserved_peers << dendl;
+
+ m_had_rejections = true; // preventing late-coming responses from triggering events
+ m_reserved_peers.clear();
+ m_waited_for_peers.clear();
+}
+
+ReplicaReservations::~ReplicaReservations()
+{
+ m_had_rejections = true; // preventing late-coming responses from triggering events
+
+ // send un-reserve messages to all reserved replicas. We do not wait for answer (there
+ // wouldn't be one). Other incoming messages will be discarded on the way, by our
+ // owner.
+ epoch_t epoch = m_pg->get_osdmap_epoch();
+
+ for (auto& p : m_reserved_peers) {
+ release_replica(p, epoch);
+ }
+ m_reserved_peers.clear();
+
+ // note: the release will follow on the heels of the request. When tried otherwise,
+ // grants that followed a reject arrived after the whole scrub machine-state was
+ // reset, causing leaked reservations.
+ for (auto& p : m_waited_for_peers) {
+ release_replica(p, epoch);
+ }
+ m_waited_for_peers.clear();
+}
+
+/**
+ * @ATTN we would not reach here if the ReplicaReservation object managed by the
+ * scrubber was reset.
+ */
+void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << " <ReplicaReservations> granted-> " << from << dendl;
+ op->mark_started();
+
+ {
+ // reduce the amount of extra release messages. Not a must, but the log is cleaner
+ auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
+ if (w != m_waited_for_peers.end())
+ m_waited_for_peers.erase(w);
+ }
+
+ // are we forced to reject the reservation?
+ if (m_had_rejections) {
+
+ dout(10) << " rejecting late-coming reservation from " << from << dendl;
+ release_replica(from, m_pg->get_osdmap_epoch());
+
+ } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
+ m_reserved_peers.end()) {
+
+ dout(10) << " already had osd." << from << " reserved" << dendl;
+
+ } else {
+
+ dout(10) << " osd." << from << " scrub reserve = success" << dendl;
+ m_reserved_peers.push_back(from);
+ if (--m_pending == 0) {
+ send_all_done();
+ }
+ }
+}
+
+void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << " <ReplicaReservations> rejected-> " << from << dendl;
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ {
+ // reduce the amount of extra release messages. Not a must, but the log is cleaner
+ auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
+ if (w != m_waited_for_peers.end())
+ m_waited_for_peers.erase(w);
+ }
+
+ if (m_had_rejections) {
+
+ // our failure was already handled when the first rejection arrived
+ dout(15) << " ignoring late-coming rejection from " << from << dendl;
+
+ } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
+ m_reserved_peers.end()) {
+
+ dout(10) << " already had osd." << from << " reserved" << dendl;
+
+ } else {
+
+ dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
+ m_had_rejections = true; // preventing any additional notifications
+ send_reject();
+ }
+}
+
+
+// ///////////////////// LocalReservation //////////////////////////////////
+
+LocalReservation::LocalReservation(PG* pg, OSDService* osds)
+ : m_pg{pg} // holding the "whole PG" for dout() sake
+ , m_osds{osds}
+{
+ if (!m_osds->inc_scrubs_local()) {
+ dout(10) << __func__ << ": failed to reserve locally " << dendl;
+ // the failure is signalled by not having m_holding_local_reservation set
+ return;
+ }
+
+ dout(20) << __func__ << ": local OSD scrub resources reserved" << dendl;
+ m_holding_local_reservation = true;
+}
+
+LocalReservation::~LocalReservation()
+{
+ if (m_holding_local_reservation) {
+ m_holding_local_reservation = false;
+ m_osds->dec_scrubs_local();
+ }
+}
+
+
+// ///////////////////// ReservedByRemotePrimary ///////////////////////////////
+
+ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch)
+ : m_pg{pg}, m_osds{osds}, m_reserved_at{epoch}
+{
+ if (!m_osds->inc_scrubs_remote()) {
+ dout(10) << __func__ << ": failed to reserve at Primary request" << dendl;
+ // the failure is signalled by not having m_reserved_by_remote_primary set
+ return;
+ }
+
+ dout(20) << __func__ << ": scrub resources reserved at Primary request" << dendl;
+ m_reserved_by_remote_primary = true;
+}
+
+bool ReservedByRemotePrimary::is_stale() const
+{
+ return m_reserved_at < m_pg->get_same_interval_since();
+}
+
+ReservedByRemotePrimary::~ReservedByRemotePrimary()
+{
+ if (m_reserved_by_remote_primary) {
+ m_reserved_by_remote_primary = false;
+ m_osds->dec_scrubs_remote();
+ }
+}
+
+// ///////////////////// MapsCollectionStatus ////////////////////////////////
+
+auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from)
+ -> std::tuple<bool, std::string_view>
+{
+ auto fe = std::find(m_maps_awaited_for.begin(), m_maps_awaited_for.end(), from);
+ if (fe != m_maps_awaited_for.end()) {
+ // we are indeed waiting for a map from this replica
+ m_maps_awaited_for.erase(fe);
+ return std::tuple{true, ""sv};
+ } else {
+ return std::tuple{false, " unsolicited scrub-map"sv};
+ }
+}
+
+void MapsCollectionStatus::reset()
+{
+ *this = MapsCollectionStatus{};
+}
+
+std::string MapsCollectionStatus::dump() const
+{
+ std::string all;
+ for (const auto& rp : m_maps_awaited_for) {
+ all.append(rp.get_osd() + " "s);
+ }
+ return all;
+}
+
+ostream& operator<<(ostream& out, const MapsCollectionStatus& sf)
+{
+ out << " [ ";
+ for (const auto& rp : sf.m_maps_awaited_for) {
+ out << rp.get_osd() << " ";
+ }
+ if (!sf.m_local_map_ready) {
+ out << " local ";
+ }
+ return out << " ] ";
+}
+
+// ///////////////////// blocked_range_t ///////////////////////////////
+
+blocked_range_t::blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id)
+ : m_osds{osds}
+{
+ auto now_is = std::chrono::system_clock::now();
+ m_callbk = new LambdaContext([now_is, pg_id, osds]([[maybe_unused]] int r) {
+ std::time_t now_c = std::chrono::system_clock::to_time_t(now_is);
+ char buf[50];
+ strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c));
+ lgeneric_subdout(g_ceph_context, osd, 10)
+ << "PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf
+ << ")" << dendl;
+ osds->clog->warn() << "osd." << osds->whoami << " PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf << ")";
+ return;
+ });
+
+ std::lock_guard l(m_osds->sleep_lock);
+ m_osds->sleep_timer.add_event_after(waittime, m_callbk);
+}
+
+blocked_range_t::~blocked_range_t()
+{
+ std::lock_guard l(m_osds->sleep_lock);
+ m_osds->sleep_timer.cancel_event(m_callbk);
+}
+
+} // namespace Scrub
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "osd/PG.h"
+#include "ScrubStore.h"
+#include "scrub_machine_lstnr.h"
+#include "osd/scrubber_common.h"
+
+class Callback;
+
+namespace Scrub {
+class ScrubMachine;
+struct BuildMap;
+
+/**
+ * Reserving/freeing scrub resources at the replicas.
+ *
+ * When constructed - sends reservation requests to the acting_set.
+ * A rejection triggers a "couldn't acquire the replicas' scrub resources" event.
+ * All previous requests, whether already granted or not, are explicitly released.
+ *
+ * A note re performance: I've measured a few container alternatives for
+ * m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as
+ * expected. flat_set is only slightly better. Surprisingly - std::vector (with no
+ * sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve.
+ */
+class ReplicaReservations {
+ using OrigSet = decltype(std::declval<PG>().get_actingset());
+
+ PG* m_pg;
+ OrigSet m_acting_set;
+ OSDService* m_osds;
+ std::vector<pg_shard_t> m_waited_for_peers;
+ std::vector<pg_shard_t> m_reserved_peers;
+ bool m_had_rejections{false};
+ int m_pending{-1};
+ const pg_info_t& m_pg_info;
+
+ void release_replica(pg_shard_t peer, epoch_t epoch);
+
+ void send_all_done(); ///< all reservations are granted
+
+ /// notify the scrubber that we have failed to reserve replicas' resources
+ void send_reject();
+
+ public:
+ /**
+ * quietly discard all knowledge about existing reservations. No messages
+ * are sent to peers.
+ * To be used upon interval change, as we know the the running scrub is no longer
+ * relevant, and that the replicas had reset the reservations on their side.
+ */
+ void discard_all();
+
+ ReplicaReservations(PG* pg, pg_shard_t whoami);
+
+ ~ReplicaReservations();
+
+ void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+
+ void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
+};
+
+/**
+ * wraps the local OSD scrub resource reservation in an RAII wrapper
+ */
+class LocalReservation {
+ PG* m_pg;
+ OSDService* m_osds;
+ bool m_holding_local_reservation{false};
+
+ public:
+ LocalReservation(PG* pg, OSDService* osds);
+ ~LocalReservation();
+ bool is_reserved() const { return m_holding_local_reservation; }
+};
+
+/**
+ * wraps the OSD resource we are using when reserved as a replica by a scrubbing master.
+ */
+class ReservedByRemotePrimary {
+ PG* m_pg;
+ OSDService* m_osds;
+ bool m_reserved_by_remote_primary{false};
+ const epoch_t m_reserved_at;
+
+ public:
+ ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch);
+ ~ReservedByRemotePrimary();
+ [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; }
+
+ /// compare the remembered reserved-at epoch to the current interval
+ [[nodiscard]] bool is_stale() const;
+};
+
+/**
+ * Once all replicas' scrub maps are received, we go on to compare the maps. That is -
+ * unless we we have not yet completed building our own scrub map. MapsCollectionStatus
+ * combines the status of waiting for both the local map and the replicas, without
+ * resorting to adding dummy entries into a list.
+ */
+class MapsCollectionStatus {
+
+ bool m_local_map_ready{false};
+ std::vector<pg_shard_t> m_maps_awaited_for;
+
+ public:
+ [[nodiscard]] bool are_all_maps_available() const
+ {
+ return m_local_map_ready && m_maps_awaited_for.empty();
+ }
+
+ void mark_local_map_ready() { m_local_map_ready = true; }
+
+ void mark_replica_map_request(pg_shard_t from_whom)
+ {
+ m_maps_awaited_for.push_back(from_whom);
+ }
+
+ /// @returns true if indeed waiting for this one. Otherwise: an error string
+ auto mark_arriving_map(pg_shard_t from) -> std::tuple<bool, std::string_view>;
+
+ std::vector<pg_shard_t> get_awaited() const { return m_maps_awaited_for; }
+
+ void reset();
+
+ std::string dump() const;
+
+ friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf);
+};
+
+
+} // namespace Scrub
+
+
+/**
+ * the scrub operation flags. Primary only.
+ * Set at scrub start. Checked in multiple locations - mostly
+ * at finish.
+ */
+struct scrub_flags_t {
+
+ unsigned int priority{0};
+
+ /**
+ * set by queue_scrub() if either planned_scrub.auto_repair or
+ * need_auto were set.
+ * Tested at scrub end.
+ */
+ bool auto_repair{false};
+
+ /// this flag indicates that we are scrubbing post repair to verify everything is fixed
+ bool check_repair{false};
+
+ /// checked at the end of the scrub, to possibly initiate a deep-scrub
+ bool deep_scrub_on_error{false};
+
+ /**
+ * scrub must not be aborted.
+ * Set for explicitly requested scrubs, and for scrubs originated by the pairing
+ * process with the 'repair' flag set (in the RequestScrub event).
+ */
+ bool required{false};
+};
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf);
+
+
+/**
+ * The part of PG-scrubbing code that isn't state-machine wiring.
+ *
+ * Why the separation? I wish to move to a different FSM implementation. Thus I
+ * am forced to strongly decouple the state-machine implementation details from
+ * the actual scrubbing code.
+ */
+class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
+
+ public:
+ explicit PgScrubber(PG* pg);
+
+ // ------------------ the I/F exposed to the PG (ScrubPgIF) -------------
+
+ /// are we waiting for resource reservation grants form our replicas?
+ [[nodiscard]] bool is_reserving() const final;
+
+ void initiate_regular_scrub(epoch_t epoch_queued) final;
+
+ void initiate_scrub_after_repair(epoch_t epoch_queued) final;
+
+ void send_scrub_resched(epoch_t epoch_queued) final;
+
+ void active_pushes_notification(epoch_t epoch_queued) final;
+
+ void update_applied_notification(epoch_t epoch_queued) final;
+
+ void send_scrub_unblock(epoch_t epoch_queued) final;
+
+ void digest_update_notification(epoch_t epoch_queued) final;
+
+ void send_replica_maps_ready(epoch_t epoch_queued) final;
+
+ void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+ void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+ void send_replica_pushes_upd(epoch_t epoch_queued) final;
+ /**
+ * The PG has updated its 'applied version'. It might be that we are waiting for this
+ * information: after selecting a range of objects to scrub, we've marked the latest
+ * version of these objects in m_subset_last_update. We will not start the map building
+ * before we know that the PG has reached this version.
+ */
+ void on_applied_when_primary(const eversion_t& applied_version) final;
+
+ void send_full_reset(epoch_t epoch_queued) final;
+
+ void send_chunk_free(epoch_t epoch_queued) final;
+
+ void send_chunk_busy(epoch_t epoch_queued) final;
+
+ void send_local_map_done(epoch_t epoch_queued) final;
+
+ void send_maps_compared(epoch_t epoch_queued) final;
+
+ void send_get_next_chunk(epoch_t epoch_queued) final;
+
+ void send_scrub_is_finished(epoch_t epoch_queued) final;
+
+ /**
+ * we allow some number of preemptions of the scrub, which mean we do
+ * not block. Then we start to block. Once we start blocking, we do
+ * not stop until the scrub range is completed.
+ */
+ bool write_blocked_by_scrub(const hobject_t& soid) final;
+
+ /// true if the given range intersects the scrub interval in any way
+ bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
+
+ /**
+ * we are a replica being asked by the Primary to reserve OSD resources for
+ * scrubbing
+ */
+ void handle_scrub_reserve_request(OpRequestRef op) final;
+
+ void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
+ void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
+ void handle_scrub_reserve_release(OpRequestRef op) final;
+ void discard_replica_reservations() final;
+ void clear_scrub_reservations() final; // PG::clear... fwds to here
+ void unreserve_replicas() final;
+
+ // managing scrub op registration
+
+ void reg_next_scrub(const requested_scrub_t& request_flags) final;
+
+ void unreg_next_scrub() final;
+
+ void scrub_requested(scrub_level_t scrub_level,
+ scrub_type_t scrub_type,
+ requested_scrub_t& req_flags) final;
+
+ /**
+ * Reserve local scrub resources (managed by the OSD)
+ *
+ * Fails if OSD's local-scrubs budget was exhausted
+ * \returns were local resources reserved?
+ */
+ bool reserve_local() final;
+
+ void handle_query_state(ceph::Formatter* f) final;
+
+ void dump(ceph::Formatter* f) const override;
+
+ // used if we are a replica
+
+ void replica_scrub_op(OpRequestRef op) final;
+
+ /// the op priority, taken from the primary's request message
+ Scrub::scrub_prio_t replica_op_priority() const final
+ {
+ return m_replica_request_priority;
+ };
+
+ unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+ unsigned int suggested_priority) const final;
+ /// the version that refers to m_flags.priority
+ unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
+
+ void add_callback(Context* context) final { m_callbacks.push_back(context); }
+
+ [[nodiscard]] bool are_callbacks_pending() const final // used for an assert in PG.cc
+ {
+ return !m_callbacks.empty();
+ }
+
+ /// handle a message carrying a replica map
+ void map_from_replica(OpRequestRef op) final;
+
+ void scrub_clear_state() final;
+
+ /**
+ * add to scrub statistics, but only if the soid is below the scrub start
+ */
+ virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+ const hobject_t& soid) override
+ {
+ ceph_assert(false);
+ }
+
+ /**
+ * finalize the parameters of the initiated scrubbing session:
+ *
+ * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
+ * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
+ */
+ void set_op_parameters(requested_scrub_t& request) final;
+
+ void cleanup_store(ObjectStore::Transaction* t) final;
+
+ bool get_store_errors(const scrub_ls_arg_t& arg,
+ scrub_ls_result_t& res_inout) const override
+ {
+ return false;
+ }
+
+ int asok_debug(std::string_view cmd,
+ std::string param,
+ Formatter* f,
+ std::stringstream& ss) override;
+ int m_debug_blockrange{0};
+
+ // -------------------------------------------------------------------------------------------
+ // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
+
+ [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); }
+
+ void select_range_n_notify() final;
+
+ Scrub::BlockedRangeWarning acquire_blocked_alarm() final;
+
+ /// walk the log to find the latest update that affects our chunk
+ eversion_t search_log_for_updates() const final;
+
+ eversion_t get_last_update_applied() const final
+ {
+ return m_pg->recovery_state.get_last_update_applied();
+ }
+
+ int pending_active_pushes() const final { return m_pg->active_pushes; }
+
+ void on_init() final;
+ void on_replica_init() final;
+ void replica_handling_done() final;
+
+ /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+ /// (thus can be called from FSM reactions)
+ void clear_pgscrub_state() final;
+
+ /*
+ * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
+ * is asserted - after a configuration-dependent timeout.
+ */
+ void add_delayed_scheduling() final;
+
+ void get_replicas_maps(bool replica_can_preempt) final;
+
+ void on_digest_updates() final;
+
+ ScrubMachineListener::MsgAndEpoch
+ prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final;
+
+ void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final;
+
+ void send_preempted_replica() final;
+
+ void send_remotes_reserved(epoch_t epoch_queued) final;
+ void send_reservation_failure(epoch_t epoch_queued) final;
+
+ /**
+ * does the PG have newer updates than what we (the scrubber) know?
+ */
+ [[nodiscard]] bool has_pg_marked_new_updates() const final;
+
+ void set_subset_last_update(eversion_t e) final;
+
+ void maps_compare_n_cleanup() final;
+
+ Scrub::preemption_t& get_preemptor() final;
+
+ int build_primary_map_chunk() final;
+
+ int build_replica_map_chunk() final;
+
+ void reserve_replicas() final;
+
+ [[nodiscard]] bool was_epoch_changed() const final;
+
+ void mark_local_map_ready() final;
+
+ [[nodiscard]] bool are_all_maps_available() const final;
+
+ std::string dump_awaited_maps() const final;
+
+ protected:
+ bool state_test(uint64_t m) const { return m_pg->state_test(m); }
+ void state_set(uint64_t m) { m_pg->state_set(m); }
+ void state_clear(uint64_t m) { m_pg->state_clear(m); }
+
+ [[nodiscard]] bool is_scrub_registered() const;
+
+ virtual void _scrub_clear_state() {}
+
+ utime_t m_scrub_reg_stamp; ///< stamp we registered for
+
+ ostream& show(ostream& out) const override;
+
+ public:
+ // -------------------------------------------------------------------------------------------
+
+ friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
+
+ static utime_t scrub_must_stamp() { return utime_t(1, 1); }
+
+ virtual ~PgScrubber(); // must be defined separately, in the .cc file
+
+ [[nodiscard]] bool is_scrub_active() const final { return m_active; }
+
+ private:
+ void reset_internal_state();
+
+ /**
+ * the current scrubbing operation is done. We should mark that fact, so that
+ * all events related to the previous operation can be discarded.
+ */
+ void advance_token();
+
+ bool is_token_current(Scrub::act_token_t received_token);
+
+ void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
+
+ void _scan_snaps(ScrubMap& smap);
+
+ ScrubMap clean_meta_map();
+
+ /**
+ * mark down some parameters of the initiated scrub:
+ * - the epoch when started;
+ * - the depth of the scrub requested (from the PG_STATE variable)
+ */
+ void reset_epoch(epoch_t epoch_queued);
+
+ void run_callbacks();
+
+ // ----- methods used to verify the relevance of incoming events:
+
+ /**
+ * is the incoming event still relevant, and should be processed?
+ *
+ * It isn't if:
+ * - (1) we are no longer 'actively scrubbing'; or
+ * - (2) the message is from an epoch prior to when we started the current scrub
+ * session; or
+ * - (3) the message epoch is from a previous interval; or
+ * - (4) the 'abort' configuration flags were set.
+ *
+ * For (1) & (2) - teh incoming message is discarded, w/o further action.
+ *
+ * For (3): (see check_interval() for a full description) if we have not reacted yet
+ * to this specific new interval, we do now:
+ * - replica reservations are silently discarded (we count on the replicas to notice
+ * the interval change and un-reserve themselves);
+ * - the scrubbing is halted.
+ *
+ * For (4): the message will be discarded, but also:
+ * if this is the first time we've noticed the 'abort' request, we perform the abort.
+ *
+ * \returns should the incoming event be processed?
+ */
+ bool is_message_relevant(epoch_t epoch_to_verify);
+
+ /**
+ * check the 'no scrub' configuration options.
+ */
+ [[nodiscard]] bool should_abort() const;
+
+ /**
+ * Check the 'no scrub' configuration flags.
+ *
+ * Reset everything if the abort was not handled before.
+ * @returns false if the message was discarded due to abort flag.
+ */
+ [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify);
+
+ [[nodiscard]] bool check_interval(epoch_t epoch_to_verify);
+
+ epoch_t m_last_aborted{}; // last time we've noticed a request to abort
+
+ /**
+ * return true if any inconsistency/missing is repaired, false otherwise
+ */
+ [[nodiscard]] bool scrub_process_inconsistent();
+
+ void scrub_compare_maps();
+
+ bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always
+ ///< 'true', unless we just got out of a sleep period
+
+ utime_t m_sleep_started_at;
+
+
+ // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
+ // to guarantee un-reserving when deleted.
+ std::optional<Scrub::ReplicaReservations> m_reservations;
+ std::optional<Scrub::LocalReservation> m_local_osd_resource;
+
+ /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
+ std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
+
+ void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when
+ // Active->NotActive
+
+ /// the part that actually finalizes a scrub
+ void scrub_finish();
+
+ protected:
+ PG* const m_pg;
+
+ /**
+ * the derivative-specific scrub-finishing touches:
+ */
+ virtual void _scrub_finish() {}
+
+ /**
+ * Validate consistency of the object info and snap sets.
+ */
+ virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
+ {}
+
+ // common code used by build_primary_map_chunk() and build_replica_map_chunk():
+ int build_scrub_map_chunk(ScrubMap& map, // primary or replica?
+ ScrubMapBuilder& pos,
+ hobject_t start,
+ hobject_t end,
+ bool deep);
+
+ std::unique_ptr<Scrub::ScrubMachine> m_fsm;
+ const spg_t m_pg_id; ///< a local copy of m_pg->pg_id
+ OSDService* const m_osds;
+ const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami;
+
+ epoch_t m_interval_start{0}; ///< interval's 'from' of when scrubbing was first scheduled
+ /*
+ * the exact epoch when the scrubbing actually started (started here - cleared checks
+ * for no-scrub conf). Incoming events are verified against this, with stale events
+ * discarded.
+ */
+ epoch_t m_epoch_start{0}; ///< the actual epoch when scrubbing started
+
+ /**
+ * (replica) a tag identifying a specific scrub "session". Incremented whenever the
+ * Primary releases the replica scrub resources.
+ * When the scrub session is terminated (even if the interval remains unchanged, as
+ * might happen following an asok no-scrub command), stale scrub-resched messages
+ * triggered by the backend will be discarded.
+ */
+ Scrub::act_token_t m_current_token{1};
+
+ scrub_flags_t m_flags;
+
+ bool m_active{false};
+
+ eversion_t m_subset_last_update{};
+
+ std::unique_ptr<Scrub::Store> m_store;
+
+ int num_digest_updates_pending{0};
+ hobject_t m_start, m_end; ///< note: half-closed: [start,end)
+
+ /// Returns reference to current osdmap
+ const OSDMapRef& get_osdmap() const;
+
+ /// Returns epoch of current osdmap
+ epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
+
+ CephContext* get_pg_cct() const { return m_pg->cct; }
+
+ // collected statistics
+ int m_shallow_errors{0};
+ int m_deep_errors{0};
+ int m_fixed_count{0};
+
+ /// Maps from objects with errors to missing peers
+ HobjToShardSetMapping m_missing;
+
+ protected:
+ /**
+ * 'm_is_deep' - is the running scrub a deep one?
+ *
+ * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
+ * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
+ * meaningful both for the primary and the replicas, and is used as a parameter when
+ * building the scrub maps.
+ */
+ bool m_is_deep{false};
+
+ /**
+ * If set: affects the backend & scrubber-backend functions called after all
+ * scrub maps are available.
+ *
+ * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be
+ * a "user facing" status display only).
+ */
+ bool m_is_repair{false};
+
+ /**
+ * User-readable summary of the scrubber's current mode of operation. Used for
+ * both osd.*.log and the cluster log.
+ * One of:
+ * "repair"
+ * "deep-scrub",
+ * "scrub
+ *
+ * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for
+ * auto_repair will show as "deep-scrub" and not as "repair" (until the first error
+ * is detected).
+ */
+ std::string_view m_mode_desc;
+
+ void update_op_mode_text();
+
+private:
+
+ /**
+ * initiate a deep-scrub after the current scrub ended with errors.
+ */
+ void request_rescrubbing(requested_scrub_t& req_flags);
+
+ /*
+ * Select a range of objects to scrub.
+ *
+ * By:
+ * - setting tentative range based on conf and divisor
+ * - requesting a partial list of elements from the backend;
+ * - handling some head/clones issues
+ *
+ * The selected range is set directly into 'm_start' and 'm_end'
+ */
+ bool select_range();
+
+ std::list<Context*> m_callbacks;
+
+ /**
+ * send a replica (un)reservation request to the acting set
+ *
+ * @param opcode - one of MOSDScrubReserve::REQUEST
+ * or MOSDScrubReserve::RELEASE
+ */
+ void message_all_replicas(int32_t opcode, std::string_view op_text);
+
+ hobject_t m_max_end; ///< Largest end that may have been sent to replicas
+ ScrubMap m_primary_scrubmap;
+ ScrubMapBuilder m_primary_scrubmap_pos;
+
+ std::map<pg_shard_t, ScrubMap> m_received_maps;
+
+ /// Cleaned std::map pending snap metadata scrub
+ ScrubMap m_cleaned_meta_map;
+
+ void _request_scrub_map(pg_shard_t replica,
+ eversion_t version,
+ hobject_t start,
+ hobject_t end,
+ bool deep,
+ bool allow_preemption);
+
+
+ Scrub::MapsCollectionStatus m_maps_status;
+
+ omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
+
+ /// Maps from objects with errors to inconsistent peers
+ HobjToShardSetMapping m_inconsistent;
+
+ /// Maps from object with errors to good peers
+ std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
+
+ // ------------ members used if we are a replica
+
+ epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message
+
+ ScrubMapBuilder replica_scrubmap_pos;
+ ScrubMap replica_scrubmap;
+
+ /**
+ * we mark the request priority as it arrived. It influences the queuing priority
+ * when we wait for local updates
+ */
+ Scrub::scrub_prio_t m_replica_request_priority;
+
+ /**
+ * the 'preemption' "state-machine".
+ * Note: I was considering an orthogonal sub-machine implementation, but as
+ * the state diagram is extremely simple, the added complexity wasn't justified.
+ */
+ class preemption_data_t : public Scrub::preemption_t {
+ public:
+ preemption_data_t(PG* pg); // the PG access is used for conf access (and logs)
+
+ [[nodiscard]] bool is_preemptable() const final { return m_preemptable; }
+
+ bool do_preempt() final
+ {
+ if (m_preempted || !m_preemptable)
+ return false;
+
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+ if (!m_preemptable)
+ return false;
+
+ m_preempted = true;
+ return true;
+ }
+
+ /// same as 'do_preempt()' but w/o checks (as once a replica
+ /// was preempted, we cannot continue)
+ void replica_preempted() { m_preempted = true; }
+
+ void enable_preemption()
+ {
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+ if (are_preemptions_left() && !m_preempted) {
+ m_preemptable = true;
+ }
+ }
+
+ /// used by a replica to set preemptability state according to the Primary's request
+ void force_preemptability(bool is_allowed)
+ {
+ // note: no need to lock for a replica
+ m_preempted = false;
+ m_preemptable = is_allowed;
+ }
+
+ bool disable_and_test() final
+ {
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+ m_preemptable = false;
+ return m_preempted;
+ }
+
+ [[nodiscard]] bool was_preempted() const { return m_preempted; }
+
+ [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; }
+
+ void reset();
+
+ void adjust_parameters() final
+ {
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+
+ if (m_preempted) {
+ m_preempted = false;
+ m_preemptable = adjust_left();
+ } else {
+ m_preemptable = are_preemptions_left();
+ }
+ }
+
+ private:
+ PG* m_pg;
+ mutable std::mutex m_preemption_lock;
+ bool m_preemptable{false};
+ bool m_preempted{false};
+ int m_left;
+ size_t m_size_divisor{1};
+ bool are_preemptions_left() const { return m_left > 0; }
+
+ bool adjust_left()
+ {
+ if (m_left > 0) {
+ --m_left;
+ m_size_divisor *= 2;
+ }
+ return m_left > 0;
+ }
+ };
+
+ preemption_data_t preemption_data;
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "scrub_machine.h"
+
+#include <chrono>
+#include <typeinfo>
+
+#include <boost/core/demangle.hpp>
+
+#include "osd/OSD.h"
+#include "osd/OpRequest.h"
+#include "ScrubStore.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout << " scrubberFSM "
+
+using namespace std::chrono;
+using namespace std::chrono_literals;
+namespace sc = boost::statechart;
+
+#define DECLARE_LOCALS \
+ ScrubMachineListener* scrbr = context<ScrubMachine>().m_scrbr; \
+ std::ignore = scrbr; \
+ auto pg_id = context<ScrubMachine>().m_pg_id; \
+ std::ignore = pg_id;
+
+namespace Scrub {
+
+// --------- trace/debug auxiliaries -------------------------------
+
+void on_event_creation(std::string_view nm)
+{
+ dout(20) << " event: --vvvv---- " << nm << dendl;
+}
+
+void on_event_discard(std::string_view nm)
+{
+ dout(20) << " event: --^^^^---- " << nm << dendl;
+}
+
+void ScrubMachine::my_states() const
+{
+ for (auto si = state_begin(); si != state_end(); ++si) {
+ const auto& siw{*si}; // prevents a warning re side-effects
+ dout(20) << " state: " << boost::core::demangle(typeid(siw).name()) << dendl;
+ }
+}
+
+void ScrubMachine::assert_not_active() const
+{
+ ceph_assert(state_cast<const NotActive*>());
+}
+
+bool ScrubMachine::is_reserving() const
+{
+ return state_cast<const ReservingReplicas*>();
+}
+
+bool ScrubMachine::is_accepting_updates() const
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ ceph_assert(scrbr->is_primary());
+
+ return state_cast<const WaitLastUpdate*>();
+}
+
+// for the rest of the code in this file - we know what PG we are dealing with:
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this->context<ScrubMachine>().m_pg)
+template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
+{
+ return t->gen_prefix(*_dout) << " scrubberFSM pg(" << t->pg_id << ") ";
+}
+
+// ////////////// the actual actions
+
+// ----------------------- NotActive -----------------------------------------
+
+NotActive::NotActive(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> NotActive" << dendl;
+}
+
+// ----------------------- ReservingReplicas ---------------------------------
+
+ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> ReservingReplicas" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ scrbr->reserve_replicas();
+}
+
+sc::result ReservingReplicas::react(const ReservationFailure&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
+
+ // the Scrubber must release all resources and abort the scrubbing
+ scrbr->clear_pgscrub_state();
+ return transit<NotActive>();
+}
+
+/**
+ * note: the event poster is handling the scrubber reset
+ */
+sc::result ReservingReplicas::react(const FullReset&)
+{
+ dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl;
+ return transit<NotActive>();
+}
+
+// ----------------------- ActiveScrubbing -----------------------------------
+
+ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> ActiveScrubbing" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ scrbr->on_init();
+}
+
+/**
+ * upon exiting the Active state
+ */
+ActiveScrubbing::~ActiveScrubbing()
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(15) << __func__ << dendl;
+ scrbr->unreserve_replicas();
+}
+
+/*
+ * The only source of an InternalError event as of now is the BuildMap state,
+ * when encountering a backend error.
+ * We kill the scrub and reset the FSM.
+ */
+sc::result ActiveScrubbing::react(const InternalError&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << __func__ << dendl;
+ scrbr->clear_pgscrub_state();
+ return transit<NotActive>();
+}
+
+sc::result ActiveScrubbing::react(const FullReset&)
+{
+ dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl;
+ // caller takes care of clearing the scrubber & FSM states
+ return transit<NotActive>();
+}
+
+// ----------------------- RangeBlocked -----------------------------------
+
+/*
+ * Blocked. Will be released by kick_object_context_blocked() (or upon
+ * an abort)
+ *
+ * Note: we are never expected to be waiting for long for a blocked object.
+ * Unfortunately we know from experience that a bug elsewhere might result
+ * in an indefinite wait in this state, for an object that is never released.
+ * If that happens, all we can do is to issue a warning message to help
+ * with the debugging.
+ */
+RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/RangeBlocked" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+
+ // arrange to have a warning message issued if we are stuck in this
+ // state for longer than some reasonable number of minutes.
+ m_timeout = scrbr->acquire_blocked_alarm();
+}
+
+// ----------------------- PendingTimer -----------------------------------
+
+/**
+ * Sleeping till timer reactivation - or just requeuing
+ */
+PendingTimer::PendingTimer(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/PendingTimer" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+
+ scrbr->add_delayed_scheduling();
+}
+
+// ----------------------- NewChunk -----------------------------------
+
+/**
+ * Preconditions:
+ * - preemption data was set
+ * - epoch start was updated
+ */
+NewChunk::NewChunk(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/NewChunk" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+
+ scrbr->get_preemptor().adjust_parameters();
+
+ // choose range to work on
+ // select_range_n_notify() will signal either SelectedChunkFree or
+ // ChunkIsBusy. If 'busy', we transition to Blocked, and wait for the
+ // range to become available.
+ scrbr->select_range_n_notify();
+}
+
+sc::result NewChunk::react(const SelectedChunkFree&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl;
+
+ scrbr->set_subset_last_update(scrbr->search_log_for_updates());
+ return transit<WaitPushes>();
+}
+
+// ----------------------- WaitPushes -----------------------------------
+
+WaitPushes::WaitPushes(my_context ctx) : my_base(ctx)
+{
+ dout(10) << " -- state -->> Act/WaitPushes" << dendl;
+ post_event(ActivePushesUpd{});
+}
+
+/*
+ * Triggered externally, by the entity that had an update re pushes
+ */
+sc::result WaitPushes::react(const ActivePushesUpd&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: "
+ << scrbr->pending_active_pushes() << dendl;
+
+ if (!scrbr->pending_active_pushes()) {
+ // done waiting
+ return transit<WaitLastUpdate>();
+ }
+
+ return discard_event();
+}
+
+// ----------------------- WaitLastUpdate -----------------------------------
+
+WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx)
+{
+ dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl;
+ post_event(UpdatesApplied{});
+}
+
+/**
+ * Note:
+ * Updates are locally readable immediately. Thus, on the replicas we do need
+ * to wait for the update notifications before scrubbing. For the Primary it's
+ * a bit different: on EC (and only there) rmw operations have an additional
+ * read roundtrip. That means that on the Primary we need to wait for
+ * last_update_applied (the replica side, even on EC, is still safe
+ * since the actual transaction will already be readable by commit time.
+ */
+void WaitLastUpdate::on_new_updates(const UpdatesApplied&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl;
+
+ if (scrbr->has_pg_marked_new_updates()) {
+ post_event(InternalAllUpdates{});
+ } else {
+ // will be requeued by op_applied
+ dout(10) << "wait for EC read/modify/writes to queue" << dendl;
+ }
+}
+
+/*
+ * request maps from the replicas in the acting set
+ */
+sc::result WaitLastUpdate::react(const InternalAllUpdates&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl;
+
+ scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable());
+ return transit<BuildMap>();
+}
+
+// ----------------------- BuildMap -----------------------------------
+
+BuildMap::BuildMap(my_context ctx) : my_base(ctx)
+{
+ dout(10) << " -- state -->> Act/BuildMap" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+
+ // no need to check for an epoch change, as all possible flows that brought us here have
+ // a check_interval() verification of their final event.
+
+ if (scrbr->get_preemptor().was_preempted()) {
+
+ // we were preempted, either directly or by a replica
+ dout(10) << __func__ << " preempted!!!" << dendl;
+ scrbr->mark_local_map_ready();
+ post_event(IntBmPreempted{});
+
+ } else {
+
+ auto ret = scrbr->build_primary_map_chunk();
+
+ if (ret == -EINPROGRESS) {
+ // must wait for the backend to finish. No specific event provided.
+ // build_primary_map_chunk() has already requeued us.
+ dout(20) << "waiting for the backend..." << dendl;
+
+ } else if (ret < 0) {
+
+ dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
+ post_event(InternalError{});
+
+ } else {
+
+ // the local map was created
+ post_event(IntLocalMapDone{});
+ }
+ }
+}
+
+sc::result BuildMap::react(const IntLocalMapDone&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl;
+
+ scrbr->mark_local_map_ready();
+ return transit<WaitReplicas>();
+}
+
+// ----------------------- DrainReplMaps -----------------------------------
+
+DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/DrainReplMaps" << dendl;
+ // we may have received all maps already. Send the event that will make us check.
+ post_event(GotReplicas{});
+}
+
+sc::result DrainReplMaps::react(const GotReplicas&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl;
+
+ if (scrbr->are_all_maps_available()) {
+ // NewChunk will handle the preemption that brought us to this state
+ return transit<PendingTimer>();
+ }
+
+ dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: "
+ << scrbr->dump_awaited_maps() << dendl;
+ return discard_event();
+}
+
+// ----------------------- WaitReplicas -----------------------------------
+
+WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/WaitReplicas" << dendl;
+ post_event(GotReplicas{});
+}
+
+/**
+ * note: now that maps_compare_n_cleanup() is "futurized"(*), and we remain in this state
+ * for a while even after we got all our maps, we must prevent are_all_maps_available()
+ * (actually - the code after the if()) from being called more than once.
+ * This is basically a separate state, but it's too transitory and artificial to justify
+ * the cost of a separate state.
+
+ * (*) "futurized" - in Crimson, the call to maps_compare_n_cleanup() returns immediately
+ * after initiating the process. The actual termination of the maps comparing etc' is
+ * signalled via an event. As we share the code with "classic" OSD, here too
+ * maps_compare_n_cleanup() is responsible for signalling the completion of the
+ * processing.
+ */
+sc::result WaitReplicas::react(const GotReplicas&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl;
+
+ if (!all_maps_already_called && scrbr->are_all_maps_available()) {
+ dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl;
+
+ all_maps_already_called = true;
+
+ // were we preempted?
+ if (scrbr->get_preemptor().disable_and_test()) { // a test&set
+
+
+ dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl;
+ return transit<PendingTimer>();
+
+ } else {
+
+ // maps_compare_n_cleanup() will arrange for MapsCompared event to be sent:
+ scrbr->maps_compare_n_cleanup();
+ return discard_event();
+ }
+ } else {
+ return discard_event();
+ }
+}
+
+// ----------------------- WaitDigestUpdate -----------------------------------
+
+WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl;
+ // perform an initial check: maybe we already
+ // have all the updates we need:
+ // (note that DigestUpdate is usually an external event)
+ post_event(DigestUpdate{});
+}
+
+sc::result WaitDigestUpdate::react(const DigestUpdate&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl;
+
+ // on_digest_updates() will either:
+ // - do nothing - if we are still waiting for updates, or
+ // - finish the scrubbing of the current chunk, and:
+ // - send NextChunk, or
+ // - send ScrubFinished
+
+ scrbr->on_digest_updates();
+ return discard_event();
+}
+
+ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
+ : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub}
+{
+ dout(15) << "ScrubMachine created " << m_pg_id << dendl;
+}
+
+ScrubMachine::~ScrubMachine() = default;
+
+// -------- for replicas -----------------------------------------------------
+
+// ----------------------- ReplicaWaitUpdates --------------------------------
+
+ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ scrbr->on_replica_init();
+}
+
+/*
+ * Triggered externally, by the entity that had an update re pushes
+ */
+sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): "
+ << scrbr->pending_active_pushes() << dendl;
+
+ if (scrbr->pending_active_pushes() == 0) {
+
+ // done waiting
+ return transit<ActiveReplica>();
+ }
+
+ return discard_event();
+}
+
+/**
+ * the event poster is handling the scrubber reset
+ */
+sc::result ReplicaWaitUpdates::react(const FullReset&)
+{
+ dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl;
+ return transit<NotActive>();
+}
+
+// ----------------------- ActiveReplica -----------------------------------
+
+ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "-- state -->> ActiveReplica" << dendl;
+ scrbr->on_replica_init(); // as we might have skipped ReplicaWaitUpdates
+ post_event(SchedReplica{});
+}
+
+sc::result ActiveReplica::react(const SchedReplica&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? "
+ << scrbr->get_preemptor().is_preemptable() << dendl;
+
+ if (scrbr->get_preemptor().was_preempted()) {
+ dout(10) << "replica scrub job preempted" << dendl;
+
+ scrbr->send_preempted_replica();
+ scrbr->replica_handling_done();
+ return transit<NotActive>();
+ }
+
+ // start or check progress of build_replica_map_chunk()
+ auto ret_init = scrbr->build_replica_map_chunk();
+ if (ret_init != -EINPROGRESS) {
+ return transit<NotActive>();
+ }
+
+ return discard_event();
+}
+
+/**
+ * the event poster is handling the scrubber reset
+ */
+sc::result ActiveReplica::react(const FullReset&)
+{
+ dout(10) << "ActiveReplica::react(const FullReset&)" << dendl;
+ return transit<NotActive>();
+}
+
+} // namespace Scrub
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <string>
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/deferral.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <boost/statechart/in_state_reaction.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+
+#include "common/version.h"
+#include "include/Context.h"
+
+#include "scrub_machine_lstnr.h"
+#include "osd/scrubber_common.h"
+
+using namespace std::string_literals;
+
+class PG; // holding a pointer to that one - just for testing
+class PgScrubber;
+namespace Scrub {
+
+namespace sc = ::boost::statechart;
+namespace mpl = ::boost::mpl;
+
+//
+// EVENTS
+//
+
+void on_event_creation(std::string_view nm);
+void on_event_discard(std::string_view nm);
+
+#define MEV(E) \
+ struct E : sc::event<E> { \
+ inline static int actv{0}; \
+ E() \
+ { \
+ if (!actv++) \
+ on_event_creation(#E); \
+ } \
+ ~E() \
+ { \
+ if (!--actv) \
+ on_event_discard(#E); \
+ } \
+ void print(std::ostream* out) const { *out << #E; } \
+ std::string_view print() const { return #E; } \
+ };
+
+MEV(RemotesReserved) ///< all replicas have granted our reserve request
+
+MEV(ReservationFailure) ///< a reservation request has failed
+
+MEV(StartScrub) ///< initiate a new scrubbing session (relevant if we are a Primary)
+
+MEV(AfterRepairScrub) ///< initiate a new scrubbing session. Only triggered at Recovery
+ ///< completion.
+
+MEV(Unblocked) ///< triggered when the PG unblocked an object that was marked for
+ ///< scrubbing. Via the PGScrubUnblocked op
+
+MEV(InternalSchedScrub)
+
+MEV(SelectedChunkFree)
+
+MEV(ChunkIsBusy)
+
+MEV(ActivePushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery
+ ///< that is in-flight to the local ObjectStore
+
+MEV(UpdatesApplied) ///< (Primary only) all updates are committed
+
+MEV(InternalAllUpdates) ///< the internal counterpart of UpdatesApplied
+
+MEV(GotReplicas) ///< got a map from a replica
+
+MEV(IntBmPreempted) ///< internal - BuildMap preempted. Required, as detected within the
+ ///< ctor
+
+MEV(InternalError)
+
+MEV(IntLocalMapDone)
+
+MEV(DigestUpdate) ///< external. called upon success of a MODIFY op. See
+ ///< scrub_snapshot_metadata()
+
+MEV(MapsCompared) ///< (Crimson) maps_compare_n_cleanup() transactions are done
+
+MEV(StartReplica) ///< initiating replica scrub.
+
+MEV(StartReplicaNoWait) ///< 'start replica' when there are no pending updates
+
+MEV(SchedReplica)
+
+MEV(ReplicaPushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery
+ ///< that is in-flight to the local ObjectStore
+
+MEV(FullReset) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive)
+
+MEV(NextChunk) ///< finished handling this chunk. Go get the next one
+
+MEV(ScrubFinished) ///< all chunks handled
+
+
+struct NotActive; ///< the quiescent state. No active scrubbing.
+struct ReservingReplicas; ///< securing scrub resources from replicas' OSDs
+struct ActiveScrubbing; ///< the active state for a Primary. A sub-machine.
+struct ReplicaWaitUpdates; ///< an active state for a replica. Waiting for all active
+ ///< operations to finish.
+struct ActiveReplica; ///< an active state for a replica.
+
+
+class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
+ public:
+ friend class PgScrubber;
+
+ public:
+ explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub);
+ ~ScrubMachine();
+
+ PG* m_pg; // only used for dout messages
+ spg_t m_pg_id;
+ ScrubMachineListener* m_scrbr;
+
+ void my_states() const;
+ void assert_not_active() const;
+ [[nodiscard]] bool is_reserving() const;
+ [[nodiscard]] bool is_accepting_updates() const;
+};
+
+/**
+ * The Scrubber's base (quiescent) state.
+ * Scrubbing is triggered by one of the following events:
+ * - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources
+ * reservation process. Will be issued by PG::scrub(), following a
+ * queued "PGScrub" op.
+ * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is
+ * not required to reserve resources.
+ * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming
+ * MOSDRepScrub message.
+ *
+ * note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting
+ * for replica resources to be acquired. But once replicas started using the
+ * resource-request to identify and tag the scrub session, this bypass cannot be
+ * supported anymore.
+ */
+struct NotActive : sc::state<NotActive, ScrubMachine> {
+ explicit NotActive(my_context ctx);
+
+ using reactions = mpl::list<sc::transition<StartScrub, ReservingReplicas>,
+ // a scrubbing that was initiated at recovery completion,
+ // and requires no resource reservations:
+ sc::transition<AfterRepairScrub, ReservingReplicas>,
+ sc::transition<StartReplica, ReplicaWaitUpdates>,
+ sc::transition<StartReplicaNoWait, ActiveReplica>>;
+};
+
+struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine> {
+
+ explicit ReservingReplicas(my_context ctx);
+ using reactions = mpl::list<sc::custom_reaction<FullReset>,
+ // all replicas granted our resources request
+ sc::transition<RemotesReserved, ActiveScrubbing>,
+ sc::custom_reaction<ReservationFailure>>;
+
+ sc::result react(const FullReset&);
+
+ /// at least one replica denied us the scrub resources we've requested
+ sc::result react(const ReservationFailure&);
+};
+
+
+// the "active" sub-states
+
+struct RangeBlocked; ///< the objects range is blocked
+struct PendingTimer; ///< either delaying the scrub by some time and requeuing, or just
+ ///< requeue
+struct NewChunk; ///< select a chunk to scrub, and verify its availability
+struct WaitPushes;
+struct WaitLastUpdate;
+struct BuildMap;
+struct DrainReplMaps; ///< a problem during BuildMap. Wait for all replicas to report,
+ ///< then restart.
+struct WaitReplicas; ///< wait for all replicas to report
+struct WaitDigestUpdate;
+
+struct ActiveScrubbing : sc::state<ActiveScrubbing, ScrubMachine, PendingTimer> {
+
+ explicit ActiveScrubbing(my_context ctx);
+ ~ActiveScrubbing();
+
+ using reactions = mpl::list<
+ sc::custom_reaction<InternalError>,
+ sc::custom_reaction<FullReset>>;
+
+ sc::result react(const FullReset&);
+ sc::result react(const InternalError&);
+};
+
+struct RangeBlocked : sc::state<RangeBlocked, ActiveScrubbing> {
+ explicit RangeBlocked(my_context ctx);
+ using reactions = mpl::list<sc::transition<Unblocked, PendingTimer>>;
+
+ Scrub::BlockedRangeWarning m_timeout;
+};
+
+struct PendingTimer : sc::state<PendingTimer, ActiveScrubbing> {
+
+ explicit PendingTimer(my_context ctx);
+
+ using reactions = mpl::list<sc::transition<InternalSchedScrub, NewChunk>>;
+};
+
+struct NewChunk : sc::state<NewChunk, ActiveScrubbing> {
+
+ explicit NewChunk(my_context ctx);
+
+ using reactions = mpl::list<sc::transition<ChunkIsBusy, RangeBlocked>,
+ sc::custom_reaction<SelectedChunkFree>>;
+
+ sc::result react(const SelectedChunkFree&);
+};
+
+/**
+ * initiate the update process for this chunk
+ *
+ * Wait fo 'active_pushes' to clear.
+ * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence
+ * scrub waits until the correct data is readable (in-flight data to the Objectstore is
+ * not readable until written to disk, termed 'applied' here)
+ */
+struct WaitPushes : sc::state<WaitPushes, ActiveScrubbing> {
+
+ explicit WaitPushes(my_context ctx);
+
+ using reactions = mpl::list<sc::custom_reaction<ActivePushesUpd>>;
+
+ sc::result react(const ActivePushesUpd&);
+};
+
+struct WaitLastUpdate : sc::state<WaitLastUpdate, ActiveScrubbing> {
+
+ explicit WaitLastUpdate(my_context ctx);
+
+ void on_new_updates(const UpdatesApplied&);
+
+ using reactions = mpl::list<sc::custom_reaction<InternalAllUpdates>,
+ sc::in_state_reaction<UpdatesApplied,
+ WaitLastUpdate,
+ &WaitLastUpdate::on_new_updates>>;
+
+ sc::result react(const InternalAllUpdates&);
+};
+
+struct BuildMap : sc::state<BuildMap, ActiveScrubbing> {
+ explicit BuildMap(my_context ctx);
+
+ // possible error scenarios:
+ // - an error reported by the backend will trigger an 'InternalError' event,
+ // handled by our parent state;
+ // - if preempted, we switch to DrainReplMaps, where we will wait for all
+ // replicas to send their maps before acknowledging the preemption;
+ // - an interval change will be handled by the relevant 'send-event' functions,
+ // and will translated into a 'FullReset' event.
+ using reactions =
+ mpl::list<sc::transition<IntBmPreempted, DrainReplMaps>,
+ sc::transition<InternalSchedScrub, BuildMap>, // looping, waiting
+ // for the backend to
+ // finish
+ sc::custom_reaction<IntLocalMapDone>>;
+
+ sc::result react(const IntLocalMapDone&);
+};
+
+/*
+ * "drain" scrub-maps responses from replicas
+ */
+struct DrainReplMaps : sc::state<DrainReplMaps, ActiveScrubbing> {
+ explicit DrainReplMaps(my_context ctx);
+
+ using reactions =
+ mpl::list<sc::custom_reaction<GotReplicas> // all replicas are accounted for
+ >;
+
+ sc::result react(const GotReplicas&);
+};
+
+struct WaitReplicas : sc::state<WaitReplicas, ActiveScrubbing> {
+ explicit WaitReplicas(my_context ctx);
+
+ using reactions =
+ mpl::list<sc::custom_reaction<GotReplicas>, // all replicas are accounted for
+ sc::transition<MapsCompared, WaitDigestUpdate>,
+ sc::deferral<DigestUpdate> // might arrive before we've reached WDU
+ >;
+
+ sc::result react(const GotReplicas&);
+
+ bool all_maps_already_called{false}; // see comment in react code
+};
+
+struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing> {
+ explicit WaitDigestUpdate(my_context ctx);
+
+ using reactions = mpl::list<sc::custom_reaction<DigestUpdate>,
+ sc::transition<NextChunk, PendingTimer>,
+ sc::transition<ScrubFinished, NotActive>>;
+ sc::result react(const DigestUpdate&);
+};
+
+// ----------------------------- the "replica active" states -----------------------
+
+/*
+ * Waiting for 'active_pushes' to complete
+ *
+ * When in this state:
+ * - the details of the Primary's request were internalized by PgScrubber;
+ * - 'active' scrubbing is set
+ */
+struct ReplicaWaitUpdates : sc::state<ReplicaWaitUpdates, ScrubMachine> {
+ explicit ReplicaWaitUpdates(my_context ctx);
+ using reactions =
+ mpl::list<sc::custom_reaction<ReplicaPushesUpd>, sc::custom_reaction<FullReset>>;
+
+ sc::result react(const ReplicaPushesUpd&);
+ sc::result react(const FullReset&);
+};
+
+
+struct ActiveReplica : sc::state<ActiveReplica, ScrubMachine> {
+ explicit ActiveReplica(my_context ctx);
+ using reactions = mpl::list<sc::custom_reaction<SchedReplica>,
+ sc::custom_reaction<FullReset>,
+ sc::transition<ScrubFinished, NotActive>>;
+
+ sc::result react(const SchedReplica&);
+ sc::result react(const FullReset&);
+};
+
+} // namespace Scrub
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+/**
+ * \file the PgScrubber interface used by the scrub FSM
+ */
+#include "common/version.h"
+#include "include/Context.h"
+
+#include "osd/osd_types.h"
+
+namespace Scrub {
+
+enum class PreemptionNoted { no_preemption, preempted };
+
+/// the interface exposed by the PgScrubber into its internal
+/// preemption_data object
+struct preemption_t {
+
+ virtual ~preemption_t() = default;
+
+ [[nodiscard]] virtual bool is_preemptable() const = 0;
+
+ [[nodiscard]] virtual bool was_preempted() const = 0;
+
+ virtual void adjust_parameters() = 0;
+
+ /**
+ * Try to preempt the scrub.
+ * 'true' (i.e. - preempted) if:
+ * preemptable && not already preempted
+ */
+ virtual bool do_preempt() = 0;
+
+ /**
+ * disables preemptions.
+ * Returns 'true' if we were already preempted
+ */
+ virtual bool disable_and_test() = 0;
+};
+
+/// an aux used when blocking on a busy object.
+/// Issues a log warning if still blocked after 'waittime'.
+struct blocked_range_t {
+ blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id);
+ ~blocked_range_t();
+
+ OSDService* m_osds;
+ Context* m_callbk;
+};
+
+using BlockedRangeWarning = std::unique_ptr<blocked_range_t>;
+
+} // namespace Scrub
+
+struct ScrubMachineListener {
+
+ struct MsgAndEpoch {
+ MessageRef m_msg;
+ epoch_t m_epoch;
+ };
+
+ virtual ~ScrubMachineListener() = default;
+
+ [[nodiscard]] virtual bool is_primary() const = 0;
+
+ virtual void select_range_n_notify() = 0;
+
+ virtual Scrub::BlockedRangeWarning acquire_blocked_alarm() = 0;
+
+ /// walk the log to find the latest update that affects our chunk
+ virtual eversion_t search_log_for_updates() const = 0;
+
+ virtual eversion_t get_last_update_applied() const = 0;
+
+ virtual int pending_active_pushes() const = 0;
+
+ virtual int build_primary_map_chunk() = 0;
+
+ virtual int build_replica_map_chunk() = 0;
+
+ virtual void on_init() = 0;
+
+ virtual void on_replica_init() = 0;
+
+ virtual void replica_handling_done() = 0;
+
+ /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+ /// (thus can be called from FSM reactions)
+ virtual void clear_pgscrub_state() = 0;
+
+ /*
+ * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
+ * is asserted - after a configuration-dependent timeout.
+ */
+ virtual void add_delayed_scheduling() = 0;
+
+ /**
+ * Ask all replicas for their scrub maps for the current chunk.
+ */
+ virtual void get_replicas_maps(bool replica_can_preempt) = 0;
+
+ virtual void on_digest_updates() = 0;
+
+ /**
+ * Prepare a MOSDRepScrubMap message carrying the requested scrub map
+ * @param was_preempted - were we preempted?
+ * @return the message, and the current value of 'm_replica_min_epoch' (which is
+ * used when sending the message, but will be overwritten before that).
+ */
+ [[nodiscard]] virtual MsgAndEpoch prep_replica_map_msg(
+ Scrub::PreemptionNoted was_preempted) = 0;
+
+ /**
+ * Send to the primary the pre-prepared message containing the requested map
+ */
+ virtual void send_replica_map(const MsgAndEpoch& preprepared) = 0;
+
+ /**
+ * Let the primary know that we were preempted while trying to build the
+ * requested map.
+ */
+ virtual void send_preempted_replica() = 0;
+
+ [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0;
+
+ virtual void set_subset_last_update(eversion_t e) = 0;
+
+ [[nodiscard]] virtual bool was_epoch_changed() const = 0;
+
+ virtual Scrub::preemption_t& get_preemptor() = 0;
+
+ /**
+ * a "technical" collection of the steps performed once all
+ * rep maps are available:
+ * - the maps are compared
+ * - the scrub region markers (start_ & end_) are advanced
+ * - callbacks and ops that were pending are allowed to run
+ */
+ virtual void maps_compare_n_cleanup() = 0;
+
+ /**
+ * order the PgScrubber to initiate the process of reserving replicas' scrub
+ * resources.
+ */
+ virtual void reserve_replicas() = 0;
+
+ virtual void unreserve_replicas() = 0;
+
+ /**
+ * the FSM interface into the "are we waiting for maps, either our own or from
+ * replicas" state.
+ * The FSM can only:
+ * - mark the local map as available, and
+ * - query status
+ */
+ virtual void mark_local_map_ready() = 0;
+
+ [[nodiscard]] virtual bool are_all_maps_available() const = 0;
+
+ /// a log/debug interface
+ virtual std::string dump_awaited_maps() const = 0;
+};