From: Ronen Friedman Date: Sun, 11 Apr 2021 18:17:41 +0000 (+0300) Subject: osd/scrub: collecting scrub-related files into a separate directory X-Git-Tag: v17.1.0~809^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=97b6fe661c2a4e106d7449b74f806975fdc8f6fc;p=ceph.git osd/scrub: collecting scrub-related files into a separate directory Cleaning src/osd from scrub implementation files. Triggered by: - the matching Crimson scrub structure; - the proliferation of scrub related code files (inc. in coming PRs); scrubber_common.h, which defines the scrubber's interface, remains in src/osd. Signed-off-by: Ronen Friedman --- diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt index 373456fc65d..82a5451804a 100644 --- a/src/osd/CMakeLists.txt +++ b/src/osd/CMakeLists.txt @@ -11,9 +11,6 @@ endif() set(osd_srcs OSD.cc - pg_scrubber.cc - scrub_machine.cc - PrimaryLogScrub.cc Watch.cc ClassHandler.cc PG.cc @@ -24,10 +21,13 @@ set(osd_srcs ECTransaction.cc PGBackend.cc OSDCap.cc + scrubber/pg_scrubber.cc + scrubber/PrimaryLogScrub.cc + scrubber/scrub_machine.cc + scrubber/ScrubStore.cc Watch.cc Session.cc SnapMapper.cc - ScrubStore.cc osd_types.cc ECUtil.cc ExtentCache.cc diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 425e27a3df1..62aa2f767a4 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -35,8 +35,8 @@ #endif #include "osd/PG.h" -#include "osd/scrub_machine.h" -#include "osd/pg_scrubber.h" +#include "osd/scrubber/scrub_machine.h" +#include "osd/scrubber/pg_scrubber.h" #include "include/types.h" #include "include/compat.h" diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 644a6a9be9a..0f992d65283 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -20,8 +20,8 @@ #include "common/config.h" #include "OSD.h" #include "OpRequest.h" -#include "ScrubStore.h" -#include "pg_scrubber.h" +#include "scrubber/ScrubStore.h" +#include "scrubber/pg_scrubber.h" #include "Session.h" #include "osd/scheduler/OpSchedulerItem.h" diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc index 4e8c74c487e..cca28a8941d 100644 --- a/src/osd/PGBackend.cc +++ b/src/osd/PGBackend.cc @@ -19,7 +19,7 @@ #include "common/errno.h" #include "common/scrub_types.h" #include "ReplicatedBackend.h" -#include "ScrubStore.h" +#include "scrubber/ScrubStore.h" #include "ECBackend.h" #include "PGBackend.h" #include "OSD.h" diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 6b1a3e52f45..6899c5ea6b0 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -15,51 +15,41 @@ * */ -#include - -#include -#include -#include +#include "PrimaryLogPG.h" #include -#include - -#include "PG.h" -#include "pg_scrubber.h" -#include "PrimaryLogPG.h" -#include "OSD.h" -#include "PrimaryLogScrub.h" -#include "OpRequest.h" -#include "ScrubStore.h" -#include "Session.h" -#include "objclass/objclass.h" -#include "osd/ClassHandler.h" #include "cls/cas/cls_cas_ops.h" +#include "common/EventTrace.h" #include "common/ceph_crypto.h" +#include "common/CDC.h" #include "common/config.h" #include "common/errno.h" -#include "common/scrub_types.h" -#include "common/perf_counters.h" -#include "common/CDC.h" #include "common/EventTrace.h" - -#include "messages/MOSDOp.h" +#include "common/perf_counters.h" +#include "common/scrub_types.h" +#include "include/compat.h" +#include "messages/MCommandReply.h" #include "messages/MOSDBackoff.h" -#include "messages/MOSDPGTrim.h" -#include "messages/MOSDPGScan.h" -#include "messages/MOSDRepScrub.h" +#include "messages/MOSDOp.h" #include "messages/MOSDPGBackfill.h" #include "messages/MOSDPGBackfillRemove.h" #include "messages/MOSDPGLog.h" +#include "messages/MOSDPGScan.h" +#include "messages/MOSDPGTrim.h" #include "messages/MOSDPGUpdateLogMissing.h" #include "messages/MOSDPGUpdateLogMissingReply.h" -#include "messages/MCommandReply.h" +#include "messages/MOSDRepScrub.h" #include "messages/MOSDScrubReserve.h" - -#include "include/compat.h" #include "mon/MonClient.h" +#include "objclass/objclass.h" +#include "osd/ClassHandler.h" +#include "osd/OpRequest.h" +#include "osd/Session.h" #include "osdc/Objecter.h" +#include "scrubber/PrimaryLogScrub.h" + +// required includes order: #include "json_spirit/json_spirit_value.h" #include "json_spirit/json_spirit_reader.h" #include "include/ceph_assert.h" // json_spirit clobbers it diff --git a/src/osd/PrimaryLogScrub.cc b/src/osd/PrimaryLogScrub.cc deleted file mode 100644 index ac4049aea93..00000000000 --- a/src/osd/PrimaryLogScrub.cc +++ /dev/null @@ -1,589 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "PrimaryLogScrub.h" - -#include "common/scrub_types.h" -#include "osd/osd_types_fmt.h" - -#include "PeeringState.h" -#include "PrimaryLogPG.h" -#include "scrub_machine.h" - -#define dout_context (m_pg->get_cct()) -#define dout_subsys ceph_subsys_osd -#undef dout_prefix -#define dout_prefix _prefix(_dout, this->m_pg) - -using std::vector; - -template static ostream& _prefix(std::ostream* _dout, T* t) -{ - return t->gen_prefix(*_dout) << " PrimaryLog scrubber pg(" << t->pg_id << ") "; -} - -using namespace Scrub; -using Scrub::ScrubMachine; - -bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg, - scrub_ls_result_t& res_inout) const -{ - if (!m_store) { - return false; - } - - if (arg.get_snapsets) { - res_inout.vals = - m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return); - } else { - res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after, - arg.max_return); - } - return true; -} - -void PrimaryLogScrub::_scrub_finish() -{ - auto& info = m_pg->get_pg_info(ScrubberPasskey{}); ///< a temporary alias - - dout(10) << __func__ - << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid") - << dendl; - - if (info.stats.stats_invalid) { - m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) { - stats.stats = m_scrub_cstat; - stats.stats_invalid = false; - return false; - }); - - if (m_pl_pg->agent_state) - m_pl_pg->agent_choose_mode(); - } - - dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/" - << info.stats.stats.sum.num_objects << " objects, " - << m_scrub_cstat.sum.num_object_clones << "/" - << info.stats.stats.sum.num_object_clones << " clones, " - << m_scrub_cstat.sum.num_objects_dirty << "/" - << info.stats.stats.sum.num_objects_dirty << " dirty, " - << m_scrub_cstat.sum.num_objects_omap << "/" - << info.stats.stats.sum.num_objects_omap << " omap, " - << m_scrub_cstat.sum.num_objects_pinned << "/" - << info.stats.stats.sum.num_objects_pinned << " pinned, " - << m_scrub_cstat.sum.num_objects_hit_set_archive << "/" - << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, " - << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes - << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/" - << info.stats.stats.sum.num_objects_manifest << " manifest objects, " - << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/" - << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes." - << dendl; - - if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects || - m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones || - (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty && - !info.stats.dirty_stats_invalid) || - (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap && - !info.stats.omap_stats_invalid) || - (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned && - !info.stats.pin_stats_invalid) || - (m_scrub_cstat.sum.num_objects_hit_set_archive != - info.stats.stats.sum.num_objects_hit_set_archive && - !info.stats.hitset_stats_invalid) || - (m_scrub_cstat.sum.num_bytes_hit_set_archive != - info.stats.stats.sum.num_bytes_hit_set_archive && - !info.stats.hitset_bytes_stats_invalid) || - (m_scrub_cstat.sum.num_objects_manifest != - info.stats.stats.sum.num_objects_manifest && - !info.stats.manifest_stats_invalid) || - m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts || - m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) { - m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got " - << m_scrub_cstat.sum.num_objects << "/" - << info.stats.stats.sum.num_objects << " objects, " - << m_scrub_cstat.sum.num_object_clones << "/" - << info.stats.stats.sum.num_object_clones << " clones, " - << m_scrub_cstat.sum.num_objects_dirty << "/" - << info.stats.stats.sum.num_objects_dirty << " dirty, " - << m_scrub_cstat.sum.num_objects_omap << "/" - << info.stats.stats.sum.num_objects_omap << " omap, " - << m_scrub_cstat.sum.num_objects_pinned << "/" - << info.stats.stats.sum.num_objects_pinned << " pinned, " - << m_scrub_cstat.sum.num_objects_hit_set_archive << "/" - << info.stats.stats.sum.num_objects_hit_set_archive - << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts - << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, " - << m_scrub_cstat.sum.num_bytes << "/" - << info.stats.stats.sum.num_bytes << " bytes, " - << m_scrub_cstat.sum.num_objects_manifest << "/" - << info.stats.stats.sum.num_objects_manifest - << " manifest objects, " - << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/" - << info.stats.stats.sum.num_bytes_hit_set_archive - << " hit_set_archive bytes."; - ++m_shallow_errors; - - if (m_is_repair) { - ++m_fixed_count; - m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) { - stats.stats = m_scrub_cstat; - stats.dirty_stats_invalid = false; - stats.omap_stats_invalid = false; - stats.hitset_stats_invalid = false; - stats.hitset_bytes_stats_invalid = false; - stats.pin_stats_invalid = false; - stats.manifest_stats_invalid = false; - return false; - }); - m_pl_pg->publish_stats_to_osd(); - m_pl_pg->recovery_state.share_pg_info(); - } - } - // Clear object context cache to get repair information - if (m_is_repair) - m_pl_pg->object_contexts.clear(); -} - -static bool doing_clones(const std::optional& snapset, - const vector::reverse_iterator& curclone) -{ - return snapset && curclone != snapset->clones.rend(); -} - -void PrimaryLogScrub::log_missing(int missing, - const std::optional& head, - LogChannelRef clog, - const spg_t& pgid, - const char* func, - bool allow_incomplete_clones) -{ - ceph_assert(head); - if (allow_incomplete_clones) { - dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped " - << missing << " clone(s) in cache tier" << dendl; - } else { - clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing - << " missing clone(s)"; - } -} - -int PrimaryLogScrub::process_clones_to(const std::optional& head, - const std::optional& snapset, - LogChannelRef clog, - const spg_t& pgid, - bool allow_incomplete_clones, - std::optional target, - vector::reverse_iterator* curclone, - inconsistent_snapset_wrapper& e) -{ - ceph_assert(head); - ceph_assert(snapset); - int missing_count = 0; - - // NOTE: clones are in descending order, thus **curclone > target test here - hobject_t next_clone(*head); - while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) { - - ++missing_count; - // it is okay to be missing one or more clones in a cache tier. - // skip higher-numbered clones in the list. - if (!allow_incomplete_clones) { - next_clone.snap = **curclone; - clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone " - << next_clone << " " << m_missing << " missing"; - ++m_shallow_errors; - e.set_clone_missing(next_clone.snap); - } - // Clones are descending - ++(*curclone); - } - return missing_count; -} - -/* - * Validate consistency of the object info and snap sets. - * - * We are sort of comparing 2 lists. The main loop is on objmap.objects. But - * the comparison of the objects is against multiple snapset.clones. There are - * multiple clone lists and in between lists we expect head. - * - * Example - * - * objects expected - * ======= ======= - * obj1 snap 1 head, unexpected obj1 snap 1 - * obj2 head head, match - * [SnapSet clones 6 4 2 1] - * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7 - * obj2 snap 6 obj2 snap 6, match - * obj2 snap 4 obj2 snap 4, match - * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match - * [Snapset clones 3 1] - * obj3 snap 3 obj3 snap 3 match - * obj3 snap 1 obj3 snap 1 match - * obj4 head head, match - * [Snapset clones 4] - * EOL obj4 snap 4, (expected) - */ -void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap, - const missing_map_t& missing_digest) -{ - dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects - << dendl; - - auto& info = m_pl_pg->info; - const PGPool& pool = m_pl_pg->pool; - bool allow_incomplete_clones = pool.info.allow_incomplete_clones(); - - std::optional all_clones; // Unspecified snapid_t or std::nullopt - - // traverse in reverse order. - std::optional head; - std::optional snapset; // If initialized so will head (above) - vector::reverse_iterator curclone; // Defined only if snapset initialized - int missing = 0; - inconsistent_snapset_wrapper soid_error, head_error; - int soid_error_count = 0; - - for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) { - - const hobject_t& soid = p->first; - ceph_assert(!soid.is_snapdir()); - soid_error = inconsistent_snapset_wrapper{soid}; - object_stat_sum_t stat; - std::optional oi; - - stat.num_objects++; - - if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace) - stat.num_objects_hit_set_archive++; - - if (soid.is_snap()) { - // it's a clone - stat.num_object_clones++; - } - - // basic checks. - if (p->second.attrs.count(OI_ATTR) == 0) { - oi = std::nullopt; - m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '" - << OI_ATTR << "' attr"; - ++m_shallow_errors; - soid_error.set_info_missing(); - } else { - bufferlist bv; - bv.push_back(p->second.attrs[OI_ATTR]); - try { - oi = object_info_t(bv); - } catch (ceph::buffer::error& e) { - oi = std::nullopt; - m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid - << " : can't decode '" << OI_ATTR << "' attr " << e.what(); - ++m_shallow_errors; - soid_error.set_info_corrupted(); - soid_error.set_info_missing(); // Not available too - } - } - - if (oi) { - if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) { - m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid - << " : on disk size (" << p->second.size - << ") does not match object info size (" << oi->size - << ") adjusted for ondisk to (" - << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")"; - soid_error.set_size_mismatch(); - ++m_shallow_errors; - } - - dout(20) << m_mode_desc << " " << soid << " " << *oi << dendl; - - // A clone num_bytes will be added later when we have snapset - if (!soid.is_snap()) { - stat.num_bytes += oi->size; - } - if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace) - stat.num_bytes_hit_set_archive += oi->size; - - if (oi->is_dirty()) - ++stat.num_objects_dirty; - if (oi->is_whiteout()) - ++stat.num_whiteouts; - if (oi->is_omap()) - ++stat.num_objects_omap; - if (oi->is_cache_pinned()) - ++stat.num_objects_pinned; - if (oi->has_manifest()) - ++stat.num_objects_manifest; - } - - // Check for any problems while processing clones - if (doing_clones(snapset, curclone)) { - std::optional target; - // Expecting an object with snap for current head - if (soid.has_snapset() || soid.get_head() != head->get_head()) { - - dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid - << " while processing " << *head << dendl; - - target = all_clones; - } else { - ceph_assert(soid.is_snap()); - target = soid.snap; - } - - // Log any clones we were expecting to be there up to target - // This will set missing, but will be a no-op if snap.soid == *curclone. - missing += - process_clones_to(head, snapset, m_osds->clog, info.pgid, - allow_incomplete_clones, target, &curclone, head_error); - } - - bool expected; - // Check doing_clones() again in case we ran process_clones_to() - if (doing_clones(snapset, curclone)) { - // A head would have processed all clones above - // or all greater than *curclone. - ceph_assert(soid.is_snap() && *curclone <= soid.snap); - - // After processing above clone snap should match the expected curclone - expected = (*curclone == soid.snap); - } else { - // If we aren't doing clones any longer, then expecting head - expected = soid.has_snapset(); - } - if (!expected) { - // If we couldn't read the head's snapset, just ignore clones - if (head && !snapset) { - m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid - << " : clone ignored due to missing snapset"; - } else { - m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid - << " : is an unexpected clone"; - } - ++m_shallow_errors; - soid_error.set_headless(); - m_store->add_snap_error(pool.id, soid_error); - ++soid_error_count; - if (head && soid.get_head() == head->get_head()) - head_error.set_clone(soid.snap); - continue; - } - - // new snapset? - if (soid.has_snapset()) { - - if (missing) { - log_missing(missing, head, m_osds->clog, info.pgid, __func__, - pool.info.allow_incomplete_clones()); - } - - // Save previous head error information - if (head && (head_error.errors || soid_error_count)) - m_store->add_snap_error(pool.id, head_error); - // Set this as a new head object - head = soid; - missing = 0; - head_error = soid_error; - soid_error_count = 0; - - dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl; - - if (p->second.attrs.count(SS_ATTR) == 0) { - m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '" - << SS_ATTR << "' attr"; - ++m_shallow_errors; - snapset = std::nullopt; - head_error.set_snapset_missing(); - } else { - bufferlist bl; - bl.push_back(p->second.attrs[SS_ATTR]); - auto blp = bl.cbegin(); - try { - snapset = SnapSet(); // Initialize optional<> before decoding into it - decode(*snapset, blp); - head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]); - } catch (ceph::buffer::error& e) { - snapset = std::nullopt; - m_osds->clog->error() - << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR - << "' attr " << e.what(); - ++m_shallow_errors; - head_error.set_snapset_corrupted(); - } - } - - if (snapset) { - // what will be next? - curclone = snapset->clones.rbegin(); - - if (!snapset->clones.empty()) { - dout(20) << " snapset " << *snapset << dendl; - if (snapset->seq == 0) { - m_osds->clog->error() - << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set"; - ++m_shallow_errors; - head_error.set_snapset_error(); - } - } - } - } else { - ceph_assert(soid.is_snap()); - ceph_assert(head); - ceph_assert(snapset); - ceph_assert(soid.snap == *curclone); - - dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl; - - if (snapset->clone_size.count(soid.snap) == 0) { - m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid - << " : is missing in clone_size"; - ++m_shallow_errors; - soid_error.set_size_mismatch(); - } else { - if (oi && oi->size != snapset->clone_size[soid.snap]) { - m_osds->clog->error() - << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size - << " != clone_size " << snapset->clone_size[*curclone]; - ++m_shallow_errors; - soid_error.set_size_mismatch(); - } - - if (snapset->clone_overlap.count(soid.snap) == 0) { - m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid - << " : is missing in clone_overlap"; - ++m_shallow_errors; - soid_error.set_size_mismatch(); - } else { - // This checking is based on get_clone_bytes(). The first 2 asserts - // can't happen because we know we have a clone_size and - // a clone_overlap. Now we check that the interval_set won't - // cause the last assert. - uint64_t size = snapset->clone_size.find(soid.snap)->second; - const interval_set& overlap = - snapset->clone_overlap.find(soid.snap)->second; - bool bad_interval_set = false; - for (interval_set::const_iterator i = overlap.begin(); - i != overlap.end(); ++i) { - if (size < i.get_len()) { - bad_interval_set = true; - break; - } - size -= i.get_len(); - } - - if (bad_interval_set) { - m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid - << " : bad interval_set in clone_overlap"; - ++m_shallow_errors; - soid_error.set_size_mismatch(); - } else { - stat.num_bytes += snapset->get_clone_bytes(soid.snap); - } - } - } - - // what's next? - ++curclone; - if (soid_error.errors) { - m_store->add_snap_error(pool.id, soid_error); - ++soid_error_count; - } - } - m_scrub_cstat.add(stat); - } - - if (doing_clones(snapset, curclone)) { - dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid - << " No more objects while processing " << *head << dendl; - - missing += - process_clones_to(head, snapset, m_osds->clog, info.pgid, - allow_incomplete_clones, all_clones, &curclone, head_error); - } - - // There could be missing found by the test above or even - // before dropping out of the loop for the last head. - if (missing) { - log_missing(missing, head, m_osds->clog, info.pgid, __func__, - allow_incomplete_clones); - } - if (head && (head_error.errors || soid_error_count)) - m_store->add_snap_error(pool.id, head_error); - - dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing" - << dendl; - for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) { - - ceph_assert(!p->first.is_snapdir()); - dout(10) << __func__ << " recording digests for " << p->first << dendl; - - ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false); - if (!obc) { - m_osds->clog->error() << info.pgid << " " << m_mode_desc - << " cannot get object context for object " << p->first; - continue; - } - if (obc->obs.oi.soid != p->first) { - m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first - << " : object has a valid oi attr with a mismatched name, " - << " obc->obs.oi.soid: " << obc->obs.oi.soid; - continue; - } - PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc); - ctx->at_version = m_pl_pg->get_next_version(); - ctx->mtime = utime_t(); // do not update mtime - if (p->second.first) { - ctx->new_obs.oi.set_data_digest(*p->second.first); - } else { - ctx->new_obs.oi.clear_data_digest(); - } - if (p->second.second) { - ctx->new_obs.oi.set_omap_digest(*p->second.second); - } else { - ctx->new_obs.oi.clear_omap_digest(); - } - m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY); - - ++num_digest_updates_pending; - ctx->register_on_success([this]() { - dout(20) << "updating scrub digest " << num_digest_updates_pending << dendl; - if (--num_digest_updates_pending <= 0) { - m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops()); - } - }); - - m_pl_pg->simple_opc_submit(std::move(ctx)); - } - - dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl; -} - -PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {} - -void PrimaryLogScrub::_scrub_clear_state() -{ - dout(15) << __func__ << dendl; - m_scrub_cstat = object_stat_collection_t(); -} - -void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats, - const hobject_t& soid) -{ - // We scrub objects in hobject_t order, so objects before m_start have already been - // scrubbed and their stats have already been added to the scrubber. Objects after that - // point haven't been included in the scrubber's stats accounting yet, so they will be - // included when the scrubber gets to that object. - if (is_primary() && is_scrub_active()) { - if (soid < m_start) { - - dout(20) << fmt::format("{} {} < [{},{})", __func__, soid, m_start, m_end) << dendl; - m_scrub_cstat.add(delta_stats); - - } else { - - dout(25) << fmt::format("{} {} >= [{},{})", __func__, soid, m_start, m_end) << dendl; - } - } -} diff --git a/src/osd/PrimaryLogScrub.h b/src/osd/PrimaryLogScrub.h deleted file mode 100644 index 78353d6dbb6..00000000000 --- a/src/osd/PrimaryLogScrub.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#pragma once - -// the './' includes are marked this way to affect clang-format -#include "./pg_scrubber.h" - -#include -#include -#include - -#include "debug.h" - -#include "common/errno.h" -#include "common/scrub_types.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDRepScrub.h" -#include "messages/MOSDRepScrubMap.h" -#include "messages/MOSDScrub.h" -#include "messages/MOSDScrubReserve.h" - -#include "OSD.h" -#include "scrub_machine.h" - -class PrimaryLogPG; - -/** - * The derivative of PgScrubber that is used by PrimaryLogPG. - */ -class PrimaryLogScrub : public PgScrubber { - public: - explicit PrimaryLogScrub(PrimaryLogPG* pg); - - void _scrub_finish() final; - - bool get_store_errors(const scrub_ls_arg_t& arg, - scrub_ls_result_t& res_inout) const final; - - void stats_of_handled_objects(const object_stat_sum_t& delta_stats, - const hobject_t& soid) final; - - private: - // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object: - PrimaryLogPG* const m_pl_pg; - - /** - * Validate consistency of the object info and snap sets. - */ - void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final; - - void log_missing(int missing, - const std::optional& head, - LogChannelRef clog, - const spg_t& pgid, - const char* func, - bool allow_incomplete_clones); - - int process_clones_to(const std::optional& head, - const std::optional& snapset, - LogChannelRef clog, - const spg_t& pgid, - bool allow_incomplete_clones, - std::optional target, - std::vector::reverse_iterator* curclone, - inconsistent_snapset_wrapper& snap_error); - - - // handle our part in stats collection - object_stat_collection_t m_scrub_cstat; - void _scrub_clear_state() final; // which just clears the stats -}; diff --git a/src/osd/ScrubStore.cc b/src/osd/ScrubStore.cc deleted file mode 100644 index a692a44353f..00000000000 --- a/src/osd/ScrubStore.cc +++ /dev/null @@ -1,198 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "ScrubStore.h" -#include "osd_types.h" -#include "common/scrub_types.h" -#include "include/rados/rados_types.hpp" - -using std::ostringstream; -using std::string; -using std::vector; - -using ceph::bufferlist; - -namespace { -ghobject_t make_scrub_object(const spg_t& pgid) -{ - ostringstream ss; - ss << "scrub_" << pgid; - return pgid.make_temp_ghobject(ss.str()); -} - -string first_object_key(int64_t pool) -{ - auto hoid = hobject_t(object_t(), - "", - 0, - 0x00000000, - pool, - ""); - hoid.build_hash_cache(); - return "SCRUB_OBJ_" + hoid.to_str(); -} - -// the object_key should be unique across pools -string to_object_key(int64_t pool, const librados::object_id_t& oid) -{ - auto hoid = hobject_t(object_t(oid.name), - oid.locator, // key - oid.snap, - 0, // hash - pool, - oid.nspace); - hoid.build_hash_cache(); - return "SCRUB_OBJ_" + hoid.to_str(); -} - -string last_object_key(int64_t pool) -{ - auto hoid = hobject_t(object_t(), - "", - 0, - 0xffffffff, - pool, - ""); - hoid.build_hash_cache(); - return "SCRUB_OBJ_" + hoid.to_str(); -} - -string first_snap_key(int64_t pool) -{ - // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for - // the representing the minimal and maximum keys. and this relies on how - // hobject_t::to_str() works: hex(pool).hex(revhash). - auto hoid = hobject_t(object_t(), - "", - 0, - 0x00000000, - pool, - ""); - hoid.build_hash_cache(); - return "SCRUB_SS_" + hoid.to_str(); -} - -string to_snap_key(int64_t pool, const librados::object_id_t& oid) -{ - auto hoid = hobject_t(object_t(oid.name), - oid.locator, // key - oid.snap, - 0x77777777, // hash - pool, - oid.nspace); - hoid.build_hash_cache(); - return "SCRUB_SS_" + hoid.to_str(); -} - -string last_snap_key(int64_t pool) -{ - auto hoid = hobject_t(object_t(), - "", - 0, - 0xffffffff, - pool, - ""); - hoid.build_hash_cache(); - return "SCRUB_SS_" + hoid.to_str(); -} -} - -namespace Scrub { - -Store* -Store::create(ObjectStore* store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll) -{ - ceph_assert(store); - ceph_assert(t); - ghobject_t oid = make_scrub_object(pgid); - t->touch(coll, oid); - return new Store{coll, oid, store}; -} - -Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store) - : coll(coll), - hoid(oid), - driver(store, coll, hoid), - backend(&driver) -{} - -Store::~Store() -{ - ceph_assert(results.empty()); -} - -void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) -{ - bufferlist bl; - e.encode(bl); - results[to_object_key(pool, e.object)] = bl; -} - -void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) -{ - bufferlist bl; - e.encode(bl); - results[to_snap_key(pool, e.object)] = bl; -} - -bool Store::empty() const -{ - return results.empty(); -} - -void Store::flush(ObjectStore::Transaction* t) -{ - if (t) { - OSDriver::OSTransaction txn = driver.get_transaction(t); - backend.set_keys(results, &txn); - } - results.clear(); -} - -void Store::cleanup(ObjectStore::Transaction* t) -{ - t->remove(coll, hoid); -} - -std::vector -Store::get_snap_errors(int64_t pool, - const librados::object_id_t& start, - uint64_t max_return) const -{ - const string begin = (start.name.empty() ? - first_snap_key(pool) : to_snap_key(pool, start)); - const string end = last_snap_key(pool); - return get_errors(begin, end, max_return); -} - -std::vector -Store::get_object_errors(int64_t pool, - const librados::object_id_t& start, - uint64_t max_return) const -{ - const string begin = (start.name.empty() ? - first_object_key(pool) : to_object_key(pool, start)); - const string end = last_object_key(pool); - return get_errors(begin, end, max_return); -} - -std::vector -Store::get_errors(const string& begin, - const string& end, - uint64_t max_return) const -{ - vector errors; - auto next = std::make_pair(begin, bufferlist{}); - while (max_return && !backend.get_next(next.first, &next)) { - if (next.first >= end) - break; - errors.push_back(next.second); - max_return--; - } - return errors; -} - -} // namespace Scrub diff --git a/src/osd/ScrubStore.h b/src/osd/ScrubStore.h deleted file mode 100644 index 721aae09291..00000000000 --- a/src/osd/ScrubStore.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_SCRUB_RESULT_H -#define CEPH_SCRUB_RESULT_H - -#include "SnapMapper.h" // for OSDriver -#include "common/map_cacher.hpp" - -namespace librados { - struct object_id_t; -} - -struct inconsistent_obj_wrapper; -struct inconsistent_snapset_wrapper; - -namespace Scrub { - -class Store { -public: - ~Store(); - static Store* create(ObjectStore* store, - ObjectStore::Transaction* t, - const spg_t& pgid, - const coll_t& coll); - void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e); - void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e); - bool empty() const; - void flush(ObjectStore::Transaction *); - void cleanup(ObjectStore::Transaction *); - std::vector get_snap_errors(int64_t pool, - const librados::object_id_t& start, - uint64_t max_return) const; - std::vector get_object_errors(int64_t pool, - const librados::object_id_t& start, - uint64_t max_return) const; -private: - Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store); - std::vector get_errors(const std::string& start, const std::string& end, - uint64_t max_return) const; -private: - const coll_t coll; - const ghobject_t hoid; - // a temp object holding mappings from seq-id to inconsistencies found in - // scrubbing - OSDriver driver; - mutable MapCacher::MapCacher backend; - std::map results; -}; -} - -#endif // CEPH_SCRUB_RESULT_H diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc deleted file mode 100644 index 12f07ca4e86..00000000000 --- a/src/osd/pg_scrubber.cc +++ /dev/null @@ -1,2392 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=2 sw=2 smarttab - -#include "./pg_scrubber.h" // the '.' notation used to affect clang-format order - -#include -#include - -#include "debug.h" - -#include "common/errno.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDRepScrub.h" -#include "messages/MOSDRepScrubMap.h" -#include "messages/MOSDScrub.h" -#include "messages/MOSDScrubReserve.h" - -#include "OSD.h" -#include "ScrubStore.h" -#include "scrub_machine.h" - -using std::list; -using std::map; -using std::pair; -using std::set; -using std::stringstream; -using std::vector; -using namespace Scrub; -using namespace std::chrono; -using namespace std::chrono_literals; -using namespace std::literals; - -#define dout_context (m_pg->get_cct()) -#define dout_subsys ceph_subsys_osd -#undef dout_prefix -#define dout_prefix _prefix(_dout, this->m_pg) - -template static ostream& _prefix(std::ostream* _dout, T* t) -{ - return t->gen_prefix(*_dout) << " scrubber pg(" << t->pg_id << ") "; -} - -ostream& operator<<(ostream& out, const scrub_flags_t& sf) -{ - if (sf.auto_repair) - out << " AUTO_REPAIR"; - if (sf.check_repair) - out << " CHECK_REPAIR"; - if (sf.deep_scrub_on_error) - out << " DEEP_SCRUB_ON_ERROR"; - if (sf.required) - out << " REQ_SCRUB"; - - return out; -} - -ostream& operator<<(ostream& out, const requested_scrub_t& sf) -{ - if (sf.must_repair) - out << " MUST_REPAIR"; - if (sf.auto_repair) - out << " planned AUTO_REPAIR"; - if (sf.check_repair) - out << " planned CHECK_REPAIR"; - if (sf.deep_scrub_on_error) - out << " planned DEEP_SCRUB_ON_ERROR"; - if (sf.must_deep_scrub) - out << " MUST_DEEP_SCRUB"; - if (sf.must_scrub) - out << " MUST_SCRUB"; - if (sf.time_for_deep) - out << " TIME_FOR_DEEP"; - if (sf.need_auto) - out << " NEED_AUTO"; - if (sf.req_scrub) - out << " planned REQ_SCRUB"; - - return out; -} - -/* - * if the incoming message is from a previous interval, it must mean - * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard - * the stale message. - */ -bool PgScrubber::check_interval(epoch_t epoch_to_verify) -{ - return epoch_to_verify >= m_pg->get_same_interval_since(); -} - -bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify) -{ - if (!m_active) { - // not scrubbing. We can assume that the scrub was already terminated, and we - // can silently discard the incoming event. - return false; - } - - // is this a message from before we started this scrub? - if (epoch_to_verify < m_epoch_start) { - return false; - } - - // has a new interval started? - if (!check_interval(epoch_to_verify)) { - // if this is a new interval, on_change() has already terminated that - // old scrub. - return false; - } - - ceph_assert(is_primary()); - - // were we instructed to abort? - return verify_against_abort(epoch_to_verify); -} - -bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify) -{ - if (!should_abort()) { - return true; - } - - dout(10) << __func__ << " aborting. incoming epoch: " << epoch_to_verify - << " vs last-aborted: " << m_last_aborted << dendl; - - // if we were not aware of the abort before - kill the scrub. - if (epoch_to_verify > m_last_aborted) { - scrub_clear_state(); - m_last_aborted = std::max(epoch_to_verify, m_epoch_start); - } - return false; -} - -bool PgScrubber::should_abort() const -{ - if (m_flags.required) { - return false; // not stopping 'required' scrubs for configuration changes - } - - if (m_is_deep) { - if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || - m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) { - dout(10) << "nodeep_scrub set, aborting" << dendl; - return true; - } - } - - if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || - m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) { - dout(10) << "noscrub set, aborting" << dendl; - return true; - } - - return false; -} - -// initiating state-machine events -------------------------------- - -/* - * a note re the checks performed before sending scrub-initiating messages: - * - * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that - * possibly were in the queue while the PG changed state and became unavailable for - * scrubbing: - * - * The check_interval() catches all major changes to the PG. As for the other conditions - * we may check (and see is_message_relevant() above): - * - * - we are not 'active' yet, so must not check against is_active(), and: - * - * - the 'abort' flags were just verified (when the triggering message was queued). As - * those are only modified in human speeds - they need not be queried again. - * - * Some of the considerations above are also relevant to the replica-side initiation - * ('StartReplica' & 'StartReplicaNoWait'). - */ - -void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued) -{ - dout(15) << __func__ << " epoch: " << epoch_queued << dendl; - // we may have lost our Primary status while the message languished in the queue - if (check_interval(epoch_queued)) { - dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl; - reset_epoch(epoch_queued); - m_fsm->my_states(); - m_fsm->process_event(StartScrub{}); - dout(10) << "scrubber event --<< StartScrub" << dendl; - } -} - -void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued) -{ - dout(15) << __func__ << " epoch: " << epoch_queued << dendl; - // we may have lost our Primary status while the message languished in the queue - if (check_interval(epoch_queued)) { - dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl; - reset_epoch(epoch_queued); - m_fsm->my_states(); - m_fsm->process_event(AfterRepairScrub{}); - dout(10) << "scrubber event --<< AfterRepairScrub" << dendl; - } -} - -void PgScrubber::send_scrub_unblock(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (is_message_relevant(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(Unblocked{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_scrub_resched(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (is_message_relevant(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(InternalSchedScrub{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued - << " token: " << token << dendl; - if (is_primary()) { - // shouldn't happen. Ignore - dout(1) << "got a replica scrub request while Primary!" << dendl; - return; - } - - if (check_interval(epoch_queued) && is_token_current(token)) { - m_fsm->my_states(); - // save us some time by not waiting for updates if there are none - // to wait for. Affects the transition from NotActive into either - // ReplicaWaitUpdates or ActiveReplica. - if (pending_active_pushes()) - m_fsm->process_event(StartReplica{}); - else - m_fsm->process_event(StartReplicaNoWait{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued - << " token: " << token << dendl; - if (check_interval(epoch_queued) && is_token_current(token)) { - m_fsm->my_states(); - m_fsm->process_event(SchedReplica{}); // retest for map availability - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::active_pushes_notification(epoch_t epoch_queued) -{ - // note: Primary only - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (is_message_relevant(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(ActivePushesUpd{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::update_applied_notification(epoch_t epoch_queued) -{ - // note: Primary only - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (is_message_relevant(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(UpdatesApplied{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::digest_update_notification(epoch_t epoch_queued) -{ - // note: Primary only - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (is_message_relevant(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(DigestUpdate{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_local_map_done(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (is_message_relevant(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(Scrub::IntLocalMapDone{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (is_message_relevant(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(GotReplicas{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (check_interval(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(ReplicaPushesUpd{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_remotes_reserved(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - // note: scrub is not active yet - if (check_interval(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(RemotesReserved{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_reservation_failure(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (check_interval(epoch_queued)) { // do not check for 'active'! - m_fsm->my_states(); - m_fsm->process_event(ReservationFailure{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_full_reset(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - - m_fsm->my_states(); - m_fsm->process_event(Scrub::FullReset{}); - - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_chunk_free(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (check_interval(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(Scrub::SelectedChunkFree{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_chunk_busy(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (check_interval(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(Scrub::ChunkIsBusy{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_get_next_chunk(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - if (is_message_relevant(epoch_queued)) { - m_fsm->my_states(); - m_fsm->process_event(Scrub::NextChunk{}); - } - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - - // can't check for "active" - - m_fsm->my_states(); - m_fsm->process_event(Scrub::ScrubFinished{}); - - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -void PgScrubber::send_maps_compared(epoch_t epoch_queued) -{ - dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; - - m_fsm->my_states(); - m_fsm->process_event(Scrub::MapsCompared{}); - - dout(10) << "scrubber event --<< " << __func__ << dendl; -} - -// ----------------- - -bool PgScrubber::is_reserving() const -{ - return m_fsm->is_reserving(); -} - -void PgScrubber::reset_epoch(epoch_t epoch_queued) -{ - dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl; - m_fsm->assert_not_active(); - - m_epoch_start = epoch_queued; - m_needs_sleep = true; - m_is_deep = state_test(PG_STATE_DEEP_SCRUB); - update_op_mode_text(); -} - -unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const -{ - unsigned int qu_priority = m_flags.priority; - - if (with_priority == Scrub::scrub_prio_t::high_priority) { - qu_priority = - std::max(qu_priority, (unsigned int)m_pg->get_cct()->_conf->osd_client_op_priority); - } - return qu_priority; -} - -unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, - unsigned int suggested_priority) const -{ - if (with_priority == Scrub::scrub_prio_t::high_priority) { - suggested_priority = std::max(suggested_priority, - (unsigned int)m_pg->cct->_conf->osd_client_op_priority); - } - return suggested_priority; -} - -// ///////////////////////////////////////////////////////////////////// // -// scrub-op registration handling - -bool PgScrubber::is_scrub_registered() const -{ - return !m_scrub_reg_stamp.is_zero(); -} - -void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags) -{ - if (!is_primary()) { - // normal. No warning is required. - return; - } - - dout(10) << __func__ << " planned: must? " << request_flags.must_scrub << " need-auto? " - << request_flags.need_auto << " stamp: " << m_pg->info.history.last_scrub_stamp - << dendl; - - ceph_assert(!is_scrub_registered()); - - utime_t reg_stamp; - bool must = false; - - if (request_flags.must_scrub || request_flags.need_auto) { - // Set the smallest time that isn't utime_t() - reg_stamp = PgScrubber::scrub_must_stamp(); - must = true; - } else if (m_pg->info.stats.stats_invalid && - m_pg->cct->_conf->osd_scrub_invalid_stats) { - reg_stamp = ceph_clock_now(); - must = true; - } else { - reg_stamp = m_pg->info.history.last_scrub_stamp; - } - - dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must - << " required:" << m_flags.required << " flags: " << request_flags - << " stamp: " << reg_stamp << dendl; - - const double scrub_min_interval = - m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0); - const double scrub_max_interval = - m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0); - - // note the sched_time, so we can locate this scrub, and remove it later - m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval, - scrub_max_interval, must); - dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time " - << m_scrub_reg_stamp << ", must = " << (int)must << dendl; -} - -void PgScrubber::unreg_next_scrub() -{ - if (is_scrub_registered()) { - dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl; - m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp); - m_scrub_reg_stamp = utime_t{}; - } -} - -void PgScrubber::scrub_requested(scrub_level_t scrub_level, - scrub_type_t scrub_type, - requested_scrub_t& req_flags) -{ - dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ") - << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ") - << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered() - << dendl; - - unreg_next_scrub(); - - req_flags.must_scrub = true; - req_flags.must_deep_scrub = - (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair); - req_flags.must_repair = (scrub_type == scrub_type_t::do_repair); - // User might intervene, so clear this - req_flags.need_auto = false; - req_flags.req_scrub = true; - - dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl; - - reg_next_scrub(req_flags); -} - -void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags) -{ - dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? " - << is_scrub_registered() << dendl; - - unreg_next_scrub(); - req_flags.need_auto = true; - reg_next_scrub(req_flags); -} - -bool PgScrubber::reserve_local() -{ - // try to create the reservation object (which translates into asking the - // OSD for the local scrub resource). If failing - undo it immediately - - m_local_osd_resource.emplace(m_pg, m_osds); - if (!m_local_osd_resource->is_reserved()) { - m_local_osd_resource.reset(); - return false; - } - - return true; -} - -// ---------------------------------------------------------------------------- - -bool PgScrubber::has_pg_marked_new_updates() const -{ - auto last_applied = m_pg->recovery_state.get_last_update_applied(); - dout(10) << __func__ << " recovery last: " << last_applied - << " vs. scrub's: " << m_subset_last_update << dendl; - - return last_applied >= m_subset_last_update; -} - -void PgScrubber::set_subset_last_update(eversion_t e) -{ - m_subset_last_update = e; - dout(15) << __func__ << " last-update: " << e << dendl; -} - -void PgScrubber::on_applied_when_primary(const eversion_t& applied_version) -{ - // we are only interested in updates if we are the Primary, and in state - // WaitLastUpdate - if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) { - m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops()); - dout(15) << __func__ << " update: " << applied_version - << " vs. required: " << m_subset_last_update << dendl; - } -} - -/* - * The selected range is set directly into 'm_start' and 'm_end' - * setting: - * - m_subset_last_update - * - m_max_end - * - end - * - start - */ -bool PgScrubber::select_range() -{ - m_primary_scrubmap = ScrubMap{}; - m_received_maps.clear(); - - /* get the start and end of our scrub chunk - * - * Our scrub chunk has an important restriction we're going to need to - * respect. We can't let head be start or end. - * Using a half-open interval means that if end == head, - * we'd scrub/lock head and the clone right next to head in different - * chunks which would allow us to miss clones created between - * scrubbing that chunk and scrubbing the chunk including head. - * This isn't true for any of the other clones since clones can - * only be created "just to the left of" head. There is one exception - * to this: promotion of clones which always happens to the left of the - * left-most clone, but promote_object checks the scrubber in that - * case, so it should be ok. Also, it's ok to "miss" clones at the - * left end of the range if we are a tier because they may legitimately - * not exist (see _scrub). - */ - int min_idx = std::max( - 3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor()); - - int max_idx = std::max(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max / - preemption_data.chunk_divisor()); - - dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx - << " Div: " << preemption_data.chunk_divisor() << dendl; - - hobject_t start = m_start; - hobject_t candidate_end; - std::vector objects; - int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects, - &candidate_end); - ceph_assert(ret >= 0); - - if (!objects.empty()) { - - hobject_t back = objects.back(); - while (candidate_end.is_head() && candidate_end == back.get_head()) { - candidate_end = back; - objects.pop_back(); - if (objects.empty()) { - ceph_assert(0 == - "Somehow we got more than 2 objects which" - "have the same head but are not clones"); - } - back = objects.back(); - } - - if (candidate_end.is_head()) { - ceph_assert(candidate_end != back.get_head()); - candidate_end = candidate_end.get_object_boundary(); - } - - } else { - ceph_assert(candidate_end.is_max()); - } - - // is that range free for us? if not - we will be rescheduled later by whoever - // triggered us this time - - if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) { - // we'll be requeued by whatever made us unavailable for scrub - dout(10) << __func__ << ": scrub blocked somewhere in range " - << "[" << m_start << ", " << candidate_end << ")" << dendl; - return false; - } - - m_end = candidate_end; - if (m_end > m_max_end) - m_max_end = m_end; - - dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// " - << m_max_end << dendl; - - // debug: be 'blocked' if told so by the 'pg scrub_debug block' asok command - if (m_debug_blockrange > 0) { - m_debug_blockrange--; - return false; - } - return true; -} - -void PgScrubber::select_range_n_notify() -{ - if (select_range()) { - // the next chunk to handle is not blocked - dout(20) << __func__ << ": selection OK" << dendl; - m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority); - - } else { - // we will wait for the objects range to become available for scrubbing - dout(10) << __func__ << ": selected chunk is busy" << dendl; - m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority); - } -} - -bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid) -{ - if (soid < m_start || soid >= m_end) { - return false; - } - - dout(20) << __func__ << " " << soid << " can preempt? " - << preemption_data.is_preemptable() << " already preempted? " - << preemption_data.was_preempted() << dendl; - - if (preemption_data.was_preempted()) { - // otherwise - write requests arriving while 'already preempted' is set - // but 'preemptable' is not - will not be allowed to continue, and will - // not be requeued on time. - return false; - } - - if (preemption_data.is_preemptable()) { - - dout(10) << __func__ << " " << soid << " preempted" << dendl; - - // signal the preemption - preemption_data.do_preempt(); - m_end = m_start; // free the range we were scrubbing - - return false; - } - return true; -} - -bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end) -{ - // does [start, end] intersect [scrubber.start, scrubber.m_max_end) - return (start < m_max_end && end >= m_start); -} - -Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm() -{ - return std::make_unique(m_osds, ceph::timespan{300s}, m_pg_id); -} - -/** - * if we are required to sleep: - * arrange a callback sometimes later. - * be sure to be able to identify a stale callback. - * Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue) - * anyway. - */ -void PgScrubber::add_delayed_scheduling() -{ - m_end = m_start; // not blocking any range now - - milliseconds sleep_time{0ms}; - if (m_needs_sleep) { - double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required); - sleep_time = milliseconds{long(scrub_sleep)}; - } - dout(15) << __func__ << " sleep: " << sleep_time.count() << "ms. needed? " - << m_needs_sleep << dendl; - - if (sleep_time.count()) { - // schedule a transition for some 'sleep_time' ms in the future - - m_needs_sleep = false; - m_sleep_started_at = ceph_clock_now(); - - // the following log line is used by osd-scrub-test.sh - dout(20) << __func__ << " scrub state is PendingTimer, sleeping" << dendl; - - // the 'delayer' for crimson is different. Will be factored out. - - spg_t pgid = m_pg->get_pgid(); - auto callbk = new LambdaContext([osds = m_osds, pgid, - scrbr = this]([[maybe_unused]] int r) mutable { - PGRef pg = osds->osd->lookup_lock_pg(pgid); - if (!pg) { - lgeneric_subdout(g_ceph_context, osd, 10) - << "scrub_requeue_callback: Could not find " - << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl; - return; - } - scrbr->m_needs_sleep = true; - lgeneric_dout(scrbr->get_pg_cct(), 7) - << "scrub_requeue_callback: slept for " - << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl; - - scrbr->m_sleep_started_at = utime_t{}; - osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority); - pg->unlock(); - }); - - std::lock_guard l(m_osds->sleep_lock); - m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk); - - } else { - // just a requeue - m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority); - } -} - -eversion_t PgScrubber::search_log_for_updates() const -{ - auto& projected = m_pg->projected_log.log; - auto pi = find_if( - projected.crbegin(), projected.crend(), - [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; }); - - if (pi != projected.crend()) - return pi->version; - - // there was no relevant update entry in the log - - auto& log = m_pg->recovery_state.get_pg_log().get_log().log; - auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool { - return e.soid >= m_start && e.soid < m_end; - }); - - if (p == log.crend()) - return eversion_t{}; - else - return p->version; -} - -void PgScrubber::get_replicas_maps(bool replica_can_preempt) -{ - dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/" - << m_interval_start - << " pg same_interval_since: " << m_pg->info.history.same_interval_since - << dendl; - - m_primary_scrubmap_pos.reset(); - - // ask replicas to scan and send maps - for (const auto& i : m_pg->get_acting_recovery_backfill()) { - - if (i == m_pg_whoami) - continue; - - m_maps_status.mark_replica_map_request(i); - _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep, - replica_can_preempt); - } - - dout(10) << __func__ << " awaiting" << m_maps_status << dendl; -} - -bool PgScrubber::was_epoch_changed() const -{ - // for crimson we have m_pg->get_info().history.same_interval_since - dout(10) << __func__ << " epoch_start: " << m_interval_start - << " from pg: " << m_pg->get_history().same_interval_since << dendl; - - return m_interval_start < m_pg->get_history().same_interval_since; -} - -void PgScrubber::mark_local_map_ready() -{ - m_maps_status.mark_local_map_ready(); -} - -bool PgScrubber::are_all_maps_available() const -{ - return m_maps_status.are_all_maps_available(); -} - -std::string PgScrubber::dump_awaited_maps() const -{ - return m_maps_status.dump(); -} - -void PgScrubber::update_op_mode_text() -{ - auto visible_repair = state_test(PG_STATE_REPAIR); - m_mode_desc = (visible_repair ? "repair" : (m_is_deep ? "deep-scrub" : "scrub")); - - dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false") - << ", internal: " << (m_is_repair ? "true" : "false") - << ". Displayed: " << m_mode_desc << dendl; -} - -void PgScrubber::_request_scrub_map(pg_shard_t replica, - eversion_t version, - hobject_t start, - hobject_t end, - bool deep, - bool allow_preemption) -{ - ceph_assert(replica != m_pg_whoami); - dout(10) << __func__ << " scrubmap from osd." << replica - << (deep ? " deep" : " shallow") << dendl; - - auto repscrubop = - new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version, - get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep, - allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub()); - - // default priority. We want the replica-scrub processed prior to any recovery - // or client io messages (we are holding a lock!) - m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch()); -} - -void PgScrubber::cleanup_store(ObjectStore::Transaction* t) -{ - if (!m_store) - return; - - struct OnComplete : Context { - std::unique_ptr store; - explicit OnComplete(std::unique_ptr&& store) : store(std::move(store)) - {} - void finish(int) override {} - }; - m_store->cleanup(t); - t->register_on_complete(new OnComplete(std::move(m_store))); - ceph_assert(!m_store); -} - -void PgScrubber::on_init() -{ - // going upwards from 'inactive' - ceph_assert(!is_scrub_active()); - - preemption_data.reset(); - m_pg->publish_stats_to_osd(); - m_interval_start = m_pg->get_history().same_interval_since; - - dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl; - - // create a new store - { - ObjectStore::Transaction t; - cleanup_store(&t); - m_store.reset( - Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll)); - m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); - } - - m_start = m_pg->info.pgid.pgid.get_hobj_start(); - m_active = true; -} - -void PgScrubber::on_replica_init() -{ - m_active = true; -} - -void PgScrubber::_scan_snaps(ScrubMap& smap) -{ - hobject_t head; - SnapSet snapset; - - // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings - // in this function - dout(15) << "_scan_snaps starts" << dendl; - - for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) { - - const hobject_t& hoid = i->first; - ScrubMap::object& o = i->second; - - dout(20) << __func__ << " " << hoid << dendl; - - ceph_assert(!hoid.is_snapdir()); - if (hoid.is_head()) { - // parse the SnapSet - bufferlist bl; - if (o.attrs.find(SS_ATTR) == o.attrs.end()) { - continue; - } - bl.push_back(o.attrs[SS_ATTR]); - auto p = bl.cbegin(); - try { - decode(snapset, p); - } catch (...) { - continue; - } - head = hoid.get_head(); - continue; - } - - if (hoid.snap < CEPH_MAXSNAP) { - // check and if necessary fix snap_mapper - if (hoid.get_head() != head) { - derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl; - continue; - } - set obj_snaps; - auto p = snapset.clone_snaps.find(hoid.snap); - if (p == snapset.clone_snaps.end()) { - derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl; - continue; - } - obj_snaps.insert(p->second.begin(), p->second.end()); - set cur_snaps; - int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps); - if (r != 0 && r != -ENOENT) { - derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl; - ceph_abort(); - } - if (r == -ENOENT || cur_snaps != obj_snaps) { - ObjectStore::Transaction t; - OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t)); - if (r == 0) { - r = m_pg->snap_mapper.remove_oid(hoid, &_t); - if (r != 0) { - derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; - ceph_abort(); - } - m_pg->osd->clog->error() - << "osd." << m_pg->osd->whoami << " found snap mapper error on pg " - << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps - << ", oi: " << obj_snaps << "...repaired"; - } else { - m_pg->osd->clog->error() - << "osd." << m_pg->osd->whoami << " found snap mapper error on pg " - << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper" - << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r - << "...repaired"; - } - m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t); - - // wait for repair to apply to avoid confusing other bits of the system. - { - dout(15) << __func__ << " wait on repair!" << dendl; - - ceph::condition_variable my_cond; - ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock"); - int e = 0; - bool done; - - t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e)); - - e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t)); - if (e != 0) { - derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl; - } else { - std::unique_lock l{my_lock}; - my_cond.wait(l, [&done] { return done; }); - } - } - } - } - } -} - -int PgScrubber::build_primary_map_chunk() -{ - epoch_t map_building_since = m_pg->get_osdmap_epoch(); - dout(20) << __func__ << ": initiated at epoch " << map_building_since << dendl; - - auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start, - m_end, m_is_deep); - - if (ret == -EINPROGRESS) { - // reschedule another round of asking the backend to collect the scrub data - m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority); - } - return ret; -} - -int PgScrubber::build_replica_map_chunk() -{ - dout(10) << __func__ << " interval start: " << m_interval_start - << " current token: " << m_current_token << " epoch: " << m_epoch_start - << " deep: " << m_is_deep << dendl; - - auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end, - m_is_deep); - - switch (ret) { - - case -EINPROGRESS: - // must wait for the backend to finish. No external event source. - // (note: previous version used low priority here. Now switched to using the - // priority of the original message) - m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority, - m_flags.priority, m_current_token); - break; - - case 0: { - // finished! - m_cleaned_meta_map.clear_from(m_start); - m_cleaned_meta_map.insert(replica_scrubmap); - auto for_meta_scrub = clean_meta_map(); - _scan_snaps(for_meta_scrub); - - // the local map has been created. Send it to the primary. - // Note: once the message reaches the Primary, it may ask us for another - // chunk - and we better be done with the current scrub. Thus - the preparation of - // the reply message is separate, and we clear the scrub state before actually - // sending it. - - auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption); - replica_handling_done(); - dout(15) << __func__ << " chunk map sent " << dendl; - send_replica_map(reply); - } break; - - default: - // negative retval: build_scrub_map_chunk() signalled an error - // Pre-Pacific code ignored this option, treating it as a success. - // \todo Add an error flag in the returning message. - dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret - << dendl; - replica_handling_done(); - // only in debug mode for now: - assert(false && "backend error"); - break; - }; - - return ret; -} - -int PgScrubber::build_scrub_map_chunk( - ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep) -{ - dout(10) << __func__ << " [" << start << "," << end << ") " - << " pos " << pos << " Deep: " << deep << dendl; - - // start - while (pos.empty()) { - - pos.deep = deep; - map.valid_through = m_pg->info.last_update; - - // objects - vector rollback_obs; - pos.ret = - m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs); - dout(10) << __func__ << " while pos empty " << pos.ret << dendl; - if (pos.ret < 0) { - dout(5) << "objects_list_range error: " << pos.ret << dendl; - return pos.ret; - } - dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl; - if (pos.ls.empty()) { - break; - } - m_pg->_scan_rollback_obs(rollback_obs); - pos.pos = 0; - return -EINPROGRESS; - } - - // scan objects - while (!pos.done()) { - - int r = m_pg->get_pgbackend()->be_scan_list(map, pos); - dout(30) << __func__ << " BE returned " << r << dendl; - if (r == -EINPROGRESS) { - dout(20) << __func__ << " in progress" << dendl; - return r; - } - } - - // finish - dout(20) << __func__ << " finishing" << dendl; - ceph_assert(pos.done()); - m_pg->_repair_oinfo_oid(map); - - dout(20) << __func__ << " done, got " << map.objects.size() << " items" << dendl; - return 0; -} - -/* - * Process: - * Building a map of objects suitable for snapshot validation. - * The data in m_cleaned_meta_map is the left over partial items that need to - * be completed before they can be processed. - * - * Snapshots in maps precede the head object, which is why we are scanning backwards. - */ -ScrubMap PgScrubber::clean_meta_map() -{ - ScrubMap for_meta_scrub; - - if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) { - m_cleaned_meta_map.swap(for_meta_scrub); - } else { - auto iter = m_cleaned_meta_map.objects.end(); - --iter; // not empty, see 'if' clause - auto begin = m_cleaned_meta_map.objects.begin(); - if (iter->first.has_snapset()) { - ++iter; - } else { - while (iter != begin) { - auto next = iter--; - if (next->first.get_head() != iter->first.get_head()) { - ++iter; - break; - } - } - } - for_meta_scrub.objects.insert(begin, iter); - m_cleaned_meta_map.objects.erase(begin, iter); - } - - return for_meta_scrub; -} - -void PgScrubber::run_callbacks() -{ - std::list to_run; - to_run.swap(m_callbacks); - - for (auto& tr : to_run) { - tr->complete(0); - } -} - -void PgScrubber::maps_compare_n_cleanup() -{ - scrub_compare_maps(); - m_start = m_end; - run_callbacks(); - requeue_waiting(); - m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority); -} - -Scrub::preemption_t& PgScrubber::get_preemptor() -{ - return preemption_data; -} - -/* - * Process note: called for the arriving "give me your map, replica!" request. Unlike - * the original implementation, we do not requeue the Op waiting for - * updates. Instead - we trigger the FSM. - */ -void PgScrubber::replica_scrub_op(OpRequestRef op) -{ - op->mark_started(); - auto msg = op->get_req(); - dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch - << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl; - - // are we still processing a previous scrub-map request without noticing that the - // interval changed? won't see it here, but rather at the reservation stage. - - if (msg->map_epoch < m_pg->info.history.same_interval_since) { - dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch - << " < " << m_pg->info.history.same_interval_since << dendl; - - // is there a general sync issue? are we holding a stale reservation? - // not checking now - assuming we will actively react to interval change. - - return; - } - - replica_scrubmap = ScrubMap{}; - replica_scrubmap_pos = ScrubMapBuilder{}; - - m_replica_min_epoch = msg->min_epoch; - m_start = msg->start; - m_end = msg->end; - m_max_end = msg->end; - m_is_deep = msg->deep; - m_interval_start = m_pg->info.history.same_interval_since; - m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority - : Scrub::scrub_prio_t::low_priority; - m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority(); - - preemption_data.reset(); - preemption_data.force_preemptability(msg->allow_preemption); - - replica_scrubmap_pos.reset(); - - // make sure the FSM is at NotActive - m_fsm->assert_not_active(); - - m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority, - m_current_token); -} - -void PgScrubber::set_op_parameters(requested_scrub_t& request) -{ - dout(10) << __func__ << " input: " << request << dendl; - - // write down the epoch of starting a new scrub. Will be used - // to discard stale messages from previous aborted scrubs. - m_epoch_start = m_pg->get_osdmap_epoch(); - - m_flags.check_repair = request.check_repair; - m_flags.auto_repair = request.auto_repair || request.need_auto; - m_flags.required = request.req_scrub || request.must_scrub; - - m_flags.priority = (request.must_scrub || request.need_auto) - ? get_pg_cct()->_conf->osd_requested_scrub_priority - : m_pg->get_scrub_priority(); - - state_set(PG_STATE_SCRUBBING); - - // will we be deep-scrubbing? - if (request.must_deep_scrub || request.need_auto || request.time_for_deep) { - state_set(PG_STATE_DEEP_SCRUB); - } - - // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e. - // deep-scrub with the auto_repair configuration flag set). m_is_repair value - // determines the scrubber behavior. - // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the - // PG status as appearing in the logs). - m_is_repair = request.must_repair || m_flags.auto_repair; - if (request.must_repair) { - state_set(PG_STATE_REPAIR); - // not calling update_op_mode_text() yet, as m_is_deep not set yet - } - - // the publishing here seems to be required for tests synchronization - m_pg->publish_stats_to_osd(); - m_flags.deep_scrub_on_error = request.deep_scrub_on_error; -} - -void PgScrubber::scrub_compare_maps() -{ - dout(10) << __func__ << " has maps, analyzing" << dendl; - - // construct authoritative scrub map for type-specific scrubbing - m_cleaned_meta_map.insert(m_primary_scrubmap); - map, std::optional>> missing_digest; - - map maps; - maps[m_pg_whoami] = &m_primary_scrubmap; - - for (const auto& i : m_pg->get_acting_recovery_backfill()) { - if (i == m_pg_whoami) - continue; - dout(2) << __func__ << " replica " << i << " has " - << m_received_maps[i].objects.size() << " items" << dendl; - maps[i] = &m_received_maps[i]; - } - - set master_set; - - // Construct master set - for (const auto& map : maps) { - for (const auto& i : map.second->objects) { - master_set.insert(i.first); - } - } - - stringstream ss; - m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss); - - if (!ss.str().empty()) { - m_osds->clog->warn(ss); - } - - if (m_pg->recovery_state.get_acting_recovery_backfill().size() > 1) { - - dout(10) << __func__ << " comparing replica scrub maps" << dendl; - - // Map from object with errors to good peer - map> authoritative; - - dout(2) << __func__ << ": primary (" << m_pg->get_primary() << ") has " - << m_primary_scrubmap.objects.size() << " items" << dendl; - - ss.str(""); - ss.clear(); - - m_pg->get_pgbackend()->be_compare_scrubmaps( - maps, master_set, m_is_repair, m_missing, m_inconsistent, - authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(), - m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss); - - if (!ss.str().empty()) { - m_osds->clog->error(ss); - } - - for (auto& i : authoritative) { - list> good_peers; - for (list::const_iterator j = i.second.begin(); j != i.second.end(); - ++j) { - good_peers.emplace_back(maps[*j]->objects[i.first], *j); - } - m_authoritative.emplace(i.first, good_peers); - } - - for (auto i = authoritative.begin(); i != authoritative.end(); ++i) { - m_cleaned_meta_map.objects.erase(i->first); - m_cleaned_meta_map.objects.insert( - *(maps[i->second.back()]->objects.find(i->first))); - } - } - - auto for_meta_scrub = clean_meta_map(); - - // ok, do the pg-type specific scrubbing - - // (Validates consistency of the object info and snap sets) - scrub_snapshot_metadata(for_meta_scrub, missing_digest); - - // Called here on the primary can use an authoritative map if it isn't the primary - _scan_snaps(for_meta_scrub); - - if (!m_store->empty()) { - - if (m_is_repair) { - dout(10) << __func__ << ": discarding scrub results" << dendl; - m_store->flush(nullptr); - } else { - dout(10) << __func__ << ": updating scrub object" << dendl; - ObjectStore::Transaction t; - m_store->flush(&t); - m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); - } - } -} - -ScrubMachineListener::MsgAndEpoch PgScrubber::prep_replica_map_msg( - PreemptionNoted was_preempted) -{ - dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl; - - auto reply = - make_message(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), - m_replica_min_epoch, m_pg_whoami); - - reply->preempted = (was_preempted == PreemptionNoted::preempted); - ::encode(replica_scrubmap, reply->get_data()); - - return ScrubMachineListener::MsgAndEpoch{reply, m_replica_min_epoch}; -} - -void PgScrubber::send_replica_map(const MsgAndEpoch& preprepared) -{ - m_pg->send_cluster_message(m_pg->get_primary().osd, preprepared.m_msg, - preprepared.m_epoch, false); -} - -void PgScrubber::send_preempted_replica() -{ - auto reply = - make_message(spg_t{m_pg->info.pgid.pgid, m_pg->get_primary().shard}, - m_replica_min_epoch, m_pg_whoami); - - reply->preempted = true; - ::encode(replica_scrubmap, reply->get_data()); // must not skip this - m_pg->send_cluster_message(m_pg->get_primary().osd, reply, m_replica_min_epoch, false); -} - -/* - * - if the replica lets us know it was interrupted, we mark the chunk as interrupted. - * The state-machine will react to that when all replica maps are received. - * - when all maps are received, we signal the FSM with the GotReplicas event (see - * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the - * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to - * handle. - */ -void PgScrubber::map_from_replica(OpRequestRef op) -{ - auto m = op->get_req(); - dout(15) << __func__ << " " << *m << dendl; - - if (m->map_epoch < m_pg->info.history.same_interval_since) { - dout(10) << __func__ << " discarding old from " << m->map_epoch << " < " - << m_pg->info.history.same_interval_since << dendl; - return; - } - - auto p = const_cast(m->get_data()).cbegin(); - - m_received_maps[m->from].decode(p, m_pg->info.pgid.pool()); - dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl; - - auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from); - if (!is_ok) { - // previously an unexpected map was triggering an assert. Now, as scrubs can be - // aborted at any time, the chances of this happening have increased, and aborting is - // not justified - dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl; - return; - } - - if (m->preempted) { - dout(10) << __func__ << " replica was preempted, setting flag" << dendl; - preemption_data.do_preempt(); - } - - if (m_maps_status.are_all_maps_available()) { - dout(15) << __func__ << " all repl-maps available" << dendl; - m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops()); - } -} - -void PgScrubber::handle_scrub_reserve_request(OpRequestRef op) -{ - dout(10) << __func__ << " " << *op->get_req() << dendl; - op->mark_started(); - auto request_ep = op->get_req()->get_map_epoch(); - - /* - * if we are currently holding a reservation, then: - * either (1) we, the scrubber, did not yet notice an interval change. The remembered - * reservation epoch is from before our interval, and we can silently discard the - * reservation (no message is required). - * or: - * (2) the interval hasn't changed, but the same Primary that (we think) holds the - * lock just sent us a new request. Note that we know it's the same Primary, as - * otherwise the interval would have changed. - * Ostensibly we can discard & redo the reservation. But then we - * will be temporarily releasing the OSD resource - and might not be able to grab it - * again. Thus, we simply treat this as a successful new request - * (but mark the fact that if there is a previous request from the primary to - * scrub a specific chunk - that request is now defunct). - */ - - if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) { - // we are holding a stale reservation from a past epoch - m_remote_osd_resource.reset(); - dout(10) << __func__ << " stale reservation request" << dendl; - } - - if (request_ep < m_pg->get_same_interval_since()) { - // will not ack stale requests - return; - } - - bool granted{false}; - if (m_remote_osd_resource.has_value()) { - - dout(10) << __func__ << " already reserved." << dendl; - - /* - * it might well be that we did not yet finish handling the latest scrub-op from - * our primary. This happens, for example, if 'noscrub' was set via a command, then - * reset. The primary in this scenario will remain in the same interval, but we do need - * to reset our internal state (otherwise - the first renewed 'give me your scrub map' - * from the primary will see us in active state, crashing the OSD). - */ - advance_token(); - granted = true; - - } else if (m_pg->cct->_conf->osd_scrub_during_recovery || - !m_osds->is_recovery_active()) { - m_remote_osd_resource.emplace(m_pg, m_osds, request_ep); - // OSD resources allocated? - granted = m_remote_osd_resource->is_reserved(); - if (!granted) { - // just forget it - m_remote_osd_resource.reset(); - dout(20) << __func__ << ": failed to reserve remotely" << dendl; - } - } - - dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl; - - Message* reply = new MOSDScrubReserve( - spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep, - granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami); - - m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection()); -} - -void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) -{ - dout(10) << __func__ << " " << *op->get_req() << dendl; - op->mark_started(); - - if (m_reservations.has_value()) { - m_reservations->handle_reserve_grant(op, from); - } else { - derr << __func__ << ": received unsolicited reservation grant from osd " << from - << " (" << op << ")" << dendl; - } -} - -void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) -{ - dout(10) << __func__ << " " << *op->get_req() << dendl; - op->mark_started(); - - if (m_reservations.has_value()) { - // there is an active reservation process. No action is required otherwise. - m_reservations->handle_reserve_reject(op, from); - } -} - -void PgScrubber::handle_scrub_reserve_release(OpRequestRef op) -{ - dout(10) << __func__ << " " << *op->get_req() << dendl; - op->mark_started(); - - /* - * this specific scrub session has terminated. All incoming events carrying the old - * tag will be discarded. - */ - advance_token(); - m_remote_osd_resource.reset(); -} - -void PgScrubber::discard_replica_reservations() -{ - dout(10) << __func__ << dendl; - if (m_reservations.has_value()) { - m_reservations->discard_all(); - } -} - -void PgScrubber::clear_scrub_reservations() -{ - dout(10) << __func__ << dendl; - m_reservations.reset(); // the remote reservations - m_local_osd_resource.reset(); // the local reservation - m_remote_osd_resource.reset(); // we as replica reserved for a Primary -} - -void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text) -{ - ceph_assert(m_pg->recovery_state.get_backfill_targets().empty()); - - std::vector> messages; - messages.reserve(m_pg->get_actingset().size()); - - epoch_t epch = get_osdmap_epoch(); - - for (auto& p : m_pg->get_actingset()) { - - if (p == m_pg_whoami) - continue; - - dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch - << dendl; - Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode, - m_pg_whoami); - messages.push_back(std::make_pair(p.osd, m)); - } - - if (!messages.empty()) { - m_osds->send_message_osd_cluster(messages, epch); - } -} - -void PgScrubber::unreserve_replicas() -{ - dout(10) << __func__ << dendl; - m_reservations.reset(); -} - -[[nodiscard]] bool PgScrubber::scrub_process_inconsistent() -{ - dout(10) << __func__ << ": checking authoritative (mode=" - << m_mode_desc << ", auth remaining #: " << m_authoritative.size() - << ")" << dendl; - - // authoritative only store objects which are missing or inconsistent. - if (!m_authoritative.empty()) { - - stringstream ss; - ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, " - << m_inconsistent.size() << " inconsistent objects"; - dout(2) << ss.str() << dendl; - m_osds->clog->error(ss); - - if (m_is_repair) { - state_clear(PG_STATE_CLEAN); - // we know we have a problem, so it's OK to set the user-visible flag - // even if we only reached here via auto-repair - state_set(PG_STATE_REPAIR); - update_op_mode_text(); - - for (const auto& [hobj, shrd_list] : m_authoritative) { - - auto missing_entry = m_missing.find(hobj); - - if (missing_entry != m_missing.end()) { - m_pg->repair_object(hobj, shrd_list, missing_entry->second); - m_fixed_count += missing_entry->second.size(); - } - - if (m_inconsistent.count(hobj)) { - m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]); - m_fixed_count += m_inconsistent[hobj].size(); - } - } - } - } - return (!m_authoritative.empty() && m_is_repair); -} - -/* - * note: only called for the Primary. - */ -void PgScrubber::scrub_finish() -{ - dout(10) << __func__ << " before flags: " << m_flags - << ". repair state: " << (state_test(PG_STATE_REPAIR) ? "repair" : "no-repair") - << ". deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl; - - ceph_assert(m_pg->is_locked()); - - m_pg->m_planned_scrub = requested_scrub_t{}; - - // if the repair request comes from auto-repair and large number of errors, - // we would like to cancel auto-repair - if (m_is_repair && m_flags.auto_repair && - m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) { - - dout(10) << __func__ << " undoing the repair" << dendl; - state_clear(PG_STATE_REPAIR); // not expected to be set, anyway - m_is_repair = false; - update_op_mode_text(); - } - - bool do_auto_scrub = false; - - // if a regular scrub had errors within the limit, do a deep scrub to auto repair - if (m_flags.deep_scrub_on_error && !m_authoritative.empty() && - m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) { - ceph_assert(!m_is_deep); - do_auto_scrub = true; - dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl; - } - - m_flags.deep_scrub_on_error = false; - - // type-specific finish (can tally more errors) - _scrub_finish(); - - bool has_error = scrub_process_inconsistent(); - - { - stringstream oss; - oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " "; - int total_errors = m_shallow_errors + m_deep_errors; - if (total_errors) - oss << total_errors << " errors"; - else - oss << "ok"; - if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors) - oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors - << " remaining deep scrub error details lost)"; - if (m_is_repair) - oss << ", " << m_fixed_count << " fixed"; - if (total_errors) - m_osds->clog->error(oss); - else - m_osds->clog->debug(oss); - } - - // Since we don't know which errors were fixed, we can only clear them - // when every one has been fixed. - if (m_is_repair) { - if (m_fixed_count == m_shallow_errors + m_deep_errors) { - - ceph_assert(m_is_deep); - m_shallow_errors = 0; - m_deep_errors = 0; - dout(20) << __func__ << " All may be fixed" << dendl; - - } else if (has_error) { - - // Deep scrub in order to get corrected error counts - m_pg->scrub_after_recovery = true; - m_pg->m_planned_scrub.req_scrub = - m_pg->m_planned_scrub.req_scrub || m_flags.required; - - dout(20) << __func__ << " Current 'required': " << m_flags.required - << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl; - - } else if (m_shallow_errors || m_deep_errors) { - - // We have errors but nothing can be fixed, so there is no repair - // possible. - state_set(PG_STATE_FAILED_REPAIR); - dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors) - << " error(s) present with no repair possible" << dendl; - } - } - - { - // finish up - ObjectStore::Transaction t; - m_pg->recovery_state.update_stats( - [this](auto& history, auto& stats) { - dout(10) << "m_pg->recovery_state.update_stats()" << dendl; - utime_t now = ceph_clock_now(); - history.last_scrub = m_pg->recovery_state.get_info().last_update; - history.last_scrub_stamp = now; - if (m_is_deep) { - history.last_deep_scrub = m_pg->recovery_state.get_info().last_update; - history.last_deep_scrub_stamp = now; - } - - if (m_is_deep) { - if ((m_shallow_errors == 0) && (m_deep_errors == 0)) - history.last_clean_scrub_stamp = now; - stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors; - stats.stats.sum.num_deep_scrub_errors = m_deep_errors; - stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects; - stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes; - stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys; - dout(25) << "scrub_finish shard " << m_pg_whoami - << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes - << " num_omap_keys = " << stats.stats.sum.num_omap_keys << dendl; - } else { - stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors; - // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent - // because of deep-scrub errors - if (m_shallow_errors == 0) - history.last_clean_scrub_stamp = now; - } - stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors + - stats.stats.sum.num_deep_scrub_errors; - if (m_flags.check_repair) { - m_flags.check_repair = false; - if (m_pg->info.stats.stats.sum.num_scrub_errors) { - state_set(PG_STATE_FAILED_REPAIR); - dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors - << " error(s) still present after re-scrub" << dendl; - } - } - return true; - }, - &t); - int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr); - ceph_assert(tr == 0); - - if (!m_pg->snap_trimq.empty()) { - dout(10) << "scrub finished, requeuing snap_trimmer" << dendl; - m_pg->snap_trimmer_scrub_complete(); - } - } - - if (has_error) { - m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared( - get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery()))); - } else { - m_is_repair = false; - state_clear(PG_STATE_REPAIR); - update_op_mode_text(); - } - - cleanup_on_finish(); - if (do_auto_scrub) { - request_rescrubbing(m_pg->m_planned_scrub); - } - - if (m_pg->is_active() && m_pg->is_primary()) { - m_pg->recovery_state.share_pg_info(); - } -} - -void PgScrubber::on_digest_updates() -{ - dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " pending? " - << num_digest_updates_pending - << (m_end.is_max() ? " " : " ") << dendl; - - if (num_digest_updates_pending > 0) { - // do nothing for now. We will be called again when new updates arrive - return; - } - - // got all updates, and finished with this chunk. Any more? - if (m_end.is_max()) { - - scrub_finish(); - m_osds->queue_scrub_is_finished(m_pg); - - } else { - // go get a new chunk (via "requeue") - preemption_data.reset(); - m_osds->queue_scrub_next_chunk(m_pg, m_pg->is_scrub_blocking_ops()); - } -} - - -/* - * note that the flags-set fetched from the PG (m_pg->m_planned_scrub) - * is cleared once scrubbing starts; Some of the values dumped here are - * thus transitory. - */ -void PgScrubber::dump(ceph::Formatter* f) const -{ - f->open_object_section("scrubber"); - f->dump_stream("epoch_start") << m_interval_start; - f->dump_bool("active", m_active); - if (m_active) { - f->dump_stream("start") << m_start; - f->dump_stream("end") << m_end; - f->dump_stream("m_max_end") << m_max_end; - f->dump_stream("subset_last_update") << m_subset_last_update; - f->dump_bool("deep", m_is_deep); - f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required)); - f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub); - f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair); - f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto); - f->dump_bool("req_scrub", m_flags.required); - f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep); - f->dump_bool("auto_repair", m_flags.auto_repair); - f->dump_bool("check_repair", m_flags.check_repair); - f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error); - f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp; // utime_t - f->dump_unsigned("priority", m_flags.priority); - f->dump_int("shallow_errors", m_shallow_errors); - f->dump_int("deep_errors", m_deep_errors); - f->dump_int("fixed", m_fixed_count); - { - f->open_array_section("waiting_on_whom"); - for (const auto& p : m_maps_status.get_awaited()) { - f->dump_stream("shard") << p; - } - f->close_section(); - } - } - f->close_section(); -} - - -void PgScrubber::handle_query_state(ceph::Formatter* f) -{ - dout(10) << __func__ << dendl; - - f->open_object_section("scrub"); - f->dump_stream("scrubber.epoch_start") << m_interval_start; - f->dump_bool("scrubber.active", m_active); - f->dump_stream("scrubber.start") << m_start; - f->dump_stream("scrubber.end") << m_end; - f->dump_stream("scrubber.m_max_end") << m_max_end; - f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update; - f->dump_bool("scrubber.deep", m_is_deep); - { - f->open_array_section("scrubber.waiting_on_whom"); - for (const auto& p : m_maps_status.get_awaited()) { - f->dump_stream("shard") << p; - } - f->close_section(); - } - - f->dump_string("comment", "DEPRECATED - may be removed in the next release"); - - f->close_section(); -} - -PgScrubber::~PgScrubber() = default; - -PgScrubber::PgScrubber(PG* pg) - : m_pg{pg} - , m_pg_id{pg->pg_id} - , m_osds{m_pg->osd} - , m_pg_whoami{pg->pg_whoami} - , preemption_data{pg} -{ - m_fsm = std::make_unique(m_pg, this); - m_fsm->initiate(); -} - -void PgScrubber::reserve_replicas() -{ - dout(10) << __func__ << dendl; - m_reservations.emplace(m_pg, m_pg_whoami); -} - -void PgScrubber::cleanup_on_finish() -{ - dout(10) << __func__ << dendl; - ceph_assert(m_pg->is_locked()); - - state_clear(PG_STATE_SCRUBBING); - state_clear(PG_STATE_DEEP_SCRUB); - m_pg->publish_stats_to_osd(); - - clear_scrub_reservations(); - m_pg->publish_stats_to_osd(); - - requeue_waiting(); - - reset_internal_state(); - m_flags = scrub_flags_t{}; - - // type-specific state clear - _scrub_clear_state(); -} - -// uses process_event(), so must be invoked externally -void PgScrubber::scrub_clear_state() -{ - dout(10) << __func__ << dendl; - - clear_pgscrub_state(); - m_fsm->process_event(FullReset{}); -} - -/* - * note: does not access the state-machine - */ -void PgScrubber::clear_pgscrub_state() -{ - dout(10) << __func__ << dendl; - ceph_assert(m_pg->is_locked()); - - state_clear(PG_STATE_SCRUBBING); - state_clear(PG_STATE_DEEP_SCRUB); - - state_clear(PG_STATE_REPAIR); - - clear_scrub_reservations(); - m_pg->publish_stats_to_osd(); - - requeue_waiting(); - - reset_internal_state(); - m_flags = scrub_flags_t{}; - - // type-specific state clear - _scrub_clear_state(); -} - -void PgScrubber::replica_handling_done() -{ - dout(10) << __func__ << dendl; - - state_clear(PG_STATE_SCRUBBING); - state_clear(PG_STATE_DEEP_SCRUB); - - reset_internal_state(); - - m_pg->publish_stats_to_osd(); -} - -/* - * note: performs run_callbacks() - * note: reservations-related variables are not reset here - */ -void PgScrubber::reset_internal_state() -{ - dout(10) << __func__ << dendl; - - preemption_data.reset(); - m_maps_status.reset(); - m_received_maps.clear(); - - m_start = hobject_t{}; - m_end = hobject_t{}; - m_max_end = hobject_t{}; - m_subset_last_update = eversion_t{}; - m_shallow_errors = 0; - m_deep_errors = 0; - m_fixed_count = 0; - m_omap_stats = (const struct omap_stat_t){0}; - - run_callbacks(); - - m_inconsistent.clear(); - m_missing.clear(); - m_authoritative.clear(); - num_digest_updates_pending = 0; - m_primary_scrubmap = ScrubMap{}; - m_primary_scrubmap_pos.reset(); - replica_scrubmap = ScrubMap{}; - replica_scrubmap_pos.reset(); - m_cleaned_meta_map = ScrubMap{}; - m_needs_sleep = true; - m_sleep_started_at = utime_t{}; - - m_active = false; -} - -// note that only applicable to the Replica: -void PgScrubber::advance_token() -{ - dout(10) << __func__ << " was: " << m_current_token << dendl; - m_current_token++; - - // when advance_token() is called, it is assumed that no scrubbing takes place. - // We will, though, verify that. And if we are actually still handling a stale request - - // both our internal state and the FSM state will be cleared. - replica_handling_done(); - m_fsm->process_event(FullReset{}); -} - -bool PgScrubber::is_token_current(Scrub::act_token_t received_token) -{ - if (received_token == 0 || received_token == m_current_token) { - return true; - } - dout(5) << __func__ << " obsolete token (" << received_token - << " vs current " << m_current_token << dendl; - - return false; -} - -const OSDMapRef& PgScrubber::get_osdmap() const -{ - return m_pg->get_osdmap(); -} - -ostream& operator<<(ostream& out, const PgScrubber& scrubber) -{ - return out << scrubber.m_flags; -} - -ostream& PgScrubber::show(ostream& out) const -{ - return out << " [ " << m_pg_id << ": " << m_flags << " ] "; -} - -int PgScrubber::asok_debug(std::string_view cmd, - std::string param, - Formatter* f, - stringstream& ss) -{ - dout(10) << __func__ << " cmd: " << cmd << " param: " << param << dendl; - - if (cmd == "block") { - // set a flag that will cause the next 'select_range' to report a blocked object - m_debug_blockrange = 1; - } else if (cmd == "unblock") { - // send an 'unblock' event, as if a blocked range was freed - m_debug_blockrange = 0; - m_fsm->process_event(Unblocked{}); - } - return 0; -} -// ///////////////////// preemption_data_t ////////////////////////////////// - -PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg} -{ - m_left = static_cast( - m_pg->get_cct()->_conf.get_val("osd_scrub_max_preemptions")); -} - -void PgScrubber::preemption_data_t::reset() -{ - std::lock_guard lk{m_preemption_lock}; - - m_preemptable = false; - m_preempted = false; - m_left = - static_cast(m_pg->cct->_conf.get_val("osd_scrub_max_preemptions")); - m_size_divisor = 1; -} - - -// ///////////////////// ReplicaReservations ////////////////////////////////// -namespace Scrub { - -void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch) -{ - auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, peer.shard), epoch, - MOSDScrubReserve::RELEASE, m_pg->pg_whoami); - m_osds->send_message_osd_cluster(peer.osd, m, epoch); -} - -ReplicaReservations::ReplicaReservations(PG* pg, pg_shard_t whoami) - : m_pg{pg} - , m_acting_set{pg->get_actingset()} - , m_osds{m_pg->get_pg_osd(ScrubberPasskey())} - , m_pending{static_cast(m_acting_set.size()) - 1} - , m_pg_info{m_pg->get_pg_info(ScrubberPasskey())} -{ - epoch_t epoch = m_pg->get_osdmap_epoch(); - - // handle the special case of no replicas - if (m_pending <= 0) { - // just signal the scrub state-machine to continue - send_all_done(); - - } else { - - for (auto p : m_acting_set) { - if (p == whoami) - continue; - auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, p.shard), epoch, - MOSDScrubReserve::REQUEST, m_pg->pg_whoami); - m_osds->send_message_osd_cluster(p.osd, m, epoch); - m_waited_for_peers.push_back(p); - dout(10) << __func__ << " reserve<-> " << p.osd << dendl; - } - } -} - -void ReplicaReservations::send_all_done() -{ - m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority); -} - -void ReplicaReservations::send_reject() -{ - m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority); -} - -void ReplicaReservations::discard_all() -{ - dout(10) << __func__ << " " << m_reserved_peers << dendl; - - m_had_rejections = true; // preventing late-coming responses from triggering events - m_reserved_peers.clear(); - m_waited_for_peers.clear(); -} - -ReplicaReservations::~ReplicaReservations() -{ - m_had_rejections = true; // preventing late-coming responses from triggering events - - // send un-reserve messages to all reserved replicas. We do not wait for answer (there - // wouldn't be one). Other incoming messages will be discarded on the way, by our - // owner. - epoch_t epoch = m_pg->get_osdmap_epoch(); - - for (auto& p : m_reserved_peers) { - release_replica(p, epoch); - } - m_reserved_peers.clear(); - - // note: the release will follow on the heels of the request. When tried otherwise, - // grants that followed a reject arrived after the whole scrub machine-state was - // reset, causing leaked reservations. - for (auto& p : m_waited_for_peers) { - release_replica(p, epoch); - } - m_waited_for_peers.clear(); -} - -/** - * @ATTN we would not reach here if the ReplicaReservation object managed by the - * scrubber was reset. - */ -void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from) -{ - dout(10) << __func__ << " granted-> " << from << dendl; - op->mark_started(); - - { - // reduce the amount of extra release messages. Not a must, but the log is cleaner - auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from); - if (w != m_waited_for_peers.end()) - m_waited_for_peers.erase(w); - } - - // are we forced to reject the reservation? - if (m_had_rejections) { - - dout(10) << " rejecting late-coming reservation from " << from << dendl; - release_replica(from, m_pg->get_osdmap_epoch()); - - } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) != - m_reserved_peers.end()) { - - dout(10) << " already had osd." << from << " reserved" << dendl; - - } else { - - dout(10) << " osd." << from << " scrub reserve = success" << dendl; - m_reserved_peers.push_back(from); - if (--m_pending == 0) { - send_all_done(); - } - } -} - -void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from) -{ - dout(10) << __func__ << " rejected-> " << from << dendl; - dout(10) << __func__ << " " << *op->get_req() << dendl; - op->mark_started(); - - { - // reduce the amount of extra release messages. Not a must, but the log is cleaner - auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from); - if (w != m_waited_for_peers.end()) - m_waited_for_peers.erase(w); - } - - if (m_had_rejections) { - - // our failure was already handled when the first rejection arrived - dout(15) << " ignoring late-coming rejection from " << from << dendl; - - } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) != - m_reserved_peers.end()) { - - dout(10) << " already had osd." << from << " reserved" << dendl; - - } else { - - dout(10) << " osd." << from << " scrub reserve = fail" << dendl; - m_had_rejections = true; // preventing any additional notifications - send_reject(); - } -} - - -// ///////////////////// LocalReservation ////////////////////////////////// - -LocalReservation::LocalReservation(PG* pg, OSDService* osds) - : m_pg{pg} // holding the "whole PG" for dout() sake - , m_osds{osds} -{ - if (!m_osds->inc_scrubs_local()) { - dout(10) << __func__ << ": failed to reserve locally " << dendl; - // the failure is signalled by not having m_holding_local_reservation set - return; - } - - dout(20) << __func__ << ": local OSD scrub resources reserved" << dendl; - m_holding_local_reservation = true; -} - -LocalReservation::~LocalReservation() -{ - if (m_holding_local_reservation) { - m_holding_local_reservation = false; - m_osds->dec_scrubs_local(); - } -} - - -// ///////////////////// ReservedByRemotePrimary /////////////////////////////// - -ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch) - : m_pg{pg}, m_osds{osds}, m_reserved_at{epoch} -{ - if (!m_osds->inc_scrubs_remote()) { - dout(10) << __func__ << ": failed to reserve at Primary request" << dendl; - // the failure is signalled by not having m_reserved_by_remote_primary set - return; - } - - dout(20) << __func__ << ": scrub resources reserved at Primary request" << dendl; - m_reserved_by_remote_primary = true; -} - -bool ReservedByRemotePrimary::is_stale() const -{ - return m_reserved_at < m_pg->get_same_interval_since(); -} - -ReservedByRemotePrimary::~ReservedByRemotePrimary() -{ - if (m_reserved_by_remote_primary) { - m_reserved_by_remote_primary = false; - m_osds->dec_scrubs_remote(); - } -} - -// ///////////////////// MapsCollectionStatus //////////////////////////////// - -auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from) - -> std::tuple -{ - auto fe = std::find(m_maps_awaited_for.begin(), m_maps_awaited_for.end(), from); - if (fe != m_maps_awaited_for.end()) { - // we are indeed waiting for a map from this replica - m_maps_awaited_for.erase(fe); - return std::tuple{true, ""sv}; - } else { - return std::tuple{false, " unsolicited scrub-map"sv}; - } -} - -void MapsCollectionStatus::reset() -{ - *this = MapsCollectionStatus{}; -} - -std::string MapsCollectionStatus::dump() const -{ - std::string all; - for (const auto& rp : m_maps_awaited_for) { - all.append(rp.get_osd() + " "s); - } - return all; -} - -ostream& operator<<(ostream& out, const MapsCollectionStatus& sf) -{ - out << " [ "; - for (const auto& rp : sf.m_maps_awaited_for) { - out << rp.get_osd() << " "; - } - if (!sf.m_local_map_ready) { - out << " local "; - } - return out << " ] "; -} - -// ///////////////////// blocked_range_t /////////////////////////////// - -blocked_range_t::blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id) - : m_osds{osds} -{ - auto now_is = std::chrono::system_clock::now(); - m_callbk = new LambdaContext([now_is, pg_id, osds]([[maybe_unused]] int r) { - std::time_t now_c = std::chrono::system_clock::to_time_t(now_is); - char buf[50]; - strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c)); - lgeneric_subdout(g_ceph_context, osd, 10) - << "PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf - << ")" << dendl; - osds->clog->warn() << "osd." << osds->whoami << " PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf << ")"; - return; - }); - - std::lock_guard l(m_osds->sleep_lock); - m_osds->sleep_timer.add_event_after(waittime, m_callbk); -} - -blocked_range_t::~blocked_range_t() -{ - std::lock_guard l(m_osds->sleep_lock); - m_osds->sleep_timer.cancel_event(m_callbk); -} - -} // namespace Scrub diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h deleted file mode 100644 index e02b173a5f3..00000000000 --- a/src/osd/pg_scrubber.h +++ /dev/null @@ -1,800 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "PG.h" -#include "ScrubStore.h" -#include "scrub_machine_lstnr.h" -#include "scrubber_common.h" - -class Callback; - -namespace Scrub { -class ScrubMachine; -struct BuildMap; - -/** - * Reserving/freeing scrub resources at the replicas. - * - * When constructed - sends reservation requests to the acting_set. - * A rejection triggers a "couldn't acquire the replicas' scrub resources" event. - * All previous requests, whether already granted or not, are explicitly released. - * - * A note re performance: I've measured a few container alternatives for - * m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as - * expected. flat_set is only slightly better. Surprisingly - std::vector (with no - * sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve. - */ -class ReplicaReservations { - using OrigSet = decltype(std::declval().get_actingset()); - - PG* m_pg; - OrigSet m_acting_set; - OSDService* m_osds; - std::vector m_waited_for_peers; - std::vector m_reserved_peers; - bool m_had_rejections{false}; - int m_pending{-1}; - const pg_info_t& m_pg_info; - - void release_replica(pg_shard_t peer, epoch_t epoch); - - void send_all_done(); ///< all reservations are granted - - /// notify the scrubber that we have failed to reserve replicas' resources - void send_reject(); - - public: - /** - * quietly discard all knowledge about existing reservations. No messages - * are sent to peers. - * To be used upon interval change, as we know the the running scrub is no longer - * relevant, and that the replicas had reset the reservations on their side. - */ - void discard_all(); - - ReplicaReservations(PG* pg, pg_shard_t whoami); - - ~ReplicaReservations(); - - void handle_reserve_grant(OpRequestRef op, pg_shard_t from); - - void handle_reserve_reject(OpRequestRef op, pg_shard_t from); -}; - -/** - * wraps the local OSD scrub resource reservation in an RAII wrapper - */ -class LocalReservation { - PG* m_pg; - OSDService* m_osds; - bool m_holding_local_reservation{false}; - - public: - LocalReservation(PG* pg, OSDService* osds); - ~LocalReservation(); - bool is_reserved() const { return m_holding_local_reservation; } -}; - -/** - * wraps the OSD resource we are using when reserved as a replica by a scrubbing master. - */ -class ReservedByRemotePrimary { - PG* m_pg; - OSDService* m_osds; - bool m_reserved_by_remote_primary{false}; - const epoch_t m_reserved_at; - - public: - ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch); - ~ReservedByRemotePrimary(); - [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; } - - /// compare the remembered reserved-at epoch to the current interval - [[nodiscard]] bool is_stale() const; -}; - -/** - * Once all replicas' scrub maps are received, we go on to compare the maps. That is - - * unless we we have not yet completed building our own scrub map. MapsCollectionStatus - * combines the status of waiting for both the local map and the replicas, without - * resorting to adding dummy entries into a list. - */ -class MapsCollectionStatus { - - bool m_local_map_ready{false}; - std::vector m_maps_awaited_for; - - public: - [[nodiscard]] bool are_all_maps_available() const - { - return m_local_map_ready && m_maps_awaited_for.empty(); - } - - void mark_local_map_ready() { m_local_map_ready = true; } - - void mark_replica_map_request(pg_shard_t from_whom) - { - m_maps_awaited_for.push_back(from_whom); - } - - /// @returns true if indeed waiting for this one. Otherwise: an error string - auto mark_arriving_map(pg_shard_t from) -> std::tuple; - - std::vector get_awaited() const { return m_maps_awaited_for; } - - void reset(); - - std::string dump() const; - - friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf); -}; - - -} // namespace Scrub - - -/** - * the scrub operation flags. Primary only. - * Set at scrub start. Checked in multiple locations - mostly - * at finish. - */ -struct scrub_flags_t { - - unsigned int priority{0}; - - /** - * set by queue_scrub() if either planned_scrub.auto_repair or - * need_auto were set. - * Tested at scrub end. - */ - bool auto_repair{false}; - - /// this flag indicates that we are scrubbing post repair to verify everything is fixed - bool check_repair{false}; - - /// checked at the end of the scrub, to possibly initiate a deep-scrub - bool deep_scrub_on_error{false}; - - /** - * scrub must not be aborted. - * Set for explicitly requested scrubs, and for scrubs originated by the pairing - * process with the 'repair' flag set (in the RequestScrub event). - */ - bool required{false}; -}; - -ostream& operator<<(ostream& out, const scrub_flags_t& sf); - - -/** - * The part of PG-scrubbing code that isn't state-machine wiring. - * - * Why the separation? I wish to move to a different FSM implementation. Thus I - * am forced to strongly decouple the state-machine implementation details from - * the actual scrubbing code. - */ -class PgScrubber : public ScrubPgIF, public ScrubMachineListener { - - public: - explicit PgScrubber(PG* pg); - - // ------------------ the I/F exposed to the PG (ScrubPgIF) ------------- - - /// are we waiting for resource reservation grants form our replicas? - [[nodiscard]] bool is_reserving() const final; - - void initiate_regular_scrub(epoch_t epoch_queued) final; - - void initiate_scrub_after_repair(epoch_t epoch_queued) final; - - void send_scrub_resched(epoch_t epoch_queued) final; - - void active_pushes_notification(epoch_t epoch_queued) final; - - void update_applied_notification(epoch_t epoch_queued) final; - - void send_scrub_unblock(epoch_t epoch_queued) final; - - void digest_update_notification(epoch_t epoch_queued) final; - - void send_replica_maps_ready(epoch_t epoch_queued) final; - - void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; - - void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; - - void send_replica_pushes_upd(epoch_t epoch_queued) final; - /** - * The PG has updated its 'applied version'. It might be that we are waiting for this - * information: after selecting a range of objects to scrub, we've marked the latest - * version of these objects in m_subset_last_update. We will not start the map building - * before we know that the PG has reached this version. - */ - void on_applied_when_primary(const eversion_t& applied_version) final; - - void send_full_reset(epoch_t epoch_queued) final; - - void send_chunk_free(epoch_t epoch_queued) final; - - void send_chunk_busy(epoch_t epoch_queued) final; - - void send_local_map_done(epoch_t epoch_queued) final; - - void send_maps_compared(epoch_t epoch_queued) final; - - void send_get_next_chunk(epoch_t epoch_queued) final; - - void send_scrub_is_finished(epoch_t epoch_queued) final; - - /** - * we allow some number of preemptions of the scrub, which mean we do - * not block. Then we start to block. Once we start blocking, we do - * not stop until the scrub range is completed. - */ - bool write_blocked_by_scrub(const hobject_t& soid) final; - - /// true if the given range intersects the scrub interval in any way - bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final; - - /** - * we are a replica being asked by the Primary to reserve OSD resources for - * scrubbing - */ - void handle_scrub_reserve_request(OpRequestRef op) final; - - void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final; - void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final; - void handle_scrub_reserve_release(OpRequestRef op) final; - void discard_replica_reservations() final; - void clear_scrub_reservations() final; // PG::clear... fwds to here - void unreserve_replicas() final; - - // managing scrub op registration - - void reg_next_scrub(const requested_scrub_t& request_flags) final; - - void unreg_next_scrub() final; - - void scrub_requested(scrub_level_t scrub_level, - scrub_type_t scrub_type, - requested_scrub_t& req_flags) final; - - /** - * Reserve local scrub resources (managed by the OSD) - * - * Fails if OSD's local-scrubs budget was exhausted - * \returns were local resources reserved? - */ - bool reserve_local() final; - - void handle_query_state(ceph::Formatter* f) final; - - void dump(ceph::Formatter* f) const override; - - // used if we are a replica - - void replica_scrub_op(OpRequestRef op) final; - - /// the op priority, taken from the primary's request message - Scrub::scrub_prio_t replica_op_priority() const final - { - return m_replica_request_priority; - }; - - unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, - unsigned int suggested_priority) const final; - /// the version that refers to m_flags.priority - unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final; - - void add_callback(Context* context) final { m_callbacks.push_back(context); } - - [[nodiscard]] bool are_callbacks_pending() const final // used for an assert in PG.cc - { - return !m_callbacks.empty(); - } - - /// handle a message carrying a replica map - void map_from_replica(OpRequestRef op) final; - - void scrub_clear_state() final; - - /** - * add to scrub statistics, but only if the soid is below the scrub start - */ - virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats, - const hobject_t& soid) override - { - ceph_assert(false); - } - - /** - * finalize the parameters of the initiated scrubbing session: - * - * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set; - * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set. - */ - void set_op_parameters(requested_scrub_t& request) final; - - void cleanup_store(ObjectStore::Transaction* t) final; - - bool get_store_errors(const scrub_ls_arg_t& arg, - scrub_ls_result_t& res_inout) const override - { - return false; - } - - int asok_debug(std::string_view cmd, - std::string param, - Formatter* f, - std::stringstream& ss) override; - int m_debug_blockrange{0}; - - // ------------------------------------------------------------------------------------------- - // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener) - - [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); } - - void select_range_n_notify() final; - - Scrub::BlockedRangeWarning acquire_blocked_alarm() final; - - /// walk the log to find the latest update that affects our chunk - eversion_t search_log_for_updates() const final; - - eversion_t get_last_update_applied() const final - { - return m_pg->recovery_state.get_last_update_applied(); - } - - int pending_active_pushes() const final { return m_pg->active_pushes; } - - void on_init() final; - void on_replica_init() final; - void replica_handling_done() final; - - /// the version of 'scrub_clear_state()' that does not try to invoke FSM services - /// (thus can be called from FSM reactions) - void clear_pgscrub_state() final; - - /* - * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep' - * is asserted - after a configuration-dependent timeout. - */ - void add_delayed_scheduling() final; - - void get_replicas_maps(bool replica_can_preempt) final; - - void on_digest_updates() final; - - ScrubMachineListener::MsgAndEpoch - prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final; - - void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final; - - void send_preempted_replica() final; - - void send_remotes_reserved(epoch_t epoch_queued) final; - void send_reservation_failure(epoch_t epoch_queued) final; - - /** - * does the PG have newer updates than what we (the scrubber) know? - */ - [[nodiscard]] bool has_pg_marked_new_updates() const final; - - void set_subset_last_update(eversion_t e) final; - - void maps_compare_n_cleanup() final; - - Scrub::preemption_t& get_preemptor() final; - - int build_primary_map_chunk() final; - - int build_replica_map_chunk() final; - - void reserve_replicas() final; - - [[nodiscard]] bool was_epoch_changed() const final; - - void mark_local_map_ready() final; - - [[nodiscard]] bool are_all_maps_available() const final; - - std::string dump_awaited_maps() const final; - - protected: - bool state_test(uint64_t m) const { return m_pg->state_test(m); } - void state_set(uint64_t m) { m_pg->state_set(m); } - void state_clear(uint64_t m) { m_pg->state_clear(m); } - - [[nodiscard]] bool is_scrub_registered() const; - - virtual void _scrub_clear_state() {} - - utime_t m_scrub_reg_stamp; ///< stamp we registered for - - ostream& show(ostream& out) const override; - - public: - // ------------------------------------------------------------------------------------------- - - friend ostream& operator<<(ostream& out, const PgScrubber& scrubber); - - static utime_t scrub_must_stamp() { return utime_t(1, 1); } - - virtual ~PgScrubber(); // must be defined separately, in the .cc file - - [[nodiscard]] bool is_scrub_active() const final { return m_active; } - - private: - void reset_internal_state(); - - /** - * the current scrubbing operation is done. We should mark that fact, so that - * all events related to the previous operation can be discarded. - */ - void advance_token(); - - bool is_token_current(Scrub::act_token_t received_token); - - void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); } - - void _scan_snaps(ScrubMap& smap); - - ScrubMap clean_meta_map(); - - /** - * mark down some parameters of the initiated scrub: - * - the epoch when started; - * - the depth of the scrub requested (from the PG_STATE variable) - */ - void reset_epoch(epoch_t epoch_queued); - - void run_callbacks(); - - // ----- methods used to verify the relevance of incoming events: - - /** - * is the incoming event still relevant, and should be processed? - * - * It isn't if: - * - (1) we are no longer 'actively scrubbing'; or - * - (2) the message is from an epoch prior to when we started the current scrub - * session; or - * - (3) the message epoch is from a previous interval; or - * - (4) the 'abort' configuration flags were set. - * - * For (1) & (2) - teh incoming message is discarded, w/o further action. - * - * For (3): (see check_interval() for a full description) if we have not reacted yet - * to this specific new interval, we do now: - * - replica reservations are silently discarded (we count on the replicas to notice - * the interval change and un-reserve themselves); - * - the scrubbing is halted. - * - * For (4): the message will be discarded, but also: - * if this is the first time we've noticed the 'abort' request, we perform the abort. - * - * \returns should the incoming event be processed? - */ - bool is_message_relevant(epoch_t epoch_to_verify); - - /** - * check the 'no scrub' configuration options. - */ - [[nodiscard]] bool should_abort() const; - - /** - * Check the 'no scrub' configuration flags. - * - * Reset everything if the abort was not handled before. - * @returns false if the message was discarded due to abort flag. - */ - [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify); - - [[nodiscard]] bool check_interval(epoch_t epoch_to_verify); - - epoch_t m_last_aborted{}; // last time we've noticed a request to abort - - /** - * return true if any inconsistency/missing is repaired, false otherwise - */ - [[nodiscard]] bool scrub_process_inconsistent(); - - void scrub_compare_maps(); - - bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always - ///< 'true', unless we just got out of a sleep period - - utime_t m_sleep_started_at; - - - // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed' - // to guarantee un-reserving when deleted. - std::optional m_reservations; - std::optional m_local_osd_resource; - - /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing - std::optional m_remote_osd_resource; - - void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when - // Active->NotActive - - /// the part that actually finalizes a scrub - void scrub_finish(); - - protected: - PG* const m_pg; - - /** - * the derivative-specific scrub-finishing touches: - */ - virtual void _scrub_finish() {} - - /** - * Validate consistency of the object info and snap sets. - */ - virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) - {} - - // common code used by build_primary_map_chunk() and build_replica_map_chunk(): - int build_scrub_map_chunk(ScrubMap& map, // primary or replica? - ScrubMapBuilder& pos, - hobject_t start, - hobject_t end, - bool deep); - - std::unique_ptr m_fsm; - const spg_t m_pg_id; ///< a local copy of m_pg->pg_id - OSDService* const m_osds; - const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami; - - epoch_t m_interval_start{0}; ///< interval's 'from' of when scrubbing was first scheduled - /* - * the exact epoch when the scrubbing actually started (started here - cleared checks - * for no-scrub conf). Incoming events are verified against this, with stale events - * discarded. - */ - epoch_t m_epoch_start{0}; ///< the actual epoch when scrubbing started - - /** - * (replica) a tag identifying a specific scrub "session". Incremented whenever the - * Primary releases the replica scrub resources. - * When the scrub session is terminated (even if the interval remains unchanged, as - * might happen following an asok no-scrub command), stale scrub-resched messages - * triggered by the backend will be discarded. - */ - Scrub::act_token_t m_current_token{1}; - - scrub_flags_t m_flags; - - bool m_active{false}; - - eversion_t m_subset_last_update{}; - - std::unique_ptr m_store; - - int num_digest_updates_pending{0}; - hobject_t m_start, m_end; ///< note: half-closed: [start,end) - - /// Returns reference to current osdmap - const OSDMapRef& get_osdmap() const; - - /// Returns epoch of current osdmap - epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); } - - CephContext* get_pg_cct() const { return m_pg->cct; } - - // collected statistics - int m_shallow_errors{0}; - int m_deep_errors{0}; - int m_fixed_count{0}; - - /// Maps from objects with errors to missing peers - HobjToShardSetMapping m_missing; - - protected: - /** - * 'm_is_deep' - is the running scrub a deep one? - * - * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is - * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is - * meaningful both for the primary and the replicas, and is used as a parameter when - * building the scrub maps. - */ - bool m_is_deep{false}; - - /** - * If set: affects the backend & scrubber-backend functions called after all - * scrub maps are available. - * - * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be - * a "user facing" status display only). - */ - bool m_is_repair{false}; - - /** - * User-readable summary of the scrubber's current mode of operation. Used for - * both osd.*.log and the cluster log. - * One of: - * "repair" - * "deep-scrub", - * "scrub - * - * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for - * auto_repair will show as "deep-scrub" and not as "repair" (until the first error - * is detected). - */ - std::string_view m_mode_desc; - - void update_op_mode_text(); - -private: - - /** - * initiate a deep-scrub after the current scrub ended with errors. - */ - void request_rescrubbing(requested_scrub_t& req_flags); - - /* - * Select a range of objects to scrub. - * - * By: - * - setting tentative range based on conf and divisor - * - requesting a partial list of elements from the backend; - * - handling some head/clones issues - * - * The selected range is set directly into 'm_start' and 'm_end' - */ - bool select_range(); - - std::list m_callbacks; - - /** - * send a replica (un)reservation request to the acting set - * - * @param opcode - one of MOSDScrubReserve::REQUEST - * or MOSDScrubReserve::RELEASE - */ - void message_all_replicas(int32_t opcode, std::string_view op_text); - - hobject_t m_max_end; ///< Largest end that may have been sent to replicas - ScrubMap m_primary_scrubmap; - ScrubMapBuilder m_primary_scrubmap_pos; - - std::map m_received_maps; - - /// Cleaned std::map pending snap metadata scrub - ScrubMap m_cleaned_meta_map; - - void _request_scrub_map(pg_shard_t replica, - eversion_t version, - hobject_t start, - hobject_t end, - bool deep, - bool allow_preemption); - - - Scrub::MapsCollectionStatus m_maps_status; - - omap_stat_t m_omap_stats = (const struct omap_stat_t){0}; - - /// Maps from objects with errors to inconsistent peers - HobjToShardSetMapping m_inconsistent; - - /// Maps from object with errors to good peers - std::map>> m_authoritative; - - // ------------ members used if we are a replica - - epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message - - ScrubMapBuilder replica_scrubmap_pos; - ScrubMap replica_scrubmap; - - /** - * we mark the request priority as it arrived. It influences the queuing priority - * when we wait for local updates - */ - Scrub::scrub_prio_t m_replica_request_priority; - - /** - * the 'preemption' "state-machine". - * Note: I was considering an orthogonal sub-machine implementation, but as - * the state diagram is extremely simple, the added complexity wasn't justified. - */ - class preemption_data_t : public Scrub::preemption_t { - public: - preemption_data_t(PG* pg); // the PG access is used for conf access (and logs) - - [[nodiscard]] bool is_preemptable() const final { return m_preemptable; } - - bool do_preempt() final - { - if (m_preempted || !m_preemptable) - return false; - - std::lock_guard lk{m_preemption_lock}; - if (!m_preemptable) - return false; - - m_preempted = true; - return true; - } - - /// same as 'do_preempt()' but w/o checks (as once a replica - /// was preempted, we cannot continue) - void replica_preempted() { m_preempted = true; } - - void enable_preemption() - { - std::lock_guard lk{m_preemption_lock}; - if (are_preemptions_left() && !m_preempted) { - m_preemptable = true; - } - } - - /// used by a replica to set preemptability state according to the Primary's request - void force_preemptability(bool is_allowed) - { - // note: no need to lock for a replica - m_preempted = false; - m_preemptable = is_allowed; - } - - bool disable_and_test() final - { - std::lock_guard lk{m_preemption_lock}; - m_preemptable = false; - return m_preempted; - } - - [[nodiscard]] bool was_preempted() const { return m_preempted; } - - [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; } - - void reset(); - - void adjust_parameters() final - { - std::lock_guard lk{m_preemption_lock}; - - if (m_preempted) { - m_preempted = false; - m_preemptable = adjust_left(); - } else { - m_preemptable = are_preemptions_left(); - } - } - - private: - PG* m_pg; - mutable std::mutex m_preemption_lock; - bool m_preemptable{false}; - bool m_preempted{false}; - int m_left; - size_t m_size_divisor{1}; - bool are_preemptions_left() const { return m_left > 0; } - - bool adjust_left() - { - if (m_left > 0) { - --m_left; - m_size_divisor *= 2; - } - return m_left > 0; - } - }; - - preemption_data_t preemption_data; -}; diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc deleted file mode 100644 index edee613ffa0..00000000000 --- a/src/osd/scrub_machine.cc +++ /dev/null @@ -1,522 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "scrub_machine.h" - -#include -#include - -#include - -#include "OSD.h" -#include "OpRequest.h" -#include "ScrubStore.h" -#include "scrub_machine_lstnr.h" - -#define dout_context g_ceph_context -#define dout_subsys ceph_subsys_osd -#undef dout_prefix -#define dout_prefix *_dout << " scrubberFSM " - -using namespace std::chrono; -using namespace std::chrono_literals; -namespace sc = boost::statechart; - -#define DECLARE_LOCALS \ - ScrubMachineListener* scrbr = context().m_scrbr; \ - std::ignore = scrbr; \ - auto pg_id = context().m_pg_id; \ - std::ignore = pg_id; - -namespace Scrub { - -// --------- trace/debug auxiliaries ------------------------------- - -void on_event_creation(std::string_view nm) -{ - dout(20) << " event: --vvvv---- " << nm << dendl; -} - -void on_event_discard(std::string_view nm) -{ - dout(20) << " event: --^^^^---- " << nm << dendl; -} - -void ScrubMachine::my_states() const -{ - for (auto si = state_begin(); si != state_end(); ++si) { - const auto& siw{*si}; // prevents a warning re side-effects - dout(20) << " state: " << boost::core::demangle(typeid(siw).name()) << dendl; - } -} - -void ScrubMachine::assert_not_active() const -{ - ceph_assert(state_cast()); -} - -bool ScrubMachine::is_reserving() const -{ - return state_cast(); -} - -bool ScrubMachine::is_accepting_updates() const -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - ceph_assert(scrbr->is_primary()); - - return state_cast(); -} - -// for the rest of the code in this file - we know what PG we are dealing with: -#undef dout_prefix -#define dout_prefix _prefix(_dout, this->context().m_pg) -template static ostream& _prefix(std::ostream* _dout, T* t) -{ - return t->gen_prefix(*_dout) << " scrubberFSM pg(" << t->pg_id << ") "; -} - -// ////////////// the actual actions - -// ----------------------- NotActive ----------------------------------------- - -NotActive::NotActive(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> NotActive" << dendl; -} - -// ----------------------- ReservingReplicas --------------------------------- - -ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> ReservingReplicas" << dendl; - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - scrbr->reserve_replicas(); -} - -sc::result ReservingReplicas::react(const ReservationFailure&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl; - - // the Scrubber must release all resources and abort the scrubbing - scrbr->clear_pgscrub_state(); - return transit(); -} - -/** - * note: the event poster is handling the scrubber reset - */ -sc::result ReservingReplicas::react(const FullReset&) -{ - dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl; - return transit(); -} - -// ----------------------- ActiveScrubbing ----------------------------------- - -ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> ActiveScrubbing" << dendl; - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - scrbr->on_init(); -} - -/** - * upon exiting the Active state - */ -ActiveScrubbing::~ActiveScrubbing() -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(15) << __func__ << dendl; - scrbr->unreserve_replicas(); -} - -/* - * The only source of an InternalError event as of now is the BuildMap state, - * when encountering a backend error. - * We kill the scrub and reset the FSM. - */ -sc::result ActiveScrubbing::react(const InternalError&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << __func__ << dendl; - scrbr->clear_pgscrub_state(); - return transit(); -} - -sc::result ActiveScrubbing::react(const FullReset&) -{ - dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl; - // caller takes care of clearing the scrubber & FSM states - return transit(); -} - -// ----------------------- RangeBlocked ----------------------------------- - -/* - * Blocked. Will be released by kick_object_context_blocked() (or upon - * an abort) - * - * Note: we are never expected to be waiting for long for a blocked object. - * Unfortunately we know from experience that a bug elsewhere might result - * in an indefinite wait in this state, for an object that is never released. - * If that happens, all we can do is to issue a warning message to help - * with the debugging. - */ -RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> Act/RangeBlocked" << dendl; - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - - // arrange to have a warning message issued if we are stuck in this - // state for longer than some reasonable number of minutes. - m_timeout = scrbr->acquire_blocked_alarm(); -} - -// ----------------------- PendingTimer ----------------------------------- - -/** - * Sleeping till timer reactivation - or just requeuing - */ -PendingTimer::PendingTimer(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> Act/PendingTimer" << dendl; - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - - scrbr->add_delayed_scheduling(); -} - -// ----------------------- NewChunk ----------------------------------- - -/** - * Preconditions: - * - preemption data was set - * - epoch start was updated - */ -NewChunk::NewChunk(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> Act/NewChunk" << dendl; - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - - scrbr->get_preemptor().adjust_parameters(); - - // choose range to work on - // select_range_n_notify() will signal either SelectedChunkFree or - // ChunkIsBusy. If 'busy', we transition to Blocked, and wait for the - // range to become available. - scrbr->select_range_n_notify(); -} - -sc::result NewChunk::react(const SelectedChunkFree&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl; - - scrbr->set_subset_last_update(scrbr->search_log_for_updates()); - return transit(); -} - -// ----------------------- WaitPushes ----------------------------------- - -WaitPushes::WaitPushes(my_context ctx) : my_base(ctx) -{ - dout(10) << " -- state -->> Act/WaitPushes" << dendl; - post_event(ActivePushesUpd{}); -} - -/* - * Triggered externally, by the entity that had an update re pushes - */ -sc::result WaitPushes::react(const ActivePushesUpd&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: " - << scrbr->pending_active_pushes() << dendl; - - if (!scrbr->pending_active_pushes()) { - // done waiting - return transit(); - } - - return discard_event(); -} - -// ----------------------- WaitLastUpdate ----------------------------------- - -WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx) -{ - dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl; - post_event(UpdatesApplied{}); -} - -/** - * Note: - * Updates are locally readable immediately. Thus, on the replicas we do need - * to wait for the update notifications before scrubbing. For the Primary it's - * a bit different: on EC (and only there) rmw operations have an additional - * read roundtrip. That means that on the Primary we need to wait for - * last_update_applied (the replica side, even on EC, is still safe - * since the actual transaction will already be readable by commit time. - */ -void WaitLastUpdate::on_new_updates(const UpdatesApplied&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl; - - if (scrbr->has_pg_marked_new_updates()) { - post_event(InternalAllUpdates{}); - } else { - // will be requeued by op_applied - dout(10) << "wait for EC read/modify/writes to queue" << dendl; - } -} - -/* - * request maps from the replicas in the acting set - */ -sc::result WaitLastUpdate::react(const InternalAllUpdates&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl; - - scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable()); - return transit(); -} - -// ----------------------- BuildMap ----------------------------------- - -BuildMap::BuildMap(my_context ctx) : my_base(ctx) -{ - dout(10) << " -- state -->> Act/BuildMap" << dendl; - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - - // no need to check for an epoch change, as all possible flows that brought us here have - // a check_interval() verification of their final event. - - if (scrbr->get_preemptor().was_preempted()) { - - // we were preempted, either directly or by a replica - dout(10) << __func__ << " preempted!!!" << dendl; - scrbr->mark_local_map_ready(); - post_event(IntBmPreempted{}); - - } else { - - auto ret = scrbr->build_primary_map_chunk(); - - if (ret == -EINPROGRESS) { - // must wait for the backend to finish. No specific event provided. - // build_primary_map_chunk() has already requeued us. - dout(20) << "waiting for the backend..." << dendl; - - } else if (ret < 0) { - - dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl; - post_event(InternalError{}); - - } else { - - // the local map was created - post_event(IntLocalMapDone{}); - } - } -} - -sc::result BuildMap::react(const IntLocalMapDone&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl; - - scrbr->mark_local_map_ready(); - return transit(); -} - -// ----------------------- DrainReplMaps ----------------------------------- - -DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> Act/DrainReplMaps" << dendl; - // we may have received all maps already. Send the event that will make us check. - post_event(GotReplicas{}); -} - -sc::result DrainReplMaps::react(const GotReplicas&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl; - - if (scrbr->are_all_maps_available()) { - // NewChunk will handle the preemption that brought us to this state - return transit(); - } - - dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: " - << scrbr->dump_awaited_maps() << dendl; - return discard_event(); -} - -// ----------------------- WaitReplicas ----------------------------------- - -WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> Act/WaitReplicas" << dendl; - post_event(GotReplicas{}); -} - -/** - * note: now that maps_compare_n_cleanup() is "futurized"(*), and we remain in this state - * for a while even after we got all our maps, we must prevent are_all_maps_available() - * (actually - the code after the if()) from being called more than once. - * This is basically a separate state, but it's too transitory and artificial to justify - * the cost of a separate state. - - * (*) "futurized" - in Crimson, the call to maps_compare_n_cleanup() returns immediately - * after initiating the process. The actual termination of the maps comparing etc' is - * signalled via an event. As we share the code with "classic" OSD, here too - * maps_compare_n_cleanup() is responsible for signalling the completion of the - * processing. - */ -sc::result WaitReplicas::react(const GotReplicas&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl; - - if (!all_maps_already_called && scrbr->are_all_maps_available()) { - dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl; - - all_maps_already_called = true; - - // were we preempted? - if (scrbr->get_preemptor().disable_and_test()) { // a test&set - - - dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl; - return transit(); - - } else { - - // maps_compare_n_cleanup() will arrange for MapsCompared event to be sent: - scrbr->maps_compare_n_cleanup(); - return discard_event(); - } - } else { - return discard_event(); - } -} - -// ----------------------- WaitDigestUpdate ----------------------------------- - -WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl; - // perform an initial check: maybe we already - // have all the updates we need: - // (note that DigestUpdate is usually an external event) - post_event(DigestUpdate{}); -} - -sc::result WaitDigestUpdate::react(const DigestUpdate&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl; - - // on_digest_updates() will either: - // - do nothing - if we are still waiting for updates, or - // - finish the scrubbing of the current chunk, and: - // - send NextChunk, or - // - send ScrubFinished - - scrbr->on_digest_updates(); - return discard_event(); -} - -ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub) - : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub} -{ - dout(15) << "ScrubMachine created " << m_pg_id << dendl; -} - -ScrubMachine::~ScrubMachine() = default; - -// -------- for replicas ----------------------------------------------------- - -// ----------------------- ReplicaWaitUpdates -------------------------------- - -ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx) -{ - dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl; - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - scrbr->on_replica_init(); -} - -/* - * Triggered externally, by the entity that had an update re pushes - */ -sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): " - << scrbr->pending_active_pushes() << dendl; - - if (scrbr->pending_active_pushes() == 0) { - - // done waiting - return transit(); - } - - return discard_event(); -} - -/** - * the event poster is handling the scrubber reset - */ -sc::result ReplicaWaitUpdates::react(const FullReset&) -{ - dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl; - return transit(); -} - -// ----------------------- ActiveReplica ----------------------------------- - -ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "-- state -->> ActiveReplica" << dendl; - scrbr->on_replica_init(); // as we might have skipped ReplicaWaitUpdates - post_event(SchedReplica{}); -} - -sc::result ActiveReplica::react(const SchedReplica&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? " - << scrbr->get_preemptor().is_preemptable() << dendl; - - if (scrbr->get_preemptor().was_preempted()) { - dout(10) << "replica scrub job preempted" << dendl; - - scrbr->send_preempted_replica(); - scrbr->replica_handling_done(); - return transit(); - } - - // start or check progress of build_replica_map_chunk() - auto ret_init = scrbr->build_replica_map_chunk(); - if (ret_init != -EINPROGRESS) { - return transit(); - } - - return discard_event(); -} - -/** - * the event poster is handling the scrubber reset - */ -sc::result ActiveReplica::react(const FullReset&) -{ - dout(10) << "ActiveReplica::react(const FullReset&)" << dendl; - return transit(); -} - -} // namespace Scrub diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h deleted file mode 100644 index 998bc5fe9c4..00000000000 --- a/src/osd/scrub_machine.h +++ /dev/null @@ -1,346 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common/version.h" -#include "include/Context.h" - -#include "scrub_machine_lstnr.h" -#include "scrubber_common.h" - -using namespace std::string_literals; - -class PG; // holding a pointer to that one - just for testing -class PgScrubber; -namespace Scrub { - -namespace sc = ::boost::statechart; -namespace mpl = ::boost::mpl; - -// -// EVENTS -// - -void on_event_creation(std::string_view nm); -void on_event_discard(std::string_view nm); - -#define MEV(E) \ - struct E : sc::event { \ - inline static int actv{0}; \ - E() \ - { \ - if (!actv++) \ - on_event_creation(#E); \ - } \ - ~E() \ - { \ - if (!--actv) \ - on_event_discard(#E); \ - } \ - void print(std::ostream* out) const { *out << #E; } \ - std::string_view print() const { return #E; } \ - }; - -MEV(RemotesReserved) ///< all replicas have granted our reserve request - -MEV(ReservationFailure) ///< a reservation request has failed - -MEV(StartScrub) ///< initiate a new scrubbing session (relevant if we are a Primary) - -MEV(AfterRepairScrub) ///< initiate a new scrubbing session. Only triggered at Recovery - ///< completion. - -MEV(Unblocked) ///< triggered when the PG unblocked an object that was marked for - ///< scrubbing. Via the PGScrubUnblocked op - -MEV(InternalSchedScrub) - -MEV(SelectedChunkFree) - -MEV(ChunkIsBusy) - -MEV(ActivePushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery - ///< that is in-flight to the local ObjectStore - -MEV(UpdatesApplied) ///< (Primary only) all updates are committed - -MEV(InternalAllUpdates) ///< the internal counterpart of UpdatesApplied - -MEV(GotReplicas) ///< got a map from a replica - -MEV(IntBmPreempted) ///< internal - BuildMap preempted. Required, as detected within the - ///< ctor - -MEV(InternalError) - -MEV(IntLocalMapDone) - -MEV(DigestUpdate) ///< external. called upon success of a MODIFY op. See - ///< scrub_snapshot_metadata() - -MEV(MapsCompared) ///< (Crimson) maps_compare_n_cleanup() transactions are done - -MEV(StartReplica) ///< initiating replica scrub. - -MEV(StartReplicaNoWait) ///< 'start replica' when there are no pending updates - -MEV(SchedReplica) - -MEV(ReplicaPushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery - ///< that is in-flight to the local ObjectStore - -MEV(FullReset) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive) - -MEV(NextChunk) ///< finished handling this chunk. Go get the next one - -MEV(ScrubFinished) ///< all chunks handled - - -struct NotActive; ///< the quiescent state. No active scrubbing. -struct ReservingReplicas; ///< securing scrub resources from replicas' OSDs -struct ActiveScrubbing; ///< the active state for a Primary. A sub-machine. -struct ReplicaWaitUpdates; ///< an active state for a replica. Waiting for all active - ///< operations to finish. -struct ActiveReplica; ///< an active state for a replica. - - -class ScrubMachine : public sc::state_machine { - public: - friend class PgScrubber; - - public: - explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub); - ~ScrubMachine(); - - PG* m_pg; // only used for dout messages - spg_t m_pg_id; - ScrubMachineListener* m_scrbr; - - void my_states() const; - void assert_not_active() const; - [[nodiscard]] bool is_reserving() const; - [[nodiscard]] bool is_accepting_updates() const; -}; - -/** - * The Scrubber's base (quiescent) state. - * Scrubbing is triggered by one of the following events: - * - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources - * reservation process. Will be issued by PG::scrub(), following a - * queued "PGScrub" op. - * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is - * not required to reserve resources. - * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming - * MOSDRepScrub message. - * - * note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting - * for replica resources to be acquired. But once replicas started using the - * resource-request to identify and tag the scrub session, this bypass cannot be - * supported anymore. - */ -struct NotActive : sc::state { - explicit NotActive(my_context ctx); - - using reactions = mpl::list, - // a scrubbing that was initiated at recovery completion, - // and requires no resource reservations: - sc::transition, - sc::transition, - sc::transition>; -}; - -struct ReservingReplicas : sc::state { - - explicit ReservingReplicas(my_context ctx); - using reactions = mpl::list, - // all replicas granted our resources request - sc::transition, - sc::custom_reaction>; - - sc::result react(const FullReset&); - - /// at least one replica denied us the scrub resources we've requested - sc::result react(const ReservationFailure&); -}; - - -// the "active" sub-states - -struct RangeBlocked; ///< the objects range is blocked -struct PendingTimer; ///< either delaying the scrub by some time and requeuing, or just - ///< requeue -struct NewChunk; ///< select a chunk to scrub, and verify its availability -struct WaitPushes; -struct WaitLastUpdate; -struct BuildMap; -struct DrainReplMaps; ///< a problem during BuildMap. Wait for all replicas to report, - ///< then restart. -struct WaitReplicas; ///< wait for all replicas to report -struct WaitDigestUpdate; - -struct ActiveScrubbing : sc::state { - - explicit ActiveScrubbing(my_context ctx); - ~ActiveScrubbing(); - - using reactions = mpl::list< - sc::custom_reaction, - sc::custom_reaction>; - - sc::result react(const FullReset&); - sc::result react(const InternalError&); -}; - -struct RangeBlocked : sc::state { - explicit RangeBlocked(my_context ctx); - using reactions = mpl::list>; - - Scrub::BlockedRangeWarning m_timeout; -}; - -struct PendingTimer : sc::state { - - explicit PendingTimer(my_context ctx); - - using reactions = mpl::list>; -}; - -struct NewChunk : sc::state { - - explicit NewChunk(my_context ctx); - - using reactions = mpl::list, - sc::custom_reaction>; - - sc::result react(const SelectedChunkFree&); -}; - -/** - * initiate the update process for this chunk - * - * Wait fo 'active_pushes' to clear. - * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence - * scrub waits until the correct data is readable (in-flight data to the Objectstore is - * not readable until written to disk, termed 'applied' here) - */ -struct WaitPushes : sc::state { - - explicit WaitPushes(my_context ctx); - - using reactions = mpl::list>; - - sc::result react(const ActivePushesUpd&); -}; - -struct WaitLastUpdate : sc::state { - - explicit WaitLastUpdate(my_context ctx); - - void on_new_updates(const UpdatesApplied&); - - using reactions = mpl::list, - sc::in_state_reaction>; - - sc::result react(const InternalAllUpdates&); -}; - -struct BuildMap : sc::state { - explicit BuildMap(my_context ctx); - - // possible error scenarios: - // - an error reported by the backend will trigger an 'InternalError' event, - // handled by our parent state; - // - if preempted, we switch to DrainReplMaps, where we will wait for all - // replicas to send their maps before acknowledging the preemption; - // - an interval change will be handled by the relevant 'send-event' functions, - // and will translated into a 'FullReset' event. - using reactions = - mpl::list, - sc::transition, // looping, waiting - // for the backend to - // finish - sc::custom_reaction>; - - sc::result react(const IntLocalMapDone&); -}; - -/* - * "drain" scrub-maps responses from replicas - */ -struct DrainReplMaps : sc::state { - explicit DrainReplMaps(my_context ctx); - - using reactions = - mpl::list // all replicas are accounted for - >; - - sc::result react(const GotReplicas&); -}; - -struct WaitReplicas : sc::state { - explicit WaitReplicas(my_context ctx); - - using reactions = - mpl::list, // all replicas are accounted for - sc::transition, - sc::deferral // might arrive before we've reached WDU - >; - - sc::result react(const GotReplicas&); - - bool all_maps_already_called{false}; // see comment in react code -}; - -struct WaitDigestUpdate : sc::state { - explicit WaitDigestUpdate(my_context ctx); - - using reactions = mpl::list, - sc::transition, - sc::transition>; - sc::result react(const DigestUpdate&); -}; - -// ----------------------------- the "replica active" states ----------------------- - -/* - * Waiting for 'active_pushes' to complete - * - * When in this state: - * - the details of the Primary's request were internalized by PgScrubber; - * - 'active' scrubbing is set - */ -struct ReplicaWaitUpdates : sc::state { - explicit ReplicaWaitUpdates(my_context ctx); - using reactions = - mpl::list, sc::custom_reaction>; - - sc::result react(const ReplicaPushesUpd&); - sc::result react(const FullReset&); -}; - - -struct ActiveReplica : sc::state { - explicit ActiveReplica(my_context ctx); - using reactions = mpl::list, - sc::custom_reaction, - sc::transition>; - - sc::result react(const SchedReplica&); - sc::result react(const FullReset&); -}; - -} // namespace Scrub diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h deleted file mode 100644 index 91dee910af2..00000000000 --- a/src/osd/scrub_machine_lstnr.h +++ /dev/null @@ -1,164 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#pragma once -/** - * \file the PgScrubber interface used by the scrub FSM - */ -#include "common/version.h" -#include "include/Context.h" - -#include "osd_types.h" - -namespace Scrub { - -enum class PreemptionNoted { no_preemption, preempted }; - -/// the interface exposed by the PgScrubber into its internal -/// preemption_data object -struct preemption_t { - - virtual ~preemption_t() = default; - - [[nodiscard]] virtual bool is_preemptable() const = 0; - - [[nodiscard]] virtual bool was_preempted() const = 0; - - virtual void adjust_parameters() = 0; - - /** - * Try to preempt the scrub. - * 'true' (i.e. - preempted) if: - * preemptable && not already preempted - */ - virtual bool do_preempt() = 0; - - /** - * disables preemptions. - * Returns 'true' if we were already preempted - */ - virtual bool disable_and_test() = 0; -}; - -/// an aux used when blocking on a busy object. -/// Issues a log warning if still blocked after 'waittime'. -struct blocked_range_t { - blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id); - ~blocked_range_t(); - - OSDService* m_osds; - Context* m_callbk; -}; - -using BlockedRangeWarning = std::unique_ptr; - -} // namespace Scrub - -struct ScrubMachineListener { - - struct MsgAndEpoch { - MessageRef m_msg; - epoch_t m_epoch; - }; - - virtual ~ScrubMachineListener() = default; - - [[nodiscard]] virtual bool is_primary() const = 0; - - virtual void select_range_n_notify() = 0; - - virtual Scrub::BlockedRangeWarning acquire_blocked_alarm() = 0; - - /// walk the log to find the latest update that affects our chunk - virtual eversion_t search_log_for_updates() const = 0; - - virtual eversion_t get_last_update_applied() const = 0; - - virtual int pending_active_pushes() const = 0; - - virtual int build_primary_map_chunk() = 0; - - virtual int build_replica_map_chunk() = 0; - - virtual void on_init() = 0; - - virtual void on_replica_init() = 0; - - virtual void replica_handling_done() = 0; - - /// the version of 'scrub_clear_state()' that does not try to invoke FSM services - /// (thus can be called from FSM reactions) - virtual void clear_pgscrub_state() = 0; - - /* - * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep' - * is asserted - after a configuration-dependent timeout. - */ - virtual void add_delayed_scheduling() = 0; - - /** - * Ask all replicas for their scrub maps for the current chunk. - */ - virtual void get_replicas_maps(bool replica_can_preempt) = 0; - - virtual void on_digest_updates() = 0; - - /** - * Prepare a MOSDRepScrubMap message carrying the requested scrub map - * @param was_preempted - were we preempted? - * @return the message, and the current value of 'm_replica_min_epoch' (which is - * used when sending the message, but will be overwritten before that). - */ - [[nodiscard]] virtual MsgAndEpoch prep_replica_map_msg( - Scrub::PreemptionNoted was_preempted) = 0; - - /** - * Send to the primary the pre-prepared message containing the requested map - */ - virtual void send_replica_map(const MsgAndEpoch& preprepared) = 0; - - /** - * Let the primary know that we were preempted while trying to build the - * requested map. - */ - virtual void send_preempted_replica() = 0; - - [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0; - - virtual void set_subset_last_update(eversion_t e) = 0; - - [[nodiscard]] virtual bool was_epoch_changed() const = 0; - - virtual Scrub::preemption_t& get_preemptor() = 0; - - /** - * a "technical" collection of the steps performed once all - * rep maps are available: - * - the maps are compared - * - the scrub region markers (start_ & end_) are advanced - * - callbacks and ops that were pending are allowed to run - */ - virtual void maps_compare_n_cleanup() = 0; - - /** - * order the PgScrubber to initiate the process of reserving replicas' scrub - * resources. - */ - virtual void reserve_replicas() = 0; - - virtual void unreserve_replicas() = 0; - - /** - * the FSM interface into the "are we waiting for maps, either our own or from - * replicas" state. - * The FSM can only: - * - mark the local map as available, and - * - query status - */ - virtual void mark_local_map_ready() = 0; - - [[nodiscard]] virtual bool are_all_maps_available() const = 0; - - /// a log/debug interface - virtual std::string dump_awaited_maps() const = 0; -}; diff --git a/src/osd/scrubber/PrimaryLogScrub.cc b/src/osd/scrubber/PrimaryLogScrub.cc new file mode 100644 index 00000000000..2be7b56f61b --- /dev/null +++ b/src/osd/scrubber/PrimaryLogScrub.cc @@ -0,0 +1,589 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "PrimaryLogScrub.h" + +#include "common/scrub_types.h" +#include "osd/osd_types_fmt.h" + +#include "osd/PeeringState.h" +#include "osd/PrimaryLogPG.h" +#include "scrub_machine.h" + +#define dout_context (m_pg->get_cct()) +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this->m_pg) + +using std::vector; + +template static ostream& _prefix(std::ostream* _dout, T* t) +{ + return t->gen_prefix(*_dout) << " PrimaryLog scrubber pg(" << t->pg_id << ") "; +} + +using namespace Scrub; +using Scrub::ScrubMachine; + +bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const +{ + if (!m_store) { + return false; + } + + if (arg.get_snapsets) { + res_inout.vals = + m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return); + } else { + res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after, + arg.max_return); + } + return true; +} + +void PrimaryLogScrub::_scrub_finish() +{ + auto& info = m_pg->get_pg_info(ScrubberPasskey{}); ///< a temporary alias + + dout(10) << __func__ + << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid") + << dendl; + + if (info.stats.stats_invalid) { + m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) { + stats.stats = m_scrub_cstat; + stats.stats_invalid = false; + return false; + }); + + if (m_pl_pg->agent_state) + m_pl_pg->agent_choose_mode(); + } + + dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/" + << info.stats.stats.sum.num_objects << " objects, " + << m_scrub_cstat.sum.num_object_clones << "/" + << info.stats.stats.sum.num_object_clones << " clones, " + << m_scrub_cstat.sum.num_objects_dirty << "/" + << info.stats.stats.sum.num_objects_dirty << " dirty, " + << m_scrub_cstat.sum.num_objects_omap << "/" + << info.stats.stats.sum.num_objects_omap << " omap, " + << m_scrub_cstat.sum.num_objects_pinned << "/" + << info.stats.stats.sum.num_objects_pinned << " pinned, " + << m_scrub_cstat.sum.num_objects_hit_set_archive << "/" + << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, " + << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes + << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/" + << info.stats.stats.sum.num_objects_manifest << " manifest objects, " + << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/" + << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes." + << dendl; + + if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects || + m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones || + (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty && + !info.stats.dirty_stats_invalid) || + (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap && + !info.stats.omap_stats_invalid) || + (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned && + !info.stats.pin_stats_invalid) || + (m_scrub_cstat.sum.num_objects_hit_set_archive != + info.stats.stats.sum.num_objects_hit_set_archive && + !info.stats.hitset_stats_invalid) || + (m_scrub_cstat.sum.num_bytes_hit_set_archive != + info.stats.stats.sum.num_bytes_hit_set_archive && + !info.stats.hitset_bytes_stats_invalid) || + (m_scrub_cstat.sum.num_objects_manifest != + info.stats.stats.sum.num_objects_manifest && + !info.stats.manifest_stats_invalid) || + m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts || + m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) { + m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got " + << m_scrub_cstat.sum.num_objects << "/" + << info.stats.stats.sum.num_objects << " objects, " + << m_scrub_cstat.sum.num_object_clones << "/" + << info.stats.stats.sum.num_object_clones << " clones, " + << m_scrub_cstat.sum.num_objects_dirty << "/" + << info.stats.stats.sum.num_objects_dirty << " dirty, " + << m_scrub_cstat.sum.num_objects_omap << "/" + << info.stats.stats.sum.num_objects_omap << " omap, " + << m_scrub_cstat.sum.num_objects_pinned << "/" + << info.stats.stats.sum.num_objects_pinned << " pinned, " + << m_scrub_cstat.sum.num_objects_hit_set_archive << "/" + << info.stats.stats.sum.num_objects_hit_set_archive + << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts + << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, " + << m_scrub_cstat.sum.num_bytes << "/" + << info.stats.stats.sum.num_bytes << " bytes, " + << m_scrub_cstat.sum.num_objects_manifest << "/" + << info.stats.stats.sum.num_objects_manifest + << " manifest objects, " + << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/" + << info.stats.stats.sum.num_bytes_hit_set_archive + << " hit_set_archive bytes."; + ++m_shallow_errors; + + if (m_is_repair) { + ++m_fixed_count; + m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) { + stats.stats = m_scrub_cstat; + stats.dirty_stats_invalid = false; + stats.omap_stats_invalid = false; + stats.hitset_stats_invalid = false; + stats.hitset_bytes_stats_invalid = false; + stats.pin_stats_invalid = false; + stats.manifest_stats_invalid = false; + return false; + }); + m_pl_pg->publish_stats_to_osd(); + m_pl_pg->recovery_state.share_pg_info(); + } + } + // Clear object context cache to get repair information + if (m_is_repair) + m_pl_pg->object_contexts.clear(); +} + +static bool doing_clones(const std::optional& snapset, + const vector::reverse_iterator& curclone) +{ + return snapset && curclone != snapset->clones.rend(); +} + +void PrimaryLogScrub::log_missing(int missing, + const std::optional& head, + LogChannelRef clog, + const spg_t& pgid, + const char* func, + bool allow_incomplete_clones) +{ + ceph_assert(head); + if (allow_incomplete_clones) { + dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped " + << missing << " clone(s) in cache tier" << dendl; + } else { + clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing + << " missing clone(s)"; + } +} + +int PrimaryLogScrub::process_clones_to(const std::optional& head, + const std::optional& snapset, + LogChannelRef clog, + const spg_t& pgid, + bool allow_incomplete_clones, + std::optional target, + vector::reverse_iterator* curclone, + inconsistent_snapset_wrapper& e) +{ + ceph_assert(head); + ceph_assert(snapset); + int missing_count = 0; + + // NOTE: clones are in descending order, thus **curclone > target test here + hobject_t next_clone(*head); + while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) { + + ++missing_count; + // it is okay to be missing one or more clones in a cache tier. + // skip higher-numbered clones in the list. + if (!allow_incomplete_clones) { + next_clone.snap = **curclone; + clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone " + << next_clone << " " << m_missing << " missing"; + ++m_shallow_errors; + e.set_clone_missing(next_clone.snap); + } + // Clones are descending + ++(*curclone); + } + return missing_count; +} + +/* + * Validate consistency of the object info and snap sets. + * + * We are sort of comparing 2 lists. The main loop is on objmap.objects. But + * the comparison of the objects is against multiple snapset.clones. There are + * multiple clone lists and in between lists we expect head. + * + * Example + * + * objects expected + * ======= ======= + * obj1 snap 1 head, unexpected obj1 snap 1 + * obj2 head head, match + * [SnapSet clones 6 4 2 1] + * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7 + * obj2 snap 6 obj2 snap 6, match + * obj2 snap 4 obj2 snap 4, match + * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match + * [Snapset clones 3 1] + * obj3 snap 3 obj3 snap 3 match + * obj3 snap 1 obj3 snap 1 match + * obj4 head head, match + * [Snapset clones 4] + * EOL obj4 snap 4, (expected) + */ +void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap, + const missing_map_t& missing_digest) +{ + dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects + << dendl; + + auto& info = m_pl_pg->info; + const PGPool& pool = m_pl_pg->pool; + bool allow_incomplete_clones = pool.info.allow_incomplete_clones(); + + std::optional all_clones; // Unspecified snapid_t or std::nullopt + + // traverse in reverse order. + std::optional head; + std::optional snapset; // If initialized so will head (above) + vector::reverse_iterator curclone; // Defined only if snapset initialized + int missing = 0; + inconsistent_snapset_wrapper soid_error, head_error; + int soid_error_count = 0; + + for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) { + + const hobject_t& soid = p->first; + ceph_assert(!soid.is_snapdir()); + soid_error = inconsistent_snapset_wrapper{soid}; + object_stat_sum_t stat; + std::optional oi; + + stat.num_objects++; + + if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace) + stat.num_objects_hit_set_archive++; + + if (soid.is_snap()) { + // it's a clone + stat.num_object_clones++; + } + + // basic checks. + if (p->second.attrs.count(OI_ATTR) == 0) { + oi = std::nullopt; + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '" + << OI_ATTR << "' attr"; + ++m_shallow_errors; + soid_error.set_info_missing(); + } else { + bufferlist bv; + bv.push_back(p->second.attrs[OI_ATTR]); + try { + oi = object_info_t(bv); + } catch (ceph::buffer::error& e) { + oi = std::nullopt; + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : can't decode '" << OI_ATTR << "' attr " << e.what(); + ++m_shallow_errors; + soid_error.set_info_corrupted(); + soid_error.set_info_missing(); // Not available too + } + } + + if (oi) { + if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : on disk size (" << p->second.size + << ") does not match object info size (" << oi->size + << ") adjusted for ondisk to (" + << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")"; + soid_error.set_size_mismatch(); + ++m_shallow_errors; + } + + dout(20) << m_mode_desc << " " << soid << " " << *oi << dendl; + + // A clone num_bytes will be added later when we have snapset + if (!soid.is_snap()) { + stat.num_bytes += oi->size; + } + if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace) + stat.num_bytes_hit_set_archive += oi->size; + + if (oi->is_dirty()) + ++stat.num_objects_dirty; + if (oi->is_whiteout()) + ++stat.num_whiteouts; + if (oi->is_omap()) + ++stat.num_objects_omap; + if (oi->is_cache_pinned()) + ++stat.num_objects_pinned; + if (oi->has_manifest()) + ++stat.num_objects_manifest; + } + + // Check for any problems while processing clones + if (doing_clones(snapset, curclone)) { + std::optional target; + // Expecting an object with snap for current head + if (soid.has_snapset() || soid.get_head() != head->get_head()) { + + dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid + << " while processing " << *head << dendl; + + target = all_clones; + } else { + ceph_assert(soid.is_snap()); + target = soid.snap; + } + + // Log any clones we were expecting to be there up to target + // This will set missing, but will be a no-op if snap.soid == *curclone. + missing += + process_clones_to(head, snapset, m_osds->clog, info.pgid, + allow_incomplete_clones, target, &curclone, head_error); + } + + bool expected; + // Check doing_clones() again in case we ran process_clones_to() + if (doing_clones(snapset, curclone)) { + // A head would have processed all clones above + // or all greater than *curclone. + ceph_assert(soid.is_snap() && *curclone <= soid.snap); + + // After processing above clone snap should match the expected curclone + expected = (*curclone == soid.snap); + } else { + // If we aren't doing clones any longer, then expecting head + expected = soid.has_snapset(); + } + if (!expected) { + // If we couldn't read the head's snapset, just ignore clones + if (head && !snapset) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : clone ignored due to missing snapset"; + } else { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : is an unexpected clone"; + } + ++m_shallow_errors; + soid_error.set_headless(); + m_store->add_snap_error(pool.id, soid_error); + ++soid_error_count; + if (head && soid.get_head() == head->get_head()) + head_error.set_clone(soid.snap); + continue; + } + + // new snapset? + if (soid.has_snapset()) { + + if (missing) { + log_missing(missing, head, m_osds->clog, info.pgid, __func__, + pool.info.allow_incomplete_clones()); + } + + // Save previous head error information + if (head && (head_error.errors || soid_error_count)) + m_store->add_snap_error(pool.id, head_error); + // Set this as a new head object + head = soid; + missing = 0; + head_error = soid_error; + soid_error_count = 0; + + dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl; + + if (p->second.attrs.count(SS_ATTR) == 0) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '" + << SS_ATTR << "' attr"; + ++m_shallow_errors; + snapset = std::nullopt; + head_error.set_snapset_missing(); + } else { + bufferlist bl; + bl.push_back(p->second.attrs[SS_ATTR]); + auto blp = bl.cbegin(); + try { + snapset = SnapSet(); // Initialize optional<> before decoding into it + decode(*snapset, blp); + head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]); + } catch (ceph::buffer::error& e) { + snapset = std::nullopt; + m_osds->clog->error() + << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR + << "' attr " << e.what(); + ++m_shallow_errors; + head_error.set_snapset_corrupted(); + } + } + + if (snapset) { + // what will be next? + curclone = snapset->clones.rbegin(); + + if (!snapset->clones.empty()) { + dout(20) << " snapset " << *snapset << dendl; + if (snapset->seq == 0) { + m_osds->clog->error() + << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set"; + ++m_shallow_errors; + head_error.set_snapset_error(); + } + } + } + } else { + ceph_assert(soid.is_snap()); + ceph_assert(head); + ceph_assert(snapset); + ceph_assert(soid.snap == *curclone); + + dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl; + + if (snapset->clone_size.count(soid.snap) == 0) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : is missing in clone_size"; + ++m_shallow_errors; + soid_error.set_size_mismatch(); + } else { + if (oi && oi->size != snapset->clone_size[soid.snap]) { + m_osds->clog->error() + << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size + << " != clone_size " << snapset->clone_size[*curclone]; + ++m_shallow_errors; + soid_error.set_size_mismatch(); + } + + if (snapset->clone_overlap.count(soid.snap) == 0) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : is missing in clone_overlap"; + ++m_shallow_errors; + soid_error.set_size_mismatch(); + } else { + // This checking is based on get_clone_bytes(). The first 2 asserts + // can't happen because we know we have a clone_size and + // a clone_overlap. Now we check that the interval_set won't + // cause the last assert. + uint64_t size = snapset->clone_size.find(soid.snap)->second; + const interval_set& overlap = + snapset->clone_overlap.find(soid.snap)->second; + bool bad_interval_set = false; + for (interval_set::const_iterator i = overlap.begin(); + i != overlap.end(); ++i) { + if (size < i.get_len()) { + bad_interval_set = true; + break; + } + size -= i.get_len(); + } + + if (bad_interval_set) { + m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid + << " : bad interval_set in clone_overlap"; + ++m_shallow_errors; + soid_error.set_size_mismatch(); + } else { + stat.num_bytes += snapset->get_clone_bytes(soid.snap); + } + } + } + + // what's next? + ++curclone; + if (soid_error.errors) { + m_store->add_snap_error(pool.id, soid_error); + ++soid_error_count; + } + } + m_scrub_cstat.add(stat); + } + + if (doing_clones(snapset, curclone)) { + dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid + << " No more objects while processing " << *head << dendl; + + missing += + process_clones_to(head, snapset, m_osds->clog, info.pgid, + allow_incomplete_clones, all_clones, &curclone, head_error); + } + + // There could be missing found by the test above or even + // before dropping out of the loop for the last head. + if (missing) { + log_missing(missing, head, m_osds->clog, info.pgid, __func__, + allow_incomplete_clones); + } + if (head && (head_error.errors || soid_error_count)) + m_store->add_snap_error(pool.id, head_error); + + dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing" + << dendl; + for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) { + + ceph_assert(!p->first.is_snapdir()); + dout(10) << __func__ << " recording digests for " << p->first << dendl; + + ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false); + if (!obc) { + m_osds->clog->error() << info.pgid << " " << m_mode_desc + << " cannot get object context for object " << p->first; + continue; + } + if (obc->obs.oi.soid != p->first) { + m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first + << " : object has a valid oi attr with a mismatched name, " + << " obc->obs.oi.soid: " << obc->obs.oi.soid; + continue; + } + PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc); + ctx->at_version = m_pl_pg->get_next_version(); + ctx->mtime = utime_t(); // do not update mtime + if (p->second.first) { + ctx->new_obs.oi.set_data_digest(*p->second.first); + } else { + ctx->new_obs.oi.clear_data_digest(); + } + if (p->second.second) { + ctx->new_obs.oi.set_omap_digest(*p->second.second); + } else { + ctx->new_obs.oi.clear_omap_digest(); + } + m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY); + + ++num_digest_updates_pending; + ctx->register_on_success([this]() { + dout(20) << "updating scrub digest " << num_digest_updates_pending << dendl; + if (--num_digest_updates_pending <= 0) { + m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops()); + } + }); + + m_pl_pg->simple_opc_submit(std::move(ctx)); + } + + dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl; +} + +PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {} + +void PrimaryLogScrub::_scrub_clear_state() +{ + dout(15) << __func__ << dendl; + m_scrub_cstat = object_stat_collection_t(); +} + +void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) +{ + // We scrub objects in hobject_t order, so objects before m_start have already been + // scrubbed and their stats have already been added to the scrubber. Objects after that + // point haven't been included in the scrubber's stats accounting yet, so they will be + // included when the scrubber gets to that object. + if (is_primary() && is_scrub_active()) { + if (soid < m_start) { + + dout(20) << fmt::format("{} {} < [{},{})", __func__, soid, m_start, m_end) << dendl; + m_scrub_cstat.add(delta_stats); + + } else { + + dout(25) << fmt::format("{} {} >= [{},{})", __func__, soid, m_start, m_end) << dendl; + } + } +} diff --git a/src/osd/scrubber/PrimaryLogScrub.h b/src/osd/scrubber/PrimaryLogScrub.h new file mode 100644 index 00000000000..9ea889b44ae --- /dev/null +++ b/src/osd/scrubber/PrimaryLogScrub.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +// the './' includes are marked this way to affect clang-format +#include "./pg_scrubber.h" + +#include +#include +#include + +#include "debug.h" + +#include "common/errno.h" +#include "common/scrub_types.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDRepScrub.h" +#include "messages/MOSDRepScrubMap.h" +#include "messages/MOSDScrub.h" +#include "messages/MOSDScrubReserve.h" + +#include "osd/OSD.h" +#include "scrub_machine.h" + +class PrimaryLogPG; + +/** + * The derivative of PgScrubber that is used by PrimaryLogPG. + */ +class PrimaryLogScrub : public PgScrubber { + public: + explicit PrimaryLogScrub(PrimaryLogPG* pg); + + void _scrub_finish() final; + + bool get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const final; + + void stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) final; + + private: + // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object: + PrimaryLogPG* const m_pl_pg; + + /** + * Validate consistency of the object info and snap sets. + */ + void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final; + + void log_missing(int missing, + const std::optional& head, + LogChannelRef clog, + const spg_t& pgid, + const char* func, + bool allow_incomplete_clones); + + int process_clones_to(const std::optional& head, + const std::optional& snapset, + LogChannelRef clog, + const spg_t& pgid, + bool allow_incomplete_clones, + std::optional target, + std::vector::reverse_iterator* curclone, + inconsistent_snapset_wrapper& snap_error); + + + // handle our part in stats collection + object_stat_collection_t m_scrub_cstat; + void _scrub_clear_state() final; // which just clears the stats +}; diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc new file mode 100644 index 00000000000..1787b3d8875 --- /dev/null +++ b/src/osd/scrubber/ScrubStore.cc @@ -0,0 +1,198 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ScrubStore.h" +#include "osd/osd_types.h" +#include "common/scrub_types.h" +#include "include/rados/rados_types.hpp" + +using std::ostringstream; +using std::string; +using std::vector; + +using ceph::bufferlist; + +namespace { +ghobject_t make_scrub_object(const spg_t& pgid) +{ + ostringstream ss; + ss << "scrub_" << pgid; + return pgid.make_temp_ghobject(ss.str()); +} + +string first_object_key(int64_t pool) +{ + auto hoid = hobject_t(object_t(), + "", + 0, + 0x00000000, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_OBJ_" + hoid.to_str(); +} + +// the object_key should be unique across pools +string to_object_key(int64_t pool, const librados::object_id_t& oid) +{ + auto hoid = hobject_t(object_t(oid.name), + oid.locator, // key + oid.snap, + 0, // hash + pool, + oid.nspace); + hoid.build_hash_cache(); + return "SCRUB_OBJ_" + hoid.to_str(); +} + +string last_object_key(int64_t pool) +{ + auto hoid = hobject_t(object_t(), + "", + 0, + 0xffffffff, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_OBJ_" + hoid.to_str(); +} + +string first_snap_key(int64_t pool) +{ + // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for + // the representing the minimal and maximum keys. and this relies on how + // hobject_t::to_str() works: hex(pool).hex(revhash). + auto hoid = hobject_t(object_t(), + "", + 0, + 0x00000000, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_SS_" + hoid.to_str(); +} + +string to_snap_key(int64_t pool, const librados::object_id_t& oid) +{ + auto hoid = hobject_t(object_t(oid.name), + oid.locator, // key + oid.snap, + 0x77777777, // hash + pool, + oid.nspace); + hoid.build_hash_cache(); + return "SCRUB_SS_" + hoid.to_str(); +} + +string last_snap_key(int64_t pool) +{ + auto hoid = hobject_t(object_t(), + "", + 0, + 0xffffffff, + pool, + ""); + hoid.build_hash_cache(); + return "SCRUB_SS_" + hoid.to_str(); +} +} + +namespace Scrub { + +Store* +Store::create(ObjectStore* store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll) +{ + ceph_assert(store); + ceph_assert(t); + ghobject_t oid = make_scrub_object(pgid); + t->touch(coll, oid); + return new Store{coll, oid, store}; +} + +Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store) + : coll(coll), + hoid(oid), + driver(store, coll, hoid), + backend(&driver) +{} + +Store::~Store() +{ + ceph_assert(results.empty()); +} + +void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) +{ + bufferlist bl; + e.encode(bl); + results[to_object_key(pool, e.object)] = bl; +} + +void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) +{ + bufferlist bl; + e.encode(bl); + results[to_snap_key(pool, e.object)] = bl; +} + +bool Store::empty() const +{ + return results.empty(); +} + +void Store::flush(ObjectStore::Transaction* t) +{ + if (t) { + OSDriver::OSTransaction txn = driver.get_transaction(t); + backend.set_keys(results, &txn); + } + results.clear(); +} + +void Store::cleanup(ObjectStore::Transaction* t) +{ + t->remove(coll, hoid); +} + +std::vector +Store::get_snap_errors(int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const +{ + const string begin = (start.name.empty() ? + first_snap_key(pool) : to_snap_key(pool, start)); + const string end = last_snap_key(pool); + return get_errors(begin, end, max_return); +} + +std::vector +Store::get_object_errors(int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const +{ + const string begin = (start.name.empty() ? + first_object_key(pool) : to_object_key(pool, start)); + const string end = last_object_key(pool); + return get_errors(begin, end, max_return); +} + +std::vector +Store::get_errors(const string& begin, + const string& end, + uint64_t max_return) const +{ + vector errors; + auto next = std::make_pair(begin, bufferlist{}); + while (max_return && !backend.get_next(next.first, &next)) { + if (next.first >= end) + break; + errors.push_back(next.second); + max_return--; + } + return errors; +} + +} // namespace Scrub diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h new file mode 100644 index 00000000000..57cd0e852d5 --- /dev/null +++ b/src/osd/scrubber/ScrubStore.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_SCRUB_RESULT_H +#define CEPH_SCRUB_RESULT_H + +#include "osd/SnapMapper.h" // for OSDriver +#include "common/map_cacher.hpp" + +namespace librados { + struct object_id_t; +} + +struct inconsistent_obj_wrapper; +struct inconsistent_snapset_wrapper; + +namespace Scrub { + +class Store { +public: + ~Store(); + static Store* create(ObjectStore* store, + ObjectStore::Transaction* t, + const spg_t& pgid, + const coll_t& coll); + void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e); + void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e); + bool empty() const; + void flush(ObjectStore::Transaction *); + void cleanup(ObjectStore::Transaction *); + std::vector get_snap_errors(int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const; + std::vector get_object_errors(int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const; +private: + Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store); + std::vector get_errors(const std::string& start, const std::string& end, + uint64_t max_return) const; +private: + const coll_t coll; + const ghobject_t hoid; + // a temp object holding mappings from seq-id to inconsistencies found in + // scrubbing + OSDriver driver; + mutable MapCacher::MapCacher backend; + std::map results; +}; +} + +#endif // CEPH_SCRUB_RESULT_H diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc new file mode 100644 index 00000000000..a9405ad82af --- /dev/null +++ b/src/osd/scrubber/pg_scrubber.cc @@ -0,0 +1,2392 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=2 sw=2 smarttab + +#include "./pg_scrubber.h" // the '.' notation used to affect clang-format order + +#include +#include + +#include "debug.h" + +#include "common/errno.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDRepScrub.h" +#include "messages/MOSDRepScrubMap.h" +#include "messages/MOSDScrub.h" +#include "messages/MOSDScrubReserve.h" + +#include "osd/OSD.h" +#include "ScrubStore.h" +#include "scrub_machine.h" + +using std::list; +using std::map; +using std::pair; +using std::set; +using std::stringstream; +using std::vector; +using namespace Scrub; +using namespace std::chrono; +using namespace std::chrono_literals; +using namespace std::literals; + +#define dout_context (m_pg->get_cct()) +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout, this->m_pg) + +template static ostream& _prefix(std::ostream* _dout, T* t) +{ + return t->gen_prefix(*_dout) << " scrubber pg(" << t->pg_id << ") "; +} + +ostream& operator<<(ostream& out, const scrub_flags_t& sf) +{ + if (sf.auto_repair) + out << " AUTO_REPAIR"; + if (sf.check_repair) + out << " CHECK_REPAIR"; + if (sf.deep_scrub_on_error) + out << " DEEP_SCRUB_ON_ERROR"; + if (sf.required) + out << " REQ_SCRUB"; + + return out; +} + +ostream& operator<<(ostream& out, const requested_scrub_t& sf) +{ + if (sf.must_repair) + out << " MUST_REPAIR"; + if (sf.auto_repair) + out << " planned AUTO_REPAIR"; + if (sf.check_repair) + out << " planned CHECK_REPAIR"; + if (sf.deep_scrub_on_error) + out << " planned DEEP_SCRUB_ON_ERROR"; + if (sf.must_deep_scrub) + out << " MUST_DEEP_SCRUB"; + if (sf.must_scrub) + out << " MUST_SCRUB"; + if (sf.time_for_deep) + out << " TIME_FOR_DEEP"; + if (sf.need_auto) + out << " NEED_AUTO"; + if (sf.req_scrub) + out << " planned REQ_SCRUB"; + + return out; +} + +/* + * if the incoming message is from a previous interval, it must mean + * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard + * the stale message. + */ +bool PgScrubber::check_interval(epoch_t epoch_to_verify) +{ + return epoch_to_verify >= m_pg->get_same_interval_since(); +} + +bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify) +{ + if (!m_active) { + // not scrubbing. We can assume that the scrub was already terminated, and we + // can silently discard the incoming event. + return false; + } + + // is this a message from before we started this scrub? + if (epoch_to_verify < m_epoch_start) { + return false; + } + + // has a new interval started? + if (!check_interval(epoch_to_verify)) { + // if this is a new interval, on_change() has already terminated that + // old scrub. + return false; + } + + ceph_assert(is_primary()); + + // were we instructed to abort? + return verify_against_abort(epoch_to_verify); +} + +bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify) +{ + if (!should_abort()) { + return true; + } + + dout(10) << __func__ << " aborting. incoming epoch: " << epoch_to_verify + << " vs last-aborted: " << m_last_aborted << dendl; + + // if we were not aware of the abort before - kill the scrub. + if (epoch_to_verify > m_last_aborted) { + scrub_clear_state(); + m_last_aborted = std::max(epoch_to_verify, m_epoch_start); + } + return false; +} + +bool PgScrubber::should_abort() const +{ + if (m_flags.required) { + return false; // not stopping 'required' scrubs for configuration changes + } + + if (m_is_deep) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || + m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) { + dout(10) << "nodeep_scrub set, aborting" << dendl; + return true; + } + } + + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || + m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) { + dout(10) << "noscrub set, aborting" << dendl; + return true; + } + + return false; +} + +// initiating state-machine events -------------------------------- + +/* + * a note re the checks performed before sending scrub-initiating messages: + * + * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that + * possibly were in the queue while the PG changed state and became unavailable for + * scrubbing: + * + * The check_interval() catches all major changes to the PG. As for the other conditions + * we may check (and see is_message_relevant() above): + * + * - we are not 'active' yet, so must not check against is_active(), and: + * + * - the 'abort' flags were just verified (when the triggering message was queued). As + * those are only modified in human speeds - they need not be queried again. + * + * Some of the considerations above are also relevant to the replica-side initiation + * ('StartReplica' & 'StartReplicaNoWait'). + */ + +void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued) +{ + dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + // we may have lost our Primary status while the message languished in the queue + if (check_interval(epoch_queued)) { + dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl; + reset_epoch(epoch_queued); + m_fsm->my_states(); + m_fsm->process_event(StartScrub{}); + dout(10) << "scrubber event --<< StartScrub" << dendl; + } +} + +void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued) +{ + dout(15) << __func__ << " epoch: " << epoch_queued << dendl; + // we may have lost our Primary status while the message languished in the queue + if (check_interval(epoch_queued)) { + dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl; + reset_epoch(epoch_queued); + m_fsm->my_states(); + m_fsm->process_event(AfterRepairScrub{}); + dout(10) << "scrubber event --<< AfterRepairScrub" << dendl; + } +} + +void PgScrubber::send_scrub_unblock(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(Unblocked{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_scrub_resched(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(InternalSchedScrub{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued + << " token: " << token << dendl; + if (is_primary()) { + // shouldn't happen. Ignore + dout(1) << "got a replica scrub request while Primary!" << dendl; + return; + } + + if (check_interval(epoch_queued) && is_token_current(token)) { + m_fsm->my_states(); + // save us some time by not waiting for updates if there are none + // to wait for. Affects the transition from NotActive into either + // ReplicaWaitUpdates or ActiveReplica. + if (pending_active_pushes()) + m_fsm->process_event(StartReplica{}); + else + m_fsm->process_event(StartReplicaNoWait{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued + << " token: " << token << dendl; + if (check_interval(epoch_queued) && is_token_current(token)) { + m_fsm->my_states(); + m_fsm->process_event(SchedReplica{}); // retest for map availability + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::active_pushes_notification(epoch_t epoch_queued) +{ + // note: Primary only + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(ActivePushesUpd{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::update_applied_notification(epoch_t epoch_queued) +{ + // note: Primary only + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(UpdatesApplied{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::digest_update_notification(epoch_t epoch_queued) +{ + // note: Primary only + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(DigestUpdate{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_local_map_done(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(Scrub::IntLocalMapDone{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(GotReplicas{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(ReplicaPushesUpd{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_remotes_reserved(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + // note: scrub is not active yet + if (check_interval(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(RemotesReserved{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_reservation_failure(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { // do not check for 'active'! + m_fsm->my_states(); + m_fsm->process_event(ReservationFailure{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_full_reset(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + + m_fsm->my_states(); + m_fsm->process_event(Scrub::FullReset{}); + + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_chunk_free(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(Scrub::SelectedChunkFree{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_chunk_busy(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(Scrub::ChunkIsBusy{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_get_next_chunk(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(Scrub::NextChunk{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + + // can't check for "active" + + m_fsm->my_states(); + m_fsm->process_event(Scrub::ScrubFinished{}); + + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_maps_compared(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + + m_fsm->my_states(); + m_fsm->process_event(Scrub::MapsCompared{}); + + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +// ----------------- + +bool PgScrubber::is_reserving() const +{ + return m_fsm->is_reserving(); +} + +void PgScrubber::reset_epoch(epoch_t epoch_queued) +{ + dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl; + m_fsm->assert_not_active(); + + m_epoch_start = epoch_queued; + m_needs_sleep = true; + m_is_deep = state_test(PG_STATE_DEEP_SCRUB); + update_op_mode_text(); +} + +unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const +{ + unsigned int qu_priority = m_flags.priority; + + if (with_priority == Scrub::scrub_prio_t::high_priority) { + qu_priority = + std::max(qu_priority, (unsigned int)m_pg->get_cct()->_conf->osd_client_op_priority); + } + return qu_priority; +} + +unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const +{ + if (with_priority == Scrub::scrub_prio_t::high_priority) { + suggested_priority = std::max(suggested_priority, + (unsigned int)m_pg->cct->_conf->osd_client_op_priority); + } + return suggested_priority; +} + +// ///////////////////////////////////////////////////////////////////// // +// scrub-op registration handling + +bool PgScrubber::is_scrub_registered() const +{ + return !m_scrub_reg_stamp.is_zero(); +} + +void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags) +{ + if (!is_primary()) { + // normal. No warning is required. + return; + } + + dout(10) << __func__ << " planned: must? " << request_flags.must_scrub << " need-auto? " + << request_flags.need_auto << " stamp: " << m_pg->info.history.last_scrub_stamp + << dendl; + + ceph_assert(!is_scrub_registered()); + + utime_t reg_stamp; + bool must = false; + + if (request_flags.must_scrub || request_flags.need_auto) { + // Set the smallest time that isn't utime_t() + reg_stamp = PgScrubber::scrub_must_stamp(); + must = true; + } else if (m_pg->info.stats.stats_invalid && + m_pg->cct->_conf->osd_scrub_invalid_stats) { + reg_stamp = ceph_clock_now(); + must = true; + } else { + reg_stamp = m_pg->info.history.last_scrub_stamp; + } + + dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must + << " required:" << m_flags.required << " flags: " << request_flags + << " stamp: " << reg_stamp << dendl; + + const double scrub_min_interval = + m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0); + const double scrub_max_interval = + m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0); + + // note the sched_time, so we can locate this scrub, and remove it later + m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval, + scrub_max_interval, must); + dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time " + << m_scrub_reg_stamp << ", must = " << (int)must << dendl; +} + +void PgScrubber::unreg_next_scrub() +{ + if (is_scrub_registered()) { + dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl; + m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp); + m_scrub_reg_stamp = utime_t{}; + } +} + +void PgScrubber::scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) +{ + dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ") + << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ") + << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered() + << dendl; + + unreg_next_scrub(); + + req_flags.must_scrub = true; + req_flags.must_deep_scrub = + (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair); + req_flags.must_repair = (scrub_type == scrub_type_t::do_repair); + // User might intervene, so clear this + req_flags.need_auto = false; + req_flags.req_scrub = true; + + dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl; + + reg_next_scrub(req_flags); +} + +void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags) +{ + dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? " + << is_scrub_registered() << dendl; + + unreg_next_scrub(); + req_flags.need_auto = true; + reg_next_scrub(req_flags); +} + +bool PgScrubber::reserve_local() +{ + // try to create the reservation object (which translates into asking the + // OSD for the local scrub resource). If failing - undo it immediately + + m_local_osd_resource.emplace(m_pg, m_osds); + if (!m_local_osd_resource->is_reserved()) { + m_local_osd_resource.reset(); + return false; + } + + return true; +} + +// ---------------------------------------------------------------------------- + +bool PgScrubber::has_pg_marked_new_updates() const +{ + auto last_applied = m_pg->recovery_state.get_last_update_applied(); + dout(10) << __func__ << " recovery last: " << last_applied + << " vs. scrub's: " << m_subset_last_update << dendl; + + return last_applied >= m_subset_last_update; +} + +void PgScrubber::set_subset_last_update(eversion_t e) +{ + m_subset_last_update = e; + dout(15) << __func__ << " last-update: " << e << dendl; +} + +void PgScrubber::on_applied_when_primary(const eversion_t& applied_version) +{ + // we are only interested in updates if we are the Primary, and in state + // WaitLastUpdate + if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) { + m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops()); + dout(15) << __func__ << " update: " << applied_version + << " vs. required: " << m_subset_last_update << dendl; + } +} + +/* + * The selected range is set directly into 'm_start' and 'm_end' + * setting: + * - m_subset_last_update + * - m_max_end + * - end + * - start + */ +bool PgScrubber::select_range() +{ + m_primary_scrubmap = ScrubMap{}; + m_received_maps.clear(); + + /* get the start and end of our scrub chunk + * + * Our scrub chunk has an important restriction we're going to need to + * respect. We can't let head be start or end. + * Using a half-open interval means that if end == head, + * we'd scrub/lock head and the clone right next to head in different + * chunks which would allow us to miss clones created between + * scrubbing that chunk and scrubbing the chunk including head. + * This isn't true for any of the other clones since clones can + * only be created "just to the left of" head. There is one exception + * to this: promotion of clones which always happens to the left of the + * left-most clone, but promote_object checks the scrubber in that + * case, so it should be ok. Also, it's ok to "miss" clones at the + * left end of the range if we are a tier because they may legitimately + * not exist (see _scrub). + */ + int min_idx = std::max( + 3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor()); + + int max_idx = std::max(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max / + preemption_data.chunk_divisor()); + + dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx + << " Div: " << preemption_data.chunk_divisor() << dendl; + + hobject_t start = m_start; + hobject_t candidate_end; + std::vector objects; + int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects, + &candidate_end); + ceph_assert(ret >= 0); + + if (!objects.empty()) { + + hobject_t back = objects.back(); + while (candidate_end.is_head() && candidate_end == back.get_head()) { + candidate_end = back; + objects.pop_back(); + if (objects.empty()) { + ceph_assert(0 == + "Somehow we got more than 2 objects which" + "have the same head but are not clones"); + } + back = objects.back(); + } + + if (candidate_end.is_head()) { + ceph_assert(candidate_end != back.get_head()); + candidate_end = candidate_end.get_object_boundary(); + } + + } else { + ceph_assert(candidate_end.is_max()); + } + + // is that range free for us? if not - we will be rescheduled later by whoever + // triggered us this time + + if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) { + // we'll be requeued by whatever made us unavailable for scrub + dout(10) << __func__ << ": scrub blocked somewhere in range " + << "[" << m_start << ", " << candidate_end << ")" << dendl; + return false; + } + + m_end = candidate_end; + if (m_end > m_max_end) + m_max_end = m_end; + + dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// " + << m_max_end << dendl; + + // debug: be 'blocked' if told so by the 'pg scrub_debug block' asok command + if (m_debug_blockrange > 0) { + m_debug_blockrange--; + return false; + } + return true; +} + +void PgScrubber::select_range_n_notify() +{ + if (select_range()) { + // the next chunk to handle is not blocked + dout(20) << __func__ << ": selection OK" << dendl; + m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority); + + } else { + // we will wait for the objects range to become available for scrubbing + dout(10) << __func__ << ": selected chunk is busy" << dendl; + m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority); + } +} + +bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid) +{ + if (soid < m_start || soid >= m_end) { + return false; + } + + dout(20) << __func__ << " " << soid << " can preempt? " + << preemption_data.is_preemptable() << " already preempted? " + << preemption_data.was_preempted() << dendl; + + if (preemption_data.was_preempted()) { + // otherwise - write requests arriving while 'already preempted' is set + // but 'preemptable' is not - will not be allowed to continue, and will + // not be requeued on time. + return false; + } + + if (preemption_data.is_preemptable()) { + + dout(10) << __func__ << " " << soid << " preempted" << dendl; + + // signal the preemption + preemption_data.do_preempt(); + m_end = m_start; // free the range we were scrubbing + + return false; + } + return true; +} + +bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end) +{ + // does [start, end] intersect [scrubber.start, scrubber.m_max_end) + return (start < m_max_end && end >= m_start); +} + +Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm() +{ + return std::make_unique(m_osds, ceph::timespan{300s}, m_pg_id); +} + +/** + * if we are required to sleep: + * arrange a callback sometimes later. + * be sure to be able to identify a stale callback. + * Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue) + * anyway. + */ +void PgScrubber::add_delayed_scheduling() +{ + m_end = m_start; // not blocking any range now + + milliseconds sleep_time{0ms}; + if (m_needs_sleep) { + double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required); + sleep_time = milliseconds{long(scrub_sleep)}; + } + dout(15) << __func__ << " sleep: " << sleep_time.count() << "ms. needed? " + << m_needs_sleep << dendl; + + if (sleep_time.count()) { + // schedule a transition for some 'sleep_time' ms in the future + + m_needs_sleep = false; + m_sleep_started_at = ceph_clock_now(); + + // the following log line is used by osd-scrub-test.sh + dout(20) << __func__ << " scrub state is PendingTimer, sleeping" << dendl; + + // the 'delayer' for crimson is different. Will be factored out. + + spg_t pgid = m_pg->get_pgid(); + auto callbk = new LambdaContext([osds = m_osds, pgid, + scrbr = this]([[maybe_unused]] int r) mutable { + PGRef pg = osds->osd->lookup_lock_pg(pgid); + if (!pg) { + lgeneric_subdout(g_ceph_context, osd, 10) + << "scrub_requeue_callback: Could not find " + << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl; + return; + } + scrbr->m_needs_sleep = true; + lgeneric_dout(scrbr->get_pg_cct(), 7) + << "scrub_requeue_callback: slept for " + << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl; + + scrbr->m_sleep_started_at = utime_t{}; + osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority); + pg->unlock(); + }); + + std::lock_guard l(m_osds->sleep_lock); + m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk); + + } else { + // just a requeue + m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority); + } +} + +eversion_t PgScrubber::search_log_for_updates() const +{ + auto& projected = m_pg->projected_log.log; + auto pi = find_if( + projected.crbegin(), projected.crend(), + [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; }); + + if (pi != projected.crend()) + return pi->version; + + // there was no relevant update entry in the log + + auto& log = m_pg->recovery_state.get_pg_log().get_log().log; + auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool { + return e.soid >= m_start && e.soid < m_end; + }); + + if (p == log.crend()) + return eversion_t{}; + else + return p->version; +} + +void PgScrubber::get_replicas_maps(bool replica_can_preempt) +{ + dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/" + << m_interval_start + << " pg same_interval_since: " << m_pg->info.history.same_interval_since + << dendl; + + m_primary_scrubmap_pos.reset(); + + // ask replicas to scan and send maps + for (const auto& i : m_pg->get_acting_recovery_backfill()) { + + if (i == m_pg_whoami) + continue; + + m_maps_status.mark_replica_map_request(i); + _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep, + replica_can_preempt); + } + + dout(10) << __func__ << " awaiting" << m_maps_status << dendl; +} + +bool PgScrubber::was_epoch_changed() const +{ + // for crimson we have m_pg->get_info().history.same_interval_since + dout(10) << __func__ << " epoch_start: " << m_interval_start + << " from pg: " << m_pg->get_history().same_interval_since << dendl; + + return m_interval_start < m_pg->get_history().same_interval_since; +} + +void PgScrubber::mark_local_map_ready() +{ + m_maps_status.mark_local_map_ready(); +} + +bool PgScrubber::are_all_maps_available() const +{ + return m_maps_status.are_all_maps_available(); +} + +std::string PgScrubber::dump_awaited_maps() const +{ + return m_maps_status.dump(); +} + +void PgScrubber::update_op_mode_text() +{ + auto visible_repair = state_test(PG_STATE_REPAIR); + m_mode_desc = (visible_repair ? "repair" : (m_is_deep ? "deep-scrub" : "scrub")); + + dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false") + << ", internal: " << (m_is_repair ? "true" : "false") + << ". Displayed: " << m_mode_desc << dendl; +} + +void PgScrubber::_request_scrub_map(pg_shard_t replica, + eversion_t version, + hobject_t start, + hobject_t end, + bool deep, + bool allow_preemption) +{ + ceph_assert(replica != m_pg_whoami); + dout(10) << __func__ << " scrubmap from osd." << replica + << (deep ? " deep" : " shallow") << dendl; + + auto repscrubop = + new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version, + get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep, + allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub()); + + // default priority. We want the replica-scrub processed prior to any recovery + // or client io messages (we are holding a lock!) + m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch()); +} + +void PgScrubber::cleanup_store(ObjectStore::Transaction* t) +{ + if (!m_store) + return; + + struct OnComplete : Context { + std::unique_ptr store; + explicit OnComplete(std::unique_ptr&& store) : store(std::move(store)) + {} + void finish(int) override {} + }; + m_store->cleanup(t); + t->register_on_complete(new OnComplete(std::move(m_store))); + ceph_assert(!m_store); +} + +void PgScrubber::on_init() +{ + // going upwards from 'inactive' + ceph_assert(!is_scrub_active()); + + preemption_data.reset(); + m_pg->publish_stats_to_osd(); + m_interval_start = m_pg->get_history().same_interval_since; + + dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl; + + // create a new store + { + ObjectStore::Transaction t; + cleanup_store(&t); + m_store.reset( + Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll)); + m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); + } + + m_start = m_pg->info.pgid.pgid.get_hobj_start(); + m_active = true; +} + +void PgScrubber::on_replica_init() +{ + m_active = true; +} + +void PgScrubber::_scan_snaps(ScrubMap& smap) +{ + hobject_t head; + SnapSet snapset; + + // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings + // in this function + dout(15) << "_scan_snaps starts" << dendl; + + for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) { + + const hobject_t& hoid = i->first; + ScrubMap::object& o = i->second; + + dout(20) << __func__ << " " << hoid << dendl; + + ceph_assert(!hoid.is_snapdir()); + if (hoid.is_head()) { + // parse the SnapSet + bufferlist bl; + if (o.attrs.find(SS_ATTR) == o.attrs.end()) { + continue; + } + bl.push_back(o.attrs[SS_ATTR]); + auto p = bl.cbegin(); + try { + decode(snapset, p); + } catch (...) { + continue; + } + head = hoid.get_head(); + continue; + } + + if (hoid.snap < CEPH_MAXSNAP) { + // check and if necessary fix snap_mapper + if (hoid.get_head() != head) { + derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl; + continue; + } + set obj_snaps; + auto p = snapset.clone_snaps.find(hoid.snap); + if (p == snapset.clone_snaps.end()) { + derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl; + continue; + } + obj_snaps.insert(p->second.begin(), p->second.end()); + set cur_snaps; + int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps); + if (r != 0 && r != -ENOENT) { + derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + if (r == -ENOENT || cur_snaps != obj_snaps) { + ObjectStore::Transaction t; + OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t)); + if (r == 0) { + r = m_pg->snap_mapper.remove_oid(hoid, &_t); + if (r != 0) { + derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + m_pg->osd->clog->error() + << "osd." << m_pg->osd->whoami << " found snap mapper error on pg " + << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps + << ", oi: " << obj_snaps << "...repaired"; + } else { + m_pg->osd->clog->error() + << "osd." << m_pg->osd->whoami << " found snap mapper error on pg " + << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper" + << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r + << "...repaired"; + } + m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t); + + // wait for repair to apply to avoid confusing other bits of the system. + { + dout(15) << __func__ << " wait on repair!" << dendl; + + ceph::condition_variable my_cond; + ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock"); + int e = 0; + bool done; + + t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e)); + + e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t)); + if (e != 0) { + derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl; + } else { + std::unique_lock l{my_lock}; + my_cond.wait(l, [&done] { return done; }); + } + } + } + } + } +} + +int PgScrubber::build_primary_map_chunk() +{ + epoch_t map_building_since = m_pg->get_osdmap_epoch(); + dout(20) << __func__ << ": initiated at epoch " << map_building_since << dendl; + + auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start, + m_end, m_is_deep); + + if (ret == -EINPROGRESS) { + // reschedule another round of asking the backend to collect the scrub data + m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority); + } + return ret; +} + +int PgScrubber::build_replica_map_chunk() +{ + dout(10) << __func__ << " interval start: " << m_interval_start + << " current token: " << m_current_token << " epoch: " << m_epoch_start + << " deep: " << m_is_deep << dendl; + + auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end, + m_is_deep); + + switch (ret) { + + case -EINPROGRESS: + // must wait for the backend to finish. No external event source. + // (note: previous version used low priority here. Now switched to using the + // priority of the original message) + m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority, + m_flags.priority, m_current_token); + break; + + case 0: { + // finished! + m_cleaned_meta_map.clear_from(m_start); + m_cleaned_meta_map.insert(replica_scrubmap); + auto for_meta_scrub = clean_meta_map(); + _scan_snaps(for_meta_scrub); + + // the local map has been created. Send it to the primary. + // Note: once the message reaches the Primary, it may ask us for another + // chunk - and we better be done with the current scrub. Thus - the preparation of + // the reply message is separate, and we clear the scrub state before actually + // sending it. + + auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption); + replica_handling_done(); + dout(15) << __func__ << " chunk map sent " << dendl; + send_replica_map(reply); + } break; + + default: + // negative retval: build_scrub_map_chunk() signalled an error + // Pre-Pacific code ignored this option, treating it as a success. + // \todo Add an error flag in the returning message. + dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret + << dendl; + replica_handling_done(); + // only in debug mode for now: + assert(false && "backend error"); + break; + }; + + return ret; +} + +int PgScrubber::build_scrub_map_chunk( + ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep) +{ + dout(10) << __func__ << " [" << start << "," << end << ") " + << " pos " << pos << " Deep: " << deep << dendl; + + // start + while (pos.empty()) { + + pos.deep = deep; + map.valid_through = m_pg->info.last_update; + + // objects + vector rollback_obs; + pos.ret = + m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs); + dout(10) << __func__ << " while pos empty " << pos.ret << dendl; + if (pos.ret < 0) { + dout(5) << "objects_list_range error: " << pos.ret << dendl; + return pos.ret; + } + dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl; + if (pos.ls.empty()) { + break; + } + m_pg->_scan_rollback_obs(rollback_obs); + pos.pos = 0; + return -EINPROGRESS; + } + + // scan objects + while (!pos.done()) { + + int r = m_pg->get_pgbackend()->be_scan_list(map, pos); + dout(30) << __func__ << " BE returned " << r << dendl; + if (r == -EINPROGRESS) { + dout(20) << __func__ << " in progress" << dendl; + return r; + } + } + + // finish + dout(20) << __func__ << " finishing" << dendl; + ceph_assert(pos.done()); + m_pg->_repair_oinfo_oid(map); + + dout(20) << __func__ << " done, got " << map.objects.size() << " items" << dendl; + return 0; +} + +/* + * Process: + * Building a map of objects suitable for snapshot validation. + * The data in m_cleaned_meta_map is the left over partial items that need to + * be completed before they can be processed. + * + * Snapshots in maps precede the head object, which is why we are scanning backwards. + */ +ScrubMap PgScrubber::clean_meta_map() +{ + ScrubMap for_meta_scrub; + + if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) { + m_cleaned_meta_map.swap(for_meta_scrub); + } else { + auto iter = m_cleaned_meta_map.objects.end(); + --iter; // not empty, see 'if' clause + auto begin = m_cleaned_meta_map.objects.begin(); + if (iter->first.has_snapset()) { + ++iter; + } else { + while (iter != begin) { + auto next = iter--; + if (next->first.get_head() != iter->first.get_head()) { + ++iter; + break; + } + } + } + for_meta_scrub.objects.insert(begin, iter); + m_cleaned_meta_map.objects.erase(begin, iter); + } + + return for_meta_scrub; +} + +void PgScrubber::run_callbacks() +{ + std::list to_run; + to_run.swap(m_callbacks); + + for (auto& tr : to_run) { + tr->complete(0); + } +} + +void PgScrubber::maps_compare_n_cleanup() +{ + scrub_compare_maps(); + m_start = m_end; + run_callbacks(); + requeue_waiting(); + m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority); +} + +Scrub::preemption_t& PgScrubber::get_preemptor() +{ + return preemption_data; +} + +/* + * Process note: called for the arriving "give me your map, replica!" request. Unlike + * the original implementation, we do not requeue the Op waiting for + * updates. Instead - we trigger the FSM. + */ +void PgScrubber::replica_scrub_op(OpRequestRef op) +{ + op->mark_started(); + auto msg = op->get_req(); + dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch + << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl; + + // are we still processing a previous scrub-map request without noticing that the + // interval changed? won't see it here, but rather at the reservation stage. + + if (msg->map_epoch < m_pg->info.history.same_interval_since) { + dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch + << " < " << m_pg->info.history.same_interval_since << dendl; + + // is there a general sync issue? are we holding a stale reservation? + // not checking now - assuming we will actively react to interval change. + + return; + } + + replica_scrubmap = ScrubMap{}; + replica_scrubmap_pos = ScrubMapBuilder{}; + + m_replica_min_epoch = msg->min_epoch; + m_start = msg->start; + m_end = msg->end; + m_max_end = msg->end; + m_is_deep = msg->deep; + m_interval_start = m_pg->info.history.same_interval_since; + m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority + : Scrub::scrub_prio_t::low_priority; + m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority(); + + preemption_data.reset(); + preemption_data.force_preemptability(msg->allow_preemption); + + replica_scrubmap_pos.reset(); + + // make sure the FSM is at NotActive + m_fsm->assert_not_active(); + + m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority, + m_current_token); +} + +void PgScrubber::set_op_parameters(requested_scrub_t& request) +{ + dout(10) << __func__ << " input: " << request << dendl; + + // write down the epoch of starting a new scrub. Will be used + // to discard stale messages from previous aborted scrubs. + m_epoch_start = m_pg->get_osdmap_epoch(); + + m_flags.check_repair = request.check_repair; + m_flags.auto_repair = request.auto_repair || request.need_auto; + m_flags.required = request.req_scrub || request.must_scrub; + + m_flags.priority = (request.must_scrub || request.need_auto) + ? get_pg_cct()->_conf->osd_requested_scrub_priority + : m_pg->get_scrub_priority(); + + state_set(PG_STATE_SCRUBBING); + + // will we be deep-scrubbing? + if (request.must_deep_scrub || request.need_auto || request.time_for_deep) { + state_set(PG_STATE_DEEP_SCRUB); + } + + // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e. + // deep-scrub with the auto_repair configuration flag set). m_is_repair value + // determines the scrubber behavior. + // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the + // PG status as appearing in the logs). + m_is_repair = request.must_repair || m_flags.auto_repair; + if (request.must_repair) { + state_set(PG_STATE_REPAIR); + // not calling update_op_mode_text() yet, as m_is_deep not set yet + } + + // the publishing here seems to be required for tests synchronization + m_pg->publish_stats_to_osd(); + m_flags.deep_scrub_on_error = request.deep_scrub_on_error; +} + +void PgScrubber::scrub_compare_maps() +{ + dout(10) << __func__ << " has maps, analyzing" << dendl; + + // construct authoritative scrub map for type-specific scrubbing + m_cleaned_meta_map.insert(m_primary_scrubmap); + map, std::optional>> missing_digest; + + map maps; + maps[m_pg_whoami] = &m_primary_scrubmap; + + for (const auto& i : m_pg->get_acting_recovery_backfill()) { + if (i == m_pg_whoami) + continue; + dout(2) << __func__ << " replica " << i << " has " + << m_received_maps[i].objects.size() << " items" << dendl; + maps[i] = &m_received_maps[i]; + } + + set master_set; + + // Construct master set + for (const auto& map : maps) { + for (const auto& i : map.second->objects) { + master_set.insert(i.first); + } + } + + stringstream ss; + m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss); + + if (!ss.str().empty()) { + m_osds->clog->warn(ss); + } + + if (m_pg->recovery_state.get_acting_recovery_backfill().size() > 1) { + + dout(10) << __func__ << " comparing replica scrub maps" << dendl; + + // Map from object with errors to good peer + map> authoritative; + + dout(2) << __func__ << ": primary (" << m_pg->get_primary() << ") has " + << m_primary_scrubmap.objects.size() << " items" << dendl; + + ss.str(""); + ss.clear(); + + m_pg->get_pgbackend()->be_compare_scrubmaps( + maps, master_set, m_is_repair, m_missing, m_inconsistent, + authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(), + m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss); + + if (!ss.str().empty()) { + m_osds->clog->error(ss); + } + + for (auto& i : authoritative) { + list> good_peers; + for (list::const_iterator j = i.second.begin(); j != i.second.end(); + ++j) { + good_peers.emplace_back(maps[*j]->objects[i.first], *j); + } + m_authoritative.emplace(i.first, good_peers); + } + + for (auto i = authoritative.begin(); i != authoritative.end(); ++i) { + m_cleaned_meta_map.objects.erase(i->first); + m_cleaned_meta_map.objects.insert( + *(maps[i->second.back()]->objects.find(i->first))); + } + } + + auto for_meta_scrub = clean_meta_map(); + + // ok, do the pg-type specific scrubbing + + // (Validates consistency of the object info and snap sets) + scrub_snapshot_metadata(for_meta_scrub, missing_digest); + + // Called here on the primary can use an authoritative map if it isn't the primary + _scan_snaps(for_meta_scrub); + + if (!m_store->empty()) { + + if (m_is_repair) { + dout(10) << __func__ << ": discarding scrub results" << dendl; + m_store->flush(nullptr); + } else { + dout(10) << __func__ << ": updating scrub object" << dendl; + ObjectStore::Transaction t; + m_store->flush(&t); + m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); + } + } +} + +ScrubMachineListener::MsgAndEpoch PgScrubber::prep_replica_map_msg( + PreemptionNoted was_preempted) +{ + dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl; + + auto reply = + make_message(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), + m_replica_min_epoch, m_pg_whoami); + + reply->preempted = (was_preempted == PreemptionNoted::preempted); + ::encode(replica_scrubmap, reply->get_data()); + + return ScrubMachineListener::MsgAndEpoch{reply, m_replica_min_epoch}; +} + +void PgScrubber::send_replica_map(const MsgAndEpoch& preprepared) +{ + m_pg->send_cluster_message(m_pg->get_primary().osd, preprepared.m_msg, + preprepared.m_epoch, false); +} + +void PgScrubber::send_preempted_replica() +{ + auto reply = + make_message(spg_t{m_pg->info.pgid.pgid, m_pg->get_primary().shard}, + m_replica_min_epoch, m_pg_whoami); + + reply->preempted = true; + ::encode(replica_scrubmap, reply->get_data()); // must not skip this + m_pg->send_cluster_message(m_pg->get_primary().osd, reply, m_replica_min_epoch, false); +} + +/* + * - if the replica lets us know it was interrupted, we mark the chunk as interrupted. + * The state-machine will react to that when all replica maps are received. + * - when all maps are received, we signal the FSM with the GotReplicas event (see + * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the + * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to + * handle. + */ +void PgScrubber::map_from_replica(OpRequestRef op) +{ + auto m = op->get_req(); + dout(15) << __func__ << " " << *m << dendl; + + if (m->map_epoch < m_pg->info.history.same_interval_since) { + dout(10) << __func__ << " discarding old from " << m->map_epoch << " < " + << m_pg->info.history.same_interval_since << dendl; + return; + } + + auto p = const_cast(m->get_data()).cbegin(); + + m_received_maps[m->from].decode(p, m_pg->info.pgid.pool()); + dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl; + + auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from); + if (!is_ok) { + // previously an unexpected map was triggering an assert. Now, as scrubs can be + // aborted at any time, the chances of this happening have increased, and aborting is + // not justified + dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl; + return; + } + + if (m->preempted) { + dout(10) << __func__ << " replica was preempted, setting flag" << dendl; + preemption_data.do_preempt(); + } + + if (m_maps_status.are_all_maps_available()) { + dout(15) << __func__ << " all repl-maps available" << dendl; + m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops()); + } +} + +void PgScrubber::handle_scrub_reserve_request(OpRequestRef op) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + auto request_ep = op->get_req()->get_map_epoch(); + + /* + * if we are currently holding a reservation, then: + * either (1) we, the scrubber, did not yet notice an interval change. The remembered + * reservation epoch is from before our interval, and we can silently discard the + * reservation (no message is required). + * or: + * (2) the interval hasn't changed, but the same Primary that (we think) holds the + * lock just sent us a new request. Note that we know it's the same Primary, as + * otherwise the interval would have changed. + * Ostensibly we can discard & redo the reservation. But then we + * will be temporarily releasing the OSD resource - and might not be able to grab it + * again. Thus, we simply treat this as a successful new request + * (but mark the fact that if there is a previous request from the primary to + * scrub a specific chunk - that request is now defunct). + */ + + if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) { + // we are holding a stale reservation from a past epoch + m_remote_osd_resource.reset(); + dout(10) << __func__ << " stale reservation request" << dendl; + } + + if (request_ep < m_pg->get_same_interval_since()) { + // will not ack stale requests + return; + } + + bool granted{false}; + if (m_remote_osd_resource.has_value()) { + + dout(10) << __func__ << " already reserved." << dendl; + + /* + * it might well be that we did not yet finish handling the latest scrub-op from + * our primary. This happens, for example, if 'noscrub' was set via a command, then + * reset. The primary in this scenario will remain in the same interval, but we do need + * to reset our internal state (otherwise - the first renewed 'give me your scrub map' + * from the primary will see us in active state, crashing the OSD). + */ + advance_token(); + granted = true; + + } else if (m_pg->cct->_conf->osd_scrub_during_recovery || + !m_osds->is_recovery_active()) { + m_remote_osd_resource.emplace(m_pg, m_osds, request_ep); + // OSD resources allocated? + granted = m_remote_osd_resource->is_reserved(); + if (!granted) { + // just forget it + m_remote_osd_resource.reset(); + dout(20) << __func__ << ": failed to reserve remotely" << dendl; + } + } + + dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl; + + Message* reply = new MOSDScrubReserve( + spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep, + granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami); + + m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection()); +} + +void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + if (m_reservations.has_value()) { + m_reservations->handle_reserve_grant(op, from); + } else { + derr << __func__ << ": received unsolicited reservation grant from osd " << from + << " (" << op << ")" << dendl; + } +} + +void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + if (m_reservations.has_value()) { + // there is an active reservation process. No action is required otherwise. + m_reservations->handle_reserve_reject(op, from); + } +} + +void PgScrubber::handle_scrub_reserve_release(OpRequestRef op) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + /* + * this specific scrub session has terminated. All incoming events carrying the old + * tag will be discarded. + */ + advance_token(); + m_remote_osd_resource.reset(); +} + +void PgScrubber::discard_replica_reservations() +{ + dout(10) << __func__ << dendl; + if (m_reservations.has_value()) { + m_reservations->discard_all(); + } +} + +void PgScrubber::clear_scrub_reservations() +{ + dout(10) << __func__ << dendl; + m_reservations.reset(); // the remote reservations + m_local_osd_resource.reset(); // the local reservation + m_remote_osd_resource.reset(); // we as replica reserved for a Primary +} + +void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text) +{ + ceph_assert(m_pg->recovery_state.get_backfill_targets().empty()); + + std::vector> messages; + messages.reserve(m_pg->get_actingset().size()); + + epoch_t epch = get_osdmap_epoch(); + + for (auto& p : m_pg->get_actingset()) { + + if (p == m_pg_whoami) + continue; + + dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch + << dendl; + Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode, + m_pg_whoami); + messages.push_back(std::make_pair(p.osd, m)); + } + + if (!messages.empty()) { + m_osds->send_message_osd_cluster(messages, epch); + } +} + +void PgScrubber::unreserve_replicas() +{ + dout(10) << __func__ << dendl; + m_reservations.reset(); +} + +[[nodiscard]] bool PgScrubber::scrub_process_inconsistent() +{ + dout(10) << __func__ << ": checking authoritative (mode=" + << m_mode_desc << ", auth remaining #: " << m_authoritative.size() + << ")" << dendl; + + // authoritative only store objects which are missing or inconsistent. + if (!m_authoritative.empty()) { + + stringstream ss; + ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, " + << m_inconsistent.size() << " inconsistent objects"; + dout(2) << ss.str() << dendl; + m_osds->clog->error(ss); + + if (m_is_repair) { + state_clear(PG_STATE_CLEAN); + // we know we have a problem, so it's OK to set the user-visible flag + // even if we only reached here via auto-repair + state_set(PG_STATE_REPAIR); + update_op_mode_text(); + + for (const auto& [hobj, shrd_list] : m_authoritative) { + + auto missing_entry = m_missing.find(hobj); + + if (missing_entry != m_missing.end()) { + m_pg->repair_object(hobj, shrd_list, missing_entry->second); + m_fixed_count += missing_entry->second.size(); + } + + if (m_inconsistent.count(hobj)) { + m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]); + m_fixed_count += m_inconsistent[hobj].size(); + } + } + } + } + return (!m_authoritative.empty() && m_is_repair); +} + +/* + * note: only called for the Primary. + */ +void PgScrubber::scrub_finish() +{ + dout(10) << __func__ << " before flags: " << m_flags + << ". repair state: " << (state_test(PG_STATE_REPAIR) ? "repair" : "no-repair") + << ". deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl; + + ceph_assert(m_pg->is_locked()); + + m_pg->m_planned_scrub = requested_scrub_t{}; + + // if the repair request comes from auto-repair and large number of errors, + // we would like to cancel auto-repair + if (m_is_repair && m_flags.auto_repair && + m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) { + + dout(10) << __func__ << " undoing the repair" << dendl; + state_clear(PG_STATE_REPAIR); // not expected to be set, anyway + m_is_repair = false; + update_op_mode_text(); + } + + bool do_auto_scrub = false; + + // if a regular scrub had errors within the limit, do a deep scrub to auto repair + if (m_flags.deep_scrub_on_error && !m_authoritative.empty() && + m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) { + ceph_assert(!m_is_deep); + do_auto_scrub = true; + dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl; + } + + m_flags.deep_scrub_on_error = false; + + // type-specific finish (can tally more errors) + _scrub_finish(); + + bool has_error = scrub_process_inconsistent(); + + { + stringstream oss; + oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " "; + int total_errors = m_shallow_errors + m_deep_errors; + if (total_errors) + oss << total_errors << " errors"; + else + oss << "ok"; + if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors) + oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors + << " remaining deep scrub error details lost)"; + if (m_is_repair) + oss << ", " << m_fixed_count << " fixed"; + if (total_errors) + m_osds->clog->error(oss); + else + m_osds->clog->debug(oss); + } + + // Since we don't know which errors were fixed, we can only clear them + // when every one has been fixed. + if (m_is_repair) { + if (m_fixed_count == m_shallow_errors + m_deep_errors) { + + ceph_assert(m_is_deep); + m_shallow_errors = 0; + m_deep_errors = 0; + dout(20) << __func__ << " All may be fixed" << dendl; + + } else if (has_error) { + + // Deep scrub in order to get corrected error counts + m_pg->scrub_after_recovery = true; + m_pg->m_planned_scrub.req_scrub = + m_pg->m_planned_scrub.req_scrub || m_flags.required; + + dout(20) << __func__ << " Current 'required': " << m_flags.required + << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl; + + } else if (m_shallow_errors || m_deep_errors) { + + // We have errors but nothing can be fixed, so there is no repair + // possible. + state_set(PG_STATE_FAILED_REPAIR); + dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors) + << " error(s) present with no repair possible" << dendl; + } + } + + { + // finish up + ObjectStore::Transaction t; + m_pg->recovery_state.update_stats( + [this](auto& history, auto& stats) { + dout(10) << "m_pg->recovery_state.update_stats()" << dendl; + utime_t now = ceph_clock_now(); + history.last_scrub = m_pg->recovery_state.get_info().last_update; + history.last_scrub_stamp = now; + if (m_is_deep) { + history.last_deep_scrub = m_pg->recovery_state.get_info().last_update; + history.last_deep_scrub_stamp = now; + } + + if (m_is_deep) { + if ((m_shallow_errors == 0) && (m_deep_errors == 0)) + history.last_clean_scrub_stamp = now; + stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors; + stats.stats.sum.num_deep_scrub_errors = m_deep_errors; + stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects; + stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes; + stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys; + dout(25) << "scrub_finish shard " << m_pg_whoami + << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes + << " num_omap_keys = " << stats.stats.sum.num_omap_keys << dendl; + } else { + stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors; + // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent + // because of deep-scrub errors + if (m_shallow_errors == 0) + history.last_clean_scrub_stamp = now; + } + stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors + + stats.stats.sum.num_deep_scrub_errors; + if (m_flags.check_repair) { + m_flags.check_repair = false; + if (m_pg->info.stats.stats.sum.num_scrub_errors) { + state_set(PG_STATE_FAILED_REPAIR); + dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors + << " error(s) still present after re-scrub" << dendl; + } + } + return true; + }, + &t); + int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr); + ceph_assert(tr == 0); + + if (!m_pg->snap_trimq.empty()) { + dout(10) << "scrub finished, requeuing snap_trimmer" << dendl; + m_pg->snap_trimmer_scrub_complete(); + } + } + + if (has_error) { + m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared( + get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery()))); + } else { + m_is_repair = false; + state_clear(PG_STATE_REPAIR); + update_op_mode_text(); + } + + cleanup_on_finish(); + if (do_auto_scrub) { + request_rescrubbing(m_pg->m_planned_scrub); + } + + if (m_pg->is_active() && m_pg->is_primary()) { + m_pg->recovery_state.share_pg_info(); + } +} + +void PgScrubber::on_digest_updates() +{ + dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " pending? " + << num_digest_updates_pending + << (m_end.is_max() ? " " : " ") << dendl; + + if (num_digest_updates_pending > 0) { + // do nothing for now. We will be called again when new updates arrive + return; + } + + // got all updates, and finished with this chunk. Any more? + if (m_end.is_max()) { + + scrub_finish(); + m_osds->queue_scrub_is_finished(m_pg); + + } else { + // go get a new chunk (via "requeue") + preemption_data.reset(); + m_osds->queue_scrub_next_chunk(m_pg, m_pg->is_scrub_blocking_ops()); + } +} + + +/* + * note that the flags-set fetched from the PG (m_pg->m_planned_scrub) + * is cleared once scrubbing starts; Some of the values dumped here are + * thus transitory. + */ +void PgScrubber::dump(ceph::Formatter* f) const +{ + f->open_object_section("scrubber"); + f->dump_stream("epoch_start") << m_interval_start; + f->dump_bool("active", m_active); + if (m_active) { + f->dump_stream("start") << m_start; + f->dump_stream("end") << m_end; + f->dump_stream("m_max_end") << m_max_end; + f->dump_stream("subset_last_update") << m_subset_last_update; + f->dump_bool("deep", m_is_deep); + f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required)); + f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub); + f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair); + f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto); + f->dump_bool("req_scrub", m_flags.required); + f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep); + f->dump_bool("auto_repair", m_flags.auto_repair); + f->dump_bool("check_repair", m_flags.check_repair); + f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error); + f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp; // utime_t + f->dump_unsigned("priority", m_flags.priority); + f->dump_int("shallow_errors", m_shallow_errors); + f->dump_int("deep_errors", m_deep_errors); + f->dump_int("fixed", m_fixed_count); + { + f->open_array_section("waiting_on_whom"); + for (const auto& p : m_maps_status.get_awaited()) { + f->dump_stream("shard") << p; + } + f->close_section(); + } + } + f->close_section(); +} + + +void PgScrubber::handle_query_state(ceph::Formatter* f) +{ + dout(10) << __func__ << dendl; + + f->open_object_section("scrub"); + f->dump_stream("scrubber.epoch_start") << m_interval_start; + f->dump_bool("scrubber.active", m_active); + f->dump_stream("scrubber.start") << m_start; + f->dump_stream("scrubber.end") << m_end; + f->dump_stream("scrubber.m_max_end") << m_max_end; + f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update; + f->dump_bool("scrubber.deep", m_is_deep); + { + f->open_array_section("scrubber.waiting_on_whom"); + for (const auto& p : m_maps_status.get_awaited()) { + f->dump_stream("shard") << p; + } + f->close_section(); + } + + f->dump_string("comment", "DEPRECATED - may be removed in the next release"); + + f->close_section(); +} + +PgScrubber::~PgScrubber() = default; + +PgScrubber::PgScrubber(PG* pg) + : m_pg{pg} + , m_pg_id{pg->pg_id} + , m_osds{m_pg->osd} + , m_pg_whoami{pg->pg_whoami} + , preemption_data{pg} +{ + m_fsm = std::make_unique(m_pg, this); + m_fsm->initiate(); +} + +void PgScrubber::reserve_replicas() +{ + dout(10) << __func__ << dendl; + m_reservations.emplace(m_pg, m_pg_whoami); +} + +void PgScrubber::cleanup_on_finish() +{ + dout(10) << __func__ << dendl; + ceph_assert(m_pg->is_locked()); + + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_DEEP_SCRUB); + m_pg->publish_stats_to_osd(); + + clear_scrub_reservations(); + m_pg->publish_stats_to_osd(); + + requeue_waiting(); + + reset_internal_state(); + m_flags = scrub_flags_t{}; + + // type-specific state clear + _scrub_clear_state(); +} + +// uses process_event(), so must be invoked externally +void PgScrubber::scrub_clear_state() +{ + dout(10) << __func__ << dendl; + + clear_pgscrub_state(); + m_fsm->process_event(FullReset{}); +} + +/* + * note: does not access the state-machine + */ +void PgScrubber::clear_pgscrub_state() +{ + dout(10) << __func__ << dendl; + ceph_assert(m_pg->is_locked()); + + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_DEEP_SCRUB); + + state_clear(PG_STATE_REPAIR); + + clear_scrub_reservations(); + m_pg->publish_stats_to_osd(); + + requeue_waiting(); + + reset_internal_state(); + m_flags = scrub_flags_t{}; + + // type-specific state clear + _scrub_clear_state(); +} + +void PgScrubber::replica_handling_done() +{ + dout(10) << __func__ << dendl; + + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_DEEP_SCRUB); + + reset_internal_state(); + + m_pg->publish_stats_to_osd(); +} + +/* + * note: performs run_callbacks() + * note: reservations-related variables are not reset here + */ +void PgScrubber::reset_internal_state() +{ + dout(10) << __func__ << dendl; + + preemption_data.reset(); + m_maps_status.reset(); + m_received_maps.clear(); + + m_start = hobject_t{}; + m_end = hobject_t{}; + m_max_end = hobject_t{}; + m_subset_last_update = eversion_t{}; + m_shallow_errors = 0; + m_deep_errors = 0; + m_fixed_count = 0; + m_omap_stats = (const struct omap_stat_t){0}; + + run_callbacks(); + + m_inconsistent.clear(); + m_missing.clear(); + m_authoritative.clear(); + num_digest_updates_pending = 0; + m_primary_scrubmap = ScrubMap{}; + m_primary_scrubmap_pos.reset(); + replica_scrubmap = ScrubMap{}; + replica_scrubmap_pos.reset(); + m_cleaned_meta_map = ScrubMap{}; + m_needs_sleep = true; + m_sleep_started_at = utime_t{}; + + m_active = false; +} + +// note that only applicable to the Replica: +void PgScrubber::advance_token() +{ + dout(10) << __func__ << " was: " << m_current_token << dendl; + m_current_token++; + + // when advance_token() is called, it is assumed that no scrubbing takes place. + // We will, though, verify that. And if we are actually still handling a stale request - + // both our internal state and the FSM state will be cleared. + replica_handling_done(); + m_fsm->process_event(FullReset{}); +} + +bool PgScrubber::is_token_current(Scrub::act_token_t received_token) +{ + if (received_token == 0 || received_token == m_current_token) { + return true; + } + dout(5) << __func__ << " obsolete token (" << received_token + << " vs current " << m_current_token << dendl; + + return false; +} + +const OSDMapRef& PgScrubber::get_osdmap() const +{ + return m_pg->get_osdmap(); +} + +ostream& operator<<(ostream& out, const PgScrubber& scrubber) +{ + return out << scrubber.m_flags; +} + +ostream& PgScrubber::show(ostream& out) const +{ + return out << " [ " << m_pg_id << ": " << m_flags << " ] "; +} + +int PgScrubber::asok_debug(std::string_view cmd, + std::string param, + Formatter* f, + stringstream& ss) +{ + dout(10) << __func__ << " cmd: " << cmd << " param: " << param << dendl; + + if (cmd == "block") { + // set a flag that will cause the next 'select_range' to report a blocked object + m_debug_blockrange = 1; + } else if (cmd == "unblock") { + // send an 'unblock' event, as if a blocked range was freed + m_debug_blockrange = 0; + m_fsm->process_event(Unblocked{}); + } + return 0; +} +// ///////////////////// preemption_data_t ////////////////////////////////// + +PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg} +{ + m_left = static_cast( + m_pg->get_cct()->_conf.get_val("osd_scrub_max_preemptions")); +} + +void PgScrubber::preemption_data_t::reset() +{ + std::lock_guard lk{m_preemption_lock}; + + m_preemptable = false; + m_preempted = false; + m_left = + static_cast(m_pg->cct->_conf.get_val("osd_scrub_max_preemptions")); + m_size_divisor = 1; +} + + +// ///////////////////// ReplicaReservations ////////////////////////////////// +namespace Scrub { + +void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch) +{ + auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, peer.shard), epoch, + MOSDScrubReserve::RELEASE, m_pg->pg_whoami); + m_osds->send_message_osd_cluster(peer.osd, m, epoch); +} + +ReplicaReservations::ReplicaReservations(PG* pg, pg_shard_t whoami) + : m_pg{pg} + , m_acting_set{pg->get_actingset()} + , m_osds{m_pg->get_pg_osd(ScrubberPasskey())} + , m_pending{static_cast(m_acting_set.size()) - 1} + , m_pg_info{m_pg->get_pg_info(ScrubberPasskey())} +{ + epoch_t epoch = m_pg->get_osdmap_epoch(); + + // handle the special case of no replicas + if (m_pending <= 0) { + // just signal the scrub state-machine to continue + send_all_done(); + + } else { + + for (auto p : m_acting_set) { + if (p == whoami) + continue; + auto m = new MOSDScrubReserve(spg_t(m_pg_info.pgid.pgid, p.shard), epoch, + MOSDScrubReserve::REQUEST, m_pg->pg_whoami); + m_osds->send_message_osd_cluster(p.osd, m, epoch); + m_waited_for_peers.push_back(p); + dout(10) << __func__ << " reserve<-> " << p.osd << dendl; + } + } +} + +void ReplicaReservations::send_all_done() +{ + m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority); +} + +void ReplicaReservations::send_reject() +{ + m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority); +} + +void ReplicaReservations::discard_all() +{ + dout(10) << __func__ << " " << m_reserved_peers << dendl; + + m_had_rejections = true; // preventing late-coming responses from triggering events + m_reserved_peers.clear(); + m_waited_for_peers.clear(); +} + +ReplicaReservations::~ReplicaReservations() +{ + m_had_rejections = true; // preventing late-coming responses from triggering events + + // send un-reserve messages to all reserved replicas. We do not wait for answer (there + // wouldn't be one). Other incoming messages will be discarded on the way, by our + // owner. + epoch_t epoch = m_pg->get_osdmap_epoch(); + + for (auto& p : m_reserved_peers) { + release_replica(p, epoch); + } + m_reserved_peers.clear(); + + // note: the release will follow on the heels of the request. When tried otherwise, + // grants that followed a reject arrived after the whole scrub machine-state was + // reset, causing leaked reservations. + for (auto& p : m_waited_for_peers) { + release_replica(p, epoch); + } + m_waited_for_peers.clear(); +} + +/** + * @ATTN we would not reach here if the ReplicaReservation object managed by the + * scrubber was reset. + */ +void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << " granted-> " << from << dendl; + op->mark_started(); + + { + // reduce the amount of extra release messages. Not a must, but the log is cleaner + auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from); + if (w != m_waited_for_peers.end()) + m_waited_for_peers.erase(w); + } + + // are we forced to reject the reservation? + if (m_had_rejections) { + + dout(10) << " rejecting late-coming reservation from " << from << dendl; + release_replica(from, m_pg->get_osdmap_epoch()); + + } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) != + m_reserved_peers.end()) { + + dout(10) << " already had osd." << from << " reserved" << dendl; + + } else { + + dout(10) << " osd." << from << " scrub reserve = success" << dendl; + m_reserved_peers.push_back(from); + if (--m_pending == 0) { + send_all_done(); + } + } +} + +void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << " rejected-> " << from << dendl; + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + { + // reduce the amount of extra release messages. Not a must, but the log is cleaner + auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from); + if (w != m_waited_for_peers.end()) + m_waited_for_peers.erase(w); + } + + if (m_had_rejections) { + + // our failure was already handled when the first rejection arrived + dout(15) << " ignoring late-coming rejection from " << from << dendl; + + } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) != + m_reserved_peers.end()) { + + dout(10) << " already had osd." << from << " reserved" << dendl; + + } else { + + dout(10) << " osd." << from << " scrub reserve = fail" << dendl; + m_had_rejections = true; // preventing any additional notifications + send_reject(); + } +} + + +// ///////////////////// LocalReservation ////////////////////////////////// + +LocalReservation::LocalReservation(PG* pg, OSDService* osds) + : m_pg{pg} // holding the "whole PG" for dout() sake + , m_osds{osds} +{ + if (!m_osds->inc_scrubs_local()) { + dout(10) << __func__ << ": failed to reserve locally " << dendl; + // the failure is signalled by not having m_holding_local_reservation set + return; + } + + dout(20) << __func__ << ": local OSD scrub resources reserved" << dendl; + m_holding_local_reservation = true; +} + +LocalReservation::~LocalReservation() +{ + if (m_holding_local_reservation) { + m_holding_local_reservation = false; + m_osds->dec_scrubs_local(); + } +} + + +// ///////////////////// ReservedByRemotePrimary /////////////////////////////// + +ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch) + : m_pg{pg}, m_osds{osds}, m_reserved_at{epoch} +{ + if (!m_osds->inc_scrubs_remote()) { + dout(10) << __func__ << ": failed to reserve at Primary request" << dendl; + // the failure is signalled by not having m_reserved_by_remote_primary set + return; + } + + dout(20) << __func__ << ": scrub resources reserved at Primary request" << dendl; + m_reserved_by_remote_primary = true; +} + +bool ReservedByRemotePrimary::is_stale() const +{ + return m_reserved_at < m_pg->get_same_interval_since(); +} + +ReservedByRemotePrimary::~ReservedByRemotePrimary() +{ + if (m_reserved_by_remote_primary) { + m_reserved_by_remote_primary = false; + m_osds->dec_scrubs_remote(); + } +} + +// ///////////////////// MapsCollectionStatus //////////////////////////////// + +auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from) + -> std::tuple +{ + auto fe = std::find(m_maps_awaited_for.begin(), m_maps_awaited_for.end(), from); + if (fe != m_maps_awaited_for.end()) { + // we are indeed waiting for a map from this replica + m_maps_awaited_for.erase(fe); + return std::tuple{true, ""sv}; + } else { + return std::tuple{false, " unsolicited scrub-map"sv}; + } +} + +void MapsCollectionStatus::reset() +{ + *this = MapsCollectionStatus{}; +} + +std::string MapsCollectionStatus::dump() const +{ + std::string all; + for (const auto& rp : m_maps_awaited_for) { + all.append(rp.get_osd() + " "s); + } + return all; +} + +ostream& operator<<(ostream& out, const MapsCollectionStatus& sf) +{ + out << " [ "; + for (const auto& rp : sf.m_maps_awaited_for) { + out << rp.get_osd() << " "; + } + if (!sf.m_local_map_ready) { + out << " local "; + } + return out << " ] "; +} + +// ///////////////////// blocked_range_t /////////////////////////////// + +blocked_range_t::blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id) + : m_osds{osds} +{ + auto now_is = std::chrono::system_clock::now(); + m_callbk = new LambdaContext([now_is, pg_id, osds]([[maybe_unused]] int r) { + std::time_t now_c = std::chrono::system_clock::to_time_t(now_is); + char buf[50]; + strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c)); + lgeneric_subdout(g_ceph_context, osd, 10) + << "PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf + << ")" << dendl; + osds->clog->warn() << "osd." << osds->whoami << " PgScrubber: " << pg_id << " blocked on an object for too long (since " << buf << ")"; + return; + }); + + std::lock_guard l(m_osds->sleep_lock); + m_osds->sleep_timer.add_event_after(waittime, m_callbk); +} + +blocked_range_t::~blocked_range_t() +{ + std::lock_guard l(m_osds->sleep_lock); + m_osds->sleep_timer.cancel_event(m_callbk); +} + +} // namespace Scrub diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h new file mode 100644 index 00000000000..c08279efb61 --- /dev/null +++ b/src/osd/scrubber/pg_scrubber.h @@ -0,0 +1,800 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "osd/PG.h" +#include "ScrubStore.h" +#include "scrub_machine_lstnr.h" +#include "osd/scrubber_common.h" + +class Callback; + +namespace Scrub { +class ScrubMachine; +struct BuildMap; + +/** + * Reserving/freeing scrub resources at the replicas. + * + * When constructed - sends reservation requests to the acting_set. + * A rejection triggers a "couldn't acquire the replicas' scrub resources" event. + * All previous requests, whether already granted or not, are explicitly released. + * + * A note re performance: I've measured a few container alternatives for + * m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as + * expected. flat_set is only slightly better. Surprisingly - std::vector (with no + * sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve. + */ +class ReplicaReservations { + using OrigSet = decltype(std::declval().get_actingset()); + + PG* m_pg; + OrigSet m_acting_set; + OSDService* m_osds; + std::vector m_waited_for_peers; + std::vector m_reserved_peers; + bool m_had_rejections{false}; + int m_pending{-1}; + const pg_info_t& m_pg_info; + + void release_replica(pg_shard_t peer, epoch_t epoch); + + void send_all_done(); ///< all reservations are granted + + /// notify the scrubber that we have failed to reserve replicas' resources + void send_reject(); + + public: + /** + * quietly discard all knowledge about existing reservations. No messages + * are sent to peers. + * To be used upon interval change, as we know the the running scrub is no longer + * relevant, and that the replicas had reset the reservations on their side. + */ + void discard_all(); + + ReplicaReservations(PG* pg, pg_shard_t whoami); + + ~ReplicaReservations(); + + void handle_reserve_grant(OpRequestRef op, pg_shard_t from); + + void handle_reserve_reject(OpRequestRef op, pg_shard_t from); +}; + +/** + * wraps the local OSD scrub resource reservation in an RAII wrapper + */ +class LocalReservation { + PG* m_pg; + OSDService* m_osds; + bool m_holding_local_reservation{false}; + + public: + LocalReservation(PG* pg, OSDService* osds); + ~LocalReservation(); + bool is_reserved() const { return m_holding_local_reservation; } +}; + +/** + * wraps the OSD resource we are using when reserved as a replica by a scrubbing master. + */ +class ReservedByRemotePrimary { + PG* m_pg; + OSDService* m_osds; + bool m_reserved_by_remote_primary{false}; + const epoch_t m_reserved_at; + + public: + ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch); + ~ReservedByRemotePrimary(); + [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; } + + /// compare the remembered reserved-at epoch to the current interval + [[nodiscard]] bool is_stale() const; +}; + +/** + * Once all replicas' scrub maps are received, we go on to compare the maps. That is - + * unless we we have not yet completed building our own scrub map. MapsCollectionStatus + * combines the status of waiting for both the local map and the replicas, without + * resorting to adding dummy entries into a list. + */ +class MapsCollectionStatus { + + bool m_local_map_ready{false}; + std::vector m_maps_awaited_for; + + public: + [[nodiscard]] bool are_all_maps_available() const + { + return m_local_map_ready && m_maps_awaited_for.empty(); + } + + void mark_local_map_ready() { m_local_map_ready = true; } + + void mark_replica_map_request(pg_shard_t from_whom) + { + m_maps_awaited_for.push_back(from_whom); + } + + /// @returns true if indeed waiting for this one. Otherwise: an error string + auto mark_arriving_map(pg_shard_t from) -> std::tuple; + + std::vector get_awaited() const { return m_maps_awaited_for; } + + void reset(); + + std::string dump() const; + + friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf); +}; + + +} // namespace Scrub + + +/** + * the scrub operation flags. Primary only. + * Set at scrub start. Checked in multiple locations - mostly + * at finish. + */ +struct scrub_flags_t { + + unsigned int priority{0}; + + /** + * set by queue_scrub() if either planned_scrub.auto_repair or + * need_auto were set. + * Tested at scrub end. + */ + bool auto_repair{false}; + + /// this flag indicates that we are scrubbing post repair to verify everything is fixed + bool check_repair{false}; + + /// checked at the end of the scrub, to possibly initiate a deep-scrub + bool deep_scrub_on_error{false}; + + /** + * scrub must not be aborted. + * Set for explicitly requested scrubs, and for scrubs originated by the pairing + * process with the 'repair' flag set (in the RequestScrub event). + */ + bool required{false}; +}; + +ostream& operator<<(ostream& out, const scrub_flags_t& sf); + + +/** + * The part of PG-scrubbing code that isn't state-machine wiring. + * + * Why the separation? I wish to move to a different FSM implementation. Thus I + * am forced to strongly decouple the state-machine implementation details from + * the actual scrubbing code. + */ +class PgScrubber : public ScrubPgIF, public ScrubMachineListener { + + public: + explicit PgScrubber(PG* pg); + + // ------------------ the I/F exposed to the PG (ScrubPgIF) ------------- + + /// are we waiting for resource reservation grants form our replicas? + [[nodiscard]] bool is_reserving() const final; + + void initiate_regular_scrub(epoch_t epoch_queued) final; + + void initiate_scrub_after_repair(epoch_t epoch_queued) final; + + void send_scrub_resched(epoch_t epoch_queued) final; + + void active_pushes_notification(epoch_t epoch_queued) final; + + void update_applied_notification(epoch_t epoch_queued) final; + + void send_scrub_unblock(epoch_t epoch_queued) final; + + void digest_update_notification(epoch_t epoch_queued) final; + + void send_replica_maps_ready(epoch_t epoch_queued) final; + + void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; + + void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; + + void send_replica_pushes_upd(epoch_t epoch_queued) final; + /** + * The PG has updated its 'applied version'. It might be that we are waiting for this + * information: after selecting a range of objects to scrub, we've marked the latest + * version of these objects in m_subset_last_update. We will not start the map building + * before we know that the PG has reached this version. + */ + void on_applied_when_primary(const eversion_t& applied_version) final; + + void send_full_reset(epoch_t epoch_queued) final; + + void send_chunk_free(epoch_t epoch_queued) final; + + void send_chunk_busy(epoch_t epoch_queued) final; + + void send_local_map_done(epoch_t epoch_queued) final; + + void send_maps_compared(epoch_t epoch_queued) final; + + void send_get_next_chunk(epoch_t epoch_queued) final; + + void send_scrub_is_finished(epoch_t epoch_queued) final; + + /** + * we allow some number of preemptions of the scrub, which mean we do + * not block. Then we start to block. Once we start blocking, we do + * not stop until the scrub range is completed. + */ + bool write_blocked_by_scrub(const hobject_t& soid) final; + + /// true if the given range intersects the scrub interval in any way + bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final; + + /** + * we are a replica being asked by the Primary to reserve OSD resources for + * scrubbing + */ + void handle_scrub_reserve_request(OpRequestRef op) final; + + void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_release(OpRequestRef op) final; + void discard_replica_reservations() final; + void clear_scrub_reservations() final; // PG::clear... fwds to here + void unreserve_replicas() final; + + // managing scrub op registration + + void reg_next_scrub(const requested_scrub_t& request_flags) final; + + void unreg_next_scrub() final; + + void scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) final; + + /** + * Reserve local scrub resources (managed by the OSD) + * + * Fails if OSD's local-scrubs budget was exhausted + * \returns were local resources reserved? + */ + bool reserve_local() final; + + void handle_query_state(ceph::Formatter* f) final; + + void dump(ceph::Formatter* f) const override; + + // used if we are a replica + + void replica_scrub_op(OpRequestRef op) final; + + /// the op priority, taken from the primary's request message + Scrub::scrub_prio_t replica_op_priority() const final + { + return m_replica_request_priority; + }; + + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const final; + /// the version that refers to m_flags.priority + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final; + + void add_callback(Context* context) final { m_callbacks.push_back(context); } + + [[nodiscard]] bool are_callbacks_pending() const final // used for an assert in PG.cc + { + return !m_callbacks.empty(); + } + + /// handle a message carrying a replica map + void map_from_replica(OpRequestRef op) final; + + void scrub_clear_state() final; + + /** + * add to scrub statistics, but only if the soid is below the scrub start + */ + virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) override + { + ceph_assert(false); + } + + /** + * finalize the parameters of the initiated scrubbing session: + * + * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set; + * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set. + */ + void set_op_parameters(requested_scrub_t& request) final; + + void cleanup_store(ObjectStore::Transaction* t) final; + + bool get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const override + { + return false; + } + + int asok_debug(std::string_view cmd, + std::string param, + Formatter* f, + std::stringstream& ss) override; + int m_debug_blockrange{0}; + + // ------------------------------------------------------------------------------------------- + // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener) + + [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); } + + void select_range_n_notify() final; + + Scrub::BlockedRangeWarning acquire_blocked_alarm() final; + + /// walk the log to find the latest update that affects our chunk + eversion_t search_log_for_updates() const final; + + eversion_t get_last_update_applied() const final + { + return m_pg->recovery_state.get_last_update_applied(); + } + + int pending_active_pushes() const final { return m_pg->active_pushes; } + + void on_init() final; + void on_replica_init() final; + void replica_handling_done() final; + + /// the version of 'scrub_clear_state()' that does not try to invoke FSM services + /// (thus can be called from FSM reactions) + void clear_pgscrub_state() final; + + /* + * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep' + * is asserted - after a configuration-dependent timeout. + */ + void add_delayed_scheduling() final; + + void get_replicas_maps(bool replica_can_preempt) final; + + void on_digest_updates() final; + + ScrubMachineListener::MsgAndEpoch + prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final; + + void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final; + + void send_preempted_replica() final; + + void send_remotes_reserved(epoch_t epoch_queued) final; + void send_reservation_failure(epoch_t epoch_queued) final; + + /** + * does the PG have newer updates than what we (the scrubber) know? + */ + [[nodiscard]] bool has_pg_marked_new_updates() const final; + + void set_subset_last_update(eversion_t e) final; + + void maps_compare_n_cleanup() final; + + Scrub::preemption_t& get_preemptor() final; + + int build_primary_map_chunk() final; + + int build_replica_map_chunk() final; + + void reserve_replicas() final; + + [[nodiscard]] bool was_epoch_changed() const final; + + void mark_local_map_ready() final; + + [[nodiscard]] bool are_all_maps_available() const final; + + std::string dump_awaited_maps() const final; + + protected: + bool state_test(uint64_t m) const { return m_pg->state_test(m); } + void state_set(uint64_t m) { m_pg->state_set(m); } + void state_clear(uint64_t m) { m_pg->state_clear(m); } + + [[nodiscard]] bool is_scrub_registered() const; + + virtual void _scrub_clear_state() {} + + utime_t m_scrub_reg_stamp; ///< stamp we registered for + + ostream& show(ostream& out) const override; + + public: + // ------------------------------------------------------------------------------------------- + + friend ostream& operator<<(ostream& out, const PgScrubber& scrubber); + + static utime_t scrub_must_stamp() { return utime_t(1, 1); } + + virtual ~PgScrubber(); // must be defined separately, in the .cc file + + [[nodiscard]] bool is_scrub_active() const final { return m_active; } + + private: + void reset_internal_state(); + + /** + * the current scrubbing operation is done. We should mark that fact, so that + * all events related to the previous operation can be discarded. + */ + void advance_token(); + + bool is_token_current(Scrub::act_token_t received_token); + + void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); } + + void _scan_snaps(ScrubMap& smap); + + ScrubMap clean_meta_map(); + + /** + * mark down some parameters of the initiated scrub: + * - the epoch when started; + * - the depth of the scrub requested (from the PG_STATE variable) + */ + void reset_epoch(epoch_t epoch_queued); + + void run_callbacks(); + + // ----- methods used to verify the relevance of incoming events: + + /** + * is the incoming event still relevant, and should be processed? + * + * It isn't if: + * - (1) we are no longer 'actively scrubbing'; or + * - (2) the message is from an epoch prior to when we started the current scrub + * session; or + * - (3) the message epoch is from a previous interval; or + * - (4) the 'abort' configuration flags were set. + * + * For (1) & (2) - teh incoming message is discarded, w/o further action. + * + * For (3): (see check_interval() for a full description) if we have not reacted yet + * to this specific new interval, we do now: + * - replica reservations are silently discarded (we count on the replicas to notice + * the interval change and un-reserve themselves); + * - the scrubbing is halted. + * + * For (4): the message will be discarded, but also: + * if this is the first time we've noticed the 'abort' request, we perform the abort. + * + * \returns should the incoming event be processed? + */ + bool is_message_relevant(epoch_t epoch_to_verify); + + /** + * check the 'no scrub' configuration options. + */ + [[nodiscard]] bool should_abort() const; + + /** + * Check the 'no scrub' configuration flags. + * + * Reset everything if the abort was not handled before. + * @returns false if the message was discarded due to abort flag. + */ + [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify); + + [[nodiscard]] bool check_interval(epoch_t epoch_to_verify); + + epoch_t m_last_aborted{}; // last time we've noticed a request to abort + + /** + * return true if any inconsistency/missing is repaired, false otherwise + */ + [[nodiscard]] bool scrub_process_inconsistent(); + + void scrub_compare_maps(); + + bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always + ///< 'true', unless we just got out of a sleep period + + utime_t m_sleep_started_at; + + + // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed' + // to guarantee un-reserving when deleted. + std::optional m_reservations; + std::optional m_local_osd_resource; + + /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing + std::optional m_remote_osd_resource; + + void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when + // Active->NotActive + + /// the part that actually finalizes a scrub + void scrub_finish(); + + protected: + PG* const m_pg; + + /** + * the derivative-specific scrub-finishing touches: + */ + virtual void _scrub_finish() {} + + /** + * Validate consistency of the object info and snap sets. + */ + virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) + {} + + // common code used by build_primary_map_chunk() and build_replica_map_chunk(): + int build_scrub_map_chunk(ScrubMap& map, // primary or replica? + ScrubMapBuilder& pos, + hobject_t start, + hobject_t end, + bool deep); + + std::unique_ptr m_fsm; + const spg_t m_pg_id; ///< a local copy of m_pg->pg_id + OSDService* const m_osds; + const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami; + + epoch_t m_interval_start{0}; ///< interval's 'from' of when scrubbing was first scheduled + /* + * the exact epoch when the scrubbing actually started (started here - cleared checks + * for no-scrub conf). Incoming events are verified against this, with stale events + * discarded. + */ + epoch_t m_epoch_start{0}; ///< the actual epoch when scrubbing started + + /** + * (replica) a tag identifying a specific scrub "session". Incremented whenever the + * Primary releases the replica scrub resources. + * When the scrub session is terminated (even if the interval remains unchanged, as + * might happen following an asok no-scrub command), stale scrub-resched messages + * triggered by the backend will be discarded. + */ + Scrub::act_token_t m_current_token{1}; + + scrub_flags_t m_flags; + + bool m_active{false}; + + eversion_t m_subset_last_update{}; + + std::unique_ptr m_store; + + int num_digest_updates_pending{0}; + hobject_t m_start, m_end; ///< note: half-closed: [start,end) + + /// Returns reference to current osdmap + const OSDMapRef& get_osdmap() const; + + /// Returns epoch of current osdmap + epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); } + + CephContext* get_pg_cct() const { return m_pg->cct; } + + // collected statistics + int m_shallow_errors{0}; + int m_deep_errors{0}; + int m_fixed_count{0}; + + /// Maps from objects with errors to missing peers + HobjToShardSetMapping m_missing; + + protected: + /** + * 'm_is_deep' - is the running scrub a deep one? + * + * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is + * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is + * meaningful both for the primary and the replicas, and is used as a parameter when + * building the scrub maps. + */ + bool m_is_deep{false}; + + /** + * If set: affects the backend & scrubber-backend functions called after all + * scrub maps are available. + * + * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be + * a "user facing" status display only). + */ + bool m_is_repair{false}; + + /** + * User-readable summary of the scrubber's current mode of operation. Used for + * both osd.*.log and the cluster log. + * One of: + * "repair" + * "deep-scrub", + * "scrub + * + * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for + * auto_repair will show as "deep-scrub" and not as "repair" (until the first error + * is detected). + */ + std::string_view m_mode_desc; + + void update_op_mode_text(); + +private: + + /** + * initiate a deep-scrub after the current scrub ended with errors. + */ + void request_rescrubbing(requested_scrub_t& req_flags); + + /* + * Select a range of objects to scrub. + * + * By: + * - setting tentative range based on conf and divisor + * - requesting a partial list of elements from the backend; + * - handling some head/clones issues + * + * The selected range is set directly into 'm_start' and 'm_end' + */ + bool select_range(); + + std::list m_callbacks; + + /** + * send a replica (un)reservation request to the acting set + * + * @param opcode - one of MOSDScrubReserve::REQUEST + * or MOSDScrubReserve::RELEASE + */ + void message_all_replicas(int32_t opcode, std::string_view op_text); + + hobject_t m_max_end; ///< Largest end that may have been sent to replicas + ScrubMap m_primary_scrubmap; + ScrubMapBuilder m_primary_scrubmap_pos; + + std::map m_received_maps; + + /// Cleaned std::map pending snap metadata scrub + ScrubMap m_cleaned_meta_map; + + void _request_scrub_map(pg_shard_t replica, + eversion_t version, + hobject_t start, + hobject_t end, + bool deep, + bool allow_preemption); + + + Scrub::MapsCollectionStatus m_maps_status; + + omap_stat_t m_omap_stats = (const struct omap_stat_t){0}; + + /// Maps from objects with errors to inconsistent peers + HobjToShardSetMapping m_inconsistent; + + /// Maps from object with errors to good peers + std::map>> m_authoritative; + + // ------------ members used if we are a replica + + epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message + + ScrubMapBuilder replica_scrubmap_pos; + ScrubMap replica_scrubmap; + + /** + * we mark the request priority as it arrived. It influences the queuing priority + * when we wait for local updates + */ + Scrub::scrub_prio_t m_replica_request_priority; + + /** + * the 'preemption' "state-machine". + * Note: I was considering an orthogonal sub-machine implementation, but as + * the state diagram is extremely simple, the added complexity wasn't justified. + */ + class preemption_data_t : public Scrub::preemption_t { + public: + preemption_data_t(PG* pg); // the PG access is used for conf access (and logs) + + [[nodiscard]] bool is_preemptable() const final { return m_preemptable; } + + bool do_preempt() final + { + if (m_preempted || !m_preemptable) + return false; + + std::lock_guard lk{m_preemption_lock}; + if (!m_preemptable) + return false; + + m_preempted = true; + return true; + } + + /// same as 'do_preempt()' but w/o checks (as once a replica + /// was preempted, we cannot continue) + void replica_preempted() { m_preempted = true; } + + void enable_preemption() + { + std::lock_guard lk{m_preemption_lock}; + if (are_preemptions_left() && !m_preempted) { + m_preemptable = true; + } + } + + /// used by a replica to set preemptability state according to the Primary's request + void force_preemptability(bool is_allowed) + { + // note: no need to lock for a replica + m_preempted = false; + m_preemptable = is_allowed; + } + + bool disable_and_test() final + { + std::lock_guard lk{m_preemption_lock}; + m_preemptable = false; + return m_preempted; + } + + [[nodiscard]] bool was_preempted() const { return m_preempted; } + + [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; } + + void reset(); + + void adjust_parameters() final + { + std::lock_guard lk{m_preemption_lock}; + + if (m_preempted) { + m_preempted = false; + m_preemptable = adjust_left(); + } else { + m_preemptable = are_preemptions_left(); + } + } + + private: + PG* m_pg; + mutable std::mutex m_preemption_lock; + bool m_preemptable{false}; + bool m_preempted{false}; + int m_left; + size_t m_size_divisor{1}; + bool are_preemptions_left() const { return m_left > 0; } + + bool adjust_left() + { + if (m_left > 0) { + --m_left; + m_size_divisor *= 2; + } + return m_left > 0; + } + }; + + preemption_data_t preemption_data; +}; diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc new file mode 100644 index 00000000000..41e3cd1f162 --- /dev/null +++ b/src/osd/scrubber/scrub_machine.cc @@ -0,0 +1,521 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "scrub_machine.h" + +#include +#include + +#include + +#include "osd/OSD.h" +#include "osd/OpRequest.h" +#include "ScrubStore.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout << " scrubberFSM " + +using namespace std::chrono; +using namespace std::chrono_literals; +namespace sc = boost::statechart; + +#define DECLARE_LOCALS \ + ScrubMachineListener* scrbr = context().m_scrbr; \ + std::ignore = scrbr; \ + auto pg_id = context().m_pg_id; \ + std::ignore = pg_id; + +namespace Scrub { + +// --------- trace/debug auxiliaries ------------------------------- + +void on_event_creation(std::string_view nm) +{ + dout(20) << " event: --vvvv---- " << nm << dendl; +} + +void on_event_discard(std::string_view nm) +{ + dout(20) << " event: --^^^^---- " << nm << dendl; +} + +void ScrubMachine::my_states() const +{ + for (auto si = state_begin(); si != state_end(); ++si) { + const auto& siw{*si}; // prevents a warning re side-effects + dout(20) << " state: " << boost::core::demangle(typeid(siw).name()) << dendl; + } +} + +void ScrubMachine::assert_not_active() const +{ + ceph_assert(state_cast()); +} + +bool ScrubMachine::is_reserving() const +{ + return state_cast(); +} + +bool ScrubMachine::is_accepting_updates() const +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + ceph_assert(scrbr->is_primary()); + + return state_cast(); +} + +// for the rest of the code in this file - we know what PG we are dealing with: +#undef dout_prefix +#define dout_prefix _prefix(_dout, this->context().m_pg) +template static ostream& _prefix(std::ostream* _dout, T* t) +{ + return t->gen_prefix(*_dout) << " scrubberFSM pg(" << t->pg_id << ") "; +} + +// ////////////// the actual actions + +// ----------------------- NotActive ----------------------------------------- + +NotActive::NotActive(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> NotActive" << dendl; +} + +// ----------------------- ReservingReplicas --------------------------------- + +ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> ReservingReplicas" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + scrbr->reserve_replicas(); +} + +sc::result ReservingReplicas::react(const ReservationFailure&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl; + + // the Scrubber must release all resources and abort the scrubbing + scrbr->clear_pgscrub_state(); + return transit(); +} + +/** + * note: the event poster is handling the scrubber reset + */ +sc::result ReservingReplicas::react(const FullReset&) +{ + dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl; + return transit(); +} + +// ----------------------- ActiveScrubbing ----------------------------------- + +ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> ActiveScrubbing" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + scrbr->on_init(); +} + +/** + * upon exiting the Active state + */ +ActiveScrubbing::~ActiveScrubbing() +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(15) << __func__ << dendl; + scrbr->unreserve_replicas(); +} + +/* + * The only source of an InternalError event as of now is the BuildMap state, + * when encountering a backend error. + * We kill the scrub and reset the FSM. + */ +sc::result ActiveScrubbing::react(const InternalError&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << __func__ << dendl; + scrbr->clear_pgscrub_state(); + return transit(); +} + +sc::result ActiveScrubbing::react(const FullReset&) +{ + dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl; + // caller takes care of clearing the scrubber & FSM states + return transit(); +} + +// ----------------------- RangeBlocked ----------------------------------- + +/* + * Blocked. Will be released by kick_object_context_blocked() (or upon + * an abort) + * + * Note: we are never expected to be waiting for long for a blocked object. + * Unfortunately we know from experience that a bug elsewhere might result + * in an indefinite wait in this state, for an object that is never released. + * If that happens, all we can do is to issue a warning message to help + * with the debugging. + */ +RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/RangeBlocked" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + + // arrange to have a warning message issued if we are stuck in this + // state for longer than some reasonable number of minutes. + m_timeout = scrbr->acquire_blocked_alarm(); +} + +// ----------------------- PendingTimer ----------------------------------- + +/** + * Sleeping till timer reactivation - or just requeuing + */ +PendingTimer::PendingTimer(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/PendingTimer" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + + scrbr->add_delayed_scheduling(); +} + +// ----------------------- NewChunk ----------------------------------- + +/** + * Preconditions: + * - preemption data was set + * - epoch start was updated + */ +NewChunk::NewChunk(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/NewChunk" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + + scrbr->get_preemptor().adjust_parameters(); + + // choose range to work on + // select_range_n_notify() will signal either SelectedChunkFree or + // ChunkIsBusy. If 'busy', we transition to Blocked, and wait for the + // range to become available. + scrbr->select_range_n_notify(); +} + +sc::result NewChunk::react(const SelectedChunkFree&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl; + + scrbr->set_subset_last_update(scrbr->search_log_for_updates()); + return transit(); +} + +// ----------------------- WaitPushes ----------------------------------- + +WaitPushes::WaitPushes(my_context ctx) : my_base(ctx) +{ + dout(10) << " -- state -->> Act/WaitPushes" << dendl; + post_event(ActivePushesUpd{}); +} + +/* + * Triggered externally, by the entity that had an update re pushes + */ +sc::result WaitPushes::react(const ActivePushesUpd&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: " + << scrbr->pending_active_pushes() << dendl; + + if (!scrbr->pending_active_pushes()) { + // done waiting + return transit(); + } + + return discard_event(); +} + +// ----------------------- WaitLastUpdate ----------------------------------- + +WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx) +{ + dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl; + post_event(UpdatesApplied{}); +} + +/** + * Note: + * Updates are locally readable immediately. Thus, on the replicas we do need + * to wait for the update notifications before scrubbing. For the Primary it's + * a bit different: on EC (and only there) rmw operations have an additional + * read roundtrip. That means that on the Primary we need to wait for + * last_update_applied (the replica side, even on EC, is still safe + * since the actual transaction will already be readable by commit time. + */ +void WaitLastUpdate::on_new_updates(const UpdatesApplied&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl; + + if (scrbr->has_pg_marked_new_updates()) { + post_event(InternalAllUpdates{}); + } else { + // will be requeued by op_applied + dout(10) << "wait for EC read/modify/writes to queue" << dendl; + } +} + +/* + * request maps from the replicas in the acting set + */ +sc::result WaitLastUpdate::react(const InternalAllUpdates&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl; + + scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable()); + return transit(); +} + +// ----------------------- BuildMap ----------------------------------- + +BuildMap::BuildMap(my_context ctx) : my_base(ctx) +{ + dout(10) << " -- state -->> Act/BuildMap" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + + // no need to check for an epoch change, as all possible flows that brought us here have + // a check_interval() verification of their final event. + + if (scrbr->get_preemptor().was_preempted()) { + + // we were preempted, either directly or by a replica + dout(10) << __func__ << " preempted!!!" << dendl; + scrbr->mark_local_map_ready(); + post_event(IntBmPreempted{}); + + } else { + + auto ret = scrbr->build_primary_map_chunk(); + + if (ret == -EINPROGRESS) { + // must wait for the backend to finish. No specific event provided. + // build_primary_map_chunk() has already requeued us. + dout(20) << "waiting for the backend..." << dendl; + + } else if (ret < 0) { + + dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl; + post_event(InternalError{}); + + } else { + + // the local map was created + post_event(IntLocalMapDone{}); + } + } +} + +sc::result BuildMap::react(const IntLocalMapDone&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl; + + scrbr->mark_local_map_ready(); + return transit(); +} + +// ----------------------- DrainReplMaps ----------------------------------- + +DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/DrainReplMaps" << dendl; + // we may have received all maps already. Send the event that will make us check. + post_event(GotReplicas{}); +} + +sc::result DrainReplMaps::react(const GotReplicas&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl; + + if (scrbr->are_all_maps_available()) { + // NewChunk will handle the preemption that brought us to this state + return transit(); + } + + dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: " + << scrbr->dump_awaited_maps() << dendl; + return discard_event(); +} + +// ----------------------- WaitReplicas ----------------------------------- + +WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/WaitReplicas" << dendl; + post_event(GotReplicas{}); +} + +/** + * note: now that maps_compare_n_cleanup() is "futurized"(*), and we remain in this state + * for a while even after we got all our maps, we must prevent are_all_maps_available() + * (actually - the code after the if()) from being called more than once. + * This is basically a separate state, but it's too transitory and artificial to justify + * the cost of a separate state. + + * (*) "futurized" - in Crimson, the call to maps_compare_n_cleanup() returns immediately + * after initiating the process. The actual termination of the maps comparing etc' is + * signalled via an event. As we share the code with "classic" OSD, here too + * maps_compare_n_cleanup() is responsible for signalling the completion of the + * processing. + */ +sc::result WaitReplicas::react(const GotReplicas&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl; + + if (!all_maps_already_called && scrbr->are_all_maps_available()) { + dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl; + + all_maps_already_called = true; + + // were we preempted? + if (scrbr->get_preemptor().disable_and_test()) { // a test&set + + + dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl; + return transit(); + + } else { + + // maps_compare_n_cleanup() will arrange for MapsCompared event to be sent: + scrbr->maps_compare_n_cleanup(); + return discard_event(); + } + } else { + return discard_event(); + } +} + +// ----------------------- WaitDigestUpdate ----------------------------------- + +WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl; + // perform an initial check: maybe we already + // have all the updates we need: + // (note that DigestUpdate is usually an external event) + post_event(DigestUpdate{}); +} + +sc::result WaitDigestUpdate::react(const DigestUpdate&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl; + + // on_digest_updates() will either: + // - do nothing - if we are still waiting for updates, or + // - finish the scrubbing of the current chunk, and: + // - send NextChunk, or + // - send ScrubFinished + + scrbr->on_digest_updates(); + return discard_event(); +} + +ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub) + : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub} +{ + dout(15) << "ScrubMachine created " << m_pg_id << dendl; +} + +ScrubMachine::~ScrubMachine() = default; + +// -------- for replicas ----------------------------------------------------- + +// ----------------------- ReplicaWaitUpdates -------------------------------- + +ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx) +{ + dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + scrbr->on_replica_init(); +} + +/* + * Triggered externally, by the entity that had an update re pushes + */ +sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): " + << scrbr->pending_active_pushes() << dendl; + + if (scrbr->pending_active_pushes() == 0) { + + // done waiting + return transit(); + } + + return discard_event(); +} + +/** + * the event poster is handling the scrubber reset + */ +sc::result ReplicaWaitUpdates::react(const FullReset&) +{ + dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl; + return transit(); +} + +// ----------------------- ActiveReplica ----------------------------------- + +ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "-- state -->> ActiveReplica" << dendl; + scrbr->on_replica_init(); // as we might have skipped ReplicaWaitUpdates + post_event(SchedReplica{}); +} + +sc::result ActiveReplica::react(const SchedReplica&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? " + << scrbr->get_preemptor().is_preemptable() << dendl; + + if (scrbr->get_preemptor().was_preempted()) { + dout(10) << "replica scrub job preempted" << dendl; + + scrbr->send_preempted_replica(); + scrbr->replica_handling_done(); + return transit(); + } + + // start or check progress of build_replica_map_chunk() + auto ret_init = scrbr->build_replica_map_chunk(); + if (ret_init != -EINPROGRESS) { + return transit(); + } + + return discard_event(); +} + +/** + * the event poster is handling the scrubber reset + */ +sc::result ActiveReplica::react(const FullReset&) +{ + dout(10) << "ActiveReplica::react(const FullReset&)" << dendl; + return transit(); +} + +} // namespace Scrub diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h new file mode 100644 index 00000000000..7f187005609 --- /dev/null +++ b/src/osd/scrubber/scrub_machine.h @@ -0,0 +1,346 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/version.h" +#include "include/Context.h" + +#include "scrub_machine_lstnr.h" +#include "osd/scrubber_common.h" + +using namespace std::string_literals; + +class PG; // holding a pointer to that one - just for testing +class PgScrubber; +namespace Scrub { + +namespace sc = ::boost::statechart; +namespace mpl = ::boost::mpl; + +// +// EVENTS +// + +void on_event_creation(std::string_view nm); +void on_event_discard(std::string_view nm); + +#define MEV(E) \ + struct E : sc::event { \ + inline static int actv{0}; \ + E() \ + { \ + if (!actv++) \ + on_event_creation(#E); \ + } \ + ~E() \ + { \ + if (!--actv) \ + on_event_discard(#E); \ + } \ + void print(std::ostream* out) const { *out << #E; } \ + std::string_view print() const { return #E; } \ + }; + +MEV(RemotesReserved) ///< all replicas have granted our reserve request + +MEV(ReservationFailure) ///< a reservation request has failed + +MEV(StartScrub) ///< initiate a new scrubbing session (relevant if we are a Primary) + +MEV(AfterRepairScrub) ///< initiate a new scrubbing session. Only triggered at Recovery + ///< completion. + +MEV(Unblocked) ///< triggered when the PG unblocked an object that was marked for + ///< scrubbing. Via the PGScrubUnblocked op + +MEV(InternalSchedScrub) + +MEV(SelectedChunkFree) + +MEV(ChunkIsBusy) + +MEV(ActivePushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery + ///< that is in-flight to the local ObjectStore + +MEV(UpdatesApplied) ///< (Primary only) all updates are committed + +MEV(InternalAllUpdates) ///< the internal counterpart of UpdatesApplied + +MEV(GotReplicas) ///< got a map from a replica + +MEV(IntBmPreempted) ///< internal - BuildMap preempted. Required, as detected within the + ///< ctor + +MEV(InternalError) + +MEV(IntLocalMapDone) + +MEV(DigestUpdate) ///< external. called upon success of a MODIFY op. See + ///< scrub_snapshot_metadata() + +MEV(MapsCompared) ///< (Crimson) maps_compare_n_cleanup() transactions are done + +MEV(StartReplica) ///< initiating replica scrub. + +MEV(StartReplicaNoWait) ///< 'start replica' when there are no pending updates + +MEV(SchedReplica) + +MEV(ReplicaPushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery + ///< that is in-flight to the local ObjectStore + +MEV(FullReset) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive) + +MEV(NextChunk) ///< finished handling this chunk. Go get the next one + +MEV(ScrubFinished) ///< all chunks handled + + +struct NotActive; ///< the quiescent state. No active scrubbing. +struct ReservingReplicas; ///< securing scrub resources from replicas' OSDs +struct ActiveScrubbing; ///< the active state for a Primary. A sub-machine. +struct ReplicaWaitUpdates; ///< an active state for a replica. Waiting for all active + ///< operations to finish. +struct ActiveReplica; ///< an active state for a replica. + + +class ScrubMachine : public sc::state_machine { + public: + friend class PgScrubber; + + public: + explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub); + ~ScrubMachine(); + + PG* m_pg; // only used for dout messages + spg_t m_pg_id; + ScrubMachineListener* m_scrbr; + + void my_states() const; + void assert_not_active() const; + [[nodiscard]] bool is_reserving() const; + [[nodiscard]] bool is_accepting_updates() const; +}; + +/** + * The Scrubber's base (quiescent) state. + * Scrubbing is triggered by one of the following events: + * - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources + * reservation process. Will be issued by PG::scrub(), following a + * queued "PGScrub" op. + * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is + * not required to reserve resources. + * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming + * MOSDRepScrub message. + * + * note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting + * for replica resources to be acquired. But once replicas started using the + * resource-request to identify and tag the scrub session, this bypass cannot be + * supported anymore. + */ +struct NotActive : sc::state { + explicit NotActive(my_context ctx); + + using reactions = mpl::list, + // a scrubbing that was initiated at recovery completion, + // and requires no resource reservations: + sc::transition, + sc::transition, + sc::transition>; +}; + +struct ReservingReplicas : sc::state { + + explicit ReservingReplicas(my_context ctx); + using reactions = mpl::list, + // all replicas granted our resources request + sc::transition, + sc::custom_reaction>; + + sc::result react(const FullReset&); + + /// at least one replica denied us the scrub resources we've requested + sc::result react(const ReservationFailure&); +}; + + +// the "active" sub-states + +struct RangeBlocked; ///< the objects range is blocked +struct PendingTimer; ///< either delaying the scrub by some time and requeuing, or just + ///< requeue +struct NewChunk; ///< select a chunk to scrub, and verify its availability +struct WaitPushes; +struct WaitLastUpdate; +struct BuildMap; +struct DrainReplMaps; ///< a problem during BuildMap. Wait for all replicas to report, + ///< then restart. +struct WaitReplicas; ///< wait for all replicas to report +struct WaitDigestUpdate; + +struct ActiveScrubbing : sc::state { + + explicit ActiveScrubbing(my_context ctx); + ~ActiveScrubbing(); + + using reactions = mpl::list< + sc::custom_reaction, + sc::custom_reaction>; + + sc::result react(const FullReset&); + sc::result react(const InternalError&); +}; + +struct RangeBlocked : sc::state { + explicit RangeBlocked(my_context ctx); + using reactions = mpl::list>; + + Scrub::BlockedRangeWarning m_timeout; +}; + +struct PendingTimer : sc::state { + + explicit PendingTimer(my_context ctx); + + using reactions = mpl::list>; +}; + +struct NewChunk : sc::state { + + explicit NewChunk(my_context ctx); + + using reactions = mpl::list, + sc::custom_reaction>; + + sc::result react(const SelectedChunkFree&); +}; + +/** + * initiate the update process for this chunk + * + * Wait fo 'active_pushes' to clear. + * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence + * scrub waits until the correct data is readable (in-flight data to the Objectstore is + * not readable until written to disk, termed 'applied' here) + */ +struct WaitPushes : sc::state { + + explicit WaitPushes(my_context ctx); + + using reactions = mpl::list>; + + sc::result react(const ActivePushesUpd&); +}; + +struct WaitLastUpdate : sc::state { + + explicit WaitLastUpdate(my_context ctx); + + void on_new_updates(const UpdatesApplied&); + + using reactions = mpl::list, + sc::in_state_reaction>; + + sc::result react(const InternalAllUpdates&); +}; + +struct BuildMap : sc::state { + explicit BuildMap(my_context ctx); + + // possible error scenarios: + // - an error reported by the backend will trigger an 'InternalError' event, + // handled by our parent state; + // - if preempted, we switch to DrainReplMaps, where we will wait for all + // replicas to send their maps before acknowledging the preemption; + // - an interval change will be handled by the relevant 'send-event' functions, + // and will translated into a 'FullReset' event. + using reactions = + mpl::list, + sc::transition, // looping, waiting + // for the backend to + // finish + sc::custom_reaction>; + + sc::result react(const IntLocalMapDone&); +}; + +/* + * "drain" scrub-maps responses from replicas + */ +struct DrainReplMaps : sc::state { + explicit DrainReplMaps(my_context ctx); + + using reactions = + mpl::list // all replicas are accounted for + >; + + sc::result react(const GotReplicas&); +}; + +struct WaitReplicas : sc::state { + explicit WaitReplicas(my_context ctx); + + using reactions = + mpl::list, // all replicas are accounted for + sc::transition, + sc::deferral // might arrive before we've reached WDU + >; + + sc::result react(const GotReplicas&); + + bool all_maps_already_called{false}; // see comment in react code +}; + +struct WaitDigestUpdate : sc::state { + explicit WaitDigestUpdate(my_context ctx); + + using reactions = mpl::list, + sc::transition, + sc::transition>; + sc::result react(const DigestUpdate&); +}; + +// ----------------------------- the "replica active" states ----------------------- + +/* + * Waiting for 'active_pushes' to complete + * + * When in this state: + * - the details of the Primary's request were internalized by PgScrubber; + * - 'active' scrubbing is set + */ +struct ReplicaWaitUpdates : sc::state { + explicit ReplicaWaitUpdates(my_context ctx); + using reactions = + mpl::list, sc::custom_reaction>; + + sc::result react(const ReplicaPushesUpd&); + sc::result react(const FullReset&); +}; + + +struct ActiveReplica : sc::state { + explicit ActiveReplica(my_context ctx); + using reactions = mpl::list, + sc::custom_reaction, + sc::transition>; + + sc::result react(const SchedReplica&); + sc::result react(const FullReset&); +}; + +} // namespace Scrub diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h new file mode 100644 index 00000000000..25bd080fbca --- /dev/null +++ b/src/osd/scrubber/scrub_machine_lstnr.h @@ -0,0 +1,164 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +/** + * \file the PgScrubber interface used by the scrub FSM + */ +#include "common/version.h" +#include "include/Context.h" + +#include "osd/osd_types.h" + +namespace Scrub { + +enum class PreemptionNoted { no_preemption, preempted }; + +/// the interface exposed by the PgScrubber into its internal +/// preemption_data object +struct preemption_t { + + virtual ~preemption_t() = default; + + [[nodiscard]] virtual bool is_preemptable() const = 0; + + [[nodiscard]] virtual bool was_preempted() const = 0; + + virtual void adjust_parameters() = 0; + + /** + * Try to preempt the scrub. + * 'true' (i.e. - preempted) if: + * preemptable && not already preempted + */ + virtual bool do_preempt() = 0; + + /** + * disables preemptions. + * Returns 'true' if we were already preempted + */ + virtual bool disable_and_test() = 0; +}; + +/// an aux used when blocking on a busy object. +/// Issues a log warning if still blocked after 'waittime'. +struct blocked_range_t { + blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id); + ~blocked_range_t(); + + OSDService* m_osds; + Context* m_callbk; +}; + +using BlockedRangeWarning = std::unique_ptr; + +} // namespace Scrub + +struct ScrubMachineListener { + + struct MsgAndEpoch { + MessageRef m_msg; + epoch_t m_epoch; + }; + + virtual ~ScrubMachineListener() = default; + + [[nodiscard]] virtual bool is_primary() const = 0; + + virtual void select_range_n_notify() = 0; + + virtual Scrub::BlockedRangeWarning acquire_blocked_alarm() = 0; + + /// walk the log to find the latest update that affects our chunk + virtual eversion_t search_log_for_updates() const = 0; + + virtual eversion_t get_last_update_applied() const = 0; + + virtual int pending_active_pushes() const = 0; + + virtual int build_primary_map_chunk() = 0; + + virtual int build_replica_map_chunk() = 0; + + virtual void on_init() = 0; + + virtual void on_replica_init() = 0; + + virtual void replica_handling_done() = 0; + + /// the version of 'scrub_clear_state()' that does not try to invoke FSM services + /// (thus can be called from FSM reactions) + virtual void clear_pgscrub_state() = 0; + + /* + * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep' + * is asserted - after a configuration-dependent timeout. + */ + virtual void add_delayed_scheduling() = 0; + + /** + * Ask all replicas for their scrub maps for the current chunk. + */ + virtual void get_replicas_maps(bool replica_can_preempt) = 0; + + virtual void on_digest_updates() = 0; + + /** + * Prepare a MOSDRepScrubMap message carrying the requested scrub map + * @param was_preempted - were we preempted? + * @return the message, and the current value of 'm_replica_min_epoch' (which is + * used when sending the message, but will be overwritten before that). + */ + [[nodiscard]] virtual MsgAndEpoch prep_replica_map_msg( + Scrub::PreemptionNoted was_preempted) = 0; + + /** + * Send to the primary the pre-prepared message containing the requested map + */ + virtual void send_replica_map(const MsgAndEpoch& preprepared) = 0; + + /** + * Let the primary know that we were preempted while trying to build the + * requested map. + */ + virtual void send_preempted_replica() = 0; + + [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0; + + virtual void set_subset_last_update(eversion_t e) = 0; + + [[nodiscard]] virtual bool was_epoch_changed() const = 0; + + virtual Scrub::preemption_t& get_preemptor() = 0; + + /** + * a "technical" collection of the steps performed once all + * rep maps are available: + * - the maps are compared + * - the scrub region markers (start_ & end_) are advanced + * - callbacks and ops that were pending are allowed to run + */ + virtual void maps_compare_n_cleanup() = 0; + + /** + * order the PgScrubber to initiate the process of reserving replicas' scrub + * resources. + */ + virtual void reserve_replicas() = 0; + + virtual void unreserve_replicas() = 0; + + /** + * the FSM interface into the "are we waiting for maps, either our own or from + * replicas" state. + * The FSM can only: + * - mark the local map as available, and + * - query status + */ + virtual void mark_local_map_ready() = 0; + + [[nodiscard]] virtual bool are_all_maps_available() const = 0; + + /// a log/debug interface + virtual std::string dump_awaited_maps() const = 0; +};