From daf848fa5afcf4ad86388eade472d2c3a4873826 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 23 Sep 2024 23:09:51 -0500 Subject: [PATCH] osd/scrub: separate shallow vs deep errors storage The ScrubStore now holds two ScrubStore::at_level_t objects, one for the shallow errors and one for the deep errors. The shallow errors DB is recreated at the start of every scrub, while the deep errors DB is only recreated at the start of a deep scrub. When queried by the operator for known scrub errors, the ScrubStore will return the union of the errors from both DBs. Signed-off-by: Ronen Friedman --- src/osd/scrubber/ScrubStore.cc | 285 +++++++++++++++++++++++++++----- src/osd/scrubber/ScrubStore.h | 42 ++++- src/osd/scrubber/pg_scrubber.cc | 2 +- 3 files changed, 285 insertions(+), 44 deletions(-) diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc index 033ea6b24df..9c680da0de1 100644 --- a/src/osd/scrubber/ScrubStore.cc +++ b/src/osd/scrubber/ScrubStore.cc @@ -99,15 +99,30 @@ Store::Store( { ceph_assert(t); - const auto err_obj = pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid)); - t->touch(coll, err_obj); - errors_db.emplace(pgid, err_obj, OSDriver{&object_store, coll, err_obj}); + // shallow errors DB object + const auto sh_err_obj = + pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid)); + t->touch(coll, sh_err_obj); + shallow_db.emplace( + pgid, sh_err_obj, OSDriver{&object_store, coll, sh_err_obj}); + + // and the DB for deep errors + const auto dp_err_obj = + pgid.make_temp_ghobject(fmt::format("deep_scrub_{}", pgid)); + t->touch(coll, dp_err_obj); + deep_db.emplace(pgid, dp_err_obj, OSDriver{&object_store, coll, dp_err_obj}); + + dout(20) << fmt::format( + "created Scrub::Store for pg[{}], shallow: {}, deep: {}", + pgid, sh_err_obj, dp_err_obj) + << dendl; } Store::~Store() { - ceph_assert(!errors_db || errors_db->results.empty()); + ceph_assert(!shallow_db || shallow_db->results.empty()); + ceph_assert(!deep_db || deep_db->results.empty()); } @@ -127,12 +142,49 @@ void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e) add_object_error(pool, e); } +namespace { + +inconsistent_obj_wrapper create_filtered_copy( + const inconsistent_obj_wrapper& obj, + uint64_t obj_err_mask, + uint64_t shard_err_mask) +{ + inconsistent_obj_wrapper dup = obj; + dup.errors &= obj_err_mask; + for (auto& [shard, si] : dup.shards) { + si.errors &= shard_err_mask; + } + return dup; +} + +} // namespace + + void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e) { const auto key = to_object_key(pool, e.object); - bufferlist bl; - e.encode(bl); - errors_db->results[key] = bl; + dout(20) << fmt::format( + "adding error for object {} ({}). Errors: {} ({}/{}) wr:{}", + e.object, key, librados::err_t{e.errors}, + librados::err_t{e.errors & librados::err_t::SHALLOW_ERRORS}, + librados::err_t{e.errors & librados::err_t::DEEP_ERRORS}, e) + << dendl; + + // divide the errors & shard errors into shallow and deep. + { + bufferlist bl; + create_filtered_copy( + e, librados::obj_err_t::SHALLOW_ERRORS, librados::err_t::SHALLOW_ERRORS) + .encode(bl); + shallow_db->results[key] = bl; + } + { + bufferlist bl; + create_filtered_copy( + e, librados::obj_err_t::DEEP_ERRORS, librados::err_t::DEEP_ERRORS) + .encode(bl); + deep_db->results[key] = bl; + } } @@ -144,23 +196,29 @@ void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e) void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e) { - errors_db->results[to_snap_key(pool, e.object)] = e.encode(); + // note: snap errors are only placed in the shallow store + shallow_db->results[to_snap_key(pool, e.object)] = e.encode(); } bool Store::is_empty() const { - return !errors_db || errors_db->results.empty(); + return (!shallow_db || shallow_db->results.empty()) && + (!deep_db || deep_db->results.empty()); } void Store::flush(ObjectStore::Transaction* t) { if (t) { - auto txn = errors_db->driver.get_transaction(t); - errors_db->backend.set_keys(errors_db->results, &txn); + auto txn = shallow_db->driver.get_transaction(t); + shallow_db->backend.set_keys(shallow_db->results, &txn); + txn = deep_db->driver.get_transaction(t); + deep_db->backend.set_keys(deep_db->results, &txn); } - errors_db->results.clear(); + + shallow_db->results.clear(); + deep_db->results.clear(); } @@ -184,18 +242,23 @@ void Store::clear_level_db( void Store::reinit( ObjectStore::Transaction* t, - [[maybe_unused]] scrub_level_t level) + scrub_level_t level) { + // Note: only one caller, and it creates the transaction passed to reinit(). + // No need to assert on 't' dout(20) << fmt::format( "re-initializing the Scrub::Store (for {} scrub)", (level == scrub_level_t::deep ? "deep" : "shallow")) << dendl; - // Note: only one caller, and it creates the transaction passed to reinit(). - // No need to assert on 't' - - if (errors_db) { - clear_level_db(t, *errors_db, "scrub"); + // always clear the known shallow errors DB (as both shallow and deep scrubs + // would recreate it) + if (shallow_db) { + clear_level_db(t, *shallow_db, "shallow"); + } + // only a deep scrub recreates the deep errors DB + if (level == scrub_level_t::deep && deep_db) { + clear_level_db(t, *deep_db, "deep"); } } @@ -204,8 +267,10 @@ void Store::cleanup(ObjectStore::Transaction* t) { dout(20) << "discarding error DBs" << dendl; ceph_assert(t); - if (errors_db) - t->remove(coll, errors_db->errors_hoid); + if (shallow_db) + t->remove(coll, shallow_db->errors_hoid); + if (deep_db) + t->remove(coll, deep_db->errors_hoid); } @@ -214,42 +279,180 @@ std::vector Store::get_snap_errors( const librados::object_id_t& start, uint64_t max_return) const { - const string begin = (start.name.empty() ? - first_snap_key(pool) : to_snap_key(pool, start)); + vector errors; + const string begin = + (start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start)); const string end = last_snap_key(pool); - return get_errors(begin, end, max_return); + + // the snap errors are stored only in the shallow store + ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(begin); + + while (max_return-- && latest_sh.has_value() && latest_sh->last_key < end) { + errors.push_back(latest_sh->data); + latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key); + } + + return errors; } -std::vector -Store::get_object_errors(int64_t pool, - const librados::object_id_t& start, - uint64_t max_return) const + +std::vector Store::get_object_errors( + int64_t pool, + const librados::object_id_t& start, + uint64_t max_return) const { - const string begin = (start.name.empty() ? - first_object_key(pool) : to_object_key(pool, start)); + const string begin = + (start.name.empty() ? first_object_key(pool) + : to_object_key(pool, start)); const string end = last_object_key(pool); + dout(20) << fmt::format("fetching errors, from {} to {}", begin, end) + << dendl; return get_errors(begin, end, max_return); } -std::vector -Store::get_errors(const string& begin, - const string& end, - uint64_t max_return) const + +inline void decode( + librados::inconsistent_obj_t& obj, + ceph::buffer::list::const_iterator& bp) { + reinterpret_cast(obj).decode(bp); +} + + +inconsistent_obj_wrapper decode_wrapper( + hobject_t obj, + ceph::buffer::list::const_iterator bp) +{ + inconsistent_obj_wrapper iow{obj}; + iow.decode(bp); + return iow; +} + + +void Store::collect_specific_store( + MapCacher::MapCacher& backend, + Store::ExpCacherPosData& latest, + std::vector& errors, + std::string_view end_key, + uint64_t max_return) const +{ + while (max_return-- && latest.has_value() && + latest.value().last_key < end_key) { + errors.push_back(latest->data); + latest = backend.get_1st_after_key(latest->last_key); + } +} + + +bufferlist Store::merge_encoded_error_wrappers( + hobject_t obj, + ExpCacherPosData& latest_sh, + ExpCacherPosData& latest_dp) const +{ + // decode both error wrappers + auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin()); + auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin()); + dout(20) << fmt::format( + "merging errors {}. Shallow: {}-({}), Deep: {}-({})", + sh_wrap.object, sh_wrap.errors, dp_wrap.errors, sh_wrap, + dp_wrap) + << dendl; + + // merge the object errors (a simple OR of the two error bit-sets) + sh_wrap.errors |= dp_wrap.errors; + + // merge the two shard error maps + for (const auto& [shard, si] : dp_wrap.shards) { + dout(20) << fmt::format( + "shard {} dp-errors: {} sh-errors:{}", shard, si.errors, + sh_wrap.shards[shard].errors) + << dendl; + // note: we may be creating the shallow shard entry here. This is OK + sh_wrap.shards[shard].errors |= si.errors; + } + + return sh_wrap.encode(); +} + + +// a better way to implement get_errors(): use two generators, one for each store. +// and sort-merge the results. Almost like a merge-sort, but with equal +// keys combined. 'todo' once 'ranges' are really working. + +std::vector Store::get_errors( + const std::string& from_key, + const std::string& end_key, + uint64_t max_return) const +{ + // merge the input from the two sorted DBs into 'errors' (until + // enough errors are collected) vector errors; - if (!errors_db) - return errors; + dout(20) << fmt::format("getting errors from {} to {}", from_key, end_key) + << dendl; - auto next = std::make_pair(begin, bufferlist{}); - while (max_return && !errors_db->backend.get_next(next.first, &next)) { - if (next.first >= end) + ceph_assert(shallow_db); + ceph_assert(deep_db); + ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(from_key); + ExpCacherPosData latest_dp = deep_db->backend.get_1st_after_key(from_key); + + while (max_return) { + dout(20) << fmt::format( + "n:{} latest_sh: {}, latest_dp: {}", max_return, + (latest_sh ? latest_sh->last_key : "(none)"), + (latest_dp ? latest_dp->last_key : "(none)")) + << dendl; + + // keys not smaller than end_key are not interesting + if (latest_sh.has_value() && latest_sh->last_key >= end_key) { + latest_sh = tl::unexpected(-EINVAL); + } + if (latest_dp.has_value() && latest_dp->last_key >= end_key) { + latest_dp = tl::unexpected(-EINVAL); + } + + if (!latest_sh && !latest_dp) { + // both stores are exhausted + break; + } + if (!latest_sh.has_value()) { + // continue with the deep store + dout(10) << fmt::format("collecting from deep store") << dendl; + collect_specific_store( + deep_db->backend, latest_dp, errors, end_key, max_return); break; - errors.push_back(next.second); + } + if (!latest_dp.has_value()) { + // continue with the shallow store + dout(10) << fmt::format("collecting from shallow store") << dendl; + collect_specific_store( + shallow_db->backend, latest_sh, errors, end_key, max_return); + break; + } + + // we have results from both stores. Select the one with a lower key. + // If the keys are equal, combine the errors. + if (latest_sh->last_key == latest_dp->last_key) { + auto bl = merge_encoded_error_wrappers( + shallow_db->errors_hoid.hobj, latest_sh, latest_dp); + errors.push_back(bl); + latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key); + latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key); + + } else if (latest_sh->last_key < latest_dp->last_key) { + dout(20) << fmt::format("shallow store element ({})", latest_sh->last_key) + << dendl; + errors.push_back(latest_sh->data); + latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key); + } else { + dout(20) << fmt::format("deep store element ({})", latest_dp->last_key) + << dendl; + errors.push_back(latest_dp->data); + latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key); + } max_return--; } dout(10) << fmt::format("{} errors reported", errors.size()) << dendl; return errors; } - -} // namespace Scrub +} // namespace Scrub diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h index 9eb77ab667d..8a30e8daf85 100644 --- a/src/osd/scrubber/ScrubStore.h +++ b/src/osd/scrubber/ScrubStore.h @@ -16,6 +16,28 @@ class PgScrubber; namespace Scrub { +/** + * Storing errors detected during scrubbing. + * + * From both functional and internal perspectives, the store is a pair of key-value + * databases: one maps objects to shallow errors detected during their scrubbing, + * and other stores deep errors. + * Note that the first store is updated in both shallow and in deep scrubs. The + * second - only while deep scrubbing. + * + * The DBs can be consulted by the operator, when trying to list 'errors known + * at this point in time'. Whenever a scrub starts - the relevant entries in the + * DBs are removed. Specifically - the shallow errors DB is recreated each scrub, + * while the deep errors DB is recreated only when a deep scrub starts. + * + * When queried - the data from both DBs is merged for each named object, and + * returned to the operator. + * + * Implementation: + * Each of the two DBs is implemented as OMAP entries of a single, uniquely named, + * object. Both DBs are cached using the general KV Cache mechanism. + */ + class Store { public: ~Store(); @@ -114,14 +136,21 @@ class Store { * allocations; and 'mutable', as the caching mechanism is used in const * methods) */ - mutable std::optional errors_db; - // not yet: mutable std::optional deep_db; + mutable std::optional shallow_db; + mutable std::optional deep_db; std::vector get_errors( const std::string& start, const std::string& end, uint64_t max_return) const; + void collect_specific_store( + MapCacher::MapCacher& backend, + ExpCacherPosData& latest, + std::vector& errors, + std::string_view end_key, + uint64_t max_return) const; + /** * Clear the DB of errors at a specific scrub level by performing an * omap_clear() on the DB object, and resetting the MapCacher. @@ -131,5 +160,14 @@ class Store { at_level_t& db, std::string_view db_name); + /** + * merge the two error wrappers - fetched from both DBs for the same object. + * Specifically, the object errors are or'ed, and so are the per-shard + * entries. + */ + bufferlist merge_encoded_error_wrappers( + hobject_t obj, + ExpCacherPosData& latest_sh, + ExpCacherPosData& latest_dp) const; }; } // namespace Scrub diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 81093666f91..594ffb15e2b 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1209,7 +1209,7 @@ void PgScrubber::reinit_scrub_store() // actual Object Store objects). // 1. The ScrubStore object itself. // 2,3. The two special hobjects in the coll (the PG data) holding the last - // scrub's results. <> + // scrub's results. // // The Store object can be deleted and recreated, as a way to guarantee // no junk is left. We won't do it here, but we will clear the at_level_t -- 2.39.5