From: Bill Scales Date: Thu, 3 Apr 2025 11:15:31 +0000 (+0100) Subject: osd: EC Optimizations: Backfill changes for partial writes X-Git-Tag: v20.3.0~30^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=babb801aa8603b971a5a8ea401d44f95116e108a;p=ceph.git osd: EC Optimizations: Backfill changes for partial writes Optimized EC pools support partial writes that do not update every shard. Consequently shards that are not updated can have out of date version numbers. The primary shard object_info_t is always updated and tracks the expected version of each shards. To avoid unnecessary backfill work changes are required to use the extra data in the object_info_t when comparing version numbers to work out whether a shard is missing updates or just didn't participate in recent partial writes. See comments in src/osd/recovery_types.h Signed-off-by: Bill Scales --- diff --git a/src/crimson/osd/CMakeLists.txt b/src/crimson/osd/CMakeLists.txt index a3b6b47d4d44a..ff78904c8faf0 100644 --- a/src/crimson/osd/CMakeLists.txt +++ b/src/crimson/osd/CMakeLists.txt @@ -55,7 +55,6 @@ add_executable(crimson-osd ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc ${PROJECT_SOURCE_DIR}/src/osd/SnapMapper.cc - ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc ${PROJECT_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc watch.cc diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h index de1cda434b85c..f98efa5bea669 100644 --- a/src/crimson/osd/backfill_facades.h +++ b/src/crimson/osd/backfill_facades.h @@ -62,6 +62,11 @@ struct PeeringFacade final : BackfillState::PeeringFacade { const std::vector &peers) override { return peering_state.prepare_backfill_for_missing(soid, v, peers); } + + const pg_pool_t& get_pool() const override { + return peering_state.get_pgpool().info; + } + PeeringFacade(PeeringState& peering_state) : peering_state(peering_state) { } diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 7fea4b7d58423..38373929b96b3 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -85,7 +85,7 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt) ceph_assert(backfill_state().last_backfill_started == \ peering_state().earliest_backfill()); ceph_assert(peering_state().is_backfilling()); - // initialize BackfillIntervals + // initialize ReplicaBackfillIntervals for (const auto& bt : peering_state().get_backfill_targets()) { backfill_state().peer_backfill_info[bt].reset( peering_state().get_peer_last_backfill(bt)); @@ -134,13 +134,52 @@ void BackfillState::Enqueuing::maybe_update_range() if (e.is_update()) { DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}", pg(), e.soid, e.version); - primary_bi.objects.erase(e.soid); - primary_bi.objects.insert(std::make_pair(e.soid, - e.version)); + if (e.written_shards.empty()) { + // Log entry updates all shards, replace all entries for e.soid + primary_bi.objects.erase(e.soid); + primary_bi.objects.insert( + std::make_pair(e.soid, + std::make_pair(shard_id_t::NO_SHARD, + e.version))); + } else { + // Update backfill interval for shards modified by log entry + std::map versions; + // Create map from existing entries in backfill entry + const auto & [begin, end] = primary_bi.objects.equal_range(e.soid); + for (const auto & entry : std::ranges::subrange(begin, end)) { + const auto & [shard, version] = entry.second; + versions[shard] = version; + } + // Update entries in map that are modified by log entry + bool uses_default = false; + for (const auto & shard : peering_state().get_backfill_targets()) { + if (e.is_written_shard(shard.shard)) { + versions.erase(shard.shard); + uses_default = true; + } else { + if (!versions.contains(shard.shard)) { + versions[shard.shard] = e.prior_version; + } + //Else: keep existing version + } + } + if (uses_default) { + versions[shard_id_t::NO_SHARD] = e.version; + } else { + versions.erase(shard_id_t::NO_SHARD); + } + // Erase and recreate backfill interval for e.soid using map + primary_bi.objects.erase(e.soid); + for (auto & [shard, version] : versions) { + primary_bi.objects.insert( + std::make_pair(e.soid, + std::make_pair(shard, version))); + } + } } else if (e.is_delete()) { DEBUGDPP("maybe_update_range(lambda): {} removed", pg(), e.soid); - primary_bi.objects.erase(e.soid); + primary_bi.objects.erase(e.soid); // Erase all entries for e.soid } } }; @@ -168,8 +207,8 @@ void BackfillState::Enqueuing::trim_backfill_infos() /* static */ bool BackfillState::Enqueuing::all_enqueued( const PeeringFacade& peering_state, - const BackfillInterval& backfill_info, - const std::map& peer_backfill_info) + const PrimaryBackfillInterval& backfill_info, + const std::map& peer_backfill_info) { const bool all_local_enqueued = \ backfill_info.extends_to_end() && backfill_info.empty(); @@ -184,7 +223,8 @@ void BackfillState::Enqueuing::trim_backfill_infos() } hobject_t BackfillState::Enqueuing::earliest_peer_backfill( - const std::map& peer_backfill_info) const + const std::map& peer_backfill_info) const { hobject_t e = hobject_t::get_max(); for (const pg_shard_t& bt : peering_state().get_backfill_targets()) { @@ -196,8 +236,8 @@ hobject_t BackfillState::Enqueuing::earliest_peer_backfill( } bool BackfillState::Enqueuing::should_rescan_replicas( - const std::map& peer_backfill_info, - const BackfillInterval& backfill_info) const + const std::map& peer_backfill_info, + const PrimaryBackfillInterval& backfill_info) const { const auto& targets = peering_state().get_backfill_targets(); return std::any_of(std::begin(targets), std::end(targets), @@ -208,8 +248,8 @@ bool BackfillState::Enqueuing::should_rescan_replicas( } bool BackfillState::Enqueuing::should_rescan_primary( - const std::map& peer_backfill_info, - const BackfillInterval& backfill_info) const + const std::map& peer_backfill_info, + const PrimaryBackfillInterval& backfill_info) const { return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) && !backfill_info.extends_to_end() && backfill_info.empty(); @@ -218,7 +258,7 @@ bool BackfillState::Enqueuing::should_rescan_primary( void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( BackfillState::Enqueuing::result_t&& result, hobject_t& last_backfill_started, - std::map& peer_backfill_info) + std::map& peer_backfill_info) { std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets), [&peer_backfill_info] (const auto& bt) { @@ -257,13 +297,28 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check) result_t result { {}, primary_bi.begin }; std::map>> backfills; + std::map versions; + auto it = primary_bi.objects.begin(); + const hobject_t& hoid = it->first; + eversion_t obj_v; + while (it != primary_bi.objects.end() && it->first == hoid) { + obj_v = std::max(obj_v, it->second.second); + versions[it->second.first] = it->second.second; + ++it; + } + for (const auto& bt : peering_state().get_backfill_targets()) { const auto& peer_bi = backfill_state().peer_backfill_info.at(bt); // Find all check peers that have the wrong version - if (const eversion_t& obj_v = primary_bi.objects.begin()->second; - check == primary_bi.begin && check == peer_bi.begin) { - if (peer_bi.objects.begin()->second != obj_v) { + if (check == primary_bi.begin && check == peer_bi.begin) { + eversion_t replicaobj_v; + if (versions.contains(bt.shard)) { + replicaobj_v = versions.at(bt.shard); + } else { + replicaobj_v = versions.at(shard_id_t::NO_SHARD); + } + if (peer_bi.objects.begin()->second != replicaobj_v) { std::ignore = backfill_state().progress_tracker->enqueue_push( primary_bi.begin); auto &[v, peers] = backfills[primary_bi.begin]; @@ -298,8 +353,9 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check) } bool BackfillState::Enqueuing::Enqueuing::all_emptied( - const BackfillInterval& local_backfill_info, - const std::map& peer_backfill_info) const + const PrimaryBackfillInterval& local_backfill_info, + const std::map& peer_backfill_info) const { const auto& targets = peering_state().get_backfill_targets(); const auto replicas_emptied = @@ -459,8 +515,8 @@ BackfillState::PrimaryScanning::react(ObjectPushed evt) // -- ReplicasScanning bool BackfillState::ReplicasScanning::replica_needs_scan( - const BackfillInterval& replica_backfill_info, - const BackfillInterval& local_backfill_info) + const ReplicaBackfillInterval& replica_backfill_info, + const PrimaryBackfillInterval& local_backfill_info) { return replica_backfill_info.empty() && \ replica_backfill_info.begin <= local_backfill_info.begin && \ diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index 75129d3974501..c295dd9c460c3 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -15,6 +15,7 @@ #include "osd/recovery_types.h" #include "osd/PGLog.h" +#include "osd/PeeringState.h" namespace crimson::osd { @@ -27,16 +28,16 @@ struct BackfillState { // events comes first struct PrimaryScanned : sc::event { - BackfillInterval result; - PrimaryScanned(BackfillInterval&& result) + PrimaryBackfillInterval result; + PrimaryScanned(PrimaryBackfillInterval&& result) : result(std::move(result)) { } }; struct ReplicaScanned : sc::event { pg_shard_t from; - BackfillInterval result; - ReplicaScanned(pg_shard_t from, BackfillInterval&& result) + ReplicaBackfillInterval result; + ReplicaScanned(pg_shard_t from, ReplicaBackfillInterval&& result) : from(std::move(from)), result(std::move(result)) { } @@ -166,8 +167,8 @@ public: // completed yet. static bool all_enqueued( const PeeringFacade& peering_state, - const BackfillInterval& backfill_info, - const std::map& peer_backfill_info); + const PrimaryBackfillInterval& backfill_info, + const std::map& peer_backfill_info); private: void maybe_update_range(); @@ -176,25 +177,27 @@ public: // these methods take BackfillIntervals instead of extracting them from // the state to emphasize the relationships across the main loop. bool all_emptied( - const BackfillInterval& local_backfill_info, - const std::map& peer_backfill_info) const; + const PrimaryBackfillInterval& local_backfill_info, + const std::map& peer_backfill_info) const; hobject_t earliest_peer_backfill( - const std::map& peer_backfill_info) const; + const std::map& peer_backfill_info) const; bool should_rescan_replicas( - const std::map& peer_backfill_info, - const BackfillInterval& backfill_info) const; + const std::map& peer_backfill_info, + const PrimaryBackfillInterval& backfill_info) const; // indicate whether a particular acting primary needs to scanned again // to process next piece of the hobject_t's namespace. // the logic is per analogy to replica_needs_scan(). See comments there. bool should_rescan_primary( - const std::map& peer_backfill_info, - const BackfillInterval& backfill_info) const; + const std::map& peer_backfill_info, + const PrimaryBackfillInterval& backfill_info) const; // the result_t is intermediary between {remove,update}_on_peers() and - // updating BackfillIntervals in trim_backfilled_object_from_intervals. - // This step is important because it affects the main loop's condition, - // and thus deserves to be exposed instead of being called deeply from - // {remove,update}_on_peers(). + // updating ReplicaBackfillIntervals in + // trim_backfilled_object_from_intervals. This step is important + // because it affects the main loop's condition, and thus deserves to be + // exposed instead of being called deeply from {remove,update}_on_peers(). struct [[nodiscard]] result_t { std::set pbi_targets; hobject_t new_last_backfill_started; @@ -202,7 +205,7 @@ public: void trim_backfilled_object_from_intervals( result_t&&, hobject_t& last_backfill_started, - std::map& peer_backfill_info); + std::map& peer_backfill_info); result_t remove_on_peers(const hobject_t& check); result_t update_on_peers(const hobject_t& check); }; @@ -242,12 +245,12 @@ public: sc::result react(Triggered); // indicate whether a particular peer should be scanned to retrieve - // BackfillInterval for new range of hobject_t namespace. + // ReplicaBackfillInterval for new range of hobject_t namespace. // true when bi.objects is exhausted, replica bi's end is not MAX, // and primary bi'begin is further than the replica's one. static bool replica_needs_scan( - const BackfillInterval& replica_backfill_info, - const BackfillInterval& local_backfill_info); + const ReplicaBackfillInterval& replica_backfill_info, + const PrimaryBackfillInterval& local_backfill_info); private: std::set waiting_on_backfill; @@ -339,8 +342,8 @@ private: backfill_suspend_state.should_go_enqueuing = true; } hobject_t last_backfill_started; - BackfillInterval backfill_info; - std::map peer_backfill_info; + PrimaryBackfillInterval backfill_info; + std::map peer_backfill_info; BackfillMachine backfill_machine; std::unique_ptr progress_tracker; size_t replicas_in_backfill = 0; @@ -408,6 +411,7 @@ struct BackfillState::PeeringFacade { const hobject_t &soid, const eversion_t &v, const std::vector &peers) = 0; + virtual const pg_pool_t& get_pool() const = 0; virtual ~PeeringFacade() {} }; diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index f219411acda9d..39afa0383ac39 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -508,11 +508,12 @@ void PGRecovery::request_primary_scan( { logger().debug("{}", __func__); using crimson::common::local_conf; - std::ignore = pg->get_recovery_backend()->scan_for_backfill( + std::ignore = pg->get_recovery_backend()->scan_for_backfill_primary( begin, local_conf()->osd_backfill_scan_min, - local_conf()->osd_backfill_scan_max - ).then_interruptible([this] (BackfillInterval bi) { + local_conf()->osd_backfill_scan_max, + pg->get_peering_state().get_backfill_targets() + ).then_interruptible([this] (PrimaryBackfillInterval bi) { logger().debug("request_primary_scan:{}", __func__); using BackfillState = crimson::osd::BackfillState; backfill_state->process_event( diff --git a/src/crimson/osd/recovery_backend.cc b/src/crimson/osd/recovery_backend.cc index b0dbcfba9839f..81174e2ba471f 100644 --- a/src/crimson/osd/recovery_backend.cc +++ b/src/crimson/osd/recovery_backend.cc @@ -222,17 +222,81 @@ RecoveryBackend::handle_backfill_remove( pg.get_collection_ref(), std::move(t)).or_terminate()); } -RecoveryBackend::interruptible_future -RecoveryBackend::scan_for_backfill( +RecoveryBackend::interruptible_future +RecoveryBackend::scan_for_backfill_primary( + const hobject_t start, + [[maybe_unused]] const std::int64_t min, + const std::int64_t max, + const std::set &backfill_targets) +{ + LOG_PREFIX(RecoveryBackend::scan_for_backfill_primary); + DEBUGDPP("starting from {}", pg, start); + auto version_map = seastar::make_lw_shared>>(); + auto&& [objects, next] = co_await backend->list_objects(start, max); + co_await interruptor::parallel_for_each(objects, + seastar::coroutine::lambda([FNAME, this, version_map, backfill_targets] + (const hobject_t& object) -> interruptible_future<> { + DEBUGDPP("querying obj:{}", pg, object); + auto obc_manager = pg.obc_loader.get_obc_manager(object); + co_await pg.obc_loader.load_and_lock( + obc_manager, RWState::RWREAD + ).handle_error_interruptible( + crimson::ct_error::assert_all("unexpected error") + ); + + if (obc_manager.get_obc()->obs.exists) { + auto version = obc_manager.get_obc()->obs.oi.version; + auto shard_versions = obc_manager.get_obc()->obs.oi.shard_versions; + if (shard_versions.empty()) { + version_map->emplace(object, std::make_pair(shard_id_t::NO_SHARD, + version)); + } else { + bool added_default = false; + for (auto & shard: backfill_targets) { + if (shard_versions.contains(shard.shard)) { + version = shard_versions.at(shard.shard); + version_map->emplace(object, std::make_pair(shard.shard, version)); + } else if (!added_default) { + version_map->emplace(object, std::make_pair(shard_id_t::NO_SHARD, + version)); + added_default = true; + } + } + } + DEBUGDPP("found: {} {}", pg, + object, version); + co_return; + } else { + // if the object does not exist here, it must have been removed + // between the collection_list_partial and here. This can happen + // for the first item in the range, which is usually last_backfill. + co_return; + } + })); + PrimaryBackfillInterval bi; + bi.begin = std::move(start); + bi.end = std::move(next); + bi.objects = std::move(*version_map); + DEBUGDPP("{} PrimaryBackfillInterval filled, leaving, {}", + "scan_for_backfill_primary", + pg, bi); + co_return std::move(bi); +} + +RecoveryBackend::interruptible_future +RecoveryBackend::scan_for_backfill_replica( const hobject_t start, [[maybe_unused]] const std::int64_t min, const std::int64_t max) { - LOG_PREFIX(RecoveryBackend::scan_for_backfill); + LOG_PREFIX(RecoveryBackend::scan_for_backfill_replica); DEBUGDPP("starting from {}", pg, start); - auto version_map = seastar::make_lw_shared>(); + auto version_map = seastar::make_lw_shared>(); auto&& [objects, next] = co_await backend->list_objects(start, max); - co_await interruptor::parallel_for_each(objects, seastar::coroutine::lambda([FNAME, this, version_map] + co_await interruptor::parallel_for_each(objects, + seastar::coroutine::lambda([FNAME, this, version_map] (const hobject_t& object) -> interruptible_future<> { DEBUGDPP("querying obj:{}", pg, object); auto obc_manager = pg.obc_loader.get_obc_manager(object); @@ -255,12 +319,12 @@ RecoveryBackend::scan_for_backfill( co_return; } })); - BackfillInterval bi; + ReplicaBackfillInterval bi; bi.begin = std::move(start); bi.end = std::move(next); bi.objects = std::move(*version_map); - DEBUGDPP("{} BackfillInterval filled, leaving, {}", - "scan_for_backfill", + DEBUGDPP("{} ReplicaBackfillInterval filled, leaving, {}", + "scan_for_backfill_replica", pg, bi); co_return std::move(bi); } @@ -283,7 +347,7 @@ RecoveryBackend::handle_scan_get_digest( PeeringState::BackfillTooFull()); return seastar::now(); } - return scan_for_backfill( + return scan_for_backfill_replica( std::move(m.begin), crimson::common::local_conf().get_val("osd_backfill_scan_min"), crimson::common::local_conf().get_val("osd_backfill_scan_max") @@ -312,7 +376,7 @@ RecoveryBackend::handle_scan_digest( // Check that from is in backfill_targets vector ceph_assert(pg.is_backfill_target(m.from)); - BackfillInterval bi; + ReplicaBackfillInterval bi; bi.begin = m.begin; bi.end = m.end; { diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h index 818e85f67b1d9..adffc0b9ac42e 100644 --- a/src/crimson/osd/recovery_backend.h +++ b/src/crimson/osd/recovery_backend.h @@ -89,7 +89,13 @@ public: const hobject_t& soid, eversion_t need) = 0; - interruptible_future scan_for_backfill( + interruptible_future scan_for_backfill_primary( + const hobject_t from, + std::int64_t min, + std::int64_t max, + const std::set &backfill_targets); + + interruptible_future scan_for_backfill_replica( const hobject_t from, std::int64_t min, std::int64_t max); diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt index e7f579f38410c..5d302209e16ee 100644 --- a/src/osd/CMakeLists.txt +++ b/src/osd/CMakeLists.txt @@ -38,7 +38,6 @@ set(osd_srcs scheduler/mClockScheduler.cc PeeringState.cc PGStateUtils.cc - recovery_types.cc MissingLoc.cc osd_perf_counters.cc ECCommonL.cc diff --git a/src/osd/PG.h b/src/osd/PG.h index d27d8196ad346..785614f1b3575 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -867,8 +867,8 @@ protected: std::set probe_targets; protected: - BackfillInterval backfill_info; - std::map peer_backfill_info; + PrimaryBackfillInterval backfill_info; + std::map peer_backfill_info; bool backfill_reserving; // The primary's num_bytes and local num_bytes for this pg, only valid diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 1340ec88679a7..b1d88bcf4220a 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -4508,11 +4508,11 @@ void PrimaryLogPG::do_scan( return; } - BackfillInterval bi; + ReplicaBackfillInterval bi; bi.begin = m->begin; // No need to flush, there won't be any in progress writes occuring // past m->begin - scan_range( + scan_range_replica( cct->_conf->osd_backfill_scan_min, cct->_conf->osd_backfill_scan_max, &bi, @@ -4534,7 +4534,7 @@ void PrimaryLogPG::do_scan( // Check that from is in backfill_targets vector ceph_assert(is_backfill_target(from)); - BackfillInterval& bi = peer_backfill_info[from]; + ReplicaBackfillInterval& bi = peer_backfill_info[from]; bi.begin = m->begin; bi.end = m->end; auto p = m->get_data().cbegin(); @@ -13873,7 +13873,7 @@ bool PrimaryLogPG::all_peer_done() const for (const pg_shard_t& bt : get_backfill_targets()) { const auto piter = peer_backfill_info.find(bt); ceph_assert(piter != peer_backfill_info.end()); - const BackfillInterval& pbi = piter->second; + const ReplicaBackfillInterval& pbi = piter->second; // See if peer has more to process if (!pbi.extends_to_end() || !pbi.empty()) return false; @@ -13986,7 +13986,7 @@ uint64_t PrimaryLogPG::recover_backfill( i != get_backfill_targets().end(); ++i) { pg_shard_t bt = *i; - BackfillInterval& pbi = peer_backfill_info[bt]; + ReplicaBackfillInterval& pbi = peer_backfill_info[bt]; dout(20) << " peer shard " << bt << " backfill " << pbi << dendl; if (pbi.begin <= backfill_info.begin && @@ -14033,7 +14033,7 @@ uint64_t PrimaryLogPG::recover_backfill( i != get_backfill_targets().end(); ++i) { pg_shard_t bt = *i; - BackfillInterval& pbi = peer_backfill_info[bt]; + ReplicaBackfillInterval& pbi = peer_backfill_info[bt]; if (pbi.begin == check) check_targets.insert(bt); } @@ -14045,7 +14045,7 @@ uint64_t PrimaryLogPG::recover_backfill( i != check_targets.end(); ++i) { pg_shard_t bt = *i; - BackfillInterval& pbi = peer_backfill_info[bt]; + ReplicaBackfillInterval& pbi = peer_backfill_info[bt]; ceph_assert(pbi.begin == check); to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt)); @@ -14059,17 +14059,31 @@ uint64_t PrimaryLogPG::recover_backfill( // and we can't increment ops without requeueing ourself // for recovery. } else { - eversion_t& obj_v = backfill_info.objects.begin()->second; - + // Unpack versions for the object being backfilled + auto it = backfill_info.objects.begin(); + const hobject_t& hoid = it->first; + eversion_t obj_v; + std::map versions; + while (it != backfill_info.objects.end() && it->first == hoid) { + obj_v = std::max(obj_v, it->second.second); + versions[it->second.first] = it->second.second; + ++it; + } vector need_ver_targs, missing_targs, keep_ver_targs, skip_targs; for (set::const_iterator i = get_backfill_targets().begin(); i != get_backfill_targets().end(); ++i) { pg_shard_t bt = *i; - BackfillInterval& pbi = peer_backfill_info[bt]; + ReplicaBackfillInterval& pbi = peer_backfill_info[bt]; // Find all check peers that have the wrong version if (check == backfill_info.begin && check == pbi.begin) { - if (pbi.objects.begin()->second != obj_v) { + eversion_t replicaobj_v; + if (versions.contains(bt.shard)) { + replicaobj_v = versions.at(bt.shard); + } else { + replicaobj_v = versions.at(shard_id_t::NO_SHARD); + } + if (pbi.objects.begin()->second != replicaobj_v) { need_ver_targs.push_back(bt); } else { keep_ver_targs.push_back(bt); @@ -14141,7 +14155,7 @@ uint64_t PrimaryLogPG::recover_backfill( i != check_targets.end(); ++i) { pg_shard_t bt = *i; - BackfillInterval& pbi = peer_backfill_info[bt]; + ReplicaBackfillInterval& pbi = peer_backfill_info[bt]; pbi.pop_front(); } } @@ -14309,17 +14323,18 @@ int PrimaryLogPG::prep_backfill_object_push( } void PrimaryLogPG::update_range( - BackfillInterval *bi, + PrimaryBackfillInterval *bi, ThreadPool::TPHandle &handle) { int local_min = cct->_conf->osd_backfill_scan_min; int local_max = cct->_conf->osd_backfill_scan_max; + const std::set& backfill_targets = get_backfill_targets(); if (bi->version < info.log_tail) { dout(10) << __func__<< ": bi is old, rescanning local backfill_info" << dendl; bi->version = info.last_update; - scan_range(local_min, local_max, bi, handle); + scan_range_primary(local_min, local_max, bi, handle, backfill_targets); } if (bi->version >= projected_last_update) { @@ -14350,14 +14365,48 @@ void PrimaryLogPG::update_range( if (e.is_update()) { dout(10) << __func__ << ": " << e.soid << " updated to version " << e.version << dendl; - bi->objects.erase(e.soid); - bi->objects.insert( - make_pair( - e.soid, - e.version)); + if (e.written_shards.empty()) { + // Log entry updates all shards, replace all entries for e.soid + bi->objects.erase(e.soid); + bi->objects.insert(make_pair(e.soid, + make_pair(shard_id_t::NO_SHARD, + e.version))); + } else { + // Update backfill interval for shards modified by log entry + std::map versions; + // Create map from existing entries in backfill entry + const auto & [begin, end] = bi->objects.equal_range(e.soid); + for (const auto & entry : std::ranges::subrange(begin, end)) { + const auto & [shard, version] = entry.second; + versions[shard] = version; + } + // Update entries in map that are modified by log entry + bool uses_default = false; + for (const auto & shard : backfill_targets) { + if (e.is_written_shard(shard.shard)) { + versions.erase(shard.shard); + uses_default = true; + } else { + if (!versions.contains(shard.shard)) { + versions[shard.shard] = e.prior_version; + } + //Else: keep existing version + } + } + if (uses_default) { + versions[shard_id_t::NO_SHARD] = e.version; + } else { + versions.erase(shard_id_t::NO_SHARD); + } + // Erase and recreate backfill interval for e.soid using map + bi->objects.erase(e.soid); + for (auto & [shard, version] : versions) { + bi->objects.insert(make_pair(e.soid, make_pair(shard, version))); + } + } } else if (e.is_delete()) { dout(10) << __func__ << ": " << e.soid << " removed" << dendl; - bi->objects.erase(e.soid); + bi->objects.erase(e.soid); // Erase all entries for e.soid } } }; @@ -14367,16 +14416,18 @@ void PrimaryLogPG::update_range( projected_log.scan_log_after(bi->version, func); bi->version = projected_last_update; } else { - ceph_abort_msg("scan_range should have raised bi->version past log_tail"); + ceph_abort_msg("scan_range_primary should have raised bi->version past log_tail"); } } -void PrimaryLogPG::scan_range( - int min, int max, BackfillInterval *bi, - ThreadPool::TPHandle &handle) +void PrimaryLogPG::scan_range_primary( + int min, int max, PrimaryBackfillInterval *bi, + ThreadPool::TPHandle &handle, + const std::set &backfill_targets) { ceph_assert(is_locked()); - dout(10) << "scan_range from " << bi->begin << dendl; + dout(10) << "scan_range_primary from " << bi->begin << + " backfill_targets " << backfill_targets << dendl; bi->clear_objects(); vector ls; @@ -14388,9 +14439,13 @@ void PrimaryLogPG::scan_range( for (vector::iterator p = ls.begin(); p != ls.end(); ++p) { handle.reset_tp_timeout(); - ObjectContextRef obc; - if (is_primary()) - obc = object_contexts.lookup(*p); + + ceph_assert(is_primary()); + + eversion_t version; + std::map shard_versions; + ObjectContextRef obc = object_contexts.lookup(*p); + if (obc) { if (!obc->obs.exists) { /* If the object does not exist here, it must have been removed @@ -14399,8 +14454,8 @@ void PrimaryLogPG::scan_range( */ continue; } - bi->objects[*p] = obc->obs.oi.version; - dout(20) << " " << *p << " " << obc->obs.oi.version << dendl; + version = obc->obs.oi.version; + shard_versions = obc->obs.oi.shard_versions; } else { bufferlist bl; int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl); @@ -14413,12 +14468,64 @@ void PrimaryLogPG::scan_range( ceph_assert(r >= 0); object_info_t oi(bl); - bi->objects[*p] = oi.version; - dout(20) << " " << *p << " " << oi.version << dendl; + version = oi.version; + shard_versions = oi.shard_versions; + } + dout(20) << " " << *p << " " << version << dendl; + if (shard_versions.empty()) { + bi->objects.insert(make_pair(*p, std::make_pair(shard_id_t::NO_SHARD, + version))); + } else { + bool added_default = false; + for (auto & shard: backfill_targets) { + if (shard_versions.contains(shard.shard)) { + version = shard_versions.at(shard.shard); + bi->objects.insert(make_pair(*p, std::make_pair(shard.shard, + version))); + } else if (!added_default) { + bi->objects.insert(make_pair(*p, std::make_pair(shard_id_t::NO_SHARD, + version))); + added_default = true; + } + } } } } +void PrimaryLogPG::scan_range_replica( + int min, int max, ReplicaBackfillInterval *bi, + ThreadPool::TPHandle &handle) +{ + ceph_assert(is_locked()); + dout(10) << "scan_range_replica from " << bi->begin << dendl; + bi->clear_objects(); + + vector ls; + ls.reserve(max); + int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end); + ceph_assert(r >= 0); + dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl; + dout(20) << ls << dendl; + + for (vector::iterator p = ls.begin(); p != ls.end(); ++p) { + handle.reset_tp_timeout(); + + ceph_assert(!is_primary()); + bufferlist bl; + int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl); + /* If the object does not exist here, it must have been removed + * between the collection_list_partial and here. This can happen + * for the first item in the range, which is usually last_backfill. + */ + if (r == -ENOENT) + continue; + + ceph_assert(r >= 0); + object_info_t oi(bl); + bi->objects[*p] = oi.version; + dout(20) << " " << *p << " " << oi.version << dendl; + } +} /** check_local * diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 94c0b13b5141d..aafd618b754de 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -1138,7 +1138,7 @@ protected: } { f->open_array_section("peer_backfill_info"); - for (std::map::const_iterator pbi = + for (std::map::const_iterator pbi = peer_backfill_info.begin(); pbi != peer_backfill_info.end(); ++pbi) { f->dump_stream("osd") << pbi->first; @@ -1327,14 +1327,20 @@ protected: * @bi.begin first item should be >= this value * @bi [out] resulting std::map of objects to eversion_t's */ - void scan_range( - int min, int max, BackfillInterval *bi, + void scan_range_replica( + int min, int max, ReplicaBackfillInterval *bi, ThreadPool::TPHandle &handle ); + void scan_range_primary( + int min, int max, PrimaryBackfillInterval *bi, + ThreadPool::TPHandle &handle, + const std::set &backfill_targets + ); + /// Update a hash range to reflect changes since the last scan void update_range( - BackfillInterval *bi, ///< [in,out] interval to update + PrimaryBackfillInterval *bi, ///< [in,out] interval to update ThreadPool::TPHandle &handle ///< [in] tp handle ); diff --git a/src/osd/recovery_types.cc b/src/osd/recovery_types.cc deleted file mode 100644 index 70ba1fd797ede..0000000000000 --- a/src/osd/recovery_types.cc +++ /dev/null @@ -1,16 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "recovery_types.h" - -std::ostream& operator<<(std::ostream& out, const BackfillInterval& bi) -{ - out << "BackfillInfo(" << bi.begin << "-" << bi.end - << " " << bi.objects.size() << " objects"; - if (!bi.objects.empty()) - out << " " << bi.objects; - out << ")"; - return out; -} - - diff --git a/src/osd/recovery_types.h b/src/osd/recovery_types.h index cffb2b7ea4c06..159620949f028 100644 --- a/src/osd/recovery_types.h +++ b/src/osd/recovery_types.h @@ -15,18 +15,54 @@ * Possible states: * 1) begin == end == hobject_t() indicates the the interval is unpopulated * 2) Else, objects contains all objects in [begin, end) + * + * ReplicaBackfillInterval + * + * Stores a map of hobject_t and eversion to track the version number of + * the objects being backfilled in an interval for one specific shard + * + * PrimaryBackfillInterval + * + * Stores a multimap of hobject and pair. + * + * Only shards that are backfill targets will be tracked. For replicated and + * non-optimized EC pools there is one entry per hobject_t and shard_id_t will + * be NO_SHARD. + * + * For optimized EC pools partial writes mean it is possible that different + * shards have different eversions, hence there may be multiple entries per + * hobject_t. To conserve memory it is permitted to have an entry for NO_SHARD + * and additional entries for the same hobject for specific shards. In this + * case shards that are not specifically listed are expected to be at the + * eversion for the NO_SHARD entry. + * + * Example: EC pool with 4+2 profile + * + * test:head, + * test:head, <1, 1'20> + * + * Shards 0 and 2-5 are expected to be at version 1'23, shard 1 has skipped + * recent updates and is expected to be at version 1'20 */ -struct BackfillInterval { + +template +class BackfillInterval { +public: // info about a backfill interval on a peer eversion_t version; /// version at which the scan occurred - std::map objects; hobject_t begin; hobject_t end; + T objects; + + virtual ~BackfillInterval() = default; + BackfillInterval() = default; + BackfillInterval(const BackfillInterval&) = default; + BackfillInterval(BackfillInterval&&) = default; + BackfillInterval& operator=(const BackfillInterval&) = default; + BackfillInterval& operator=(BackfillInterval&&) = default; /// clear content - void clear() { - *this = BackfillInterval(); - } + virtual void clear() = 0; /// clear objects std::list only void clear_objects() { @@ -60,10 +96,60 @@ struct BackfillInterval { /// Adjusts begin to the first object void trim() { - if (!objects.empty()) + if (!objects.empty()) { begin = objects.begin()->first; - else + } else { begin = end; + } + } + + /// drop first entry, and adjust @begin accordingly + virtual void pop_front() = 0; + + /// dump + virtual void dump(ceph::Formatter *f) const = 0; +}; + +class PrimaryBackfillInterval: public BackfillInterval>> { +public: + + /// clear content + void clear() override { + *this = PrimaryBackfillInterval(); + } + + /// drop first entry, and adjust @begin accordingly + void pop_front() override { + ceph_assert(!objects.empty()); + // Use erase(key) to erase all entries for key + objects.erase(objects.begin()->first); + trim(); + } + + /// dump + void dump(ceph::Formatter *f) const override { + f->dump_stream("begin") << begin; + f->dump_stream("end") << end; + f->open_array_section("objects"); + for (const auto& [hoid, shard_version] : objects) { + const auto& [shard, version] = shard_version; + f->open_object_section("object"); + f->dump_stream("object") << hoid; + f->dump_stream("shard") << shard; + f->dump_stream("version") << version; + f->close_section(); + } + f->close_section(); + } +}; + +class ReplicaBackfillInterval: public BackfillInterval> { +public: + /// clear content + void clear() override { + *this = ReplicaBackfillInterval(); } /// drop first entry, and adjust @begin accordingly @@ -74,25 +160,32 @@ struct BackfillInterval { } /// dump - void dump(ceph::Formatter *f) const { + void dump(ceph::Formatter *f) const override { f->dump_stream("begin") << begin; f->dump_stream("end") << end; f->open_array_section("objects"); - for (std::map::const_iterator i = - objects.begin(); - i != objects.end(); - ++i) { + for (const auto& [hoid, version] : objects) { f->open_object_section("object"); - f->dump_stream("object") << i->first; - f->dump_stream("version") << i->second; + f->dump_stream("object") << hoid; + f->dump_stream("version") << version; f->close_section(); } f->close_section(); } }; -std::ostream &operator<<(std::ostream &out, const BackfillInterval &bi); +template std::ostream& operator<<(std::ostream& out, + const BackfillInterval& bi) +{ + out << "BackfillInfo(" << bi.begin << "-" << bi.end << " "; + if (!bi.objects.empty()) { + out << bi.objects.size() << " objects " << bi.objects; + } + out << ")"; + return out; +} #if FMT_VERSION >= 90000 -template <> struct fmt::formatter : fmt::ostream_formatter {}; +template <> struct fmt::formatter : fmt::ostream_formatter {}; +template <> struct fmt::formatter : fmt::ostream_formatter {}; #endif diff --git a/src/test/crimson/CMakeLists.txt b/src/test/crimson/CMakeLists.txt index 3456514b5fda9..de4f4b4869214 100644 --- a/src/test/crimson/CMakeLists.txt +++ b/src/test/crimson/CMakeLists.txt @@ -2,11 +2,10 @@ add_executable(unittest-crimson-backfill test_backfill.cc ${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc - ${PROJECT_SOURCE_DIR}/src/crimson/osd/backfill_state.cc - ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc) + ${PROJECT_SOURCE_DIR}/src/crimson/osd/backfill_state.cc) add_ceph_unittest(unittest-crimson-backfill --memory 256M --smp 1) -target_link_libraries(unittest-crimson-backfill crimson GTest::Main) +target_link_libraries(unittest-crimson-backfill crimson GTest::Main Boost::MPL) add_executable(unittest-seastar-buffer test_buffer.cc) diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc index 117024a48d4b0..39d6700d4a07e 100644 --- a/src/test/crimson/test_backfill.cc +++ b/src/test/crimson/test_backfill.cc @@ -14,6 +14,7 @@ #include #include + #include #include @@ -67,10 +68,31 @@ struct FakeStore { : hobject_t::get_max(); } + // Permit rhs (reference) objects to be the same version or 1 version older + bool looks_like(const FakeStore& rhs) const { + if (std::size(objs) != std::size(rhs.objs)) { + return false; + } + for (auto &[obj, version] : objs) { + if (!rhs.objs.contains(obj)) { + return false; + } + auto version_r = rhs.objs.at(obj); + if ((version.epoch != version_r.epoch) || + ((version.version != version_r.version) && + (version.version != version_r.version + 1))) + { + return false; + } + } + return true; + } + bool operator==(const FakeStore& rhs) const { return std::size(objs) == std::size(rhs.objs) && \ std::equal(std::begin(objs), std::end(objs), std::begin(rhs.objs)); } + bool operator!=(const FakeStore& rhs) const { return !(*this == rhs); } @@ -92,6 +114,7 @@ struct FakePrimary { eversion_t projected_last_update; eversion_t log_tail; PGLog pg_log; + pg_pool_t pool; PGLog::IndexedLog projected_log; FakePrimary(FakeStore&& store) @@ -184,7 +207,7 @@ public: const bool all_replica_match = std::all_of( std::begin(backfill_targets), std::end(backfill_targets), [&reference] (const auto kv) { - return kv.second.store == reference; + return kv.second.store.looks_like(reference); }); return backfill_source.store == reference && all_replica_match; } @@ -244,6 +267,10 @@ struct BackfillFixture::PeeringFacade return backfill_source.pg_log; } + const pg_pool_t& get_pool() const override { + return backfill_source.pool; + } + void scan_log_after(eversion_t, scan_log_func_t) const override { /* NOP */ } @@ -304,7 +331,7 @@ void BackfillFixture::request_replica_scan( const hobject_t& begin, const hobject_t& end) { - BackfillInterval bi; + ReplicaBackfillInterval bi; bi.end = backfill_targets.at(target).store.list(begin, [&bi](auto kv) { bi.objects.insert(std::move(kv)); }); @@ -317,9 +344,35 @@ void BackfillFixture::request_replica_scan( void BackfillFixture::request_primary_scan( const hobject_t& begin) { - BackfillInterval bi; + PrimaryBackfillInterval bi; bi.end = backfill_source.store.list(begin, [&bi](auto kv) { - bi.objects.insert(std::move(kv)); + auto && [hoid,version] = kv; + eversion_t version_zero; + eversion_t version_next = eversion_t(version.epoch, version.version + 1); + switch (std::rand() % 4) { + case 0: + // All shards at same version (Replica, EC, optimized EC after full-stripe write) + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version))); + break; + case 1: + // Optimized EC partial write - Shard 3 at an earlier version + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(3), version_zero))); + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version))); + break; + case 2: + // Optimized EC partial write - Shard 1 and 2 at an earlier version + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version_next))); + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(1), version))); + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(2), version))); + break; + case 3: + // Optimized EC partial write - Shard 1, 2 and 3 at different earlier versions + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version_next))); + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(1), version))); + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(2), version))); + bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(3), version_zero))); + break; + } }); bi.begin = begin; bi.version = backfill_source.last_update; @@ -381,8 +434,10 @@ struct BackfillFixtureBuilder { BackfillFixtureBuilder&& add_target(FakeStore::objs_t objs) && { const auto new_osd_num = std::size(backfill_targets); + const auto new_shard_id = shard_id_t(1 + new_osd_num); const auto [ _, inserted ] = backfill_targets.emplace( - new_osd_num, FakeReplica{ FakeStore{std::move(objs)} }); + pg_shard_t(new_osd_num, new_shard_id), + FakeReplica{ FakeStore{std::move(objs)} }); ceph_assert(inserted); return std::move(*this); } @@ -438,6 +493,27 @@ TEST(backfill, one_empty_replica) EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); } +TEST(backfill, one_same_one_empty_replica) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + reference_store.objs + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + TEST(backfill, two_empty_replicas) { const auto reference_store = FakeStore{ { @@ -459,6 +535,34 @@ TEST(backfill, two_empty_replicas) EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); } +TEST(backfill, one_behind_one_empty_replica) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {8, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 250} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 247} }, + //"1:0256710c:::rbd_data.1018ac3e755.00000000000000b1:head", deleted + }}; + const auto behind_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {8, 234} }, + //"1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head" missing + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {8, 165} }, + { "1:0256710c:::rbd_data.1018ac3e755.00000000000000b1:head", {8, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + behind_store.objs + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + TEST(backfill, cancel_resume_middle_of_primaryscan) { const auto reference_store = FakeStore{ {