From 136d96ce2067476a2004b8c3290788ee316fa5e3 Mon Sep 17 00:00:00 2001 From: Bill Scales <156200352+bill-scales@users.noreply.github.com> Date: Thu, 6 Mar 2025 09:47:17 +0000 Subject: [PATCH] osd: EC Optimizations: Partial write changes to add_next_event add_next_event is used during peering to process log entries that a shard is missing to build up a list of missing objects. With EC optimized pools and partial writes not every update modifies every shard. The log entry contains details of which shards were modified and this can be used to work out whether a missing entry needs to be created/updated. Signed-off-by: Bill Scales --- src/osd/PGLog.cc | 8 ++++++-- src/osd/PGLog.h | 23 +++++++++++++++++------ src/osd/PeeringState.cc | 14 ++++++++++---- src/osd/PeeringState.h | 2 +- src/osd/osd_types.h | 15 ++++++++++++--- src/test/osd/TestPGLog.cc | 32 ++++++++++++++++---------------- src/test/osd/types.cc | 22 +++++++++++----------- 7 files changed, 73 insertions(+), 43 deletions(-) diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index 5831cef8c2b..6f9cadda707 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -369,8 +369,10 @@ void PGLog::rewind_divergent_log(eversion_t newhead, } void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd, - pg_info_t &info, LogEntryHandler *rollbacker, - bool &dirty_info, bool &dirty_big_info) + pg_info_t &info, const pg_pool_t &pool, pg_shard_t toosd, + LogEntryHandler *rollbacker, + bool &dirty_info, bool &dirty_big_info, + bool ec_optimizations_enabled) { dout(10) << "merge_log " << olog << " from osd." << fromosd << " into " << log << dendl; @@ -476,6 +478,8 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd, &log, missing, rollbacker, + pool, + toosd.shard, this); _merge_divergent_entries( diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index a6434098d2d..d668874e881 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -796,8 +796,8 @@ public: missing.add(oid, need, have, is_delete); } - void missing_add_next_entry(const pg_log_entry_t& e) { - missing.add_next_event(e); + void missing_add_next_entry(const pg_log_entry_t& e, const pg_pool_t &pool, shard_id_t shard) { + missing.add_next_event(e, pool, shard); } //////////////////// get or std::set log //////////////////// @@ -1287,8 +1287,12 @@ public: void merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t from, - pg_info_t &info, LogEntryHandler *rollbacker, - bool &dirty_info, bool &dirty_big_info); + pg_info_t &info, + const pg_pool_t &pool, + pg_shard_t to, + LogEntryHandler *rollbacker, + bool &dirty_info, bool &dirty_big_info, + bool ec_optimizations_enabled); template static bool append_log_entries_update_missing( @@ -1298,6 +1302,8 @@ public: IndexedLog *log, missing_type &missing, LogEntryHandler *rollbacker, + const pg_pool_t &pool, + shard_id_t shard, const DoutPrefixProvider *dpp) { bool invalidate_stats = false; if (log && !entries.empty()) { @@ -1312,12 +1318,12 @@ public: if (p->soid <= last_backfill && !p->is_error()) { if (missing.may_include_deletes) { - missing.add_next_event(*p); + missing.add_next_event(*p, pool, shard); } else { if (p->is_delete()) { missing.rm(p->soid, p->version); } else { - missing.add_next_event(*p); + missing.add_next_event(*p, pool, shard); } if (rollbacker) { // hack to match PG::mark_all_unfound_lost @@ -1335,7 +1341,10 @@ public: bool append_new_log_entries( const hobject_t &last_backfill, const mempool::osd_pglog::list &entries, + pg_info_t *info, LogEntryHandler *rollbacker, + const pg_pool_t &pool, + shard_id_t shard, bool ec_optimizations_enabled) { bool invalidate_stats = append_log_entries_update_missing( last_backfill, @@ -1344,6 +1353,8 @@ public: &log, missing, rollbacker, + pool, + shard, this); if (!entries.empty()) { mark_writeout_from(entries.begin()->version); diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 89f6037406e..09f2b45f0fe 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -3003,7 +3003,7 @@ void PeeringState::activate( if (perform_deletes_during_peering() && p->is_delete()) { pm.rm(p->soid, p->version); } else { - pm.add_next_event(*p); + pm.add_next_event(*p, pool.info, peer.shard); } } } @@ -3149,8 +3149,9 @@ void PeeringState::merge_log( { PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; pg_log.merge_log( - oinfo, std::move(olog), from, info, rollbacker.get(), - dirty_info, dirty_big_info); + oinfo, std::move(olog), from, info, pool.info, pg_whoami, + rollbacker.get(), dirty_info, dirty_big_info, + pool.info.allows_ecoptimizations()); } void PeeringState::rewind_divergent_log( @@ -4229,7 +4230,10 @@ bool PeeringState::append_log_entries_update_missing( pg_log.append_new_log_entries( info.last_backfill, entries, + &info, rollbacker.get(), + pool.info, + pg_whoami.shard, pool.info.allows_ecoptimizations()); if (pg_committed_to && entries.rbegin()->soid > info.last_backfill) { @@ -4288,6 +4292,8 @@ void PeeringState::merge_new_log_entries( NULL, pmissing, NULL, + pool.info, + peer.shard, dpp); pinfo.last_update = info.last_update; pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats; @@ -4542,7 +4548,7 @@ void PeeringState::pre_submit_op( continue; requires_missing_loc = true; for (auto &&entry: logv) { - peer_missing[i].add_next_event(entry); + peer_missing[i].add_next_event(entry, pool.info, i.shard); } } diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index 1a7323b4127..a28c09b5276 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -1982,7 +1982,7 @@ public: /// Update missing set to reflect e (TODOSAM: not sure why this is needed) void add_local_next_event(const pg_log_entry_t& e) { - pg_log.missing_add_next_entry(e); + pg_log.missing_add_next_entry(e, pool.info, pg_whoami.shard); } /// Update log trim boundary diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index be34ad0f717..f6e10c67553 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -5091,10 +5091,11 @@ public: * this needs to be called in log order as we extend the log. it * assumes missing is accurate up through the previous log entry. */ - void add_next_event(const pg_log_entry_t& e) { + void add_next_event(const pg_log_entry_t& e, const pg_pool_t &pool, shard_id_t shard) { std::map::iterator missing_it; missing_it = missing.find(e.soid); bool is_missing_divergent_item = missing_it != missing.end(); + bool skipped = false; if (e.prior_version == eversion_t() || e.is_clone()) { // new object. if (is_missing_divergent_item) { // use iterator @@ -5102,6 +5103,9 @@ public: // .have = nil missing_it->second = item(e.version, eversion_t(), e.is_delete()); missing_it->second.clean_regions.mark_fully_dirty(); + } else if (pool.is_nonprimary_shard(shard) && !e.is_written_shard(shard)) { + // new object, partial write and not already missing - skip + skipped = true; } else { // create new element in missing map // .have = nil @@ -5117,6 +5121,9 @@ public: missing_it->second.clean_regions.mark_fully_dirty(); else missing_it->second.clean_regions.merge(e.clean_regions); + } else if (pool.is_nonprimary_shard(shard) && !e.is_written_shard(shard)) { + // existing object, partial write and not already missing - skip + skipped = true; } else { // not missing, we must have prior_version (if any) ceph_assert(!is_missing_divergent_item); @@ -5126,8 +5133,10 @@ public: else missing[e.soid].clean_regions = e.clean_regions; } - rmissing[e.version.version] = e.soid; - tracker.changed(e.soid); + if (!skipped) { + rmissing[e.version.version] = e.soid; + tracker.changed(e.soid); + } } void revise_need(hobject_t oid, eversion_t need, bool is_delete) { diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc index 5b85e0f9d2e..6aa7190b974 100644 --- a/src/test/osd/TestPGLog.cc +++ b/src/test/osd/TestPGLog.cc @@ -294,7 +294,7 @@ public: bool dirty_big_info = false; merge_log( oinfo, std::move(olog), pg_shard_t(1, shard_id_t(0)), info, - &h, dirty_info, dirty_big_info); + pg_pool_t(), pg_shard_t(), &h, dirty_info, dirty_big_info, false); ASSERT_EQ(info.last_update, oinfo.last_update); verify_missing(tcase, missing); @@ -328,7 +328,7 @@ public: if (i->is_delete() && tcase.deletes_during_peering) { omissing.rm(i->soid, i->version); } else { - omissing.add_next_event(*i); + omissing.add_next_event(*i, pg_pool_t(), shard_id_t()); } } } @@ -613,7 +613,7 @@ TEST_F(PGLogTest, merge_old_entry) { { pg_log_entry_t &ne = log.log.front(); ne.op = pg_log_entry_t::MODIFY; - missing.add_next_event(ne); + missing.add_next_event(ne, pg_pool_t(), shard_id_t()); pg_log_entry_t oe; oe.mark_unrollbackable(); oe.version = eversion_t(1,1); @@ -692,7 +692,7 @@ TEST_F(PGLogTest, merge_old_entry) { oe.version = eversion_t(2,1); oe.op = pg_log_entry_t::MODIFY; - missing.add_next_event(oe); + missing.add_next_event(oe, pg_pool_t(), shard_id_t()); missing.flush(); EXPECT_FALSE(is_dirty()); @@ -897,8 +897,8 @@ TEST_F(PGLogTest, merge_log) { EXPECT_FALSE(dirty_big_info); TestHandler h(remove_snap); - merge_log(oinfo, std::move(olog), fromosd, info, &h, - dirty_info, dirty_big_info); + merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(), + &h, dirty_info, dirty_big_info, false); EXPECT_FALSE(missing.have_missing()); EXPECT_EQ(0U, log.log.size()); @@ -947,8 +947,8 @@ TEST_F(PGLogTest, merge_log) { EXPECT_FALSE(dirty_big_info); TestHandler h(remove_snap); - merge_log(oinfo, std::move(olog), fromosd, info, &h, - dirty_info, dirty_big_info); + merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(), + &h,dirty_info, dirty_big_info, false); EXPECT_FALSE(missing.have_missing()); EXPECT_EQ(0U, log.log.size()); @@ -1052,8 +1052,8 @@ TEST_F(PGLogTest, merge_log) { EXPECT_FALSE(dirty_big_info); TestHandler h(remove_snap); - merge_log(oinfo, std::move(olog), fromosd, info, &h, - dirty_info, dirty_big_info); + merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(), + &h, dirty_info, dirty_big_info, false); EXPECT_FALSE(missing.have_missing()); EXPECT_EQ(3U, log.log.size()); @@ -1161,8 +1161,8 @@ TEST_F(PGLogTest, merge_log) { EXPECT_FALSE(dirty_big_info); TestHandler h(remove_snap); - merge_log(oinfo, std::move(olog), fromosd, info, &h, - dirty_info, dirty_big_info); + merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(), + &h, dirty_info, dirty_big_info, false); /* When the divergent entry is a DELETE and the authoritative entry is a MODIFY, the object will be added to missing : it is @@ -1280,8 +1280,8 @@ TEST_F(PGLogTest, merge_log) { TestHandler h(remove_snap); missing.may_include_deletes = false; - merge_log(oinfo, std::move(olog), fromosd, info, &h, - dirty_info, dirty_big_info); + merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(), + &h, dirty_info, dirty_big_info, false); /* When the divergent entry is a DELETE and the authoritative entry is a MODIFY, the object will be added to missing : it is @@ -1382,8 +1382,8 @@ TEST_F(PGLogTest, merge_log) { TestHandler h(remove_snap); missing.may_include_deletes = false; - merge_log(oinfo, std::move(olog), fromosd, info, &h, - dirty_info, dirty_big_info); + merge_log(oinfo, std::move(olog), fromosd, info, pg_pool_t(), pg_shard_t(), + &h, dirty_info, dirty_big_info, false); EXPECT_FALSE(missing.have_missing()); EXPECT_EQ(2U, log.log.size()); diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc index 1fd56a272a8..a0da3cae2c2 100644 --- a/src/test/osd/types.cc +++ b/src/test/osd/types.cc @@ -1154,7 +1154,7 @@ TEST(pg_missing_t, add_next_event) EXPECT_TRUE(e.object_is_indexed()); EXPECT_TRUE(e.reqid_is_indexed()); EXPECT_FALSE(missing.is_missing(oid)); - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have); EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version)); @@ -1162,7 +1162,7 @@ TEST(pg_missing_t, add_next_event) EXPECT_EQ(1U, missing.get_rmissing().size()); // adding the same object replaces the previous one - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(1U, missing.num_missing()); EXPECT_EQ(1U, missing.get_rmissing().size()); @@ -1179,7 +1179,7 @@ TEST(pg_missing_t, add_next_event) EXPECT_TRUE(e.object_is_indexed()); EXPECT_FALSE(e.reqid_is_indexed()); EXPECT_FALSE(missing.is_missing(oid)); - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have); EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version)); @@ -1187,7 +1187,7 @@ TEST(pg_missing_t, add_next_event) EXPECT_EQ(1U, missing.get_rmissing().size()); // adding the same object replaces the previous one - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(1U, missing.num_missing()); EXPECT_EQ(1U, missing.get_rmissing().size()); @@ -1204,7 +1204,7 @@ TEST(pg_missing_t, add_next_event) EXPECT_TRUE(e.object_is_indexed()); EXPECT_TRUE(e.reqid_is_indexed()); EXPECT_FALSE(missing.is_missing(oid)); - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have); EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version)); @@ -1213,7 +1213,7 @@ TEST(pg_missing_t, add_next_event) // adding the same object with a different version e.prior_version = prior_version; - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(1U, missing.num_missing()); @@ -1230,7 +1230,7 @@ TEST(pg_missing_t, add_next_event) EXPECT_TRUE(e.object_is_indexed()); EXPECT_TRUE(e.reqid_is_indexed()); EXPECT_FALSE(missing.is_missing(oid)); - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(prior_version, missing.get_items().at(oid).have); EXPECT_EQ(version, missing.get_items().at(oid).need); @@ -1249,12 +1249,12 @@ TEST(pg_missing_t, add_next_event) EXPECT_TRUE(e.object_is_indexed()); EXPECT_TRUE(e.reqid_is_indexed()); EXPECT_FALSE(missing.is_missing(oid)); - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); e.op = pg_log_entry_t::DELETE; EXPECT_TRUE(e.is_delete()); - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_TRUE(missing.get_items().at(oid).is_delete()); EXPECT_EQ(prior_version, missing.get_items().at(oid).have); @@ -1274,14 +1274,14 @@ TEST(pg_missing_t, add_next_event) EXPECT_TRUE(e.object_is_indexed()); EXPECT_TRUE(e.reqid_is_indexed()); EXPECT_FALSE(missing.is_missing(oid)); - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_FALSE(missing.get_items().at(oid).is_delete()); e.op = pg_log_entry_t::LOST_DELETE; e.version.version++; EXPECT_TRUE(e.is_delete()); - missing.add_next_event(e); + missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_TRUE(missing.get_items().at(oid).is_delete()); EXPECT_EQ(prior_version, missing.get_items().at(oid).have); -- 2.39.5