From 3ce975379f3c79829b7aae46f187bcf58053fe9f Mon Sep 17 00:00:00 2001 From: Bill Scales Date: Wed, 26 Mar 2025 13:04:46 +0000 Subject: [PATCH] osd: EC Optimizations: PG log changes for partial logs Optimized EC pools will not add a log entry for shards that are not modified by a partial write. This means the shard will have a partial copy of the log. There are several asserts in PGLog that assume that the log is contiguous, these need to be relaxed when it is an optimized EC pool (other pools retain the full strength asserts). During peering the primary may provide a complete log to a non-primary shard to merge into its log. This merge can skip log entries for partial writes that do not update the shard. Signed-off-by: Bill Scales --- src/osd/PGLog.cc | 12 +++++++++--- src/osd/PGLog.h | 35 +++++++++++++++++++++++++++++------ src/osd/PeeringState.cc | 5 +++-- src/test/osd/TestPGLog.cc | 24 ++++++++++++------------ 4 files changed, 53 insertions(+), 23 deletions(-) diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index 6f9cadda70714..07e3f30b6c820 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -224,7 +224,8 @@ void PGLog::proc_replica_log( pg_info_t &oinfo, const pg_log_t &olog, pg_missing_t& omissing, - pg_shard_t from) const + pg_shard_t from, + bool ec_optimizations_enabled) const { dout(10) << "proc_replica_log for osd." << from << ": " << oinfo << " " << olog << " " << omissing << dendl; @@ -301,6 +302,7 @@ void PGLog::proc_replica_log( olog.get_can_rollback_to(), omissing, 0, + ec_optimizations_enabled, this); if (lu < oinfo.last_update) { @@ -333,7 +335,8 @@ void PGLog::proc_replica_log( */ void PGLog::rewind_divergent_log(eversion_t newhead, pg_info_t &info, LogEntryHandler *rollbacker, - bool &dirty_info, bool &dirty_big_info) + bool &dirty_info, bool &dirty_big_info, + bool ec_optimizations_enabled) { dout(10) << "rewind_divergent_log truncate divergent future " << newhead << dendl; @@ -362,6 +365,7 @@ void PGLog::rewind_divergent_log(eversion_t newhead, original_crt, missing, rollbacker, + ec_optimizations_enabled, this); dirty_info = true; @@ -430,7 +434,8 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd, // do we have divergent entries to throw out? if (olog.head < log.head) { - rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info); + rewind_divergent_log(olog.head, info, rollbacker, + dirty_info, dirty_big_info, ec_optimizations_enabled); changed = true; } @@ -489,6 +494,7 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd, original_crt, missing, rollbacker, + ec_optimizations_enabled, this); info.last_update = log.head = olog.head; diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index d668874e88182..455fc44641850 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -972,7 +972,8 @@ public: void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog, - pg_missing_t& omissing, pg_shard_t from) const; + pg_missing_t& omissing, pg_shard_t from, + bool ec_optimizations_enabled) const; void set_missing_may_contain_deletes() { missing.may_include_deletes = true; @@ -1022,6 +1023,7 @@ protected: eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input InedexedLog missing_type &missing, ///< [in,out] missing to adjust, use LogEntryHandler *rollbacker, ///< [in] optional rollbacker object + bool ec_optimizations_enabled, ///< [in] relax asserts for allow_ec_optimzations pools const DoutPrefixProvider *dpp ///< [in] logging provider ) { ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid @@ -1062,7 +1064,13 @@ protected: // in increasing order of version ceph_assert(i->version > last); // prior_version correct (unless it is an ERROR entry) - ceph_assert(i->prior_version == last || i->is_error()); + if (ec_optimizations_enabled) { + // With partial writes prior_verson may be > last because of + // skipped log entries + ceph_assert(i->prior_version >= last || i->is_error()); + } else { + ceph_assert(i->prior_version == last || i->is_error()); + } } if (i->is_error()) { ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl; @@ -1103,8 +1111,15 @@ protected: // ensure missing has been updated appropriately if (objiter->second->is_update() || (missing.may_include_deletes && objiter->second->is_delete())) { - ceph_assert(missing.is_missing(hoid) && - missing.get_items().at(hoid).need == objiter->second->version); + if (ec_optimizations_enabled) { + // relax the assert for partial writes - missing may be newer than the + // most recent log entry + ceph_assert(missing.is_missing(hoid) && + missing.get_items().at(hoid).need >= objiter->second->version); + } else { + ceph_assert(missing.is_missing(hoid) && + missing.get_items().at(hoid).need == objiter->second->version); + } } else { ceph_assert(!missing.is_missing(hoid)); } @@ -1235,6 +1250,7 @@ protected: eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input IndexedLog missing_type &omissing, ///< [in,out] missing to adjust, use LogEntryHandler *rollbacker, ///< [in] optional rollbacker object + bool ec_optimizations_enabled, ///< [in] relax asserts for allow_ec_optimzations pools const DoutPrefixProvider *dpp ///< [in] logging provider ) { std::map > split; @@ -1248,6 +1264,7 @@ protected: olog_can_rollback_to, omissing, rollbacker, + ec_optimizations_enabled, dpp); } } @@ -1271,6 +1288,7 @@ protected: log.get_can_rollback_to(), missing, rollbacker, + false, // not allow_ec_optimizations pool this); } @@ -1282,7 +1300,8 @@ public: pg_info_t &info, LogEntryHandler *rollbacker, bool &dirty_info, - bool &dirty_big_info); + bool &dirty_big_info, + bool ec_optimizations_enabled); void merge_log(pg_info_t &oinfo, pg_log_t&& olog, @@ -1313,7 +1332,11 @@ public: invalidate_stats = invalidate_stats || !p->is_error(); if (log) { ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl; - log->add(*p); + // Skip the log entry if it is a partial write that did not involve + // this shard + if (!pool.is_nonprimary_shard(shard) || p->is_written_shard(shard)) { + log->add(*p); + } } if (p->soid <= last_backfill && !p->is_error()) { diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 483fe12bd3d84..0f49351d69a1f 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -3186,7 +3186,8 @@ void PeeringState::rewind_divergent_log( { PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; pg_log.rewind_divergent_log( - newhead, info, rollbacker.get(), dirty_info, dirty_big_info); + newhead, info, rollbacker.get(), dirty_info, dirty_big_info, + pool.info.allows_ecoptimizations()); } @@ -3257,7 +3258,7 @@ void PeeringState::proc_replica_log( psdout(10) << "proc_replica_log for osd." << from << ": " << oinfo << " " << olog << " " << omissing << dendl; - pg_log.proc_replica_log(oinfo, olog, omissing, from); + pg_log.proc_replica_log(oinfo, olog, omissing, from, pool.info.allows_ecoptimizations()); peer_info[from] = oinfo; update_peer_info(from, oinfo); diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc index 6aa7190b97482..ef6b8758dd358 100644 --- a/src/test/osd/TestPGLog.cc +++ b/src/test/osd/TestPGLog.cc @@ -313,7 +313,7 @@ public: pg_info_t oinfo = tcase.get_divinfo(); proc_replica_log( - oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0))); + oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)), false); ceph_assert(oinfo.last_update >= log.tail); @@ -434,7 +434,7 @@ TEST_F(PGLogTest, rewind_divergent_log) { TestHandler h(remove_snap); rewind_divergent_log(newhead, info, &h, - dirty_info, dirty_big_info); + dirty_info, dirty_big_info, false); EXPECT_TRUE(log.objects.count(divergent)); EXPECT_TRUE(missing.is_missing(divergent_object)); @@ -499,7 +499,7 @@ TEST_F(PGLogTest, rewind_divergent_log) { TestHandler h(remove_snap); rewind_divergent_log(newhead, info, &h, - dirty_info, dirty_big_info); + dirty_info, dirty_big_info, false); EXPECT_TRUE(missing.is_missing(divergent_object)); EXPECT_EQ(0U, log.objects.count(divergent_object)); @@ -538,7 +538,7 @@ TEST_F(PGLogTest, rewind_divergent_log) { TestHandler h(remove_snap); roll_forward_to(eversion_t(1, 6), &info, &h); rewind_divergent_log(eversion_t(1, 5), info, &h, - dirty_info, dirty_big_info); + dirty_info, dirty_big_info, false); pg_log_t log; reset_backfill_claim_log(log, &info, &h); } @@ -1417,7 +1417,7 @@ TEST_F(PGLogTest, proc_replica_log) { EXPECT_EQ(last_complete, oinfo.last_complete); missing.may_include_deletes = false; - proc_replica_log(oinfo, olog, omissing, from); + proc_replica_log(oinfo, olog, omissing, from, false); EXPECT_FALSE(omissing.have_missing()); EXPECT_EQ(last_update, oinfo.last_update); @@ -1491,7 +1491,7 @@ TEST_F(PGLogTest, proc_replica_log) { EXPECT_EQ(olog.head, oinfo.last_complete); missing.may_include_deletes = false; - proc_replica_log(oinfo, olog, omissing, from); + proc_replica_log(oinfo, olog, omissing, from, false); EXPECT_FALSE(omissing.have_missing()); } @@ -1593,7 +1593,7 @@ TEST_F(PGLogTest, proc_replica_log) { EXPECT_EQ(olog.head, oinfo.last_complete); missing.may_include_deletes = false; - proc_replica_log(oinfo, olog, omissing, from); + proc_replica_log(oinfo, olog, omissing, from, false); EXPECT_TRUE(omissing.have_missing()); EXPECT_TRUE(omissing.is_missing(divergent_object)); @@ -1680,7 +1680,7 @@ TEST_F(PGLogTest, proc_replica_log) { EXPECT_EQ(olog.head, oinfo.last_complete); missing.may_include_deletes = false; - proc_replica_log(oinfo, olog, omissing, from); + proc_replica_log(oinfo, olog, omissing, from, false); EXPECT_TRUE(omissing.have_missing()); EXPECT_TRUE(omissing.is_missing(divergent_object)); @@ -1770,7 +1770,7 @@ TEST_F(PGLogTest, proc_replica_log) { EXPECT_EQ(olog.head, oinfo.last_complete); missing.may_include_deletes = false; - proc_replica_log(oinfo, olog, omissing, from); + proc_replica_log(oinfo, olog, omissing, from, false); EXPECT_TRUE(omissing.have_missing()); EXPECT_TRUE(omissing.is_missing(divergent_object)); @@ -1864,7 +1864,7 @@ TEST_F(PGLogTest, proc_replica_log) { EXPECT_EQ(olog.head, oinfo.last_complete); missing.may_include_deletes = false; - proc_replica_log(oinfo, olog, omissing, from); + proc_replica_log(oinfo, olog, omissing, from, false); EXPECT_TRUE(omissing.have_missing()); EXPECT_TRUE(omissing.get_items().begin()->second.need == eversion_t(1, 1)); @@ -2944,7 +2944,7 @@ TEST_F(PGLogTest, _merge_object_divergent_entries) { orig_entries, oinfo, log.get_can_rollback_to(), missing, &rollbacker, - this); + false, this); // No core dump } { @@ -2970,7 +2970,7 @@ TEST_F(PGLogTest, _merge_object_divergent_entries) { orig_entries, oinfo, log.get_can_rollback_to(), missing, &rollbacker, - this); + false, this); // No core dump } } -- 2.39.5