From: Bill Scales Date: Tue, 25 Mar 2025 17:41:57 +0000 (+0000) Subject: osd: EC Optimizations: Update pwlc in pg_info_t X-Git-Tag: v20.3.0~29^2~19 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=886112b8136fb05b55be4906dbfd5a85c0db0d64;p=ceph.git osd: EC Optimizations: Update pwlc in pg_info_t Optimized EC pools add extra data to the log entry to track which shards were updated by a partial write. When the log entry is completed this needs to be summarized in the partial_writes_last_complete map in pg_info_t. Summarising this data in pg_info_t makes it easy to determine whether the reason a shard is behind is because it is missing update or has just not been involved in recent updates. This also ensures that even if there is a long sequence of updates that all skip updating a shard that a record of this is retained in the info structure even after the log has been trimmed. Edited by aainscow as suggested in comment here: https://github.com/ceph/ceph/pull/62522/files#r2050803678 Signed-off-by: Bill Scales Signed-off-by: Alex Ainscow --- diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 0a20f2c4d624..1ea50542dfce 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -477,6 +477,10 @@ public: void trim(const pg_log_entry_t &entry) override { // TODO } + void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override { + // TODO + ceph_assert(entry.written_shards.empty() && info->partial_writes_last_complete.empty()); + } }; PGLog::LogEntryHandlerRef get_log_handler( ceph::os::Transaction &t) final { diff --git a/src/osd/PG.h b/src/osd/PG.h index 28b0e5937efe..907c50a66928 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1127,6 +1127,9 @@ protected: void trim(const pg_log_entry_t &entry) override { pg->get_pgbackend()->trim(entry, t); } + void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override { + pg->get_pgbackend()->partial_write(info, entry); + } }; void update_object_snap_mapping( diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc index e91614192f5c..19ac1ab70ea6 100644 --- a/src/osd/PGBackend.cc +++ b/src/osd/PGBackend.cc @@ -411,6 +411,55 @@ void PGBackend::try_stash( ghobject_t(hoid, v, get_parent()->whoami_shard().shard)); } +void PGBackend::partial_write( + pg_info_t *info, + const pg_log_entry_t &entry) +{ + ceph_assert(info != nullptr); + auto dpp = get_parent()->get_dpp(); + if (!entry.written_shards.empty()) { + ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version + << " written_shards=" << entry.written_shards + << " present_shards=" << entry.present_shards + << " pwlc=" << info->partial_writes_last_complete + << dendl; + const pg_pool_t &pool = get_parent()->get_pool(); + for (unsigned int shard = 0; + shard < get_parent()->get_pool().size; + shard++) { + if (pool.is_nonprimary_shard(shard_id_t(shard))) { + if (!entry.is_written_shard(shard_id_t(shard))) { + if (!info->partial_writes_last_complete.contains(shard_id_t(shard))) { + // 1st partial write since all logs were updated + info->partial_writes_last_complete[shard_id_t(shard)] = + std::pair(entry.prior_version, entry.version); + } else if (info->partial_writes_last_complete[shard_id_t(shard)] + .second.version + 1 == entry.version.version) { + // Subsequent partial write, version is sequential + info->partial_writes_last_complete[shard_id_t(shard)].second = + entry.version; + } else { + // Subsequent partial write, discontiguous versions + ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard + << dendl; + } + } else { + // Log updated or shard absent, partial write entry not required + info->partial_writes_last_complete.erase(shard_id_t(shard)); + } + } + } + ldpp_dout(dpp, 20) << __func__ << " after pwlc=" + << info->partial_writes_last_complete << dendl; + } else { + // All shard updated - clear partial write data + if (!info->partial_writes_last_complete.empty()) { + ldpp_dout(dpp, 20) << __func__ << " clear pwlc" << dendl; + } + info->partial_writes_last_complete.clear(); + } +} + void PGBackend::remove( const hobject_t &hoid, ObjectStore::Transaction *t) { diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 32ca5f17820c..4e5be28b637a 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -491,6 +491,10 @@ typedef std::shared_ptr OSDMapRef; const pg_log_entry_t &entry, ObjectStore::Transaction *t); + void partial_write( + pg_info_t *info, + const pg_log_entry_t &entry); + void remove( const hobject_t &hoid, ObjectStore::Transaction *t); diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index 9e7c0405a63e..5831cef8c2b6 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -465,7 +465,7 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd, for (auto &&oe: divergent) { dout(10) << "merge_log divergent " << oe << dendl; } - log.roll_forward_to(log.head, rollbacker); + log.roll_forward_to(log.head, &info, rollbacker); mempool::osd_pglog::list new_entries; new_entries.splice(new_entries.end(), olog.log, from, to); diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index 10c7fc0e7ea7..4d899f2356b3 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -146,6 +146,9 @@ struct PGLog : DoutPrefixProvider { virtual void try_stash( const hobject_t &hoid, version_t v) = 0; + virtual void partial_write( + pg_info_t *info, + const pg_log_entry_t &entry) = 0; virtual ~LogEntryHandler() {} }; using LogEntryHandlerRef = std::unique_ptr; @@ -249,18 +252,28 @@ public: return *this; } - void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) { + void trim_rollback_info_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) { advance_can_rollback_to( to, [&](pg_log_entry_t &entry) { h->trim(entry); + h->partial_write(info, entry); }); } - bool roll_forward_to(eversion_t to, LogEntryHandler *h) { + bool roll_forward_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) { return advance_can_rollback_to( to, [&](pg_log_entry_t &entry) { h->rollforward(entry); + h->partial_write(info, entry); + }); + } + + void skip_can_rollback_to_to_head(pg_info_t *info, LogEntryHandler *h) { + advance_can_rollback_to( + head, + [&](pg_log_entry_t &entry) { + h->partial_write(info, entry); }); } @@ -826,9 +839,11 @@ public: void roll_forward_to( eversion_t roll_forward_to, + pg_info_t *info, LogEntryHandler *h) { if (log.roll_forward_to( roll_forward_to, + info, h)) dirty_log = true; } @@ -837,20 +852,22 @@ public: return log.get_can_rollback_to(); } - void roll_forward(LogEntryHandler *h) { + void roll_forward(pg_info_t *info, LogEntryHandler *h) { roll_forward_to( log.head, + info, h); } - void skip_rollforward() { - log.skip_can_rollback_to_to_head(); + void skip_rollforward(pg_info_t *info, LogEntryHandler *h) { + // Update pwlc during backfill + log.skip_can_rollback_to_to_head(info, h); } //////////////////// get or std::set log & missing //////////////////// - void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) { - log.trim_rollback_info_to(log.head, h); + void reset_backfill_claim_log(const pg_log_t &o, pg_info_t *info, LogEntryHandler *h) { + log.trim_rollback_info_to(log.head, info, h); log.claim_log_and_clear_rollback_info(o); missing.clear(); mark_dirty_to(eversion_t::max()); diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 4dd7543e10ac..e3d10e4024e8 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -3012,7 +3012,7 @@ void PeeringState::activate( } if (acting_set_writeable()) { PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; - pg_log.roll_forward(rollbacker.get()); + pg_log.roll_forward(&info, rollbacker.get()); } } @@ -3398,7 +3398,7 @@ void PeeringState::merge_from( } PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)}; - pg_log.roll_forward(handler.get()); + pg_log.roll_forward(&info, handler.get()); info.last_complete = info.last_update; // to fake out trim() pg_log.reset_recovery_pointers(); @@ -3437,7 +3437,7 @@ void PeeringState::merge_from( // prepare log PGLog::LogEntryHandlerRef handler{ source->pl->get_log_handler(rctx.transaction)}; - source->pg_log.roll_forward(handler.get()); + source->pg_log.roll_forward(&info, handler.get()); source->info.last_complete = source->info.last_update; // to fake out trim() source->pg_log.reset_recovery_pointers(); source->pg_log.trim(source->info.last_update, source->info); @@ -4116,10 +4116,10 @@ bool PeeringState::append_log_entries_update_missing( rollbacker.get()); if (pg_committed_to && entries.rbegin()->soid > info.last_backfill) { - pg_log.roll_forward(rollbacker.get()); + pg_log.roll_forward(&info, rollbacker.get()); } if (pg_committed_to && *pg_committed_to > pg_log.get_can_rollback_to()) { - pg_log.roll_forward_to(*pg_committed_to, rollbacker.get()); + pg_log.roll_forward_to(*pg_committed_to, &info, rollbacker.get()); last_rollback_info_trimmed_to_applied = *pg_committed_to; } @@ -4247,7 +4247,7 @@ void PeeringState::append_log( * from the backend and we do not end up in a situation, where the * object is deleted before we can _merge_object_divergent_entries(). */ - pg_log.skip_rollforward(); + pg_log.skip_rollforward(&info, handler.get()); } for (auto p = logv.begin(); p != logv.end(); ++p) { @@ -4258,12 +4258,13 @@ void PeeringState::append_log( * above */ if (transaction_applied && p->soid > info.last_backfill) { - pg_log.roll_forward(handler.get()); + pg_log.roll_forward(&info, handler.get()); } } if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) { pg_log.roll_forward_to( roll_forward_to, + &info, handler.get()); last_rollback_info_trimmed_to_applied = roll_forward_to; } @@ -4299,7 +4300,7 @@ void PeeringState::recover_got( * to roll it back anyway (and we'll be rolled forward shortly * anyway) */ PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)}; - pg_log.roll_forward_to(v, handler.get()); + pg_log.roll_forward_to(v, &info, handler.get()); } psdout(10) << "got missing " << oid << " v " << v << dendl; @@ -6645,7 +6646,7 @@ boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt) ps->dirty_big_info = true; // maybe. PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; - ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get()); + ps->pg_log.reset_backfill_claim_log(msg->log, &ps->info, rollbacker.get()); ps->pg_log.reset_backfill(); } else { @@ -6794,7 +6795,7 @@ PeeringState::Deleting::Deleting(my_context ctx) // clear log PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; - ps->pg_log.roll_forward(rollbacker.get()); + ps->pg_log.roll_forward(&ps->info, rollbacker.get()); // adjust info to backfill ps->info.set_last_backfill(hobject_t()); diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc index 1fff469d1fe5..5b85e0f9d2e8 100644 --- a/src/test/osd/TestPGLog.cc +++ b/src/test/osd/TestPGLog.cc @@ -233,6 +233,9 @@ public: } void trim( const pg_log_entry_t &entry) override {} + void partial_write( + pg_info_t *info, + const pg_log_entry_t &entry) override {} }; template @@ -356,6 +359,9 @@ struct TestHandler : public PGLog::LogEntryHandler { } void trim( const pg_log_entry_t &entry) override {} + void partial_write( + pg_info_t *info, + const pg_log_entry_t &entry) override {} }; TEST_F(PGLogTest, rewind_divergent_log) { @@ -530,11 +536,11 @@ TEST_F(PGLogTest, rewind_divergent_log) { add(e); } TestHandler h(remove_snap); - roll_forward_to(eversion_t(1, 6), &h); + roll_forward_to(eversion_t(1, 6), &info, &h); rewind_divergent_log(eversion_t(1, 5), info, &h, dirty_info, dirty_big_info); pg_log_t log; - reset_backfill_claim_log(log, &h); + reset_backfill_claim_log(log, &info, &h); } }