From: Bill Scales Date: Wed, 16 Jul 2025 14:55:40 +0000 (+0100) Subject: osd: Optimized EC invalid pwlc for shards doing backfill/async X-Git-Tag: testing/wip-vshankar-testing-20250813.085004-debug~8^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=534fc76d40a86a49bfabab247d3a703cbb575e27;p=ceph-ci.git osd: Optimized EC invalid pwlc for shards doing backfill/async Shards performing backfill or async recovery receive log entries (but not transactions) for updates to missing/yet to be backfilled objects. These log entries get applied and completed immediately because there is nothing that can be rolled back. This causes pwlc to advance too early and causes problems if other shards do not complete the update and end up rolling it backwards. This fix sets pwlc to be invalid when such a log entry is applied and completed and it then remains invalid until the next interval when peering runs again. Other shards will continue to update pwlc and any complete subset of shards in a future interval will include at least one shard that has continued to update pwlc Signed-off-by: Bill Scales --- diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc index d4ca5b83ca0..3eedd4d462d 100644 --- a/src/osd/PGBackend.cc +++ b/src/osd/PGBackend.cc @@ -440,7 +440,12 @@ void PGBackend::partial_write( } auto &&[old_v, new_v] = pwlc_iter->second; if (old_v == new_v) { - if (old_v.version >= entry.version.version) { + if (old_v.version == eversion_t::max().version) { + // shard is backfilling or in async recovery, pwlc is + // invalid + ldpp_dout(dpp, 20) << __func__ << " pwlc invalid " << shard + << dendl; + } else if (old_v.version >= entry.version.version) { // Abnormal case - consider_adjusting_pwlc may advance pwlc // during peering because all shards have updates but these // have not been marked complete. At the end of peering @@ -463,9 +468,13 @@ void PGBackend::partial_write( } else if (pwlc_iter != info->partial_writes_last_complete.end()) { auto &&[old_v, new_v] = pwlc_iter->second; // Log updated or shard absent, partial write entry is a no-op - if (old_v.version >= entry.version.version) { - // Abnormal case - see above - ldpp_dout(dpp, 20) << __func__ << " pwlc is ahead of entry " << shard + if (old_v.version == eversion_t::max().version) { + // shard is backfilling or in async recovery, pwlc is invalid + ldpp_dout(dpp, 20) << __func__ << " pwlc invalid " << shard + << dendl; + } else if (old_v.version >= entry.version.version) { + // Abnormal case - see above + ldpp_dout(dpp, 20) << __func__ << " pwlc is ahead of entry " << shard << dendl; } else { old_v = new_v = entry.version; diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 17842b324f5..1bcf7936fce 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -4627,6 +4627,16 @@ void PeeringState::append_log( * object is deleted before we can _merge_object_divergent_entries(). */ pg_log.skip_rollforward(&info, handler.get()); + /* Invalidate pwlc for this shard until the next interval when + * it will be updated with the pwlc from another shard + */ + for (auto & [shard, versionrange] : + info.partial_writes_last_complete) { + auto & [fromversion, toversion] = versionrange; + fromversion.epoch = 0; + fromversion.version = eversion_t::max().version; + toversion = fromversion; + } } for (auto p = logv.begin(); p != logv.end(); ++p) {