From f6d5d102213063c7a10432334f54b616f66fd5c3 Mon Sep 17 00:00:00 2001 From: Bill Scales Date: Fri, 4 Jul 2025 11:51:05 +0100 Subject: [PATCH] osd: EC Optimizations proc_master_log bug fixes 1. proc_master_log can roll forward full-writes that have been applied to all shards but not yet completed. Add a new function consider_adjusting_pwlc to roll-forward pwlc. Later partial_write can be called to process the same writes again. This can result in pwlc being rolled backwards. Modify partial_write so it does not undo pwlc. 2. At the end of proc_master_log we want the new authorative view of pwlc to persist - this may be better or worse than the stale view of pwlc held by other shards. consider_rollback_pwlc sometimes updated the epoch in the toversion (second value of the range fromversion-toverison). We now always do this. Updating toversion.epoch causes problems because this version sometimes gets copied to last_update and last_complete - using the wrong epoch here messes everything up in later peering cycles. Instead we now update fromversion.epoch. This requires changes to apply_pwlc and an assert in Stray::react(const MInfoRec&) 3. Calling apply_pwlc at the end of proc_master_log is too early - updating last_update and last_complete here breaks GetMissing. We need to do this later when activating (change to search_missing and activate) 4. proc_master_log is calling partial_write with the wrong previous version - this causes problems after a split when the log is sparsely populated. 5. merging PGs is not setting up pwlc correctly which can cause issues in future peering cycles. The pwlc can simply be reset, we need to update the epoch to make sure this view of pwlc persists vs stale pwlc from other shards. Signed-off-by: Bill Scales (cherry picked from commit 0b8593a0112e31705acb581ac388a4ef1df31b4b) --- src/osd/PGBackend.cc | 24 ++++++++--- src/osd/PeeringState.cc | 95 +++++++++++++++++++++++++++-------------- src/osd/PeeringState.h | 1 + 3 files changed, 84 insertions(+), 36 deletions(-) diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc index b199c9248dc..d4ca5b83ca0 100644 --- a/src/osd/PGBackend.cc +++ b/src/osd/PGBackend.cc @@ -440,8 +440,18 @@ void PGBackend::partial_write( } auto &&[old_v, new_v] = pwlc_iter->second; if (old_v == new_v) { - old_v = previous_version; - new_v = entry.version; + if (old_v.version >= entry.version.version) { + // Abnormal case - consider_adjusting_pwlc may advance pwlc + // during peering because all shards have updates but these + // have not been marked complete. At the end of peering + // partial_write catches up with these entries - these need + // to be ignored to preserve old_v.epoch + ldpp_dout(dpp, 20) << __func__ << " pwlc is ahead of entry " << shard + << dendl; + } else { + old_v = previous_version; + new_v = entry.version; + } } else if (new_v == previous_version) { // Subsequent partial write, contiguous versions new_v = entry.version; @@ -453,9 +463,13 @@ void PGBackend::partial_write( } else if (pwlc_iter != info->partial_writes_last_complete.end()) { auto &&[old_v, new_v] = pwlc_iter->second; // Log updated or shard absent, partial write entry is a no-op - // FIXME: In a later commit (or later PR) we should look at other ways of - // actually clearing the PWLC once all shards have seen the update. - old_v = new_v = entry.version; + if (old_v.version >= entry.version.version) { + // Abnormal case - see above + ldpp_dout(dpp, 20) << __func__ << " pwlc is ahead of entry " << shard + << dendl; + } else { + old_v = new_v = entry.version; + } } } ldpp_dout(dpp, 20) << __func__ << " after pwlc=" diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 521af37c836..ce520a9f4b5 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -332,7 +332,7 @@ void PeeringState::apply_pwlc(const std::pair pwlc, const auto & [fromversion, toversion] = pwlc; bool logged = false; if ((toversion > info.last_complete) && - (fromversion <= info.last_complete)) { + (fromversion.version <= info.last_complete.version)) { psdout(10) << "osd." << shard << " has last_complete " << info.last_complete << " pwlc can advance last_complete to " << toversion @@ -341,7 +341,7 @@ void PeeringState::apply_pwlc(const std::pair pwlc, info.last_complete = toversion; } if ((toversion > info.last_update) && - (fromversion <= info.last_update)) { + (fromversion.version <= info.last_update.version)) { if (!logged) { psdout(10) << "osd." << shard << " has last_complete " << info.last_complete << " and last_update " @@ -387,8 +387,10 @@ void PeeringState::update_peer_info(const pg_shard_t &from, info.partial_writes_last_complete[shard]; // Prefer pwlc with a newer toversion, if toversion matches prefer an // older fromversion. - if ((otoversion > toversion) || - ((otoversion == toversion) && (ofromversion < fromversion))) { + if ((ofromversion.epoch > fromversion.epoch) || + ((ofromversion.epoch == fromversion.epoch) && (otoversion > toversion)) || + ((ofromversion.epoch == fromversion.epoch) && (otoversion == toversion) && + (ofromversion.version < fromversion.version))) { if (!updated) { updated = true; psdout(10) << "osd." << from @@ -2737,6 +2739,9 @@ bool PeeringState::search_for_missing( tinfo.pgid.shard = pg_whoami.shard; // add partial write from our info tinfo.partial_writes_last_complete = info.partial_writes_last_complete; + if (info.partial_writes_last_complete.contains(from.shard)) { + apply_pwlc(info.partial_writes_last_complete[from.shard], from, tinfo); + } if (!tinfo.partial_writes_last_complete.empty()) { psdout(20) << "sending info to " << from << " pwlc=" << tinfo.partial_writes_last_complete @@ -2998,7 +3003,7 @@ void PeeringState::activate( << " is up to date, queueing in pending_activators" << dendl; if (!info.partial_writes_last_complete.empty()) { psdout(20) << "sending info to " << peer - << " pwcl=" << info.partial_writes_last_complete + << " pwlc=" << info.partial_writes_last_complete << " info=" << info << dendl; } @@ -3034,6 +3039,7 @@ void PeeringState::activate( << "] " << pi.last_backfill << " to " << info.last_update; + pi.partial_writes_last_complete = info.partial_writes_last_complete; pi.last_update = info.last_update; pi.last_complete = info.last_update; pi.set_last_backfill(hobject_t()); @@ -3268,22 +3274,44 @@ void PeeringState::proc_primary_info( } } +void PeeringState::consider_adjusting_pwlc(eversion_t last_complete) +{ + for (const auto & [shard, versionrange] : + info.partial_writes_last_complete) { + auto [fromversion, toversion] = versionrange; + if (last_complete > toversion) { + // Full writes are being rolled forward, eventually + // partial_write will be called to advance pwlc, but we need + // to preempt that here before proc_master_log considers + // rolling forward partial writes + info.partial_writes_last_complete[shard] = std::pair(last_complete, + last_complete); + psdout(10) << "shard " << shard << " pwlc rolled forward to " + << info.partial_writes_last_complete[shard] << dendl; + } else if (last_complete < toversion) { + // A divergent update has advanced pwlc adhead of last_complete, + // roll backwards to the last completed full write and then + // let proc_master_log roll forward partial writes + info.partial_writes_last_complete[shard] = std::pair(last_complete, + last_complete); + psdout(10) << "shard " << shard << " pwlc rolled backward to " + << info.partial_writes_last_complete[shard] << dendl; + } + } +} + void PeeringState::consider_rollback_pwlc(eversion_t last_complete) { for (const auto & [shard, versionrange] : info.partial_writes_last_complete) { auto [fromversion, toversion] = versionrange; - if (last_complete < fromversion) { + if (last_complete.version < fromversion.version) { // It is possible that we need to rollback pwlc, this can happen if // peering is attempted with an OSD missing but does not manage to // activate (typically because of a wait upthru) before the missing // OSD returns info.partial_writes_last_complete[shard] = std::pair(last_complete, last_complete); - // Assign the current epoch to the version number so that this is - // recognised as the newest pwlc update - info.partial_writes_last_complete[shard].second.epoch = - get_osdmap_epoch(); psdout(10) << "shard " << shard << " pwlc rolled back to " << info.partial_writes_last_complete[shard] << dendl; } else if (last_complete < toversion) { @@ -3291,6 +3319,11 @@ void PeeringState::consider_rollback_pwlc(eversion_t last_complete) psdout(10) << "shard " << shard << " pwlc rolled back to " << info.partial_writes_last_complete[shard] << dendl; } + // Always assign the current epoch to the version number so that + // pwlc adjustments made by the whole proc_master_log process + // are recognized as the newest updates + info.partial_writes_last_complete[shard].first.epoch = + get_osdmap_epoch(); } } @@ -3324,7 +3357,7 @@ void PeeringState::proc_master_log( all_info[pg_whoami] = info; // Normal case is that both logs have entry olog.head bool can_check_next_entry = (p->version == olog.head); - if (p->version < olog.head) { + if ((p->version < olog.head) || (p == pg_log.get_log().log.begin())) { // After a PG split there may be gaps in the log where entries were // split to the other PG. This can result in olog.head being ahead // of p->version. So long as there are no entries in olog between @@ -3337,8 +3370,10 @@ void PeeringState::proc_master_log( can_check_next_entry = true; } } + if (can_check_next_entry) { + consider_adjusting_pwlc(p->version); + } PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; - bool update_pwlc = false; while (can_check_next_entry) { ++p; if (p == pg_log.get_log().log.end()) { @@ -3381,20 +3416,17 @@ void PeeringState::proc_master_log( // This entry can be kept, only shards that didn't participate in // the partial write missed the update psdout(20) << "keeping entry " << p->version << dendl; - rollbacker.get()->partial_write(&info, olog.head, *p); - update_pwlc = true; + eversion_t previous_version; + if (p == pg_log.get_log().log.begin()) { + previous_version = pg_log.get_tail(); + } else { + previous_version = std::prev(p)->version; + } + rollbacker.get()->partial_write(&info, previous_version, *p); olog.head = p->version; // We need to continue processing the log, so don't break. } - if (update_pwlc) { - psdout(20) << "applying pwlc updates" << dendl; - for (auto & [shard, peer] : peer_info) { - if (info.partial_writes_last_complete.contains(shard.shard)) { - apply_pwlc(info.partial_writes_last_complete[shard.shard], shard, peer); - } - } - } } // merge log into our own log to build master log. no need to // make any adjustments to their missing map; we are taking their @@ -3402,7 +3434,6 @@ void PeeringState::proc_master_log( // non-divergent). merge_log(t, oinfo, std::move(olog), from); peer_info[from] = oinfo; - update_peer_info(from, oinfo); psdout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl; might_have_unfound.insert(from); @@ -3778,13 +3809,15 @@ void PeeringState::merge_from( past_intervals = source->past_intervals; } - // merge pwlc + // merge pwlc - reset if (!info.partial_writes_last_complete.empty()) { - psdout(10) << "before pwlc=" << info.partial_writes_last_complete << dendl; - } - update_peer_info(pg_whoami, source->info); - if (!info.partial_writes_last_complete.empty()) { - psdout(10) << "after pwlc=" << info.partial_writes_last_complete << dendl; + for (auto &&[shard, versionrange] : + info.partial_writes_last_complete) { + auto &&[old_v, new_v] = versionrange; + old_v = new_v = info.last_update; + old_v.epoch = get_osdmap_epoch(); + } + psdout(10) << "merged pwlc=" << info.partial_writes_last_complete << dendl; } } @@ -6857,7 +6890,7 @@ boost::statechart::result PeeringState::ReplicaActive::react( i.history.last_epoch_started = evt.activation_epoch; i.history.last_interval_started = i.history.same_interval_since; if (!i.partial_writes_last_complete.empty()) { - psdout(20) << "sending info to " << ps->get_primary() << " pwcl=" + psdout(20) << "sending info to " << ps->get_primary() << " pwlc=" << i.partial_writes_last_complete << " info=" << i << dendl; } rctx.send_info( @@ -7081,7 +7114,7 @@ boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt) << " our last_update=" << ps->info.last_update << dendl; // Our last update must be in the range described by partial write // last_complete - ceph_assert(ps->info.last_update >= pwlc.first); + ceph_assert(ps->info.last_update.version >= pwlc.first.version); // Last complete must match the partial write last_update ceph_assert(pwlc.second == infoevt.info.last_update); } else { diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index d87eb7cc6bc..e756879dd8d 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -1783,6 +1783,7 @@ private: pg_log_t&& olog, pg_shard_t from); void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info); + void consider_adjusting_pwlc(eversion_t last_complete); void consider_rollback_pwlc(eversion_t last_complete); void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t&& olog, pg_missing_t&& omissing, -- 2.47.3