From: Bill Scales Date: Wed, 14 May 2025 07:39:40 +0000 (+0100) Subject: osd: EC Optimizations bug fix for flip/flop acting set X-Git-Tag: v20.1.0~68^2~30 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5df15c098c2b006e3026ad9d506ec5dbe91d3375;p=ceph.git osd: EC Optimizations bug fix for flip/flop acting set EC optimizations pools have a set of non-primary shards which cannot become the primary because they do not have all the metadata updates. If one of these shards is chosen as the primary it will set the acting set to force another shard to be chosen. It is important that the selected acting set is the same acting set that will be chosen by the next primary (assuming nothing else changes) otherwise a PG can get into a state where the acting set flip/flops between two different states causing the PG to get stuck in peering and hanging I/O. A bug in update_peer_info meant that non-primary shards did not present the same info to choose_acting_set as primary shards because they were not updating their pg_info_t based on pwlc information from other shards. Signed-off-by: Bill Scales (cherry picked from commit 54b265f811e545885916367d7d63c7f4d734fae0) --- diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 7e6d06c9343..585b4a3b793 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -364,7 +364,7 @@ void PeeringState::update_peer_info(const pg_shard_t &from, } // 3 cases: // We are the primary - from is the shard that sent the oinfo - // We are a replica - from is the primary, it will not have pwlc infomation + // We are a replica - from is the primary, it will not have pwlc infomation for itself // Merge - from is pg_whoami, oinfo is a source pg that is being merged if ((from != pg_whoami) && info.partial_writes_last_complete.contains(from.shard)) { @@ -391,6 +391,34 @@ void PeeringState::update_peer_info(const pg_shard_t &from, } } } + // Non-primary shards might need to apply pwlc to update info + if (info.partial_writes_last_complete.contains(pg_whoami.shard)) { + // Check if last_complete and last_update can be advanced based on + // knowledge of partial_writes + const auto & [fromversion, toversion] = + info.partial_writes_last_complete[pg_whoami.shard]; + if (toversion > info.last_complete) { + if (fromversion <= info.last_complete) { + psdout(10) << "osd." << pg_whoami << " has last_complete " + << info.last_complete + << " but pwlc says its at " << toversion + << dendl; + info.last_complete = toversion; + if (toversion > info.last_update) { + info.last_update = toversion; + } + if (toversion > pg_log.get_head()) { + pg_log.set_head(toversion); + } + } else { + psdout(10) << "osd." << pg_whoami << " has last_complete " + << info.last_complete + << " cannot apply pwlc from " << fromversion + << " to " << toversion + << dendl; + } + } + } } bool PeeringState::proc_replica_notify(const pg_shard_t &from, const pg_notify_t ¬ify)