From: Bill Scales Date: Wed, 1 Oct 2025 14:52:23 +0000 (+0100) Subject: osd: Optimized EC missing list not updated on recovering shard X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=b82ed27ce811e42f9a3c48cef3099c18e31e9605;p=ceph-ci.git osd: Optimized EC missing list not updated on recovering shard Shards that are recovering (last_complete != last_update) that skip transactions and log entries because of partial writes are using pwlc to advance last_update. However simply incrementing last_update is not sufficient - there are scenarios where the needed version of a missing object has to be updated. If the shard is already missing object X at version V1 and there was a partial write at V2 that did not update the shard, it does not need to retain the log entry, but it does need to update the missing list to say it needs V2 rather than V1. This ensures all shards report a need for an object at the same version and avoids an assert in MissingLoc::add_active_missing when the primary is trying to combine the missing lists from all the shards to work out what has to be recovered. Avoiding applying pwlc during the early phase of the peering process ensures the missing list gets updated. However if a shard is not missing object X and there was a partial write at V2 that did not update the shard then at the end of peering it is still necessary to advance last_upadte by applying pwlc. This ensures that in later peering cycles the code does not change its mind and think the shard is now missing object X. The fix is to be more sophisticated about when pwlc can be used to advance last_update for a recovering shard. The code now passes in a parameter indicating whether we are in the early (pre activate) or later phase of peering. This also means that additional calls to apply_pwlc are needed when peering gets to activating and is searching for missing to make updates that were not made earlier. Fixes: https://tracker.ceph.com/issues/73249 Signed-off-by: Bill Scales --- diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 698b44ed5fa..fef041f5eec 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -324,6 +324,7 @@ void PeeringState::query_unfound(Formatter *f, string state) void PeeringState::apply_pwlc(const std::pair pwlc, const pg_shard_t &shard, pg_info_t &info, + peering_stage stage, pg_log_t *log1, PGLog *log2) { @@ -331,7 +332,9 @@ void PeeringState::apply_pwlc(const std::pair pwlc, // knowledge of partial_writes const auto & [fromversion, toversion] = pwlc; if (toversion > info.last_update) { - if (fromversion <= info.last_update) { + if ((fromversion <= info.last_update) && + ((stage == AFTER_ACTIVATE) || + (info.last_complete == info.last_update))) { if (info.last_complete == info.last_update) { psdout(10) << "osd." << shard << " has last_complete" << "=last_update " << info.last_update @@ -364,7 +367,8 @@ void PeeringState::apply_pwlc(const std::pair pwlc, } void PeeringState::update_peer_info(const pg_shard_t &from, - const pg_info_t &oinfo) + const pg_info_t &oinfo, + peering_stage stage) { // Merge pwlc information from another shard into // info.partial_writes_last_complete keeping the newest @@ -423,14 +427,15 @@ void PeeringState::update_peer_info(const pg_shard_t &from, if (is_primary()) { for (auto & [shard, peer] : peer_info) { if (info.partial_writes_last_complete.contains(shard.shard)) { - apply_pwlc(info.partial_writes_last_complete[shard.shard], shard, peer); + apply_pwlc(info.partial_writes_last_complete[shard.shard], shard, peer, + stage); } } } // Non-primary shards might need to apply pwlc to update info if (info.partial_writes_last_complete.contains(pg_whoami.shard)) { apply_pwlc(info.partial_writes_last_complete[pg_whoami.shard], pg_whoami, - info, &pg_log); + info, stage, &pg_log); } } @@ -457,7 +462,7 @@ bool PeeringState::proc_replica_notify(const pg_shard_t &from, const pg_notify_t peer_info[from] = oinfo; stats_last_update[from] = oinfo.last_update; - update_peer_info(from, oinfo); + update_peer_info(from, oinfo, BEFORE_ACTIVATE); might_have_unfound.insert(from); update_history(oinfo.history); @@ -2755,8 +2760,16 @@ bool PeeringState::search_for_missing( PeeringCtxWrapper &ctx) { uint64_t num_unfound_before = missing_loc.num_unfound(); + pg_info_t tinfo(oinfo); + tinfo.pgid.shard = pg_whoami.shard; + // add partial write from our info + tinfo.partial_writes_last_complete = info.partial_writes_last_complete; + tinfo.partial_writes_last_complete_epoch = info.partial_writes_last_complete_epoch; + if (info.partial_writes_last_complete.contains(from.shard)) { + apply_pwlc(info.partial_writes_last_complete[from.shard], from, tinfo, AFTER_ACTIVATE); + } bool found_missing = missing_loc.add_source_info( - from, oinfo, omissing, ctx.handle); + from, tinfo, omissing, ctx.handle); if (found_missing && num_unfound_before != missing_loc.num_unfound()) pl->publish_stats_to_osd(); // avoid doing this if the peer is empty. This is abit of paranoia @@ -2765,14 +2778,6 @@ bool PeeringState::search_for_missing( // last_update=0'0 that's impossible.) if (found_missing && oinfo.last_update != eversion_t()) { - pg_info_t tinfo(oinfo); - tinfo.pgid.shard = pg_whoami.shard; - // add partial write from our info - tinfo.partial_writes_last_complete = info.partial_writes_last_complete; - tinfo.partial_writes_last_complete_epoch = info.partial_writes_last_complete_epoch; - if (info.partial_writes_last_complete.contains(from.shard)) { - apply_pwlc(info.partial_writes_last_complete[from.shard], from, tinfo); - } if (!tinfo.partial_writes_last_complete.empty()) { psdout(20) << "sending info to " << from << " pwlc=e" << tinfo.partial_writes_last_complete_epoch @@ -3195,6 +3200,10 @@ void PeeringState::activate( ceph_assert(pi_it != peer_info.end()); auto pm_it = peer_missing.find(*i); ceph_assert(pm_it != peer_missing.end()); + if (info.partial_writes_last_complete.contains(i->shard)) { + apply_pwlc(info.partial_writes_last_complete[i->shard], *i, + pi_it->second, AFTER_ACTIVATE); + } missing_loc.add_source_info( *i, pi_it->second, @@ -3372,7 +3381,7 @@ void PeeringState::proc_master_log( if (info.partial_writes_last_complete.contains(from.shard)) { apply_pwlc(info.partial_writes_last_complete[from.shard], from, oinfo, - &olog); + BEFORE_ACTIVATE, &olog); } bool invalidate_stats = false; @@ -3563,12 +3572,12 @@ void PeeringState::proc_replica_log( if (info.partial_writes_last_complete.contains(from.shard)) { apply_pwlc(info.partial_writes_last_complete[from.shard], from, oinfo, - &olog); + BEFORE_ACTIVATE, &olog); } pg_log.proc_replica_log(oinfo, olog, omissing, from, pool.info.allows_ecoptimizations()); peer_info[from] = oinfo; - update_peer_info(from, oinfo); + update_peer_info(from, oinfo, BEFORE_ACTIVATE); psdout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl; might_have_unfound.insert(from); @@ -7074,10 +7083,10 @@ boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& loge ObjectStore::Transaction &t = context().get_cur_transaction(); if (msg->info.partial_writes_last_complete.contains(ps->pg_whoami.shard)) { ps->apply_pwlc(msg->info.partial_writes_last_complete[ps->pg_whoami.shard], - ps->pg_whoami, ps->info, &ps->pg_log); + ps->pg_whoami, ps->info, AFTER_ACTIVATE, &ps->pg_log); } ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from); - ps->update_peer_info(logevt.from, logevt.msg->info); + ps->update_peer_info(logevt.from, logevt.msg->info, AFTER_ACTIVATE); ceph_assert(ps->pg_log.get_head() == ps->info.last_update); if (logevt.msg->lease) { ps->proc_lease(*logevt.msg->lease); @@ -7199,10 +7208,10 @@ boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt) } else { if (msg->info.partial_writes_last_complete.contains(ps->pg_whoami.shard)) { ps->apply_pwlc(msg->info.partial_writes_last_complete[ps->pg_whoami.shard], - ps->pg_whoami, ps->info, &ps->pg_log); + ps->pg_whoami, ps->info, AFTER_ACTIVATE, &ps->pg_log); } ps->merge_log(t, msg->info, std::move(msg->log), logevt.from); - ps->update_peer_info(logevt.from, msg->info); + ps->update_peer_info(logevt.from, msg->info, AFTER_ACTIVATE); } if (logevt.msg->lease) { ps->proc_lease(*logevt.msg->lease); @@ -7262,7 +7271,7 @@ boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt) // Log must be consistent with info ceph_assert(ps->pg_log.get_head() == ps->info.last_update); // Update pwlc - ps->update_peer_info(infoevt.from, infoevt.info); + ps->update_peer_info(infoevt.from, infoevt.info, AFTER_ACTIVATE); post_event(Activate(infoevt.info.last_epoch_started)); return transit(); } @@ -8113,7 +8122,7 @@ boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt) psdout(10) << "Noting missing from osd." << logevt.from << dendl; ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing)); ps->peer_info[logevt.from] = logevt.msg->info; - ps->update_peer_info(logevt.from, logevt.msg->info); + ps->update_peer_info(logevt.from, logevt.msg->info, BEFORE_ACTIVATE); return discard_event(); } diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index 226ba4281dc..2774244e581 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -1589,26 +1589,35 @@ public: void update_heartbeat_peers(); void query_unfound(Formatter *f, std::string state); + + enum peering_stage { + BEFORE_ACTIVATE, + AFTER_ACTIVATE + }; + void apply_pwlc(const std::pair pwlc, const pg_shard_t &shard, pg_info_t &info, + peering_stage stage, pg_log_t *log1, PGLog *log2); void apply_pwlc(const std::pair pwlc, const pg_shard_t &shard, pg_info_t &info, + peering_stage stage, pg_log_t *log) { - apply_pwlc(pwlc, shard, info, log, nullptr); + apply_pwlc(pwlc, shard, info, stage, log, nullptr); } void apply_pwlc(const std::pair pwlc, const pg_shard_t &shard, pg_info_t &info, + peering_stage stage, PGLog *log = nullptr) { - apply_pwlc(pwlc, shard, info, nullptr, log); + apply_pwlc(pwlc, shard, info, stage, nullptr, log); } - void update_peer_info(const pg_shard_t &from, const pg_info_t &oinfo); + void update_peer_info(const pg_shard_t &from, const pg_info_t &oinfo, peering_stage stage); bool proc_replica_notify(const pg_shard_t &from, const pg_notify_t ¬ify); void remove_down_peer_info(const OSDMapRef &osdmap); void check_recovery_sources(const OSDMapRef& map);