From 56fbf22db3b393fdcf69442b23fae7f694fdef89 Mon Sep 17 00:00:00 2001 From: Bill Scales Date: Fri, 1 Aug 2025 10:39:16 +0100 Subject: [PATCH] osd: Optimized EC choose_async_recovery_ec must use auth_shard Optimized EC pools modify how GetLog and choose_acting work, if the auth_shard is a non-primary shard and the (new) primary is behind the auth_shard then we cannot just get the log from the non-primary shard because it will be missing entries for partial writes. Instead we need to get the log from a shard that has the full log first and then repeat GetLog to get the log from the auth_shard. choose_acting was modifying auth_shard in the case where we need to get the log from another shard first. This is wrong - the remainder of the logic in choose_acting and in particular choose_async_recovery_ec needs to use the auth_shard to calculate what the acting set will be. Using a different shard occasional can cause a different acting set to be selected (because of thresholds about the number of log entries behind a shard needs to be to perform async recovery) and this can lead to two shards flip/flopping with different opinions about what the acting set should be. Fix is to separate out which shard will be returned to GetLog from the auth_shard which will be used for acting set calculations. Signed-off-by: Bill Scales (cherry picked from commit 3c2161ee7350a05e0d81a23ce24cd0712dfef5fb) --- src/osd/PeeringState.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index eec4653a350..33a7d33170b 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -2503,6 +2503,7 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting, false, history_les_bound); + auto get_log_shard = auth_log_shard; if ((repeat_getlog != nullptr) && auth_log_shard != all_info.end() && @@ -2516,16 +2517,16 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, // has to be rolled backwards psdout(10) << "auth_log_shard " << auth_log_shard->first << " is ahead but is a non_primary shard" << dendl; - auth_log_shard = find_best_info(all_info, restrict_to_up_acting, + get_log_shard = find_best_info(all_info, restrict_to_up_acting, true, history_les_bound); - if (auth_log_shard != all_info.end()) { + if (get_log_shard != all_info.end()) { psdout(10) << "auth_log_shard " << auth_log_shard->first << " selected instead" << dendl; *repeat_getlog = true; } } - if (auth_log_shard == all_info.end()) { + if (get_log_shard == all_info.end()) { if (up != acting) { psdout(10) << "no suitable info found (incomplete backfills?)," << " reverting to up" << dendl; @@ -2540,7 +2541,7 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, } ceph_assert(!auth_log_shard->second.is_incomplete()); - auth_log_shard_id = auth_log_shard->first; + auth_log_shard_id = get_log_shard->first; set want_backfill, want_acting_backfill; vector want; -- 2.39.5