]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: Optimized EC choose_async_recovery_ec must use auth_shard
authorBill Scales <bill_scales@uk.ibm.com>
Fri, 1 Aug 2025 09:39:16 +0000 (10:39 +0100)
committerAlex Ainscow <aainscow@uk.ibm.com>
Wed, 17 Sep 2025 08:43:26 +0000 (09:43 +0100)
Optimized EC pools modify how GetLog and choose_acting work,
if the auth_shard is a non-primary shard and the (new) primary
is behind the auth_shard then we cannot just get the log from
the non-primary shard because it will be missing entries for
partial writes. Instead we need to get the log from a shard
that has the full log first and then repeat GetLog to get
the log from the auth_shard.

choose_acting was modifying auth_shard in the case where
we need to get the log from another shard first. This is
wrong - the remainder of the logic in choose_acting and
in particular choose_async_recovery_ec needs to use the
auth_shard to calculate what the acting set will be.
Using a different shard occasional can cause a
different acting set to be selected (because of
thresholds about the number of log entries behind
a shard needs to be to perform async recovery) and
this can lead to two shards flip/flopping with
different opinions about what the acting set should be.

Fix is to separate out which shard will be returned
to GetLog from the auth_shard which will be used
for acting set calculations.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
(cherry picked from commit 3c2161ee7350a05e0d81a23ce24cd0712dfef5fb)

src/osd/PeeringState.cc

index eec4653a3504b53575760c9c9471ffc02272292e..33a7d33170bb946608a931fc0b198c553ed63059 100644 (file)
@@ -2503,6 +2503,7 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
 
   auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
                                       false, history_les_bound);
+  auto get_log_shard = auth_log_shard;
 
   if ((repeat_getlog != nullptr) &&
       auth_log_shard != all_info.end() &&
@@ -2516,16 +2517,16 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
     // has to be rolled backwards
     psdout(10) << "auth_log_shard " << auth_log_shard->first
               << " is ahead but is a non_primary shard" << dendl;
-    auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
+    get_log_shard = find_best_info(all_info, restrict_to_up_acting,
                                    true, history_les_bound);
-    if (auth_log_shard != all_info.end()) {
+    if (get_log_shard != all_info.end()) {
       psdout(10) << "auth_log_shard " << auth_log_shard->first
                 << " selected instead" << dendl;
       *repeat_getlog = true;
     }
   }
 
-  if (auth_log_shard == all_info.end()) {
+  if (get_log_shard == all_info.end()) {
     if (up != acting) {
       psdout(10) << "no suitable info found (incomplete backfills?),"
                 << " reverting to up" << dendl;
@@ -2540,7 +2541,7 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
   }
 
   ceph_assert(!auth_log_shard->second.is_incomplete());
-  auth_log_shard_id = auth_log_shard->first;
+  auth_log_shard_id = get_log_shard->first;
 
   set<pg_shard_t> want_backfill, want_acting_backfill;
   vector<int> want;