]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: Optimized EC missing list not updated on recovering shard
authorBill Scales <bill_scales@uk.ibm.com>
Wed, 1 Oct 2025 14:52:23 +0000 (15:52 +0100)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 21 Oct 2025 13:34:54 +0000 (14:34 +0100)
Shards that are recovering (last_complete != last_update) that
skip transactions and log entries because of partial writes are
using pwlc to advance last_update. However simply incrementing
last_update is not sufficient - there are scenarios where the
needed version of a missing object has to be updated.

If the shard is already missing object X at version V1 and there was
a partial write at V2 that did not update the shard, it does not need
to retain the log entry, but it does need to update the missing list
to say it needs V2 rather than V1. This ensures all shards report
a need for an object at the same version and avoids an assert in
MissingLoc::add_active_missing when the primary is trying to
combine the missing lists from all the shards to work out what has
to be recovered. Avoiding applying pwlc during the early phase
of the peering process ensures the missing list gets updated.

However if a shard is not missing object X and there was a partial
write at V2 that did not update the shard then at the end of peering
it is still necessary to advance last_upadte by applying pwlc. This
ensures that in later peering cycles the code does not change its
mind and think the shard is now missing object X.

The fix is to be more sophisticated about when pwlc can be used
to advance last_update for a recovering shard. The code now
passes in a parameter indicating whether we are in the early
(pre activate) or later phase of peering. This also means that
additional calls to apply_pwlc are needed when peering gets to
activating and is searching for missing to make updates that were
not made earlier.

Fixes: https://tracker.ceph.com/issues/73249
Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/PeeringState.cc
src/osd/PeeringState.h

index 698b44ed5fa38cedc193fa8d925e4e0d16bc356b..fef041f5eec3d1dbdfb318cafd74c307bb002fab 100644 (file)
@@ -324,6 +324,7 @@ void PeeringState::query_unfound(Formatter *f, string state)
 void PeeringState::apply_pwlc(const std::pair<eversion_t, eversion_t> pwlc,
                              const pg_shard_t &shard,
                              pg_info_t &info,
+                             peering_stage stage,
                              pg_log_t *log1,
                              PGLog *log2)
 {
@@ -331,7 +332,9 @@ void PeeringState::apply_pwlc(const std::pair<eversion_t, eversion_t> pwlc,
   // knowledge of partial_writes
   const auto & [fromversion, toversion] = pwlc;
   if (toversion > info.last_update) {
-    if (fromversion <= info.last_update) {
+    if ((fromversion <= info.last_update) &&
+       ((stage == AFTER_ACTIVATE) ||
+        (info.last_complete == info.last_update))) {
       if (info.last_complete == info.last_update) {
        psdout(10) << "osd." << shard << " has last_complete"
                   << "=last_update " << info.last_update
@@ -364,7 +367,8 @@ void PeeringState::apply_pwlc(const std::pair<eversion_t, eversion_t> pwlc,
 }
 
 void PeeringState::update_peer_info(const pg_shard_t &from,
-                                   const pg_info_t &oinfo)
+                                   const pg_info_t &oinfo,
+                                   peering_stage stage)
 {
   // Merge pwlc information from another shard into
   // info.partial_writes_last_complete keeping the newest
@@ -423,14 +427,15 @@ void PeeringState::update_peer_info(const pg_shard_t &from,
   if (is_primary()) {
     for (auto & [shard, peer] : peer_info) {
       if (info.partial_writes_last_complete.contains(shard.shard)) {
-       apply_pwlc(info.partial_writes_last_complete[shard.shard], shard, peer);
+       apply_pwlc(info.partial_writes_last_complete[shard.shard], shard, peer,
+                  stage);
       }
     }
   }
   // Non-primary shards might need to apply pwlc to update info
   if (info.partial_writes_last_complete.contains(pg_whoami.shard)) {
     apply_pwlc(info.partial_writes_last_complete[pg_whoami.shard], pg_whoami,
-              info, &pg_log);
+              info, stage, &pg_log);
   }
 }
 
@@ -457,7 +462,7 @@ bool PeeringState::proc_replica_notify(const pg_shard_t &from, const pg_notify_t
   peer_info[from] = oinfo;
   stats_last_update[from] = oinfo.last_update;
 
-  update_peer_info(from, oinfo);
+  update_peer_info(from, oinfo, BEFORE_ACTIVATE);
   might_have_unfound.insert(from);
 
   update_history(oinfo.history);
@@ -2755,8 +2760,16 @@ bool PeeringState::search_for_missing(
   PeeringCtxWrapper &ctx)
 {
   uint64_t num_unfound_before = missing_loc.num_unfound();
+  pg_info_t tinfo(oinfo);
+  tinfo.pgid.shard = pg_whoami.shard;
+  // add partial write from our info
+  tinfo.partial_writes_last_complete = info.partial_writes_last_complete;
+  tinfo.partial_writes_last_complete_epoch = info.partial_writes_last_complete_epoch;
+  if (info.partial_writes_last_complete.contains(from.shard)) {
+    apply_pwlc(info.partial_writes_last_complete[from.shard], from, tinfo, AFTER_ACTIVATE);
+  }
   bool found_missing = missing_loc.add_source_info(
-    from, oinfo, omissing, ctx.handle);
+    from, tinfo, omissing, ctx.handle);
   if (found_missing && num_unfound_before != missing_loc.num_unfound())
     pl->publish_stats_to_osd();
   // avoid doing this if the peer is empty.  This is abit of paranoia
@@ -2765,14 +2778,6 @@ bool PeeringState::search_for_missing(
   // last_update=0'0 that's impossible.)
   if (found_missing &&
       oinfo.last_update != eversion_t()) {
-    pg_info_t tinfo(oinfo);
-    tinfo.pgid.shard = pg_whoami.shard;
-    // add partial write from our info
-    tinfo.partial_writes_last_complete = info.partial_writes_last_complete;
-    tinfo.partial_writes_last_complete_epoch = info.partial_writes_last_complete_epoch;
-    if (info.partial_writes_last_complete.contains(from.shard)) {
-      apply_pwlc(info.partial_writes_last_complete[from.shard], from, tinfo);
-    }
     if (!tinfo.partial_writes_last_complete.empty()) {
       psdout(20) << "sending info to " << from
                 << " pwlc=e" << tinfo.partial_writes_last_complete_epoch
@@ -3195,6 +3200,10 @@ void PeeringState::activate(
          ceph_assert(pi_it != peer_info.end());
          auto pm_it = peer_missing.find(*i);
          ceph_assert(pm_it != peer_missing.end());
+          if (info.partial_writes_last_complete.contains(i->shard)) {
+           apply_pwlc(info.partial_writes_last_complete[i->shard], *i,
+                      pi_it->second, AFTER_ACTIVATE);
+         }
          missing_loc.add_source_info(
            *i,
            pi_it->second,
@@ -3372,7 +3381,7 @@ void PeeringState::proc_master_log(
 
   if (info.partial_writes_last_complete.contains(from.shard)) {
     apply_pwlc(info.partial_writes_last_complete[from.shard], from, oinfo,
-              &olog);
+              BEFORE_ACTIVATE, &olog);
   }
 
   bool invalidate_stats = false;
@@ -3563,12 +3572,12 @@ void PeeringState::proc_replica_log(
 
   if (info.partial_writes_last_complete.contains(from.shard)) {
     apply_pwlc(info.partial_writes_last_complete[from.shard], from, oinfo,
-              &olog);
+              BEFORE_ACTIVATE, &olog);
   }
   pg_log.proc_replica_log(oinfo, olog, omissing, from, pool.info.allows_ecoptimizations());
 
   peer_info[from] = oinfo;
-  update_peer_info(from, oinfo);
+  update_peer_info(from, oinfo, BEFORE_ACTIVATE);
   psdout(10) << " peer osd." << from << " now "
             << oinfo << " " << omissing << dendl;
   might_have_unfound.insert(from);
@@ -7074,10 +7083,10 @@ boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& loge
   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
   if (msg->info.partial_writes_last_complete.contains(ps->pg_whoami.shard)) {
     ps->apply_pwlc(msg->info.partial_writes_last_complete[ps->pg_whoami.shard],
-                  ps->pg_whoami, ps->info, &ps->pg_log);
+                  ps->pg_whoami, ps->info, AFTER_ACTIVATE, &ps->pg_log);
   }
   ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from);
-  ps->update_peer_info(logevt.from, logevt.msg->info);
+  ps->update_peer_info(logevt.from, logevt.msg->info, AFTER_ACTIVATE);
   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
   if (logevt.msg->lease) {
     ps->proc_lease(*logevt.msg->lease);
@@ -7199,10 +7208,10 @@ boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
   } else {
     if (msg->info.partial_writes_last_complete.contains(ps->pg_whoami.shard)) {
       ps->apply_pwlc(msg->info.partial_writes_last_complete[ps->pg_whoami.shard],
-                    ps->pg_whoami, ps->info, &ps->pg_log);
+                    ps->pg_whoami, ps->info, AFTER_ACTIVATE, &ps->pg_log);
     }
     ps->merge_log(t, msg->info, std::move(msg->log), logevt.from);
-    ps->update_peer_info(logevt.from, msg->info);
+    ps->update_peer_info(logevt.from, msg->info, AFTER_ACTIVATE);
   }
   if (logevt.msg->lease) {
     ps->proc_lease(*logevt.msg->lease);
@@ -7262,7 +7271,7 @@ boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
   // Log must be consistent with info
   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
   // Update pwlc
-  ps->update_peer_info(infoevt.from, infoevt.info);
+  ps->update_peer_info(infoevt.from, infoevt.info, AFTER_ACTIVATE);
   post_event(Activate(infoevt.info.last_epoch_started));
   return transit<ReplicaActive>();
 }
@@ -8113,7 +8122,7 @@ boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
   psdout(10) << "Noting missing from osd." << logevt.from << dendl;
   ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing));
   ps->peer_info[logevt.from] = logevt.msg->info;
-  ps->update_peer_info(logevt.from, logevt.msg->info);
+  ps->update_peer_info(logevt.from, logevt.msg->info, BEFORE_ACTIVATE);
   return discard_event();
 }
 
index 226ba4281dca90a15f2fac8d4b30e44aad479170..2774244e58107160f730f5b155e229167293aa3d 100644 (file)
@@ -1589,26 +1589,35 @@ public:
 
   void update_heartbeat_peers();
   void query_unfound(Formatter *f, std::string state);
+
+  enum peering_stage {
+    BEFORE_ACTIVATE,
+    AFTER_ACTIVATE
+  };
+
   void apply_pwlc(const std::pair<eversion_t, eversion_t> pwlc,
                  const pg_shard_t &shard,
                  pg_info_t &info,
+                 peering_stage stage,
                  pg_log_t *log1,
                  PGLog *log2);
   void apply_pwlc(const std::pair<eversion_t, eversion_t> pwlc,
                  const pg_shard_t &shard,
                  pg_info_t &info,
+                 peering_stage stage,
                  pg_log_t *log)
   {
-    apply_pwlc(pwlc, shard, info, log, nullptr);
+    apply_pwlc(pwlc, shard, info, stage, log, nullptr);
   }
   void apply_pwlc(const std::pair<eversion_t, eversion_t> pwlc,
                  const pg_shard_t &shard,
                  pg_info_t &info,
+                 peering_stage stage,
                  PGLog *log = nullptr)
   {
-    apply_pwlc(pwlc, shard, info, nullptr, log);
+    apply_pwlc(pwlc, shard, info, stage, nullptr, log);
   }
-  void update_peer_info(const pg_shard_t &from, const pg_info_t &oinfo);
+  void update_peer_info(const pg_shard_t &from, const pg_info_t &oinfo, peering_stage stage);
   bool proc_replica_notify(const pg_shard_t &from, const pg_notify_t &notify);
   void remove_down_peer_info(const OSDMapRef &osdmap);
   void check_recovery_sources(const OSDMapRef& map);