]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC Optimizations: Backfill changes for partial writes 62710/head
authorBill Scales <bill_scales@uk.ibm.com>
Thu, 3 Apr 2025 11:15:31 +0000 (12:15 +0100)
committerBill Scales <bill_scales@uk.ibm.com>
Thu, 10 Apr 2025 11:31:37 +0000 (12:31 +0100)
Optimized EC pools support partial writes that do not update every shard.
Consequently shards that are not updated can have out of date version
numbers. The primary shard object_info_t is always updated and tracks the
expected version of each shards. To avoid unnecessary backfill work changes
are required to use the extra data in the object_info_t when comparing version
numbers to work out whether a shard is missing updates or just didn't
participate in recent partial writes.

See comments in src/osd/recovery_types.h

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
15 files changed:
src/crimson/osd/CMakeLists.txt
src/crimson/osd/backfill_facades.h
src/crimson/osd/backfill_state.cc
src/crimson/osd/backfill_state.h
src/crimson/osd/pg_recovery.cc
src/crimson/osd/recovery_backend.cc
src/crimson/osd/recovery_backend.h
src/osd/CMakeLists.txt
src/osd/PG.h
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h
src/osd/recovery_types.cc [deleted file]
src/osd/recovery_types.h
src/test/crimson/CMakeLists.txt
src/test/crimson/test_backfill.cc

index a3b6b47d4d44a40a1d0f655e2eda0cadea8f7341..ff78904c8faf0e81212ae7cf37227da08c0421bb 100644 (file)
@@ -55,7 +55,6 @@ add_executable(crimson-osd
   ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc
   ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc
   ${PROJECT_SOURCE_DIR}/src/osd/SnapMapper.cc
-  ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc
   ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc
   ${PROJECT_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc
   watch.cc
index de1cda434b85ca22db128f3df44614eab46c0c57..f98efa5bea669d99cc2b261620ed84ef47367f00 100644 (file)
@@ -62,6 +62,11 @@ struct PeeringFacade final : BackfillState::PeeringFacade {
     const std::vector<pg_shard_t> &peers) override {
     return peering_state.prepare_backfill_for_missing(soid, v, peers);
   }
+
+  const pg_pool_t& get_pool() const override {
+    return peering_state.get_pgpool().info;
+  }
+
   PeeringFacade(PeeringState& peering_state)
     : peering_state(peering_state) {
   }
index 7fea4b7d5842316255f49b6723ca7d5446360fdf..38373929b96b3d3155898b39d31413fc2fbdae89 100644 (file)
@@ -85,7 +85,7 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt)
   ceph_assert(backfill_state().last_backfill_started == \
               peering_state().earliest_backfill());
   ceph_assert(peering_state().is_backfilling());
-  // initialize BackfillIntervals
+  // initialize ReplicaBackfillIntervals
   for (const auto& bt : peering_state().get_backfill_targets()) {
     backfill_state().peer_backfill_info[bt].reset(
       peering_state().get_peer_last_backfill(bt));
@@ -134,13 +134,52 @@ void BackfillState::Enqueuing::maybe_update_range()
          if (e.is_update()) {
            DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}",
              pg(), e.soid, e.version);
-            primary_bi.objects.erase(e.soid);
-            primary_bi.objects.insert(std::make_pair(e.soid,
-                                                             e.version));
+           if (e.written_shards.empty()) {
+             // Log entry updates all shards, replace all entries for e.soid
+             primary_bi.objects.erase(e.soid);
+             primary_bi.objects.insert(
+                          std::make_pair(e.soid,
+                                         std::make_pair(shard_id_t::NO_SHARD,
+                                                        e.version)));
+           } else {
+             // Update backfill interval for shards modified by log entry
+             std::map<shard_id_t,eversion_t> versions;
+             // Create map from existing entries in backfill entry
+             const auto & [begin, end] = primary_bi.objects.equal_range(e.soid);
+             for (const auto & entry : std::ranges::subrange(begin, end)) {
+               const auto & [shard, version] = entry.second;
+               versions[shard] = version;
+             }
+             // Update entries in map that are modified by log entry
+             bool uses_default = false;
+             for (const auto & shard : peering_state().get_backfill_targets()) {
+               if (e.is_written_shard(shard.shard)) {
+                 versions.erase(shard.shard);
+                 uses_default = true;
+               } else {
+                 if (!versions.contains(shard.shard)) {
+                   versions[shard.shard] = e.prior_version;
+                 }
+                 //Else: keep existing version
+               }
+             }
+             if (uses_default) {
+               versions[shard_id_t::NO_SHARD] = e.version;
+             } else {
+               versions.erase(shard_id_t::NO_SHARD);
+             }
+             // Erase and recreate backfill interval for e.soid using map
+             primary_bi.objects.erase(e.soid);
+             for (auto & [shard, version] : versions) {
+               primary_bi.objects.insert(
+                            std::make_pair(e.soid,
+                                           std::make_pair(shard, version)));
+             }
+           }
          } else if (e.is_delete()) {
             DEBUGDPP("maybe_update_range(lambda): {} removed",
              pg(), e.soid);
-            primary_bi.objects.erase(e.soid);
+            primary_bi.objects.erase(e.soid); // Erase all entries for e.soid
           }
         }
       };
@@ -168,8 +207,8 @@ void BackfillState::Enqueuing::trim_backfill_infos()
 
 /* static */ bool BackfillState::Enqueuing::all_enqueued(
   const PeeringFacade& peering_state,
-  const BackfillInterval& backfill_info,
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+  const PrimaryBackfillInterval& backfill_info,
+  const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info)
 {
   const bool all_local_enqueued = \
     backfill_info.extends_to_end() && backfill_info.empty();
@@ -184,7 +223,8 @@ void BackfillState::Enqueuing::trim_backfill_infos()
 }
 
 hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+  const std::map<pg_shard_t,
+                 ReplicaBackfillInterval>& peer_backfill_info) const
 {
   hobject_t e = hobject_t::get_max();
   for (const pg_shard_t& bt : peering_state().get_backfill_targets()) {
@@ -196,8 +236,8 @@ hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
 }
 
 bool BackfillState::Enqueuing::should_rescan_replicas(
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
-  const BackfillInterval& backfill_info) const
+  const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+  const PrimaryBackfillInterval& backfill_info) const
 {
   const auto& targets = peering_state().get_backfill_targets();
   return std::any_of(std::begin(targets), std::end(targets),
@@ -208,8 +248,8 @@ bool BackfillState::Enqueuing::should_rescan_replicas(
 }
 
 bool BackfillState::Enqueuing::should_rescan_primary(
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
-  const BackfillInterval& backfill_info) const
+  const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+  const PrimaryBackfillInterval& backfill_info) const
 {
   return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
         !backfill_info.extends_to_end() && backfill_info.empty();
@@ -218,7 +258,7 @@ bool BackfillState::Enqueuing::should_rescan_primary(
 void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
   BackfillState::Enqueuing::result_t&& result,
   hobject_t& last_backfill_started,
-  std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+  std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info)
 {
   std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets),
     [&peer_backfill_info] (const auto& bt) {
@@ -257,13 +297,28 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
   result_t result { {}, primary_bi.begin };
   std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
 
+  std::map<shard_id_t,eversion_t> versions;
+  auto it = primary_bi.objects.begin();
+  const hobject_t& hoid = it->first;
+  eversion_t obj_v;
+  while (it != primary_bi.objects.end() && it->first == hoid) {
+    obj_v = std::max(obj_v, it->second.second);
+    versions[it->second.first] = it->second.second;
+    ++it;
+  }
+
   for (const auto& bt : peering_state().get_backfill_targets()) {
     const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
 
     // Find all check peers that have the wrong version
-    if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
-        check == primary_bi.begin && check == peer_bi.begin) {
-      if (peer_bi.objects.begin()->second != obj_v) {
+    if (check == primary_bi.begin && check == peer_bi.begin) {
+      eversion_t replicaobj_v;
+      if (versions.contains(bt.shard)) {
+       replicaobj_v = versions.at(bt.shard);
+      } else {
+       replicaobj_v = versions.at(shard_id_t::NO_SHARD);
+      }
+      if (peer_bi.objects.begin()->second != replicaobj_v) {
        std::ignore = backfill_state().progress_tracker->enqueue_push(
          primary_bi.begin);
        auto &[v, peers] = backfills[primary_bi.begin];
@@ -298,8 +353,9 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
 }
 
 bool BackfillState::Enqueuing::Enqueuing::all_emptied(
-  const BackfillInterval& local_backfill_info,
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+  const PrimaryBackfillInterval& local_backfill_info,
+  const std::map<pg_shard_t,
+                 ReplicaBackfillInterval>& peer_backfill_info) const
 {
   const auto& targets = peering_state().get_backfill_targets();
   const auto replicas_emptied =
@@ -459,8 +515,8 @@ BackfillState::PrimaryScanning::react(ObjectPushed evt)
 
 // -- ReplicasScanning
 bool BackfillState::ReplicasScanning::replica_needs_scan(
-  const BackfillInterval& replica_backfill_info,
-  const BackfillInterval& local_backfill_info)
+  const ReplicaBackfillInterval& replica_backfill_info,
+  const PrimaryBackfillInterval& local_backfill_info)
 {
   return replica_backfill_info.empty() && \
          replica_backfill_info.begin <= local_backfill_info.begin && \
index 75129d39745014cbf4ac597259f43f4c97a2b39b..c295dd9c460c3631414418e11b6f1dc4a8e00f69 100644 (file)
@@ -15,6 +15,7 @@
 
 #include "osd/recovery_types.h"
 #include "osd/PGLog.h"
+#include "osd/PeeringState.h"
 
 namespace crimson::osd {
 
@@ -27,16 +28,16 @@ struct BackfillState {
 
   // events comes first
   struct PrimaryScanned : sc::event<PrimaryScanned> {
-    BackfillInterval result;
-    PrimaryScanned(BackfillInterval&& result)
+    PrimaryBackfillInterval result;
+    PrimaryScanned(PrimaryBackfillInterval&& result)
       : result(std::move(result)) {
     }
   };
 
   struct ReplicaScanned : sc::event<ReplicaScanned> {
     pg_shard_t from;
-    BackfillInterval result;
-    ReplicaScanned(pg_shard_t from, BackfillInterval&& result)
+    ReplicaBackfillInterval result;
+    ReplicaScanned(pg_shard_t from, ReplicaBackfillInterval&& result)
       : from(std::move(from)),
         result(std::move(result)) {
     }
@@ -166,8 +167,8 @@ public:
     // completed yet.
     static bool all_enqueued(
       const PeeringFacade& peering_state,
-      const BackfillInterval& backfill_info,
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+      const PrimaryBackfillInterval& backfill_info,
+      const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info);
 
   private:
     void maybe_update_range();
@@ -176,25 +177,27 @@ public:
     // these methods take BackfillIntervals instead of extracting them from
     // the state to emphasize the relationships across the main loop.
     bool all_emptied(
-      const BackfillInterval& local_backfill_info,
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+      const PrimaryBackfillInterval& local_backfill_info,
+      const std::map<pg_shard_t,
+                     ReplicaBackfillInterval>& peer_backfill_info) const;
     hobject_t earliest_peer_backfill(
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+      const std::map<pg_shard_t,
+                     ReplicaBackfillInterval>& peer_backfill_info) const;
     bool should_rescan_replicas(
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
-      const BackfillInterval& backfill_info) const;
+      const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+      const PrimaryBackfillInterval& backfill_info) const;
     // indicate whether a particular acting primary needs to scanned again
     // to process next piece of the hobject_t's namespace.
     // the logic is per analogy to replica_needs_scan(). See comments there.
     bool should_rescan_primary(
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
-      const BackfillInterval& backfill_info) const;
+      const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+      const PrimaryBackfillInterval& backfill_info) const;
 
     // the result_t is intermediary between {remove,update}_on_peers() and
-    // updating BackfillIntervals in trim_backfilled_object_from_intervals.
-    // This step is important because it affects the main loop's condition,
-    // and thus deserves to be exposed instead of being called deeply from
-    // {remove,update}_on_peers().
+    // updating ReplicaBackfillIntervals in
+    // trim_backfilled_object_from_intervals. This step is important
+    // because it affects the main loop's condition, and thus deserves to be
+    // exposed instead of being called deeply from {remove,update}_on_peers().
     struct [[nodiscard]] result_t {
       std::set<pg_shard_t> pbi_targets;
       hobject_t new_last_backfill_started;
@@ -202,7 +205,7 @@ public:
     void trim_backfilled_object_from_intervals(
       result_t&&,
       hobject_t& last_backfill_started,
-      std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+      std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info);
     result_t remove_on_peers(const hobject_t& check);
     result_t update_on_peers(const hobject_t& check);
   };
@@ -242,12 +245,12 @@ public:
     sc::result react(Triggered);
 
     // indicate whether a particular peer should be scanned to retrieve
-    // BackfillInterval for new range of hobject_t namespace.
+    // ReplicaBackfillInterval for new range of hobject_t namespace.
     // true when bi.objects is exhausted, replica bi's end is not MAX,
     // and primary bi'begin is further than the replica's one.
     static bool replica_needs_scan(
-      const BackfillInterval& replica_backfill_info,
-      const BackfillInterval& local_backfill_info);
+      const ReplicaBackfillInterval& replica_backfill_info,
+      const PrimaryBackfillInterval& local_backfill_info);
 
   private:
     std::set<pg_shard_t> waiting_on_backfill;
@@ -339,8 +342,8 @@ private:
     backfill_suspend_state.should_go_enqueuing = true;
   }
   hobject_t last_backfill_started;
-  BackfillInterval backfill_info;
-  std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+  PrimaryBackfillInterval backfill_info;
+  std::map<pg_shard_t, ReplicaBackfillInterval> peer_backfill_info;
   BackfillMachine backfill_machine;
   std::unique_ptr<ProgressTracker> progress_tracker;
   size_t replicas_in_backfill = 0;
@@ -408,6 +411,7 @@ struct BackfillState::PeeringFacade {
     const hobject_t &soid,
     const eversion_t &v,
     const std::vector<pg_shard_t> &peers) = 0;
+  virtual const pg_pool_t& get_pool() const = 0;
   virtual ~PeeringFacade() {}
 };
 
index f219411acda9d660faf52ec584413ab7549e35c0..39afa0383ac3910b452822db09683d78cf3ef3eb 100644 (file)
@@ -508,11 +508,12 @@ void PGRecovery::request_primary_scan(
 {
   logger().debug("{}", __func__);
   using crimson::common::local_conf;
-  std::ignore = pg->get_recovery_backend()->scan_for_backfill(
+  std::ignore = pg->get_recovery_backend()->scan_for_backfill_primary(
     begin,
     local_conf()->osd_backfill_scan_min,
-    local_conf()->osd_backfill_scan_max
-  ).then_interruptible([this] (BackfillInterval bi) {
+    local_conf()->osd_backfill_scan_max,
+    pg->get_peering_state().get_backfill_targets()
+  ).then_interruptible([this] (PrimaryBackfillInterval bi) {
     logger().debug("request_primary_scan:{}", __func__);
     using BackfillState = crimson::osd::BackfillState;
     backfill_state->process_event(
index b0dbcfba9839fb089c80fb5762ab2e8f4ddf2d7f..81174e2ba471f7f7658d4cea596640e9e0bf71a7 100644 (file)
@@ -222,17 +222,81 @@ RecoveryBackend::handle_backfill_remove(
       pg.get_collection_ref(), std::move(t)).or_terminate());
 }
 
-RecoveryBackend::interruptible_future<BackfillInterval>
-RecoveryBackend::scan_for_backfill(
+RecoveryBackend::interruptible_future<PrimaryBackfillInterval>
+RecoveryBackend::scan_for_backfill_primary(
+  const hobject_t start,
+  [[maybe_unused]] const std::int64_t min,
+  const std::int64_t max,
+  const std::set<pg_shard_t> &backfill_targets)
+{
+  LOG_PREFIX(RecoveryBackend::scan_for_backfill_primary);
+  DEBUGDPP("starting from {}", pg, start);
+  auto version_map = seastar::make_lw_shared<std::multimap<hobject_t,
+                      std::pair<shard_id_t,eversion_t>>>();
+  auto&& [objects, next] = co_await backend->list_objects(start, max);
+  co_await interruptor::parallel_for_each(objects,
+    seastar::coroutine::lambda([FNAME, this, version_map, backfill_targets]
+    (const hobject_t& object) -> interruptible_future<> {
+    DEBUGDPP("querying obj:{}", pg, object);
+    auto obc_manager = pg.obc_loader.get_obc_manager(object);
+    co_await pg.obc_loader.load_and_lock(
+      obc_manager, RWState::RWREAD
+    ).handle_error_interruptible(
+      crimson::ct_error::assert_all("unexpected error")
+    );
+
+    if (obc_manager.get_obc()->obs.exists) {
+      auto version = obc_manager.get_obc()->obs.oi.version;
+      auto shard_versions = obc_manager.get_obc()->obs.oi.shard_versions;
+      if (shard_versions.empty()) {
+       version_map->emplace(object, std::make_pair(shard_id_t::NO_SHARD,
+                                                   version));
+      } else {
+       bool added_default = false;
+       for (auto & shard: backfill_targets) {
+         if (shard_versions.contains(shard.shard)) {
+           version = shard_versions.at(shard.shard);
+           version_map->emplace(object, std::make_pair(shard.shard, version));
+         } else if (!added_default) {
+           version_map->emplace(object, std::make_pair(shard_id_t::NO_SHARD,
+                                                       version));
+           added_default = true;
+         }
+       }
+      }
+      DEBUGDPP("found: {}  {}", pg,
+               object, version);
+      co_return;
+    } else {
+      // if the object does not exist here, it must have been removed
+      // between the collection_list_partial and here.  This can happen
+      // for the first item in the range, which is usually last_backfill.
+      co_return;
+    }
+  }));
+  PrimaryBackfillInterval bi;
+  bi.begin = std::move(start);
+  bi.end = std::move(next);
+  bi.objects = std::move(*version_map);
+  DEBUGDPP("{} PrimaryBackfillInterval filled, leaving, {}",
+           "scan_for_backfill_primary",
+           pg, bi);
+  co_return std::move(bi);
+}
+
+RecoveryBackend::interruptible_future<ReplicaBackfillInterval>
+RecoveryBackend::scan_for_backfill_replica(
   const hobject_t start,
   [[maybe_unused]] const std::int64_t min,
   const std::int64_t max)
 {
-  LOG_PREFIX(RecoveryBackend::scan_for_backfill);
+  LOG_PREFIX(RecoveryBackend::scan_for_backfill_replica);
   DEBUGDPP("starting from {}", pg, start);
-  auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>();
+  auto version_map = seastar::make_lw_shared<std::map<hobject_t,
+                                                     eversion_t>>();
   auto&& [objects, next] = co_await backend->list_objects(start, max);
-  co_await interruptor::parallel_for_each(objects, seastar::coroutine::lambda([FNAME, this, version_map]
+  co_await interruptor::parallel_for_each(objects,
+    seastar::coroutine::lambda([FNAME, this, version_map]
     (const hobject_t& object) -> interruptible_future<> {
     DEBUGDPP("querying obj:{}", pg, object);
     auto obc_manager = pg.obc_loader.get_obc_manager(object);
@@ -255,12 +319,12 @@ RecoveryBackend::scan_for_backfill(
       co_return;
     }
   }));
-  BackfillInterval bi;
+  ReplicaBackfillInterval bi;
   bi.begin = std::move(start);
   bi.end = std::move(next);
   bi.objects = std::move(*version_map);
-  DEBUGDPP("{} BackfillInterval filled, leaving, {}",
-           "scan_for_backfill",
+  DEBUGDPP("{} ReplicaBackfillInterval filled, leaving, {}",
+           "scan_for_backfill_replica",
            pg, bi);
   co_return std::move(bi);
 }
@@ -283,7 +347,7 @@ RecoveryBackend::handle_scan_get_digest(
       PeeringState::BackfillTooFull());
     return seastar::now();
   }
-  return scan_for_backfill(
+  return scan_for_backfill_replica(
     std::move(m.begin),
     crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"),
     crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max")
@@ -312,7 +376,7 @@ RecoveryBackend::handle_scan_digest(
   // Check that from is in backfill_targets vector
   ceph_assert(pg.is_backfill_target(m.from));
 
-  BackfillInterval bi;
+  ReplicaBackfillInterval bi;
   bi.begin = m.begin;
   bi.end = m.end;
   {
index 818e85f67b1d9c282b0f39d64edaa104c36951e1..adffc0b9ac42e283b4d022de258a3b1f2a215cec 100644 (file)
@@ -89,7 +89,13 @@ public:
     const hobject_t& soid,
     eversion_t need) = 0;
 
-  interruptible_future<BackfillInterval> scan_for_backfill(
+  interruptible_future<PrimaryBackfillInterval> scan_for_backfill_primary(
+    const hobject_t from,
+    std::int64_t min,
+    std::int64_t max,
+    const std::set<pg_shard_t> &backfill_targets);
+
+  interruptible_future<ReplicaBackfillInterval> scan_for_backfill_replica(
     const hobject_t from,
     std::int64_t min,
     std::int64_t max);
index e7f579f38410cbf0a095955012c982e847e7d825..5d302209e16ee620fe97431d6610a1f4b96c06e5 100644 (file)
@@ -38,7 +38,6 @@ set(osd_srcs
   scheduler/mClockScheduler.cc
   PeeringState.cc
   PGStateUtils.cc
-  recovery_types.cc
   MissingLoc.cc
   osd_perf_counters.cc
   ECCommonL.cc
index d27d8196ad3467a942a5e1ddd29248e67da72a98..785614f1b35758fc6f8a939592bb514ed6de2fae 100644 (file)
@@ -867,8 +867,8 @@ protected:
   std::set<int> probe_targets;
 
 protected:
-  BackfillInterval backfill_info;
-  std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+  PrimaryBackfillInterval backfill_info;
+  std::map<pg_shard_t, ReplicaBackfillInterval> peer_backfill_info;
   bool backfill_reserving;
 
   // The primary's num_bytes and local num_bytes for this pg, only valid
index 1340ec88679a7f6379fcc3588dbe480c2a3d5d09..b1d88bcf4220a241df7f0d2fc49013da195a1d84 100644 (file)
@@ -4508,11 +4508,11 @@ void PrimaryLogPG::do_scan(
        return;
       }
 
-      BackfillInterval bi;
+      ReplicaBackfillInterval bi;
       bi.begin = m->begin;
       // No need to flush, there won't be any in progress writes occuring
       // past m->begin
-      scan_range(
+      scan_range_replica(
        cct->_conf->osd_backfill_scan_min,
        cct->_conf->osd_backfill_scan_max,
        &bi,
@@ -4534,7 +4534,7 @@ void PrimaryLogPG::do_scan(
       // Check that from is in backfill_targets vector
       ceph_assert(is_backfill_target(from));
 
-      BackfillInterval& bi = peer_backfill_info[from];
+      ReplicaBackfillInterval& bi = peer_backfill_info[from];
       bi.begin = m->begin;
       bi.end = m->end;
       auto p = m->get_data().cbegin();
@@ -13873,7 +13873,7 @@ bool PrimaryLogPG::all_peer_done() const
   for (const pg_shard_t& bt : get_backfill_targets()) {
     const auto piter = peer_backfill_info.find(bt);
     ceph_assert(piter != peer_backfill_info.end());
-    const BackfillInterval& pbi = piter->second;
+    const ReplicaBackfillInterval& pbi = piter->second;
     // See if peer has more to process
     if (!pbi.extends_to_end() || !pbi.empty())
        return false;
@@ -13986,7 +13986,7 @@ uint64_t PrimaryLogPG::recover_backfill(
         i != get_backfill_targets().end();
         ++i) {
       pg_shard_t bt = *i;
-      BackfillInterval& pbi = peer_backfill_info[bt];
+      ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
 
       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
       if (pbi.begin <= backfill_info.begin &&
@@ -14033,7 +14033,7 @@ uint64_t PrimaryLogPG::recover_backfill(
           i != get_backfill_targets().end();
           ++i) {
         pg_shard_t bt = *i;
-        BackfillInterval& pbi = peer_backfill_info[bt];
+        ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
         if (pbi.begin == check)
           check_targets.insert(bt);
       }
@@ -14045,7 +14045,7 @@ uint64_t PrimaryLogPG::recover_backfill(
           i != check_targets.end();
           ++i) {
         pg_shard_t bt = *i;
-        BackfillInterval& pbi = peer_backfill_info[bt];
+        ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
         ceph_assert(pbi.begin == check);
 
         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
@@ -14059,17 +14059,31 @@ uint64_t PrimaryLogPG::recover_backfill(
       // and we can't increment ops without requeueing ourself
       // for recovery.
     } else {
-      eversion_t& obj_v = backfill_info.objects.begin()->second;
-
+      // Unpack versions for the object being backfilled
+      auto it = backfill_info.objects.begin();
+      const hobject_t& hoid = it->first;
+      eversion_t obj_v;
+      std::map<shard_id_t,eversion_t> versions;
+      while (it != backfill_info.objects.end() && it->first == hoid) {
+       obj_v = std::max(obj_v, it->second.second);
+       versions[it->second.first] = it->second.second;
+       ++it;
+      }
       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
       for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
           i != get_backfill_targets().end();
           ++i) {
        pg_shard_t bt = *i;
-       BackfillInterval& pbi = peer_backfill_info[bt];
+       ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
         // Find all check peers that have the wrong version
        if (check == backfill_info.begin && check == pbi.begin) {
-         if (pbi.objects.begin()->second != obj_v) {
+         eversion_t replicaobj_v;
+         if (versions.contains(bt.shard)) {
+           replicaobj_v = versions.at(bt.shard);
+         } else {
+           replicaobj_v = versions.at(shard_id_t::NO_SHARD);
+         }
+         if (pbi.objects.begin()->second != replicaobj_v) {
            need_ver_targs.push_back(bt);
          } else {
            keep_ver_targs.push_back(bt);
@@ -14141,7 +14155,7 @@ uint64_t PrimaryLogPG::recover_backfill(
           i != check_targets.end();
           ++i) {
         pg_shard_t bt = *i;
-        BackfillInterval& pbi = peer_backfill_info[bt];
+        ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
         pbi.pop_front();
       }
     }
@@ -14309,17 +14323,18 @@ int PrimaryLogPG::prep_backfill_object_push(
 }
 
 void PrimaryLogPG::update_range(
-  BackfillInterval *bi,
+  PrimaryBackfillInterval *bi,
   ThreadPool::TPHandle &handle)
 {
   int local_min = cct->_conf->osd_backfill_scan_min;
   int local_max = cct->_conf->osd_backfill_scan_max;
+  const std::set<pg_shard_t>& backfill_targets = get_backfill_targets();
 
   if (bi->version < info.log_tail) {
     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
             << dendl;
     bi->version = info.last_update;
-    scan_range(local_min, local_max, bi, handle);
+    scan_range_primary(local_min, local_max, bi, handle, backfill_targets);
   }
 
   if (bi->version >= projected_last_update) {
@@ -14350,14 +14365,48 @@ void PrimaryLogPG::update_range(
        if (e.is_update()) {
          dout(10) << __func__ << ": " << e.soid << " updated to version "
                   << e.version << dendl;
-         bi->objects.erase(e.soid);
-         bi->objects.insert(
-           make_pair(
-             e.soid,
-             e.version));
+         if (e.written_shards.empty()) {
+           // Log entry updates all shards, replace all entries for e.soid
+           bi->objects.erase(e.soid);
+           bi->objects.insert(make_pair(e.soid,
+                                        make_pair(shard_id_t::NO_SHARD,
+                                                  e.version)));
+         } else {
+           // Update backfill interval for shards modified by log entry
+           std::map<shard_id_t,eversion_t> versions;
+           // Create map from existing entries in backfill entry
+           const auto & [begin, end] = bi->objects.equal_range(e.soid);
+           for (const auto & entry : std::ranges::subrange(begin, end)) {
+             const auto & [shard, version] = entry.second;
+             versions[shard] = version;
+           }
+           // Update entries in map that are modified by log entry
+           bool uses_default = false;
+           for (const auto & shard : backfill_targets) {
+             if (e.is_written_shard(shard.shard)) {
+               versions.erase(shard.shard);
+               uses_default = true;
+             } else {
+               if (!versions.contains(shard.shard)) {
+                 versions[shard.shard] = e.prior_version;
+               }
+               //Else: keep existing version
+             }
+           }
+           if (uses_default) {
+             versions[shard_id_t::NO_SHARD] = e.version;
+           } else {
+             versions.erase(shard_id_t::NO_SHARD);
+           }
+           // Erase and recreate backfill interval for e.soid using map
+           bi->objects.erase(e.soid);
+           for (auto & [shard, version] : versions) {
+             bi->objects.insert(make_pair(e.soid, make_pair(shard, version)));
+           }
+         }
        } else if (e.is_delete()) {
          dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
-         bi->objects.erase(e.soid);
+         bi->objects.erase(e.soid); // Erase all entries for e.soid
        }
       }
     };
@@ -14367,16 +14416,18 @@ void PrimaryLogPG::update_range(
     projected_log.scan_log_after(bi->version, func);
     bi->version = projected_last_update;
   } else {
-    ceph_abort_msg("scan_range should have raised bi->version past log_tail");
+    ceph_abort_msg("scan_range_primary should have raised bi->version past log_tail");
   }
 }
 
-void PrimaryLogPG::scan_range(
-  int min, int max, BackfillInterval *bi,
-  ThreadPool::TPHandle &handle)
+void PrimaryLogPG::scan_range_primary(
+  int min, int max, PrimaryBackfillInterval *bi,
+  ThreadPool::TPHandle &handle,
+  const std::set<pg_shard_t> &backfill_targets)
 {
   ceph_assert(is_locked());
-  dout(10) << "scan_range from " << bi->begin << dendl;
+  dout(10) << "scan_range_primary from " << bi->begin <<
+              " backfill_targets " << backfill_targets << dendl;
   bi->clear_objects();
 
   vector<hobject_t> ls;
@@ -14388,9 +14439,13 @@ void PrimaryLogPG::scan_range(
 
   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
     handle.reset_tp_timeout();
-    ObjectContextRef obc;
-    if (is_primary())
-      obc = object_contexts.lookup(*p);
+
+    ceph_assert(is_primary());
+
+    eversion_t version;
+    std::map<shard_id_t,eversion_t> shard_versions;
+    ObjectContextRef obc = object_contexts.lookup(*p);
+
     if (obc) {
       if (!obc->obs.exists) {
        /* If the object does not exist here, it must have been removed
@@ -14399,8 +14454,8 @@ void PrimaryLogPG::scan_range(
         */
        continue;
       }
-      bi->objects[*p] = obc->obs.oi.version;
-      dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
+      version = obc->obs.oi.version;
+      shard_versions = obc->obs.oi.shard_versions;
     } else {
       bufferlist bl;
       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
@@ -14413,12 +14468,64 @@ void PrimaryLogPG::scan_range(
 
       ceph_assert(r >= 0);
       object_info_t oi(bl);
-      bi->objects[*p] = oi.version;
-      dout(20) << "  " << *p << " " << oi.version << dendl;
+      version = oi.version;
+      shard_versions = oi.shard_versions;
+    }
+    dout(20) << "  " << *p << " " << version << dendl;
+    if (shard_versions.empty()) {
+      bi->objects.insert(make_pair(*p, std::make_pair(shard_id_t::NO_SHARD,
+                                                     version)));
+    } else {
+      bool added_default = false;
+      for (auto & shard: backfill_targets) {
+       if (shard_versions.contains(shard.shard)) {
+         version = shard_versions.at(shard.shard);
+         bi->objects.insert(make_pair(*p, std::make_pair(shard.shard,
+                                                         version)));
+       } else if (!added_default) {
+         bi->objects.insert(make_pair(*p, std::make_pair(shard_id_t::NO_SHARD,
+                                                         version)));
+         added_default = true;
+       }
+      }
     }
   }
 }
 
+void PrimaryLogPG::scan_range_replica(
+  int min, int max, ReplicaBackfillInterval *bi,
+  ThreadPool::TPHandle &handle)
+{
+  ceph_assert(is_locked());
+  dout(10) << "scan_range_replica from " << bi->begin << dendl;
+  bi->clear_objects();
+
+  vector<hobject_t> ls;
+  ls.reserve(max);
+  int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
+  ceph_assert(r >= 0);
+  dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
+  dout(20) << ls << dendl;
+
+  for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+    handle.reset_tp_timeout();
+
+    ceph_assert(!is_primary());
+    bufferlist bl;
+    int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
+    /* If the object does not exist here, it must have been removed
+     * between the collection_list_partial and here.  This can happen
+     * for the first item in the range, which is usually last_backfill.
+     */
+    if (r == -ENOENT)
+      continue;
+
+    ceph_assert(r >= 0);
+    object_info_t oi(bl);
+    bi->objects[*p] = oi.version;
+    dout(20) << "  " << *p << " " << oi.version << dendl;
+  }
+}
 
 /** check_local
  *
index 94c0b13b5141d661ac19e65695e07d3c7ad57417..aafd618b754deb18bd432a6f644f68338fa05a55 100644 (file)
@@ -1138,7 +1138,7 @@ protected:
     }
     {
       f->open_array_section("peer_backfill_info");
-      for (std::map<pg_shard_t, BackfillInterval>::const_iterator pbi =
+      for (std::map<pg_shard_t, ReplicaBackfillInterval>::const_iterator pbi =
             peer_backfill_info.begin();
           pbi != peer_backfill_info.end(); ++pbi) {
         f->dump_stream("osd") << pbi->first;
@@ -1327,14 +1327,20 @@ protected:
    * @bi.begin first item should be >= this value
    * @bi [out] resulting std::map of objects to eversion_t's
    */
-  void scan_range(
-    int min, int max, BackfillInterval *bi,
+  void scan_range_replica(
+    int min, int max, ReplicaBackfillInterval *bi,
     ThreadPool::TPHandle &handle
     );
 
+  void scan_range_primary(
+    int min, int max, PrimaryBackfillInterval *bi,
+    ThreadPool::TPHandle &handle,
+    const std::set<pg_shard_t> &backfill_targets
+    );
+
   /// Update a hash range to reflect changes since the last scan
   void update_range(
-    BackfillInterval *bi,        ///< [in,out] interval to update
+    PrimaryBackfillInterval *bi, ///< [in,out] interval to update
     ThreadPool::TPHandle &handle ///< [in] tp handle
     );
 
diff --git a/src/osd/recovery_types.cc b/src/osd/recovery_types.cc
deleted file mode 100644 (file)
index 70ba1fd..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "recovery_types.h"
-
-std::ostream& operator<<(std::ostream& out, const BackfillInterval& bi)
-{
-  out << "BackfillInfo(" << bi.begin << "-" << bi.end
-      << " " << bi.objects.size() << " objects";
-  if (!bi.objects.empty())
-    out << " " << bi.objects;
-  out << ")";
-  return out;
-}
-
-
index cffb2b7ea4c06251884e5180933a10d68cdb1172..159620949f028d29638f467d356c1934e9a927af 100644 (file)
  * Possible states:
  * 1) begin == end == hobject_t() indicates the the interval is unpopulated
  * 2) Else, objects contains all objects in [begin, end)
+ *
+ * ReplicaBackfillInterval
+ *
+ * Stores a map of hobject_t and eversion to track the version number of
+ * the objects being backfilled in an interval for one specific shard
+ *
+ * PrimaryBackfillInterval
+ *
+ * Stores a multimap of hobject and pair<shard_id_t, eversion>.
+ *
+ * Only shards that are backfill targets will be tracked. For replicated and
+ * non-optimized EC pools there is one entry per hobject_t and shard_id_t will
+ * be NO_SHARD.
+ *
+ * For optimized EC pools partial writes mean it is possible that different
+ * shards have different eversions, hence there may be multiple entries per
+ * hobject_t. To conserve memory it is permitted to have an entry for NO_SHARD
+ * and additional entries for the same hobject for specific shards. In this
+ * case shards that are not specifically listed are expected to be at the
+ * eversion for the NO_SHARD entry.
+ *
+ * Example: EC pool with 4+2 profile
+ *
+ *   test:head, <NO_SHARD, 1'23>
+ *   test:head, <1,        1'20>
+ *
+ * Shards 0 and 2-5 are expected to be at version 1'23, shard 1 has skipped
+ * recent updates and is expected to be at version 1'20
  */
-struct BackfillInterval {
+
+template <typename T>
+class BackfillInterval {
+public:
   // info about a backfill interval on a peer
   eversion_t version; /// version at which the scan occurred
-  std::map<hobject_t,eversion_t> objects;
   hobject_t begin;
   hobject_t end;
+  T objects;
+
+  virtual ~BackfillInterval() = default;
+  BackfillInterval() = default;
+  BackfillInterval(const BackfillInterval&) = default;
+  BackfillInterval(BackfillInterval&&) = default;
+  BackfillInterval& operator=(const BackfillInterval&) = default;
+  BackfillInterval& operator=(BackfillInterval&&) = default;
 
   /// clear content
-  void clear() {
-    *this = BackfillInterval();
-  }
+  virtual void clear() = 0;
 
   /// clear objects std::list only
   void clear_objects() {
@@ -60,10 +96,60 @@ struct BackfillInterval {
 
   /// Adjusts begin to the first object
   void trim() {
-    if (!objects.empty())
+    if (!objects.empty()) {
       begin = objects.begin()->first;
-    else
+    } else {
       begin = end;
+    }
+  }
+
+  /// drop first entry, and adjust @begin accordingly
+  virtual void pop_front() = 0;
+
+  /// dump
+  virtual void dump(ceph::Formatter *f) const = 0;
+};
+
+class PrimaryBackfillInterval: public BackfillInterval<std::multimap<hobject_t,
+                                       std::pair<shard_id_t, eversion_t>>> {
+public:
+
+  /// clear content
+  void clear() override {
+    *this = PrimaryBackfillInterval();
+  }
+
+  /// drop first entry, and adjust @begin accordingly
+  void pop_front() override {
+    ceph_assert(!objects.empty());
+    // Use erase(key) to erase all entries for key
+    objects.erase(objects.begin()->first);
+    trim();
+  }
+
+  /// dump
+  void dump(ceph::Formatter *f) const override {
+    f->dump_stream("begin") << begin;
+    f->dump_stream("end") << end;
+    f->open_array_section("objects");
+    for (const auto& [hoid, shard_version] : objects) {
+      const auto& [shard, version] = shard_version;
+      f->open_object_section("object");
+      f->dump_stream("object") << hoid;
+      f->dump_stream("shard") << shard;
+      f->dump_stream("version") << version;
+      f->close_section();
+    }
+    f->close_section();
+  }
+};
+
+class ReplicaBackfillInterval: public BackfillInterval<std::map<hobject_t,
+                                                               eversion_t>> {
+public:
+  /// clear content
+  void clear() override {
+    *this = ReplicaBackfillInterval();
   }
 
   /// drop first entry, and adjust @begin accordingly
@@ -74,25 +160,32 @@ struct BackfillInterval {
   }
 
   /// dump
-  void dump(ceph::Formatter *f) const {
+  void dump(ceph::Formatter *f) const override {
     f->dump_stream("begin") << begin;
     f->dump_stream("end") << end;
     f->open_array_section("objects");
-    for (std::map<hobject_t, eversion_t>::const_iterator i =
-           objects.begin();
-         i != objects.end();
-         ++i) {
+    for (const auto& [hoid, version] : objects) {
       f->open_object_section("object");
-      f->dump_stream("object") << i->first;
-      f->dump_stream("version") << i->second;
+      f->dump_stream("object") << hoid;
+      f->dump_stream("version") << version;
       f->close_section();
     }
     f->close_section();
   }
 };
 
-std::ostream &operator<<(std::ostream &out, const BackfillInterval &bi);
+template<typename T> std::ostream& operator<<(std::ostream& out,
+                                             const BackfillInterval<T>& bi)
+{
+  out << "BackfillInfo(" << bi.begin << "-" << bi.end << " ";
+  if (!bi.objects.empty()) {
+    out << bi.objects.size() << " objects " << bi.objects;
+  }
+  out << ")";
+  return out;
+}
 
 #if FMT_VERSION >= 90000
-template <> struct fmt::formatter<BackfillInterval> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<PrimaryBackfillInterval> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<ReplicaBackfillInterval> : fmt::ostream_formatter {};
 #endif
index 3456514b5fda9ac41a7df2efac873555980dc020..de4f4b486921480cd304ae62fd6fbe47f3fe5d45 100644 (file)
@@ -2,11 +2,10 @@
 add_executable(unittest-crimson-backfill
   test_backfill.cc
   ${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc
-  ${PROJECT_SOURCE_DIR}/src/crimson/osd/backfill_state.cc
-  ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc)
+  ${PROJECT_SOURCE_DIR}/src/crimson/osd/backfill_state.cc)
 add_ceph_unittest(unittest-crimson-backfill
   --memory 256M --smp 1)
-target_link_libraries(unittest-crimson-backfill crimson GTest::Main)
+target_link_libraries(unittest-crimson-backfill crimson GTest::Main Boost::MPL)
 
 add_executable(unittest-seastar-buffer
   test_buffer.cc)
index 117024a48d4b07beaed1bb7466053846d5abfe54..39d6700d4a07e44c0337416b693093f131663578 100644 (file)
@@ -14,6 +14,7 @@
 #include <string>
 
 #include <boost/statechart/event_base.hpp>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
@@ -67,10 +68,31 @@ struct FakeStore {
                                 : hobject_t::get_max();
   }
 
+  // Permit rhs (reference) objects to be the same version or 1 version older
+  bool looks_like(const FakeStore& rhs) const {
+    if (std::size(objs) != std::size(rhs.objs)) {
+      return false;
+    }
+    for (auto &[obj, version] : objs) {
+      if (!rhs.objs.contains(obj)) {
+       return false;
+      }
+      auto version_r = rhs.objs.at(obj);
+      if ((version.epoch != version_r.epoch) ||
+         ((version.version != version_r.version) &&
+          (version.version != version_r.version + 1)))
+      {
+       return false;
+      }
+    }
+    return true;
+  }
+
   bool operator==(const FakeStore& rhs) const {
     return std::size(objs) == std::size(rhs.objs) && \
            std::equal(std::begin(objs), std::end(objs), std::begin(rhs.objs));
   }
+
   bool operator!=(const FakeStore& rhs) const {
     return !(*this == rhs);
   }
@@ -92,6 +114,7 @@ struct FakePrimary {
   eversion_t projected_last_update;
   eversion_t log_tail;
   PGLog pg_log;
+  pg_pool_t pool;
   PGLog::IndexedLog projected_log;
 
   FakePrimary(FakeStore&& store)
@@ -184,7 +207,7 @@ public:
     const bool all_replica_match = std::all_of(
       std::begin(backfill_targets), std::end(backfill_targets),
       [&reference] (const auto kv) {
-        return kv.second.store == reference;
+        return kv.second.store.looks_like(reference);
       });
     return backfill_source.store == reference && all_replica_match;
   }
@@ -244,6 +267,10 @@ struct BackfillFixture::PeeringFacade
     return backfill_source.pg_log;
   }
 
+  const pg_pool_t& get_pool() const override {
+    return backfill_source.pool;
+  }
+
   void scan_log_after(eversion_t, scan_log_func_t) const override {
     /* NOP */
   }
@@ -304,7 +331,7 @@ void BackfillFixture::request_replica_scan(
   const hobject_t& begin,
   const hobject_t& end)
 {
-  BackfillInterval bi;
+  ReplicaBackfillInterval bi;
   bi.end = backfill_targets.at(target).store.list(begin, [&bi](auto kv) {
     bi.objects.insert(std::move(kv));
   });
@@ -317,9 +344,35 @@ void BackfillFixture::request_replica_scan(
 void BackfillFixture::request_primary_scan(
   const hobject_t& begin)
 {
-  BackfillInterval bi;
+  PrimaryBackfillInterval bi;
   bi.end = backfill_source.store.list(begin, [&bi](auto kv) {
-    bi.objects.insert(std::move(kv));
+    auto && [hoid,version] = kv;
+    eversion_t version_zero;
+    eversion_t version_next = eversion_t(version.epoch, version.version + 1);
+    switch (std::rand() % 4) {
+    case 0:
+      // All shards at same version (Replica, EC, optimized EC after full-stripe write)
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version)));
+      break;
+    case 1:
+      // Optimized EC partial write - Shard 3 at an earlier version
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(3), version_zero)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version)));
+      break;
+    case 2:
+      // Optimized EC partial write - Shard 1 and 2 at an earlier version
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version_next)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(1), version)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(2), version)));
+      break;
+    case 3:
+      // Optimized EC partial write - Shard 1, 2 and 3 at different earlier versions
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version_next)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(1), version)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(2), version)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(3), version_zero)));
+      break;
+    }
   });
   bi.begin = begin;
   bi.version = backfill_source.last_update;
@@ -381,8 +434,10 @@ struct BackfillFixtureBuilder {
 
   BackfillFixtureBuilder&& add_target(FakeStore::objs_t objs) && {
     const auto new_osd_num = std::size(backfill_targets);
+    const auto new_shard_id = shard_id_t(1 + new_osd_num);
     const auto [ _, inserted ] = backfill_targets.emplace(
-      new_osd_num, FakeReplica{ FakeStore{std::move(objs)} });
+      pg_shard_t(new_osd_num, new_shard_id),
+      FakeReplica{ FakeStore{std::move(objs)} });
     ceph_assert(inserted);
     return std::move(*this);
   }
@@ -438,6 +493,27 @@ TEST(backfill, one_empty_replica)
   EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
 }
 
+TEST(backfill, one_same_one_empty_replica)
+{
+  const auto reference_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
+    { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} },
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} },
+  }};
+  auto cluster_fixture = BackfillFixtureBuilder::add_source(
+    reference_store.objs
+  ).add_target(
+    reference_store.objs
+  ).add_target(
+    { /* nothing 2 */ }
+  ).get_result();
+
+  EXPECT_CALL(cluster_fixture, backfilled);
+  cluster_fixture.next_till_done();
+
+  EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
 TEST(backfill, two_empty_replicas)
 {
   const auto reference_store = FakeStore{ {
@@ -459,6 +535,34 @@ TEST(backfill, two_empty_replicas)
   EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
 }
 
+TEST(backfill, one_behind_one_empty_replica)
+{
+  const auto reference_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {8, 234} },
+    { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 250} },
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 247} },
+    //"1:0256710c:::rbd_data.1018ac3e755.00000000000000b1:head", deleted
+  }};
+  const auto behind_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {8, 234} },
+    //"1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head"  missing
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {8, 165} },
+    { "1:0256710c:::rbd_data.1018ac3e755.00000000000000b1:head", {8, 169} },
+  }};
+  auto cluster_fixture = BackfillFixtureBuilder::add_source(
+    reference_store.objs
+  ).add_target(
+    { /* nothing 1 */ }
+  ).add_target(
+    behind_store.objs
+  ).get_result();
+
+  EXPECT_CALL(cluster_fixture, backfilled);
+  cluster_fixture.next_till_done();
+
+  EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
 TEST(backfill, cancel_resume_middle_of_primaryscan)
 {
   const auto reference_store = FakeStore{ {