osd: EC Optimizations: Backfill changes for partial writes

author Bill Scales <bill_scales@uk.ibm.com>

Thu, 3 Apr 2025 11:15:31 +0000 (12:15 +0100)

committer Bill Scales <bill_scales@uk.ibm.com>

Thu, 10 Apr 2025 11:31:37 +0000 (12:31 +0100)
author Bill Scales <bill_scales@uk.ibm.com>
Thu, 3 Apr 2025 11:15:31 +0000 (12:15 +0100)
committer Bill Scales <bill_scales@uk.ibm.com>
Thu, 10 Apr 2025 11:31:37 +0000 (12:31 +0100)
diff --git a/src/crimson/osd/CMakeLists.txt b/src/crimson/osd/CMakeLists.txt

index a3b6b47d4d44a40a1d0f655e2eda0cadea8f7341..ff78904c8faf0e81212ae7cf37227da08c0421bb 100644 (file)
--- a/src/crimson/osd/CMakeLists.txt
+++ b/src/crimson/osd/CMakeLists.txt
@@ -55,7 +55,6 @@ add_executable(crimson-osd
    ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc
    ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc
    ${PROJECT_SOURCE_DIR}/src/osd/SnapMapper.cc
-  ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc
    ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc
    ${PROJECT_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc
    watch.cc
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h

index de1cda434b85ca22db128f3df44614eab46c0c57..f98efa5bea669d99cc2b261620ed84ef47367f00 100644 (file)
--- a/src/crimson/osd/backfill_facades.h
+++ b/src/crimson/osd/backfill_facades.h
@@ -62,6 +62,11 @@ struct PeeringFacade final : BackfillState::PeeringFacade {
      const std::vector<pg_shard_t> &peers) override {
      return peering_state.prepare_backfill_for_missing(soid, v, peers);
    }
+
+  const pg_pool_t& get_pool() const override {
+    return peering_state.get_pgpool().info;
+  }
+
    PeeringFacade(PeeringState& peering_state)
      : peering_state(peering_state) {
    }
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc

index 7fea4b7d5842316255f49b6723ca7d5446360fdf..38373929b96b3d3155898b39d31413fc2fbdae89 100644 (file)
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -85,7 +85,7 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt)
    ceph_assert(backfill_state().last_backfill_started == \
                peering_state().earliest_backfill());
    ceph_assert(peering_state().is_backfilling());
-  // initialize BackfillIntervals
+  // initialize ReplicaBackfillIntervals
    for (const auto& bt : peering_state().get_backfill_targets()) {
      backfill_state().peer_backfill_info[bt].reset(
        peering_state().get_peer_last_backfill(bt));
@@ -134,13 +134,52 @@ void BackfillState::Enqueuing::maybe_update_range()
           if (e.is_update()) {
             DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}",
               pg(), e.soid, e.version);
-            primary_bi.objects.erase(e.soid);
-            primary_bi.objects.insert(std::make_pair(e.soid,
-                                                             e.version));
+           if (e.written_shards.empty()) {
+             // Log entry updates all shards, replace all entries for e.soid
+             primary_bi.objects.erase(e.soid);
+             primary_bi.objects.insert(
+                          std::make_pair(e.soid,
+                                         std::make_pair(shard_id_t::NO_SHARD,
+                                                        e.version)));
+           } else {
+             // Update backfill interval for shards modified by log entry
+             std::map<shard_id_t,eversion_t> versions;
+             // Create map from existing entries in backfill entry
+             const auto & [begin, end] = primary_bi.objects.equal_range(e.soid);
+             for (const auto & entry : std::ranges::subrange(begin, end)) {
+               const auto & [shard, version] = entry.second;
+               versions[shard] = version;
+             }
+             // Update entries in map that are modified by log entry
+             bool uses_default = false;
+             for (const auto & shard : peering_state().get_backfill_targets()) {
+               if (e.is_written_shard(shard.shard)) {
+                 versions.erase(shard.shard);
+                 uses_default = true;
+               } else {
+                 if (!versions.contains(shard.shard)) {
+                   versions[shard.shard] = e.prior_version;
+                 }
+                 //Else: keep existing version
+               }
+             }
+             if (uses_default) {
+               versions[shard_id_t::NO_SHARD] = e.version;
+             } else {
+               versions.erase(shard_id_t::NO_SHARD);
+             }
+             // Erase and recreate backfill interval for e.soid using map
+             primary_bi.objects.erase(e.soid);
+             for (auto & [shard, version] : versions) {
+               primary_bi.objects.insert(
+                            std::make_pair(e.soid,
+                                           std::make_pair(shard, version)));
+             }
+           }
           } else if (e.is_delete()) {
              DEBUGDPP("maybe_update_range(lambda): {} removed",
               pg(), e.soid);
-            primary_bi.objects.erase(e.soid);
+            primary_bi.objects.erase(e.soid); // Erase all entries for e.soid
            }
          }
        };
@@ -168,8 +207,8 @@ void BackfillState::Enqueuing::trim_backfill_infos()
  
  /* static */ bool BackfillState::Enqueuing::all_enqueued(
    const PeeringFacade& peering_state,
-  const BackfillInterval& backfill_info,
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+  const PrimaryBackfillInterval& backfill_info,
+  const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info)
  {
    const bool all_local_enqueued = \
      backfill_info.extends_to_end() && backfill_info.empty();
@@ -184,7 +223,8 @@ void BackfillState::Enqueuing::trim_backfill_infos()
  }
  
  hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+  const std::map<pg_shard_t,
+                 ReplicaBackfillInterval>& peer_backfill_info) const
  {
    hobject_t e = hobject_t::get_max();
    for (const pg_shard_t& bt : peering_state().get_backfill_targets()) {
@@ -196,8 +236,8 @@ hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
  }
  
  bool BackfillState::Enqueuing::should_rescan_replicas(
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
-  const BackfillInterval& backfill_info) const
+  const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+  const PrimaryBackfillInterval& backfill_info) const
  {
    const auto& targets = peering_state().get_backfill_targets();
    return std::any_of(std::begin(targets), std::end(targets),
@@ -208,8 +248,8 @@ bool BackfillState::Enqueuing::should_rescan_replicas(
  }
  
  bool BackfillState::Enqueuing::should_rescan_primary(
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
-  const BackfillInterval& backfill_info) const
+  const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+  const PrimaryBackfillInterval& backfill_info) const
  {
    return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
          !backfill_info.extends_to_end() && backfill_info.empty();
@@ -218,7 +258,7 @@ bool BackfillState::Enqueuing::should_rescan_primary(
  void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
    BackfillState::Enqueuing::result_t&& result,
    hobject_t& last_backfill_started,
-  std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+  std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info)
  {
    std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets),
      [&peer_backfill_info] (const auto& bt) {
@@ -257,13 +297,28 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
    result_t result { {}, primary_bi.begin };
    std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
  
+  std::map<shard_id_t,eversion_t> versions;
+  auto it = primary_bi.objects.begin();
+  const hobject_t& hoid = it->first;
+  eversion_t obj_v;
+  while (it != primary_bi.objects.end() && it->first == hoid) {
+    obj_v = std::max(obj_v, it->second.second);
+    versions[it->second.first] = it->second.second;
+    ++it;
+  }
+
    for (const auto& bt : peering_state().get_backfill_targets()) {
      const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
  
      // Find all check peers that have the wrong version
-    if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
-        check == primary_bi.begin && check == peer_bi.begin) {
-      if (peer_bi.objects.begin()->second != obj_v) {
+    if (check == primary_bi.begin && check == peer_bi.begin) {
+      eversion_t replicaobj_v;
+      if (versions.contains(bt.shard)) {
+       replicaobj_v = versions.at(bt.shard);
+      } else {
+       replicaobj_v = versions.at(shard_id_t::NO_SHARD);
+      }
+      if (peer_bi.objects.begin()->second != replicaobj_v) {
         std::ignore = backfill_state().progress_tracker->enqueue_push(
           primary_bi.begin);
         auto &[v, peers] = backfills[primary_bi.begin];
@@ -298,8 +353,9 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
  }
  
  bool BackfillState::Enqueuing::Enqueuing::all_emptied(
-  const BackfillInterval& local_backfill_info,
-  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+  const PrimaryBackfillInterval& local_backfill_info,
+  const std::map<pg_shard_t,
+                 ReplicaBackfillInterval>& peer_backfill_info) const
  {
    const auto& targets = peering_state().get_backfill_targets();
    const auto replicas_emptied =
@@ -459,8 +515,8 @@ BackfillState::PrimaryScanning::react(ObjectPushed evt)
  
  // -- ReplicasScanning
  bool BackfillState::ReplicasScanning::replica_needs_scan(
-  const BackfillInterval& replica_backfill_info,
-  const BackfillInterval& local_backfill_info)
+  const ReplicaBackfillInterval& replica_backfill_info,
+  const PrimaryBackfillInterval& local_backfill_info)
  {
    return replica_backfill_info.empty() && \
           replica_backfill_info.begin <= local_backfill_info.begin && \
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h

index 75129d39745014cbf4ac597259f43f4c97a2b39b..c295dd9c460c3631414418e11b6f1dc4a8e00f69 100644 (file)
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -15,6 +15,7 @@
  
  #include "osd/recovery_types.h"
  #include "osd/PGLog.h"
+#include "osd/PeeringState.h"
  
  namespace crimson::osd {
  
@@ -27,16 +28,16 @@ struct BackfillState {
  
    // events comes first
    struct PrimaryScanned : sc::event<PrimaryScanned> {
-    BackfillInterval result;
-    PrimaryScanned(BackfillInterval&& result)
+    PrimaryBackfillInterval result;
+    PrimaryScanned(PrimaryBackfillInterval&& result)
        : result(std::move(result)) {
      }
    };
  
    struct ReplicaScanned : sc::event<ReplicaScanned> {
      pg_shard_t from;
-    BackfillInterval result;
-    ReplicaScanned(pg_shard_t from, BackfillInterval&& result)
+    ReplicaBackfillInterval result;
+    ReplicaScanned(pg_shard_t from, ReplicaBackfillInterval&& result)
        : from(std::move(from)),
          result(std::move(result)) {
      }
@@ -166,8 +167,8 @@ public:
      // completed yet.
      static bool all_enqueued(
        const PeeringFacade& peering_state,
-      const BackfillInterval& backfill_info,
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+      const PrimaryBackfillInterval& backfill_info,
+      const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info);
  
    private:
      void maybe_update_range();
@@ -176,25 +177,27 @@ public:
      // these methods take BackfillIntervals instead of extracting them from
      // the state to emphasize the relationships across the main loop.
      bool all_emptied(
-      const BackfillInterval& local_backfill_info,
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+      const PrimaryBackfillInterval& local_backfill_info,
+      const std::map<pg_shard_t,
+                     ReplicaBackfillInterval>& peer_backfill_info) const;
      hobject_t earliest_peer_backfill(
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+      const std::map<pg_shard_t,
+                     ReplicaBackfillInterval>& peer_backfill_info) const;
      bool should_rescan_replicas(
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
-      const BackfillInterval& backfill_info) const;
+      const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+      const PrimaryBackfillInterval& backfill_info) const;
      // indicate whether a particular acting primary needs to scanned again
      // to process next piece of the hobject_t's namespace.
      // the logic is per analogy to replica_needs_scan(). See comments there.
      bool should_rescan_primary(
-      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
-      const BackfillInterval& backfill_info) const;
+      const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
+      const PrimaryBackfillInterval& backfill_info) const;
  
      // the result_t is intermediary between {remove,update}_on_peers() and
-    // updating BackfillIntervals in trim_backfilled_object_from_intervals.
-    // This step is important because it affects the main loop's condition,
-    // and thus deserves to be exposed instead of being called deeply from
-    // {remove,update}_on_peers().
+    // updating ReplicaBackfillIntervals in
+    // trim_backfilled_object_from_intervals. This step is important
+    // because it affects the main loop's condition, and thus deserves to be
+    // exposed instead of being called deeply from {remove,update}_on_peers().
      struct [[nodiscard]] result_t {
        std::set<pg_shard_t> pbi_targets;
        hobject_t new_last_backfill_started;
@@ -202,7 +205,7 @@ public:
      void trim_backfilled_object_from_intervals(
        result_t&&,
        hobject_t& last_backfill_started,
-      std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+      std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info);
      result_t remove_on_peers(const hobject_t& check);
      result_t update_on_peers(const hobject_t& check);
    };
@@ -242,12 +245,12 @@ public:
      sc::result react(Triggered);
  
      // indicate whether a particular peer should be scanned to retrieve
-    // BackfillInterval for new range of hobject_t namespace.
+    // ReplicaBackfillInterval for new range of hobject_t namespace.
      // true when bi.objects is exhausted, replica bi's end is not MAX,
      // and primary bi'begin is further than the replica's one.
      static bool replica_needs_scan(
-      const BackfillInterval& replica_backfill_info,
-      const BackfillInterval& local_backfill_info);
+      const ReplicaBackfillInterval& replica_backfill_info,
+      const PrimaryBackfillInterval& local_backfill_info);
  
    private:
      std::set<pg_shard_t> waiting_on_backfill;
@@ -339,8 +342,8 @@ private:
      backfill_suspend_state.should_go_enqueuing = true;
    }
    hobject_t last_backfill_started;
-  BackfillInterval backfill_info;
-  std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+  PrimaryBackfillInterval backfill_info;
+  std::map<pg_shard_t, ReplicaBackfillInterval> peer_backfill_info;
    BackfillMachine backfill_machine;
    std::unique_ptr<ProgressTracker> progress_tracker;
    size_t replicas_in_backfill = 0;
@@ -408,6 +411,7 @@ struct BackfillState::PeeringFacade {
      const hobject_t &soid,
      const eversion_t &v,
      const std::vector<pg_shard_t> &peers) = 0;
+  virtual const pg_pool_t& get_pool() const = 0;
    virtual ~PeeringFacade() {}
  };
  
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc

index f219411acda9d660faf52ec584413ab7549e35c0..39afa0383ac3910b452822db09683d78cf3ef3eb 100644 (file)
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -508,11 +508,12 @@ void PGRecovery::request_primary_scan(
  {
    logger().debug("{}", __func__);
    using crimson::common::local_conf;
-  std::ignore = pg->get_recovery_backend()->scan_for_backfill(
+  std::ignore = pg->get_recovery_backend()->scan_for_backfill_primary(
      begin,
      local_conf()->osd_backfill_scan_min,
-    local_conf()->osd_backfill_scan_max
-  ).then_interruptible([this] (BackfillInterval bi) {
+    local_conf()->osd_backfill_scan_max,
+    pg->get_peering_state().get_backfill_targets()
+  ).then_interruptible([this] (PrimaryBackfillInterval bi) {
      logger().debug("request_primary_scan:{}", __func__);
      using BackfillState = crimson::osd::BackfillState;
      backfill_state->process_event(
diff --git a/src/crimson/osd/recovery_backend.cc b/src/crimson/osd/recovery_backend.cc

index b0dbcfba9839fb089c80fb5762ab2e8f4ddf2d7f..81174e2ba471f7f7658d4cea596640e9e0bf71a7 100644 (file)
--- a/src/crimson/osd/recovery_backend.cc
+++ b/src/crimson/osd/recovery_backend.cc
@@ -222,17 +222,81 @@ RecoveryBackend::handle_backfill_remove(
        pg.get_collection_ref(), std::move(t)).or_terminate());
  }
  
-RecoveryBackend::interruptible_future<BackfillInterval>
-RecoveryBackend::scan_for_backfill(
+RecoveryBackend::interruptible_future<PrimaryBackfillInterval>
+RecoveryBackend::scan_for_backfill_primary(
+  const hobject_t start,
+  [[maybe_unused]] const std::int64_t min,
+  const std::int64_t max,
+  const std::set<pg_shard_t> &backfill_targets)
+{
+  LOG_PREFIX(RecoveryBackend::scan_for_backfill_primary);
+  DEBUGDPP("starting from {}", pg, start);
+  auto version_map = seastar::make_lw_shared<std::multimap<hobject_t,
+                      std::pair<shard_id_t,eversion_t>>>();
+  auto&& [objects, next] = co_await backend->list_objects(start, max);
+  co_await interruptor::parallel_for_each(objects,
+    seastar::coroutine::lambda([FNAME, this, version_map, backfill_targets]
+    (const hobject_t& object) -> interruptible_future<> {
+    DEBUGDPP("querying obj:{}", pg, object);
+    auto obc_manager = pg.obc_loader.get_obc_manager(object);
+    co_await pg.obc_loader.load_and_lock(
+      obc_manager, RWState::RWREAD
+    ).handle_error_interruptible(
+      crimson::ct_error::assert_all("unexpected error")
+    );
+
+    if (obc_manager.get_obc()->obs.exists) {
+      auto version = obc_manager.get_obc()->obs.oi.version;
+      auto shard_versions = obc_manager.get_obc()->obs.oi.shard_versions;
+      if (shard_versions.empty()) {
+       version_map->emplace(object, std::make_pair(shard_id_t::NO_SHARD,
+                                                   version));
+      } else {
+       bool added_default = false;
+       for (auto & shard: backfill_targets) {
+         if (shard_versions.contains(shard.shard)) {
+           version = shard_versions.at(shard.shard);
+           version_map->emplace(object, std::make_pair(shard.shard, version));
+         } else if (!added_default) {
+           version_map->emplace(object, std::make_pair(shard_id_t::NO_SHARD,
+                                                       version));
+           added_default = true;
+         }
+       }
+      }
+      DEBUGDPP("found: {}  {}", pg,
+               object, version);
+      co_return;
+    } else {
+      // if the object does not exist here, it must have been removed
+      // between the collection_list_partial and here.  This can happen
+      // for the first item in the range, which is usually last_backfill.
+      co_return;
+    }
+  }));
+  PrimaryBackfillInterval bi;
+  bi.begin = std::move(start);
+  bi.end = std::move(next);
+  bi.objects = std::move(*version_map);
+  DEBUGDPP("{} PrimaryBackfillInterval filled, leaving, {}",
+           "scan_for_backfill_primary",
+           pg, bi);
+  co_return std::move(bi);
+}
+
+RecoveryBackend::interruptible_future<ReplicaBackfillInterval>
+RecoveryBackend::scan_for_backfill_replica(
    const hobject_t start,
    [[maybe_unused]] const std::int64_t min,
    const std::int64_t max)
  {
-  LOG_PREFIX(RecoveryBackend::scan_for_backfill);
+  LOG_PREFIX(RecoveryBackend::scan_for_backfill_replica);
    DEBUGDPP("starting from {}", pg, start);
-  auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>();
+  auto version_map = seastar::make_lw_shared<std::map<hobject_t,
+                                                     eversion_t>>();
    auto&& [objects, next] = co_await backend->list_objects(start, max);
-  co_await interruptor::parallel_for_each(objects, seastar::coroutine::lambda([FNAME, this, version_map]
+  co_await interruptor::parallel_for_each(objects,
+    seastar::coroutine::lambda([FNAME, this, version_map]
      (const hobject_t& object) -> interruptible_future<> {
      DEBUGDPP("querying obj:{}", pg, object);
      auto obc_manager = pg.obc_loader.get_obc_manager(object);
@@ -255,12 +319,12 @@ RecoveryBackend::scan_for_backfill(
        co_return;
      }
    }));
-  BackfillInterval bi;
+  ReplicaBackfillInterval bi;
    bi.begin = std::move(start);
    bi.end = std::move(next);
    bi.objects = std::move(*version_map);
-  DEBUGDPP("{} BackfillInterval filled, leaving, {}",
-           "scan_for_backfill",
+  DEBUGDPP("{} ReplicaBackfillInterval filled, leaving, {}",
+           "scan_for_backfill_replica",
             pg, bi);
    co_return std::move(bi);
  }
@@ -283,7 +347,7 @@ RecoveryBackend::handle_scan_get_digest(
        PeeringState::BackfillTooFull());
      return seastar::now();
    }
-  return scan_for_backfill(
+  return scan_for_backfill_replica(
      std::move(m.begin),
      crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"),
      crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max")
@@ -312,7 +376,7 @@ RecoveryBackend::handle_scan_digest(
    // Check that from is in backfill_targets vector
    ceph_assert(pg.is_backfill_target(m.from));
  
-  BackfillInterval bi;
+  ReplicaBackfillInterval bi;
    bi.begin = m.begin;
    bi.end = m.end;
    {
diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h

index 818e85f67b1d9c282b0f39d64edaa104c36951e1..adffc0b9ac42e283b4d022de258a3b1f2a215cec 100644 (file)
--- a/src/crimson/osd/recovery_backend.h
+++ b/src/crimson/osd/recovery_backend.h
@@ -89,7 +89,13 @@ public:
      const hobject_t& soid,
      eversion_t need) = 0;
  
-  interruptible_future<BackfillInterval> scan_for_backfill(
+  interruptible_future<PrimaryBackfillInterval> scan_for_backfill_primary(
+    const hobject_t from,
+    std::int64_t min,
+    std::int64_t max,
+    const std::set<pg_shard_t> &backfill_targets);
+
+  interruptible_future<ReplicaBackfillInterval> scan_for_backfill_replica(
      const hobject_t from,
      std::int64_t min,
      std::int64_t max);
diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt

index e7f579f38410cbf0a095955012c982e847e7d825..5d302209e16ee620fe97431d6610a1f4b96c06e5 100644 (file)
--- a/src/osd/CMakeLists.txt
+++ b/src/osd/CMakeLists.txt
@@ -38,7 +38,6 @@ set(osd_srcs
    scheduler/mClockScheduler.cc
    PeeringState.cc
    PGStateUtils.cc
-  recovery_types.cc
    MissingLoc.cc
    osd_perf_counters.cc
    ECCommonL.cc
diff --git a/src/osd/PG.h b/src/osd/PG.h

index d27d8196ad3467a942a5e1ddd29248e67da72a98..785614f1b35758fc6f8a939592bb514ed6de2fae 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -867,8 +867,8 @@ protected:
    std::set<int> probe_targets;
  
  protected:
-  BackfillInterval backfill_info;
-  std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+  PrimaryBackfillInterval backfill_info;
+  std::map<pg_shard_t, ReplicaBackfillInterval> peer_backfill_info;
    bool backfill_reserving;
  
    // The primary's num_bytes and local num_bytes for this pg, only valid
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc

index 1340ec88679a7f6379fcc3588dbe480c2a3d5d09..b1d88bcf4220a241df7f0d2fc49013da195a1d84 100644 (file)
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -4508,11 +4508,11 @@ void PrimaryLogPG::do_scan(
         return;
        }
  
-      BackfillInterval bi;
+      ReplicaBackfillInterval bi;
        bi.begin = m->begin;
        // No need to flush, there won't be any in progress writes occuring
        // past m->begin
-      scan_range(
+      scan_range_replica(
         cct->_conf->osd_backfill_scan_min,
         cct->_conf->osd_backfill_scan_max,
         &bi,
@@ -4534,7 +4534,7 @@ void PrimaryLogPG::do_scan(
        // Check that from is in backfill_targets vector
        ceph_assert(is_backfill_target(from));
  
-      BackfillInterval& bi = peer_backfill_info[from];
+      ReplicaBackfillInterval& bi = peer_backfill_info[from];
        bi.begin = m->begin;
        bi.end = m->end;
        auto p = m->get_data().cbegin();
@@ -13873,7 +13873,7 @@ bool PrimaryLogPG::all_peer_done() const
    for (const pg_shard_t& bt : get_backfill_targets()) {
      const auto piter = peer_backfill_info.find(bt);
      ceph_assert(piter != peer_backfill_info.end());
-    const BackfillInterval& pbi = piter->second;
+    const ReplicaBackfillInterval& pbi = piter->second;
      // See if peer has more to process
      if (!pbi.extends_to_end() || !pbi.empty())
         return false;
@@ -13986,7 +13986,7 @@ uint64_t PrimaryLogPG::recover_backfill(
          i != get_backfill_targets().end();
          ++i) {
        pg_shard_t bt = *i;
-      BackfillInterval& pbi = peer_backfill_info[bt];
+      ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
  
        dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
        if (pbi.begin <= backfill_info.begin &&
@@ -14033,7 +14033,7 @@ uint64_t PrimaryLogPG::recover_backfill(
            i != get_backfill_targets().end();
            ++i) {
          pg_shard_t bt = *i;
-        BackfillInterval& pbi = peer_backfill_info[bt];
+        ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
          if (pbi.begin == check)
            check_targets.insert(bt);
        }
@@ -14045,7 +14045,7 @@ uint64_t PrimaryLogPG::recover_backfill(
            i != check_targets.end();
            ++i) {
          pg_shard_t bt = *i;
-        BackfillInterval& pbi = peer_backfill_info[bt];
+        ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
          ceph_assert(pbi.begin == check);
  
          to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
@@ -14059,17 +14059,31 @@ uint64_t PrimaryLogPG::recover_backfill(
        // and we can't increment ops without requeueing ourself
        // for recovery.
      } else {
-      eversion_t& obj_v = backfill_info.objects.begin()->second;
-
+      // Unpack versions for the object being backfilled
+      auto it = backfill_info.objects.begin();
+      const hobject_t& hoid = it->first;
+      eversion_t obj_v;
+      std::map<shard_id_t,eversion_t> versions;
+      while (it != backfill_info.objects.end() && it->first == hoid) {
+       obj_v = std::max(obj_v, it->second.second);
+       versions[it->second.first] = it->second.second;
+       ++it;
+      }
        vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
        for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
            i != get_backfill_targets().end();
            ++i) {
         pg_shard_t bt = *i;
-       BackfillInterval& pbi = peer_backfill_info[bt];
+       ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
          // Find all check peers that have the wrong version
         if (check == backfill_info.begin && check == pbi.begin) {
-         if (pbi.objects.begin()->second != obj_v) {
+         eversion_t replicaobj_v;
+         if (versions.contains(bt.shard)) {
+           replicaobj_v = versions.at(bt.shard);
+         } else {
+           replicaobj_v = versions.at(shard_id_t::NO_SHARD);
+         }
+         if (pbi.objects.begin()->second != replicaobj_v) {
             need_ver_targs.push_back(bt);
           } else {
             keep_ver_targs.push_back(bt);
@@ -14141,7 +14155,7 @@ uint64_t PrimaryLogPG::recover_backfill(
            i != check_targets.end();
            ++i) {
          pg_shard_t bt = *i;
-        BackfillInterval& pbi = peer_backfill_info[bt];
+        ReplicaBackfillInterval& pbi = peer_backfill_info[bt];
          pbi.pop_front();
        }
      }
@@ -14309,17 +14323,18 @@ int PrimaryLogPG::prep_backfill_object_push(
  }
  
  void PrimaryLogPG::update_range(
-  BackfillInterval *bi,
+  PrimaryBackfillInterval *bi,
    ThreadPool::TPHandle &handle)
  {
    int local_min = cct->_conf->osd_backfill_scan_min;
    int local_max = cct->_conf->osd_backfill_scan_max;
+  const std::set<pg_shard_t>& backfill_targets = get_backfill_targets();
  
    if (bi->version < info.log_tail) {
      dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
              << dendl;
      bi->version = info.last_update;
-    scan_range(local_min, local_max, bi, handle);
+    scan_range_primary(local_min, local_max, bi, handle, backfill_targets);
    }
  
    if (bi->version >= projected_last_update) {
@@ -14350,14 +14365,48 @@ void PrimaryLogPG::update_range(
         if (e.is_update()) {
           dout(10) << __func__ << ": " << e.soid << " updated to version "
                    << e.version << dendl;
-         bi->objects.erase(e.soid);
-         bi->objects.insert(
-           make_pair(
-             e.soid,
-             e.version));
+         if (e.written_shards.empty()) {
+           // Log entry updates all shards, replace all entries for e.soid
+           bi->objects.erase(e.soid);
+           bi->objects.insert(make_pair(e.soid,
+                                        make_pair(shard_id_t::NO_SHARD,
+                                                  e.version)));
+         } else {
+           // Update backfill interval for shards modified by log entry
+           std::map<shard_id_t,eversion_t> versions;
+           // Create map from existing entries in backfill entry
+           const auto & [begin, end] = bi->objects.equal_range(e.soid);
+           for (const auto & entry : std::ranges::subrange(begin, end)) {
+             const auto & [shard, version] = entry.second;
+             versions[shard] = version;
+           }
+           // Update entries in map that are modified by log entry
+           bool uses_default = false;
+           for (const auto & shard : backfill_targets) {
+             if (e.is_written_shard(shard.shard)) {
+               versions.erase(shard.shard);
+               uses_default = true;
+             } else {
+               if (!versions.contains(shard.shard)) {
+                 versions[shard.shard] = e.prior_version;
+               }
+               //Else: keep existing version
+             }
+           }
+           if (uses_default) {
+             versions[shard_id_t::NO_SHARD] = e.version;
+           } else {
+             versions.erase(shard_id_t::NO_SHARD);
+           }
+           // Erase and recreate backfill interval for e.soid using map
+           bi->objects.erase(e.soid);
+           for (auto & [shard, version] : versions) {
+             bi->objects.insert(make_pair(e.soid, make_pair(shard, version)));
+           }
+         }
         } else if (e.is_delete()) {
           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
-         bi->objects.erase(e.soid);
+         bi->objects.erase(e.soid); // Erase all entries for e.soid
         }
        }
      };
@@ -14367,16 +14416,18 @@ void PrimaryLogPG::update_range(
      projected_log.scan_log_after(bi->version, func);
      bi->version = projected_last_update;
    } else {
-    ceph_abort_msg("scan_range should have raised bi->version past log_tail");
+    ceph_abort_msg("scan_range_primary should have raised bi->version past log_tail");
    }
  }
  
-void PrimaryLogPG::scan_range(
-  int min, int max, BackfillInterval *bi,
-  ThreadPool::TPHandle &handle)
+void PrimaryLogPG::scan_range_primary(
+  int min, int max, PrimaryBackfillInterval *bi,
+  ThreadPool::TPHandle &handle,
+  const std::set<pg_shard_t> &backfill_targets)
  {
    ceph_assert(is_locked());
-  dout(10) << "scan_range from " << bi->begin << dendl;
+  dout(10) << "scan_range_primary from " << bi->begin <<
+              " backfill_targets " << backfill_targets << dendl;
    bi->clear_objects();
  
    vector<hobject_t> ls;
@@ -14388,9 +14439,13 @@ void PrimaryLogPG::scan_range(
  
    for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
      handle.reset_tp_timeout();
-    ObjectContextRef obc;
-    if (is_primary())
-      obc = object_contexts.lookup(*p);
+
+    ceph_assert(is_primary());
+
+    eversion_t version;
+    std::map<shard_id_t,eversion_t> shard_versions;
+    ObjectContextRef obc = object_contexts.lookup(*p);
+
      if (obc) {
        if (!obc->obs.exists) {
         /* If the object does not exist here, it must have been removed
@@ -14399,8 +14454,8 @@ void PrimaryLogPG::scan_range(
          */
         continue;
        }
-      bi->objects[*p] = obc->obs.oi.version;
-      dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
+      version = obc->obs.oi.version;
+      shard_versions = obc->obs.oi.shard_versions;
      } else {
        bufferlist bl;
        int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
@@ -14413,12 +14468,64 @@ void PrimaryLogPG::scan_range(
  
        ceph_assert(r >= 0);
        object_info_t oi(bl);
-      bi->objects[*p] = oi.version;
-      dout(20) << "  " << *p << " " << oi.version << dendl;
+      version = oi.version;
+      shard_versions = oi.shard_versions;
+    }
+    dout(20) << "  " << *p << " " << version << dendl;
+    if (shard_versions.empty()) {
+      bi->objects.insert(make_pair(*p, std::make_pair(shard_id_t::NO_SHARD,
+                                                     version)));
+    } else {
+      bool added_default = false;
+      for (auto & shard: backfill_targets) {
+       if (shard_versions.contains(shard.shard)) {
+         version = shard_versions.at(shard.shard);
+         bi->objects.insert(make_pair(*p, std::make_pair(shard.shard,
+                                                         version)));
+       } else if (!added_default) {
+         bi->objects.insert(make_pair(*p, std::make_pair(shard_id_t::NO_SHARD,
+                                                         version)));
+         added_default = true;
+       }
+      }
      }
    }
  }
  
+void PrimaryLogPG::scan_range_replica(
+  int min, int max, ReplicaBackfillInterval *bi,
+  ThreadPool::TPHandle &handle)
+{
+  ceph_assert(is_locked());
+  dout(10) << "scan_range_replica from " << bi->begin << dendl;
+  bi->clear_objects();
+
+  vector<hobject_t> ls;
+  ls.reserve(max);
+  int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
+  ceph_assert(r >= 0);
+  dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
+  dout(20) << ls << dendl;
+
+  for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+    handle.reset_tp_timeout();
+
+    ceph_assert(!is_primary());
+    bufferlist bl;
+    int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
+    /* If the object does not exist here, it must have been removed
+     * between the collection_list_partial and here.  This can happen
+     * for the first item in the range, which is usually last_backfill.
+     */
+    if (r == -ENOENT)
+      continue;
+
+    ceph_assert(r >= 0);
+    object_info_t oi(bl);
+    bi->objects[*p] = oi.version;
+    dout(20) << "  " << *p << " " << oi.version << dendl;
+  }
+}
  
  /** check_local
   *
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h

index 94c0b13b5141d661ac19e65695e07d3c7ad57417..aafd618b754deb18bd432a6f644f68338fa05a55 100644 (file)
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -1138,7 +1138,7 @@ protected:
      }
      {
        f->open_array_section("peer_backfill_info");
-      for (std::map<pg_shard_t, BackfillInterval>::const_iterator pbi =
+      for (std::map<pg_shard_t, ReplicaBackfillInterval>::const_iterator pbi =
              peer_backfill_info.begin();
            pbi != peer_backfill_info.end(); ++pbi) {
          f->dump_stream("osd") << pbi->first;
@@ -1327,14 +1327,20 @@ protected:
     * @bi.begin first item should be >= this value
     * @bi [out] resulting std::map of objects to eversion_t's
     */
-  void scan_range(
-    int min, int max, BackfillInterval *bi,
+  void scan_range_replica(
+    int min, int max, ReplicaBackfillInterval *bi,
      ThreadPool::TPHandle &handle
      );
  
+  void scan_range_primary(
+    int min, int max, PrimaryBackfillInterval *bi,
+    ThreadPool::TPHandle &handle,
+    const std::set<pg_shard_t> &backfill_targets
+    );
+
    /// Update a hash range to reflect changes since the last scan
    void update_range(
-    BackfillInterval *bi,        ///< [in,out] interval to update
+    PrimaryBackfillInterval *bi, ///< [in,out] interval to update
      ThreadPool::TPHandle &handle ///< [in] tp handle
      );
  
diff --git a/src/osd/recovery_types.cc b/src/osd/recovery_types.cc

deleted file mode 100644 (file)

index 70ba1fd..0000000
--- a/src/osd/recovery_types.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "recovery_types.h"
-
-std::ostream& operator<<(std::ostream& out, const BackfillInterval& bi)
-{
-  out << "BackfillInfo(" << bi.begin << "-" << bi.end
-      << " " << bi.objects.size() << " objects";
-  if (!bi.objects.empty())
-    out << " " << bi.objects;
-  out << ")";
-  return out;
-}
-
-
diff --git a/src/osd/recovery_types.h b/src/osd/recovery_types.h

index cffb2b7ea4c06251884e5180933a10d68cdb1172..159620949f028d29638f467d356c1934e9a927af 100644 (file)
--- a/src/osd/recovery_types.h
+++ b/src/osd/recovery_types.h
@@ -15,18 +15,54 @@
   * Possible states:
   * 1) begin == end == hobject_t() indicates the the interval is unpopulated
   * 2) Else, objects contains all objects in [begin, end)
+ *
+ * ReplicaBackfillInterval
+ *
+ * Stores a map of hobject_t and eversion to track the version number of
+ * the objects being backfilled in an interval for one specific shard
+ *
+ * PrimaryBackfillInterval
+ *
+ * Stores a multimap of hobject and pair<shard_id_t, eversion>.
+ *
+ * Only shards that are backfill targets will be tracked. For replicated and
+ * non-optimized EC pools there is one entry per hobject_t and shard_id_t will
+ * be NO_SHARD.
+ *
+ * For optimized EC pools partial writes mean it is possible that different
+ * shards have different eversions, hence there may be multiple entries per
+ * hobject_t. To conserve memory it is permitted to have an entry for NO_SHARD
+ * and additional entries for the same hobject for specific shards. In this
+ * case shards that are not specifically listed are expected to be at the
+ * eversion for the NO_SHARD entry.
+ *
+ * Example: EC pool with 4+2 profile
+ *
+ *   test:head, <NO_SHARD, 1'23>
+ *   test:head, <1,        1'20>
+ *
+ * Shards 0 and 2-5 are expected to be at version 1'23, shard 1 has skipped
+ * recent updates and is expected to be at version 1'20
   */
-struct BackfillInterval {
+
+template <typename T>
+class BackfillInterval {
+public:
    // info about a backfill interval on a peer
    eversion_t version; /// version at which the scan occurred
-  std::map<hobject_t,eversion_t> objects;
    hobject_t begin;
    hobject_t end;
+  T objects;
+
+  virtual ~BackfillInterval() = default;
+  BackfillInterval() = default;
+  BackfillInterval(const BackfillInterval&) = default;
+  BackfillInterval(BackfillInterval&&) = default;
+  BackfillInterval& operator=(const BackfillInterval&) = default;
+  BackfillInterval& operator=(BackfillInterval&&) = default;
  
    /// clear content
-  void clear() {
-    *this = BackfillInterval();
-  }
+  virtual void clear() = 0;
  
    /// clear objects std::list only
    void clear_objects() {
@@ -60,10 +96,60 @@ struct BackfillInterval {
  
    /// Adjusts begin to the first object
    void trim() {
-    if (!objects.empty())
+    if (!objects.empty()) {
        begin = objects.begin()->first;
-    else
+    } else {
        begin = end;
+    }
+  }
+
+  /// drop first entry, and adjust @begin accordingly
+  virtual void pop_front() = 0;
+
+  /// dump
+  virtual void dump(ceph::Formatter *f) const = 0;
+};
+
+class PrimaryBackfillInterval: public BackfillInterval<std::multimap<hobject_t,
+                                       std::pair<shard_id_t, eversion_t>>> {
+public:
+
+  /// clear content
+  void clear() override {
+    *this = PrimaryBackfillInterval();
+  }
+
+  /// drop first entry, and adjust @begin accordingly
+  void pop_front() override {
+    ceph_assert(!objects.empty());
+    // Use erase(key) to erase all entries for key
+    objects.erase(objects.begin()->first);
+    trim();
+  }
+
+  /// dump
+  void dump(ceph::Formatter *f) const override {
+    f->dump_stream("begin") << begin;
+    f->dump_stream("end") << end;
+    f->open_array_section("objects");
+    for (const auto& [hoid, shard_version] : objects) {
+      const auto& [shard, version] = shard_version;
+      f->open_object_section("object");
+      f->dump_stream("object") << hoid;
+      f->dump_stream("shard") << shard;
+      f->dump_stream("version") << version;
+      f->close_section();
+    }
+    f->close_section();
+  }
+};
+
+class ReplicaBackfillInterval: public BackfillInterval<std::map<hobject_t,
+                                                               eversion_t>> {
+public:
+  /// clear content
+  void clear() override {
+    *this = ReplicaBackfillInterval();
    }
  
    /// drop first entry, and adjust @begin accordingly
@@ -74,25 +160,32 @@ struct BackfillInterval {
    }
  
    /// dump
-  void dump(ceph::Formatter *f) const {
+  void dump(ceph::Formatter *f) const override {
      f->dump_stream("begin") << begin;
      f->dump_stream("end") << end;
      f->open_array_section("objects");
-    for (std::map<hobject_t, eversion_t>::const_iterator i =
-           objects.begin();
-         i != objects.end();
-         ++i) {
+    for (const auto& [hoid, version] : objects) {
        f->open_object_section("object");
-      f->dump_stream("object") << i->first;
-      f->dump_stream("version") << i->second;
+      f->dump_stream("object") << hoid;
+      f->dump_stream("version") << version;
        f->close_section();
      }
      f->close_section();
    }
  };
  
-std::ostream &operator<<(std::ostream &out, const BackfillInterval &bi);
+template<typename T> std::ostream& operator<<(std::ostream& out,
+                                             const BackfillInterval<T>& bi)
+{
+  out << "BackfillInfo(" << bi.begin << "-" << bi.end << " ";
+  if (!bi.objects.empty()) {
+    out << bi.objects.size() << " objects " << bi.objects;
+  }
+  out << ")";
+  return out;
+}
  
  #if FMT_VERSION >= 90000
-template <> struct fmt::formatter<BackfillInterval> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<PrimaryBackfillInterval> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<ReplicaBackfillInterval> : fmt::ostream_formatter {};
  #endif
diff --git a/src/test/crimson/CMakeLists.txt b/src/test/crimson/CMakeLists.txt

index 3456514b5fda9ac41a7df2efac873555980dc020..de4f4b486921480cd304ae62fd6fbe47f3fe5d45 100644 (file)
--- a/src/test/crimson/CMakeLists.txt
+++ b/src/test/crimson/CMakeLists.txt
@@ -2,11 +2,10 @@
  add_executable(unittest-crimson-backfill
    test_backfill.cc
    ${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc
-  ${PROJECT_SOURCE_DIR}/src/crimson/osd/backfill_state.cc
-  ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc)
+  ${PROJECT_SOURCE_DIR}/src/crimson/osd/backfill_state.cc)
  add_ceph_unittest(unittest-crimson-backfill
    --memory 256M --smp 1)
-target_link_libraries(unittest-crimson-backfill crimson GTest::Main)
+target_link_libraries(unittest-crimson-backfill crimson GTest::Main Boost::MPL)
  
  add_executable(unittest-seastar-buffer
    test_buffer.cc)
diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc

index 117024a48d4b07beaed1bb7466053846d5abfe54..39d6700d4a07e44c0337416b693093f131663578 100644 (file)
--- a/src/test/crimson/test_backfill.cc
+++ b/src/test/crimson/test_backfill.cc
@@ -14,6 +14,7 @@
  #include <string>
  
  #include <boost/statechart/event_base.hpp>
+
  #include <gmock/gmock.h>
  #include <gtest/gtest.h>
  
@@ -67,10 +68,31 @@ struct FakeStore {
                                  : hobject_t::get_max();
    }
  
+  // Permit rhs (reference) objects to be the same version or 1 version older
+  bool looks_like(const FakeStore& rhs) const {
+    if (std::size(objs) != std::size(rhs.objs)) {
+      return false;
+    }
+    for (auto &[obj, version] : objs) {
+      if (!rhs.objs.contains(obj)) {
+       return false;
+      }
+      auto version_r = rhs.objs.at(obj);
+      if ((version.epoch != version_r.epoch) ||
+         ((version.version != version_r.version) &&
+          (version.version != version_r.version + 1)))
+      {
+       return false;
+      }
+    }
+    return true;
+  }
+
    bool operator==(const FakeStore& rhs) const {
      return std::size(objs) == std::size(rhs.objs) && \
             std::equal(std::begin(objs), std::end(objs), std::begin(rhs.objs));
    }
+
    bool operator!=(const FakeStore& rhs) const {
      return !(*this == rhs);
    }
@@ -92,6 +114,7 @@ struct FakePrimary {
    eversion_t projected_last_update;
    eversion_t log_tail;
    PGLog pg_log;
+  pg_pool_t pool;
    PGLog::IndexedLog projected_log;
  
    FakePrimary(FakeStore&& store)
@@ -184,7 +207,7 @@ public:
      const bool all_replica_match = std::all_of(
        std::begin(backfill_targets), std::end(backfill_targets),
        [&reference] (const auto kv) {
-        return kv.second.store == reference;
+        return kv.second.store.looks_like(reference);
        });
      return backfill_source.store == reference && all_replica_match;
    }
@@ -244,6 +267,10 @@ struct BackfillFixture::PeeringFacade
      return backfill_source.pg_log;
    }
  
+  const pg_pool_t& get_pool() const override {
+    return backfill_source.pool;
+  }
+
    void scan_log_after(eversion_t, scan_log_func_t) const override {
      /* NOP */
    }
@@ -304,7 +331,7 @@ void BackfillFixture::request_replica_scan(
    const hobject_t& begin,
    const hobject_t& end)
  {
-  BackfillInterval bi;
+  ReplicaBackfillInterval bi;
    bi.end = backfill_targets.at(target).store.list(begin, [&bi](auto kv) {
      bi.objects.insert(std::move(kv));
    });
@@ -317,9 +344,35 @@ void BackfillFixture::request_replica_scan(
  void BackfillFixture::request_primary_scan(
    const hobject_t& begin)
  {
-  BackfillInterval bi;
+  PrimaryBackfillInterval bi;
    bi.end = backfill_source.store.list(begin, [&bi](auto kv) {
-    bi.objects.insert(std::move(kv));
+    auto && [hoid,version] = kv;
+    eversion_t version_zero;
+    eversion_t version_next = eversion_t(version.epoch, version.version + 1);
+    switch (std::rand() % 4) {
+    case 0:
+      // All shards at same version (Replica, EC, optimized EC after full-stripe write)
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version)));
+      break;
+    case 1:
+      // Optimized EC partial write - Shard 3 at an earlier version
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(3), version_zero)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version)));
+      break;
+    case 2:
+      // Optimized EC partial write - Shard 1 and 2 at an earlier version
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version_next)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(1), version)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(2), version)));
+      break;
+    case 3:
+      // Optimized EC partial write - Shard 1, 2 and 3 at different earlier versions
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t::NO_SHARD, version_next)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(1), version)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(2), version)));
+      bi.objects.insert(std::make_pair(hoid, std::make_pair(shard_id_t(3), version_zero)));
+      break;
+    }
    });
    bi.begin = begin;
    bi.version = backfill_source.last_update;
@@ -381,8 +434,10 @@ struct BackfillFixtureBuilder {
  
    BackfillFixtureBuilder&& add_target(FakeStore::objs_t objs) && {
      const auto new_osd_num = std::size(backfill_targets);
+    const auto new_shard_id = shard_id_t(1 + new_osd_num);
      const auto [ _, inserted ] = backfill_targets.emplace(
-      new_osd_num, FakeReplica{ FakeStore{std::move(objs)} });
+      pg_shard_t(new_osd_num, new_shard_id),
+      FakeReplica{ FakeStore{std::move(objs)} });
      ceph_assert(inserted);
      return std::move(*this);
    }
@@ -438,6 +493,27 @@ TEST(backfill, one_empty_replica)
    EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
  }
  
+TEST(backfill, one_same_one_empty_replica)
+{
+  const auto reference_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
+    { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} },
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} },
+  }};
+  auto cluster_fixture = BackfillFixtureBuilder::add_source(
+    reference_store.objs
+  ).add_target(
+    reference_store.objs
+  ).add_target(
+    { /* nothing 2 */ }
+  ).get_result();
+
+  EXPECT_CALL(cluster_fixture, backfilled);
+  cluster_fixture.next_till_done();
+
+  EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
  TEST(backfill, two_empty_replicas)
  {
    const auto reference_store = FakeStore{ {
@@ -459,6 +535,34 @@ TEST(backfill, two_empty_replicas)
    EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
  }
  
+TEST(backfill, one_behind_one_empty_replica)
+{
+  const auto reference_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {8, 234} },
+    { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 250} },
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 247} },
+    //"1:0256710c:::rbd_data.1018ac3e755.00000000000000b1:head", deleted
+  }};
+  const auto behind_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {8, 234} },
+    //"1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head"  missing
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {8, 165} },
+    { "1:0256710c:::rbd_data.1018ac3e755.00000000000000b1:head", {8, 169} },
+  }};
+  auto cluster_fixture = BackfillFixtureBuilder::add_source(
+    reference_store.objs
+  ).add_target(
+    { /* nothing 1 */ }
+  ).add_target(
+    behind_store.objs
+  ).get_result();
+
+  EXPECT_CALL(cluster_fixture, backfilled);
+  cluster_fixture.next_till_done();
+
+  EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
  TEST(backfill, cancel_resume_middle_of_primaryscan)
  {
    const auto reference_store = FakeStore{ {
author	Bill Scales <bill_scales@uk.ibm.com>
	Thu, 3 Apr 2025 11:15:31 +0000 (12:15 +0100)
committer	Bill Scales <bill_scales@uk.ibm.com>
	Thu, 10 Apr 2025 11:31:37 +0000 (12:31 +0100)
src/crimson/osd/CMakeLists.txt		patch \| blob \| history
src/crimson/osd/backfill_facades.h		patch \| blob \| history
src/crimson/osd/backfill_state.cc		patch \| blob \| history
src/crimson/osd/backfill_state.h		patch \| blob \| history
src/crimson/osd/pg_recovery.cc		patch \| blob \| history
src/crimson/osd/recovery_backend.cc		patch \| blob \| history
src/crimson/osd/recovery_backend.h		patch \| blob \| history
src/osd/CMakeLists.txt		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history
src/osd/PrimaryLogPG.cc		patch \| blob \| history
src/osd/PrimaryLogPG.h		patch \| blob \| history
src/osd/recovery_types.cc	[deleted file]	patch \| blob \| history
src/osd/recovery_types.h		patch \| blob \| history
src/test/crimson/CMakeLists.txt		patch \| blob \| history
src/test/crimson/test_backfill.cc		patch \| blob \| history