osd: Deduplicate zeros in EC slice iterator

author Alex Ainscow <aainscow@uk.ibm.com>

Fri, 27 Jun 2025 15:00:56 +0000 (16:00 +0100)

committer Alex Ainscow <aainscow@uk.ibm.com>

Thu, 4 Sep 2025 21:08:52 +0000 (22:08 +0100)
author Alex Ainscow <aainscow@uk.ibm.com>
Fri, 27 Jun 2025 15:00:56 +0000 (16:00 +0100)
committer Alex Ainscow <aainscow@uk.ibm.com>
Thu, 4 Sep 2025 21:08:52 +0000 (22:08 +0100)
diff --git a/src/include/interval_set.h b/src/include/interval_set.h

index 61392289c2afbeeb5e9052db686ffc4f52d91a26..9ed8d3a9cabe2060d4bab5054731323cdb6d656d 100644 (file)
--- a/src/include/interval_set.h
+++ b/src/include/interval_set.h
@@ -999,4 +999,14 @@ public:
  template<typename T, template<typename, typename, typename ...> class C, bool strict>
  struct fmt::is_range<interval_set<T, C, strict>, char> : std::false_type {};
  
+template <typename T>
+struct is_interval_set : std::false_type {};
+
+template <typename T, template<typename, typename, typename ...> class C, bool strict>
+struct is_interval_set<interval_set<T, C, strict>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_interval_set_v = is_interval_set<T>::value;
+
+#undef strict_mode_assert
  #endif
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc

index 0f91eba5bf8686181cb50bac626ea6d91dccd24b..112a65e85d4fa54b7f09b179b0c35d49583b2f55 100644 (file)
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -223,19 +223,20 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
      m->t.touch(coll, tobj);
    }
  
-  if (!op.data_included.empty()) {
-    uint64_t start = op.data_included.range_start();
-    uint64_t end = op.data_included.range_end();
-    ceph_assert(op.data.length() == (end - start));
-
-    m->t.write(
-      coll,
-      tobj,
-      start,
-      op.data.length(),
-      op.data);
-  } else {
-    ceph_assert(op.data.length() == 0);
+  ceph_assert(op.data.length() == op.data_included.size());
+  uint64_t tobj_size = 0;
+
+  uint64_t cursor = 0;
+  for (auto [off, len] : op.data_included) {
+    bufferlist bl;
+    if (len != op.data.length()) {
+      bl.substr_of(op.data, cursor, len);
+    } else {
+      bl = op.data;
+    }
+    m->t.write(coll, tobj, off, len, bl);
+    tobj_size = off + len;
+    cursor += len;
    }
  
    if (op.before_progress.first) {
@@ -246,6 +247,15 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
        op.attrset);
    }
  
+  if (op.after_progress.data_complete) {
+    uint64_t shard_size = sinfo.object_size_to_shard_size(op.recovery_info.size,
+      get_parent()->whoami_shard().shard);
+    ceph_assert(shard_size >= tobj_size);
+    if (shard_size != tobj_size) {
+      m->t.truncate( coll, tobj, shard_size);
+    }
+  }
+
    if (op.after_progress.data_complete && !oneshot) {
      dout(10) << __func__ << ": Removing oid "
              << tobj.hobj << " from the temp collection" << dendl;
@@ -361,7 +371,7 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
  
    uint64_t aligned_size = ECUtil::align_next(op.obc->obs.oi.size);
  
-  int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size);
+  int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size, get_parent()->get_dpp(), true);
    ceph_assert(r == 0);
  
    // Finally, we don't want to write any padding, so truncate the buffer
@@ -601,22 +611,24 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
        if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
          after_progress.data_complete = true;
        }
+
        for (auto &&pg_shard: op.missing_on) {
          m->pushes[pg_shard].push_back(PushOp());
          PushOp &pop = m->pushes[pg_shard].back();
          pop.soid = op.hoid;
          pop.version = op.recovery_info.oi.get_version_for_shard(pg_shard.shard);
-        op.returned_data->get_shard_first_buffer(pg_shard.shard, pop.data);
+
+        op.returned_data->get_sparse_buffer(pg_shard.shard, pop.data, pop.data_included);
+        ceph_assert(pop.data.length() == pop.data_included.size());
+
          dout(10) << __func__ << ": pop shard=" << pg_shard
                   << ", oid=" << pop.soid
                   << ", before_progress=" << op.recovery_progress
                  << ", after_progress=" << after_progress
                  << ", pop.data.length()=" << pop.data.length()
+                 << ", pop.data_included=" << pop.data_included
                  << ", size=" << op.obc->obs.oi.size << dendl;
-        if (pop.data.length())
-          pop.data_included.union_insert(
-            op.returned_data->get_shard_first_offset(pg_shard.shard),
-            pop.data.length());
+
          if (op.recovery_progress.first) {
            if (sinfo.is_nonprimary_shard(pg_shard.shard)) {
              if (pop.version == op.recovery_info.oi.version) {
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc

index 0db49bc621a2e4a9a40e75de16d107873d8b37bb..377fd86452d547d8643685727fd7aab67fac8143 100644 (file)
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -487,7 +487,8 @@ struct ClientReadCompleter final : ECCommon::ReadCompleter {
        /* Decode any missing buffers */
        int r = res.buffers_read.decode(read_pipeline.ec_impl,
                                    req.shard_want_to_read,
-                                  req.object_size);
+                                  req.object_size,
+                                  read_pipeline.get_parent()->get_dpp());
        ceph_assert( r == 0 );
        dout(30) << __func__ << ": after decode: "
                 << res.buffers_read.debug_string(2048, 8)
diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc

index 996a37f03d8060a5a3cab482cf1ef2fb23d62e39..1142db8008f4a2911b342e41b8f05050d180bdb3 100644 (file)
--- a/src/osd/ECTransaction.cc
+++ b/src/osd/ECTransaction.cc
@@ -68,7 +68,7 @@ void ECTransaction::Generate::encode_and_write() {
       */
      read_sem->zero_pad(plan.will_write);
      to_write.pad_with_other(plan.will_write, *read_sem);
-    r = to_write.encode_parity_delta(ec_impl, *read_sem);
+    r = to_write.encode_parity_delta(ec_impl, *read_sem, dpp);
    } else {
      r = to_write.encode(ec_impl);
    }
diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc

index b1522a7fbd8c77e31a54bb0f7ceeb6591d6775a9..911e314deaf802db2d60c6c8bc6d8f9161cdb5c8 100644 (file)
--- a/src/osd/ECUtil.cc
+++ b/src/osd/ECUtil.cc
@@ -475,19 +475,23 @@ void shard_extent_map_t::insert_parity_buffers() {
    }
  }
  
-slice_iterator<shard_id_t, extent_map> shard_extent_map_t::begin_slice_iterator(
-    const shard_id_set &out) {
-  return slice_iterator(extent_maps, out);
+slice_iterator shard_extent_map_t::begin_slice_iterator(
+    const shard_id_set &out,
+    DoutPrefixProvider *dpp,
+    const shard_id_set *dedup_zeros) {
+  return slice_iterator(extent_maps, out, dpp, dedup_zeros);
  }
  
  /* Encode parity chunks, using the encode_chunks interface into the
   * erasure coding. This generates all parity using full stripe writes.
   */
-int shard_extent_map_t::_encode(const ErasureCodeInterfaceRef &ec_impl) {
+int shard_extent_map_t::encode(const ErasureCodeInterfaceRef &ec_impl,
+    DoutPrefixProvider *dpp,
+    shard_id_set *dedup_zeros) {
    shard_id_set out_set = sinfo->get_parity_shards();
    bool rebuild_req = false;
  
-  for (auto iter = begin_slice_iterator(out_set); !iter.is_end(); ++iter) {
+  for (auto iter = begin_slice_iterator(out_set, dpp, dedup_zeros); !iter.is_end(); ++iter) {
      if (!iter.is_page_aligned()) {
        rebuild_req = true;
        break;
@@ -503,25 +507,19 @@ int shard_extent_map_t::_encode(const ErasureCodeInterfaceRef &ec_impl) {
  
    if (rebuild_req) {
      pad_and_rebuild_to_ec_align();
-    return _encode(ec_impl);
+    return encode(ec_impl, dpp, dedup_zeros);
    }
  
    return 0;
  }
  
-/* Encode parity chunks, using the encode_chunks interface into the
- * erasure coding. This generates all parity using full stripe writes.
- */
-int shard_extent_map_t::encode(const ErasureCodeInterfaceRef &ec_impl) {
-  return _encode(ec_impl);
-}
-
  /* Encode parity chunks, using the parity delta write interfaces on plugins
   * that support them.
   */
  int shard_extent_map_t::encode_parity_delta(
      const ErasureCodeInterfaceRef &ec_impl,
-    shard_extent_map_t &old_sem) {
+    shard_extent_map_t &old_sem,
+    DoutPrefixProvider *dpp) {
    shard_id_set out_set = sinfo->get_parity_shards();
  
    pad_and_rebuild_to_ec_align();
@@ -542,7 +540,7 @@ int shard_extent_map_t::encode_parity_delta(
  
      s.compute_ro_range();
  
-    for (auto iter = s.begin_slice_iterator(out_set); !iter.is_end(); ++iter) {
+    for (auto iter = s.begin_slice_iterator(out_set, dpp); !iter.is_end(); ++iter) {
        ceph_assert(iter.is_page_aligned());
        shard_id_map<bufferptr> &data_shards = iter.get_in_bufferptrs();
        shard_id_map<bufferptr> &parity_shards = iter.get_out_bufferptrs();
@@ -644,7 +642,9 @@ void shard_extent_map_t::trim(const shard_extent_set_t &trim_to) {
  
  int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
                                 const shard_extent_set_t &want,
-                               uint64_t object_size) {
+                               uint64_t object_size,
+                               DoutPrefixProvider *dpp,
+                               bool dedup_zeros) {
    shard_id_set want_set;
    shard_id_set have_set;
    want.populate_shard_id_set(want_set);
@@ -679,11 +679,11 @@ int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
        decode_for_parity.intersection_of(want.at(shard), read_mask.at(shard));
        pad_on_shard(decode_for_parity, shard);
      }
-    r = _decode(ec_impl, want_set, decode_set);
+    r = _decode(ec_impl, want_set, decode_set, dpp);
    }
    if (!r && !encode_set.empty()) {
      pad_on_shards(want, encode_set);
-    r = _encode(ec_impl);
+    r = encode(ec_impl, dpp, dedup_zeros?&need_set:nullptr);
    }
  
    // If we failed to decode, then bail out, or the trimming below might fail.
@@ -702,9 +702,10 @@ int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
  
  int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl,
                                  const shard_id_set &want_set,
-                                const shard_id_set &need_set) {
+                                const shard_id_set &need_set,
+                                DoutPrefixProvider *dpp) {
    bool rebuild_req = false;
-  for (auto iter = begin_slice_iterator(need_set); !iter.is_end(); ++iter) {
+  for (auto iter = begin_slice_iterator(need_set, dpp); !iter.is_end(); ++iter) {
      if (!iter.is_page_aligned()) {
        rebuild_req = true;
        break;
@@ -719,7 +720,7 @@ int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl,
  
    if (rebuild_req) {
      pad_and_rebuild_to_ec_align();
-    return _decode(ec_impl, want_set, need_set);
+    return _decode(ec_impl, want_set, need_set, dpp);
    }
  
    compute_ro_range();
diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h

index 04ac579d69948daec4090e672ff8d67f3e6d42d9..f31a0b9108388b32759392481c83820b9bba857c 100644 (file)
--- a/src/osd/ECUtil.h
+++ b/src/osd/ECUtil.h
@@ -67,9 +67,8 @@ using extent_map = interval_map<uint64_t, ceph::buffer::list, bl_split_merge,
   * K must a key suitable for a mini_flat_map.
   * T must be either an extent map or a reference to an extent map.
   */
-template <typename K, typename T>
  class slice_iterator {
-  mini_flat_map<K, T> &input;
+  mini_flat_map<shard_id_t, extent_map> &input;
    uint64_t offset = std::numeric_limits<uint64_t>::max();
    uint64_t length = std::numeric_limits<uint64_t>::max();
    uint64_t start = std::numeric_limits<uint64_t>::max();
@@ -79,8 +78,81 @@ class slice_iterator {
    shard_id_map<bufferptr> in;
    shard_id_map<bufferptr> out;
    const shard_id_set &out_set;
+  const shard_id_set *dedup_set;
+  DoutPrefixProvider *dpp;
+
+  /* zero dedup is used by the slice iterator to detect zero buffers and replace
+   * them with the dedup'd zero buffer. It keeps a replacement buffer which
+   * once full (bl.length() == len) can be used to swap out the input buffer.
+   */
+  struct zeros {
+    uint64_t off;
+    uint64_t len;
+    bufferlist bl;
+
+    zeros(uint64_t _off, uint64_t _len) : off(_off), len(_len) {}
+
+    bool dedup(bufferptr &bp) {
+      bool is_zeros = false;
+      uint64_t bp_len = bp.length();
+      uint64_t off = 0;
+      char *c_str = bp.c_str();
+      // Skip any non-aligned chunk.
+      uint64_t analysed = p2roundup((uintptr_t)c_str, EC_ALIGN_SIZE) - (uintptr_t)c_str;
+
+      while (off + analysed <= bp_len) {
+        bool new_is_zeros;
+        if (bp_len - off - analysed < EC_ALIGN_SIZE) {
+          new_is_zeros = false;
+        } else {
+          new_is_zeros = mem_is_zero(c_str + off + analysed, EC_ALIGN_SIZE);
+        }
+        if (new_is_zeros != is_zeros && analysed) {
+          if (is_zeros) {
+            bl.append_zero2(analysed);
+          } else {
+            bl.append(bufferptr(bp, off, analysed));
+          }
+          off += analysed;
+          analysed = 0;
+        }
+        is_zeros = new_is_zeros;
+        analysed += EC_ALIGN_SIZE;
+      }
+      if (is_zeros) {
+        bl.append_zero2(bp_len - off);
+      } else {
+        bl.append(bufferptr(bp, off, bp_len - off));
+      }
+
+      return bl.length() == len;
+    }
+  };
+
+  std::optional<shard_id_map<zeros>> zeros;
+
+  void zeros_dedup() {
+    for (auto &&[shard, _zeros] : *zeros) {
+
+      if (!out.contains(shard) && !in.contains(shard)) {
+        continue;
+      }
+
+      bufferptr &bp = out.contains(shard) ? out.at(shard) : in.at(shard);
+      if (_zeros.dedup(bp)) {
+        ldpp_dout(dpp, 20) << __func__ << ": overwrite input[" << shard << "]="
+                           << _zeros.off << "~" << _zeros.len
+                           << " with bl=" << _zeros.bl << dendl;
+        input.at(shard).insert(_zeros.off, _zeros.len, _zeros.bl);
+        zeros->erase(shard);
+      }
+    }
+  }
  
    void advance() {
+    if (dedup_set) {
+      zeros_dedup();
+    }
      in.clear();
      out.clear();
      offset = start;
@@ -121,9 +193,15 @@ class slice_iterator {
          // Create a new buffer pointer for the result. We don't want the client
          // manipulating the ptr.
          if (out_set.contains(shard)) {
+          ldpp_dout(dpp, 20) << __func__ << " out[" << shard << "]="
+                             << start << "~" << (end - start)
+                             << dendl;
            out.emplace(
              shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start));
          } else {
+          ldpp_dout(dpp, 20) << __func__ << " in[" << shard << "]="
+                   << start << "~" << (end - start)
+                   << dendl;
            in.emplace(
              shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start));
          }
@@ -142,6 +220,9 @@ class slice_iterator {
                bl.invalidate_crc();
              }
              iters.at(shard).second = emap_iter.get_val().begin();
+            if (zeros) {
+              zeros->emplace(shard, emap_iter.get_off(), emap_iter.get_len());
+            }
            }
          }
        } else
@@ -169,15 +250,29 @@ class slice_iterator {
    }
  
  public:
-  slice_iterator(mini_flat_map<K, T> &_input, const shard_id_set &out_set) :
+  slice_iterator(
+      mini_flat_map<shard_id_t, extent_map> &_input,
+      const shard_id_set &out_set,
+      DoutPrefixProvider *_dpp,
+      const shard_id_set *dedup_set) :
      input(_input),
      iters(input.max_size()),
      in(input.max_size()),
      out(input.max_size()),
-    out_set(out_set) {
+    out_set(out_set),
+    dedup_set(dedup_set),
+    dpp(_dpp) {
+
+    if (dedup_set) {
+      zeros.emplace(input.max_size());
+    }
+
      for (auto &&[shard, emap] : input) {
        auto emap_iter = emap.begin();
        auto bl_iter = emap_iter.get_val().begin();
+      if (zeros) {
+        zeros->emplace(shard, emap_iter.get_off(), emap_iter.get_len());
+      }
        auto p = std::make_pair(std::move(emap_iter), std::move(bl_iter));
        iters.emplace(shard, std::move(p));
  
@@ -756,8 +851,10 @@ public:
    uint64_t end_offset;
    shard_id_map<extent_map> extent_maps;
  
-  slice_iterator<shard_id_t, extent_map> begin_slice_iterator(
-      const shard_id_set &out_set);
+  slice_iterator begin_slice_iterator(
+      const shard_id_set &out,
+      DoutPrefixProvider *dpp,
+      const shard_id_set *dedup_zeros = nullptr);
  
    /* This caculates the ro offset for an offset into a particular shard */
    uint64_t calc_ro_offset(raw_shard_id_t raw_shard, int shard_offset) const {
@@ -890,10 +987,12 @@ public:
    void append_zeros_to_ro_offset(uint64_t ro_offset);
    void insert_ro_extent_map(const extent_map &host_extent_map);
    extent_set get_extent_superset() const;
-  int encode(const ErasureCodeInterfaceRef &ec_impl);
-  int _encode(const ErasureCodeInterfaceRef &ec_impl);
+  int encode(const ErasureCodeInterfaceRef &ec_impl,
+    DoutPrefixProvider *dpp = nullptr,
+    shard_id_set *dedup_zeros = nullptr);
    int encode_parity_delta(const ErasureCodeInterfaceRef &ec_impl,
-                          shard_extent_map_t &old_sem);
+                          shard_extent_map_t &old_sem,
+                          DoutPrefixProvider *dpp);
  
    void pad_on_shards(const shard_extent_set_t &pad_to,
                       const shard_id_set &shards);
@@ -904,10 +1003,13 @@ public:
    void trim(const shard_extent_set_t &trim_to);
    int decode(const ErasureCodeInterfaceRef &ec_impl,
               const shard_extent_set_t &want,
-             uint64_t object_size);
+             uint64_t object_size,
+             DoutPrefixProvider *dpp = nullptr,
+             bool dedup_zeros = false);
    int _decode(const ErasureCodeInterfaceRef &ec_impl,
                const shard_id_set &want_set,
-              const shard_id_set &need_set);
+              const shard_id_set &need_set,
+              DoutPrefixProvider *dpp);
    void get_buffer(shard_id_t shard, uint64_t offset, uint64_t length,
                    buffer::list &append_to) const;
    void get_shard_first_buffer(shard_id_t shard, buffer::list &append_to) const;
@@ -977,6 +1079,29 @@ public:
      return changed;
    }
  
+  template <typename IntervalSetT> requires is_interval_set_v<IntervalSetT>
+  void get_sparse_buffer(shard_id_t shard, bufferlist &bl_out, IntervalSetT &iset) {
+    ceph_assert(bl_out.length() == 0);
+    if (!extent_maps.contains(shard)) {
+      return;
+    }
+    for (auto iter = extent_maps.at(shard).begin(); iter != extent_maps.at(shard).end(); ++iter) {
+      uint64_t off = iter.get_off();
+      bufferlist &bl = iter.get_val();
+
+      auto bl_iter = bl.begin();
+      for (const auto &bp : bl.buffers()) {
+        uint64_t len = bp.length();
+        if (!bp.is_zero_fast()) {
+          iset.insert(off, bp.length());
+          bl_out.append(bp);
+        }
+        off += len;
+        bl_iter += len;
+      }
+    }
+  }
+
    friend std::ostream &operator<<(std::ostream &lhs,
                                    const shard_extent_map_t &rhs);
  
diff --git a/src/test/osd/TestECBackend.cc b/src/test/osd/TestECBackend.cc

index a84648d02996cc96d43ea9b766ff854a578dfade..1dd4faa6e681dbd8bf2190985bbb2b4378dcd364 100644 (file)
--- a/src/test/osd/TestECBackend.cc
+++ b/src/test/osd/TestECBackend.cc
@@ -141,25 +141,32 @@ public:
    int minimum_to_decode(const shard_id_set &want_to_read, const shard_id_set &available,
                          shard_id_set &minimum_set,
                         shard_id_map<std::vector<std::pair<int, int>>> *minimum_sub_chunks) override {
-    shard_id_t parity_shard_index(data_chunk_count);
+    bool recover = false;
      for (shard_id_t shard : want_to_read) {
        if (available.contains(shard)) {
          minimum_set.insert(shard);
        } else {
-        // Shard is missing.  Recover with every other shard and one parity
-        // for each missing shard.
-        for (shard_id_t i; i<data_chunk_count; ++i) {
-          if (available.contains(i)) {
-            minimum_set.insert(i);
-          } else {
-            minimum_set.insert(parity_shard_index);
-            ++parity_shard_index;
-          }
-
-          if (int(parity_shard_index) == chunk_count)
-            return -EIO; // Cannot recover.
+        recover = true;
+        break;
+      }
+    }
+
+    if (recover) {
+      minimum_set.clear();
+
+      // Shard is missing.  Recover with every other shard and one parity
+      // for each missing shard.
+      for (auto a : available) {
+        minimum_set.insert(a);
+        if (minimum_set.size() == data_chunk_count) {
+          break;
          }
        }
+
+      if (minimum_set.size() != data_chunk_count) {
+        minimum_set.clear();
+        return -EIO; // Cannot recover.
+      }
      }
  
      for (auto &&shard : minimum_set) {
@@ -237,7 +244,7 @@ public:
    int decode_chunks(const shard_id_set &want_to_read,
                      shard_id_map<bufferptr> &in, shard_id_map<bufferptr> &out) override
    {
-    if (in.size() < data_chunk_count) {
+    if (std::cmp_less(in.size(), data_chunk_count)) {
        ADD_FAILURE();
      }
      uint64_t len = 0;
@@ -258,6 +265,9 @@ public:
        if (len != bp.length()) {
          ADD_FAILURE();
        }
+      if (bp.is_zero_fast()) {
+        ADD_FAILURE();
+      }
      }
      return 0;
    }
@@ -1189,7 +1199,7 @@ TEST(ECCommon, get_remaining_shards)
        ECCommon::shard_read_t shard_read;
        shard_read.subchunk = ecode->default_sub_chunk;
        shard_read.extents.insert(0,4096);
-      unsigned int shard_id = i==missing_shard?parity_shard:i;
+      unsigned int shard_id = std::cmp_equal(i, missing_shard) ? parity_shard : i;
        shard_read.pg_shard = pg_shard_t(shard_id, shard_id_t(shard_id));
        ref.shard_reads[shard_id_t(shard_id)] = shard_read;
      }
@@ -1265,15 +1275,30 @@ TEST(ECCommon, encode)
    semap.encode(ec_impl);
  }
  
-TEST(ECCommon, decode)
+bufferlist create_buf(uint64_t len) {
+  bufferlist bl;
+
+  while (bl.length() < len) {
+    uint64_t pages = std::rand() % 5;
+    uint64_t len_to_add = std::min(len - bl.length(), pages * EC_ALIGN_SIZE);
+    bl.append_zero(len_to_add);
+  }
+  ceph_assert(bl.is_aligned(EC_ALIGN_SIZE));
+  ceph_assert(len == bl.length());
+  return bl;
+}
+
+
+void test_decode(unsigned int k, unsigned int m, uint64_t chunk_size, uint64_t object_size, const ECUtil::shard_extent_set_t &want, const shard_id_set &acting_set)
  {
-  const uint64_t align_size = EC_ALIGN_SIZE;
-  const uint64_t swidth = 3*align_size;
-  const unsigned int k = 3;
-  const unsigned int m = 2;
+  const uint64_t swidth = k*chunk_size;
  
    ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
    ECListenerStub listenerStub;
+  listenerStub.acting_shards.clear();
+  for (auto s : acting_set) {
+    listenerStub.acting_shards.insert(pg_shard_t(int(s), s));
+  }
    ASSERT_EQ(s.get_stripe_width(), swidth);
    ASSERT_EQ(s.get_chunk_size(), swidth/k);
  
@@ -1284,67 +1309,171 @@ TEST(ECCommon, decode)
    ErasureCodeInterfaceRef ec_impl(ecode);
    ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
  
+
    ECUtil::shard_extent_map_t semap(&s);
-  bufferlist bl12k;
-  bl12k.append_zero(12288);
-  bufferlist bl8k;
-  bl8k.append_zero(8192);
-  bufferlist bl16k;
-  bl16k.append_zero(16384);
-  semap.insert_in_shard(shard_id_t(1), 512000, bl12k);
-  semap.insert_in_shard(shard_id_t(1), 634880, bl12k);
-  semap.insert_in_shard(shard_id_t(2), 512000, bl12k);
-  semap.insert_in_shard(shard_id_t(2), 630784, bl16k);
-  semap.insert_in_shard(shard_id_t(3), 516096, bl8k);
-  semap.insert_in_shard(shard_id_t(3), 634880, bl12k);
-  ECUtil::shard_extent_set_t want = semap.get_extent_set();
-
-  want[shard_id_t(0)].insert(516096, 8192);
-  want[shard_id_t(0)].insert(634880, 12288);
-  want[shard_id_t(4)].insert(516096, 8192);
-  want[shard_id_t(4)].insert(634880, 12288);
-
-  ceph_assert(0 == semap.decode(ec_impl, want, 2*1024*1024));
+  hobject_t hoid;
+  ECCommon::read_request_t read_request(want, false, object_size);
+  ASSERT_EQ(0, pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request));
+  for (auto [shard, read] : read_request.shard_reads) {
+    for (auto [off, len] : read.extents) {
+      semap.insert_in_shard(shard, off, create_buf(len));
+    }
+  }
+
+  ASSERT_EQ(0, semap.decode(ec_impl, want, object_size, nullptr, true));
  }
  
+TEST(ECCommon, decode) {
+  unsigned int k = 4;
+  unsigned int m = 2;
+  uint64_t chunk_size = 4096;
+  uint64_t object_size = k * 256 * 1024 + 4096 + 1;
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
  
-TEST(ECCommon, decode2)
-{
-  const unsigned int k = 4;
-  const unsigned int m = 2;
-  const uint64_t align_size = EC_ALIGN_SIZE;
-  const uint64_t swidth = k*align_size;
+  want[shard_id_t(1)].insert(256 * 1024, 4096);
+  want[shard_id_t(4)].insert(256 * 1024, 4096);
  
+  acting_set.insert_range(shard_id_t(1), 4);
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
  
-  ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
-  ECListenerStub listenerStub;
-  ASSERT_EQ(s.get_stripe_width(), swidth);
-  ASSERT_EQ(s.get_chunk_size(), swidth/k);
  
-  const std::vector<int> chunk_mapping = {}; // no remapping
-  ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
-  ecode->data_chunk_count = k;
-  ecode->chunk_count = k + m;
-  ErasureCodeInterfaceRef ec_impl(ecode);
-  ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+TEST(ECCommon, decode2)
+{
+  unsigned int k = 4;
+  unsigned int m = 2;
+  uint64_t chunk_size = 4096;
+  uint64_t object_size = 2104*1024;
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
  
-  ECUtil::shard_extent_map_t semap(&s);
-  bufferlist bl528k;
-  bl528k.append_zero(528*1024);
-  bufferlist bl524k;
-  bl524k.append_zero(524*1024);
-  semap.insert_in_shard(shard_id_t(0), 0, bl524k);
-  semap.insert_in_shard(shard_id_t(1), 0, bl528k);
-  semap.insert_in_shard(shard_id_t(3), 0, bl524k);
-  semap.insert_in_shard(shard_id_t(4), 0, bl528k);
-  ECUtil::shard_extent_set_t want(k + m);
-
-  //shard_want_to_read={1:[0~540672],2:[0~536576],3:[0~536576],4:[0~540672],5:[0~540672]}
    want[shard_id_t(1)].insert(0, 528*1024);
    want[shard_id_t(2)].insert(0, 524*1024);
    want[shard_id_t(3)].insert(0, 524*1024);
    want[shard_id_t(4)].insert(0, 528*1024);
    want[shard_id_t(5)].insert(0, 528*1024);
  
-  ceph_assert(0 == semap.decode(ec_impl, want, 2104*1024));
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert(shard_id_t(1));
+  acting_set.insert(shard_id_t(3));
+  acting_set.insert(shard_id_t(4));
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode3) {
+  /* For this problematic IO, we want to reads:
+   * first is readable - shard 0, 0~4k
+   * second is on missing shard - shard 2, 16k~4k
+   *
+   * Recovery would work out it needs to recover shard 2, so would need
+   * shards 0,1,3,4 - howecer it works out that shard 3 does not need a read
+   *                  because the object is off the end!
+   *
+   * So the reads we end up doing are to 0,1 and 4 only.
+   */
+  unsigned int k = 4;
+  unsigned int m = 1;
+  uint64_t chunk_size = 4096;
+  uint64_t object_size = 4 * chunk_size * k + 2 * chunk_size + 1;
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(0)].insert(0, chunk_size);
+  want[shard_id_t(2)].insert(4 * chunk_size, chunk_size);
+
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert(shard_id_t(1));
+  acting_set.insert(shard_id_t(3));
+  acting_set.insert(shard_id_t(4));
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode4) {
+  const unsigned int k = 5;
+  const unsigned int m = 2;
+  const uint64_t chunk_size = 4096;
+  const uint64_t object_size = 3243718;
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(0)].insert(544768, 106496);
+  want[shard_id_t(1)].insert(544151, 106799);
+  want[shard_id_t(2)].insert(540672, 106496);
+  want[shard_id_t(3)].insert(540672, 106496);
+  want[shard_id_t(4)].insert(540672, 106496);
+
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert_range(shard_id_t(2), 4);
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode5) {
+  const unsigned int k = 6;
+  const unsigned int m = 4;
+  const uint64_t chunk_size = 4096;
+  const uint64_t object_size = 3428595;
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(0)].insert(0, 573440);
+  want[shard_id_t(1)].insert(0, 573440);
+  want[shard_id_t(2)].insert(0, 573440);
+  want[shard_id_t(3)].insert(0, 569587);
+  want[shard_id_t(4)].insert(0, 569344);
+  want[shard_id_t(5)].insert(0, 569344);
+
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert(shard_id_t(3));
+  acting_set.insert_range(shard_id_t(6), 4);
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode6) {
+  const unsigned int k = 8;
+  const unsigned int m = 4;
+  const uint64_t chunk_size = 4096;
+  const uint64_t object_size = 3092488;
+
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(0)].insert(262144, 126976);
+  want[shard_id_t(1)].insert(262144, 126976);
+  want[shard_id_t(2)].insert(262144, 126976);
+  want[shard_id_t(3)].insert(262144, 126976);
+  want[shard_id_t(4)].insert(262144, 122880);
+  want[shard_id_t(5)].insert(262144, 122880);
+  want[shard_id_t(6)].insert(262144, 122880);
+  want[shard_id_t(7)].insert(262144, 122880);
+  want[shard_id_t(8)].insert(262144, 126976);
+  want[shard_id_t(9)].insert(262144, 126976);
+  want[shard_id_t(10)].insert(262144, 126976);
+
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert_range(shard_id_t(2), 2);
+  acting_set.insert_range(shard_id_t(5), 5);
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode7) {
+  const unsigned int k = 3;
+  const unsigned int m = 3;
+  const uint64_t chunk_size = 4096;
+  const uint64_t object_size = 89236;
+
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(5)].insert(0, 32*1024);
+
+  acting_set.insert_range(shard_id_t(0), 3);
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
  }
 \ No newline at end of file
diff --git a/src/test/osd/TestECUtil.cc b/src/test/osd/TestECUtil.cc

index ccf93d312da5e8baa967b9541d16b21d1cce721f..c9fa63b8a6b46af2ae6ef839fff567b2f2020d61 100644 (file)
--- a/src/test/osd/TestECUtil.cc
+++ b/src/test/osd/TestECUtil.cc
@@ -646,7 +646,7 @@ TEST(ECUtil, slice_iterator)
    out_set.insert_range(shard_id_t(0), 3);
    shard_extent_map_t sem(&sinfo);
    {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
      ASSERT_TRUE(iter.get_out_bufferptrs().empty());
    }
  
@@ -660,7 +660,7 @@ TEST(ECUtil, slice_iterator)
    sem.insert_in_shard(shard_id_t(0), 0, a);
    sem.insert_in_shard(shard_id_t(1), 0, b);
    {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
  
      {
        auto out = iter.get_out_bufferptrs();
@@ -699,7 +699,7 @@ TEST(ECUtil, slice_iterator)
    sem.insert_in_shard(shard_id_t(1), 4096*4, e);
  
    {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
  
      {
        auto out = iter.get_out_bufferptrs();
@@ -755,7 +755,7 @@ TEST(ECUtil, slice_iterator)
    sem.insert_in_shard(shard_id_t(1), 4096*2, d);
  
    {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
  
      {
        auto out = iter.get_out_bufferptrs();
@@ -794,7 +794,7 @@ TEST(ECUtil, slice_iterator_subset_out)
    out_set.insert(shard_id_t(1));
    shard_extent_map_t sem(&sinfo);
    {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
      ASSERT_TRUE(iter.get_in_bufferptrs().empty());
      ASSERT_TRUE(iter.get_out_bufferptrs().empty());
    }
@@ -809,7 +809,7 @@ TEST(ECUtil, slice_iterator_subset_out)
    sem.insert_in_shard(shard_id_t(0), 0, a);
    sem.insert_in_shard(shard_id_t(1), 0, b);
    {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
  
      {
        auto in = iter.get_in_bufferptrs();
@@ -841,7 +841,7 @@ TEST(ECUtil, slice_iterator_subset_out)
    sem.insert_in_shard(shard_id_t(1), 4096*4, e);
  
    {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
  
      {
        auto in = iter.get_in_bufferptrs();
@@ -896,7 +896,7 @@ TEST(ECUtil, slice_iterator_subset_out)
    sem.insert_in_shard(shard_id_t(1), 4096*2, d);
  
    {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
  
      {
        auto in = iter.get_in_bufferptrs();
author	Alex Ainscow <aainscow@uk.ibm.com>
	Fri, 27 Jun 2025 15:00:56 +0000 (16:00 +0100)
committer	Alex Ainscow <aainscow@uk.ibm.com>
	Thu, 4 Sep 2025 21:08:52 +0000 (22:08 +0100)
src/include/interval_set.h		patch \| blob \| history
src/osd/ECBackend.cc		patch \| blob \| history
src/osd/ECCommon.cc		patch \| blob \| history
src/osd/ECTransaction.cc		patch \| blob \| history
src/osd/ECUtil.cc		patch \| blob \| history
src/osd/ECUtil.h		patch \| blob \| history
src/test/osd/TestECBackend.cc		patch \| blob \| history
src/test/osd/TestECUtil.cc		patch \| blob \| history