From fd809c4b7db86a4373d563f54dc345370e0fd9ab Mon Sep 17 00:00:00 2001
From: Alex Ainscow <aainscow@uk.ibm.com>
Date: Fri, 27 Jun 2025 16:00:56 +0100
Subject: [PATCH] osd: Deduplicate zeros in EC slice iterator

Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
(cherry picked from commit 06658fdac16dde95d20a8907511afb7fde7313da)
Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
---
 src/include/interval_set.h    |  10 ++
 src/osd/ECBackend.cc          |  50 ++++---
 src/osd/ECCommon.cc           |   3 +-
 src/osd/ECTransaction.cc      |   2 +-
 src/osd/ECUtil.cc             |  43 +++---
 src/osd/ECUtil.h              | 147 +++++++++++++++++--
 src/test/osd/TestECBackend.cc | 267 +++++++++++++++++++++++++---------
 src/test/osd/TestECUtil.cc    |  16 +-
 8 files changed, 408 insertions(+), 130 deletions(-)
diff --git a/src/include/interval_set.h b/src/include/interval_set.h
index 61392289c2afb..9ed8d3a9cabe2 100644
--- a/src/include/interval_set.h
+++ b/src/include/interval_set.h
@@ -999,4 +999,14 @@ public:
 template<typename T, template<typename, typename, typename ...> class C, bool strict>
 struct fmt::is_range<interval_set<T, C, strict>, char> : std::false_type {};
 
+template <typename T>
+struct is_interval_set : std::false_type {};
+
+template <typename T, template<typename, typename, typename ...> class C, bool strict>
+struct is_interval_set<interval_set<T, C, strict>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_interval_set_v = is_interval_set<T>::value;
+
+#undef strict_mode_assert
 #endif
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index 0f91eba5bf868..112a65e85d4fa 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -223,19 +223,20 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
     m->t.touch(coll, tobj);
   }
 
-  if (!op.data_included.empty()) {
-    uint64_t start = op.data_included.range_start();
-    uint64_t end = op.data_included.range_end();
-    ceph_assert(op.data.length() == (end - start));
-
-    m->t.write(
-      coll,
-      tobj,
-      start,
-      op.data.length(),
-      op.data);
-  } else {
-    ceph_assert(op.data.length() == 0);
+  ceph_assert(op.data.length() == op.data_included.size());
+  uint64_t tobj_size = 0;
+
+  uint64_t cursor = 0;
+  for (auto [off, len] : op.data_included) {
+    bufferlist bl;
+    if (len != op.data.length()) {
+      bl.substr_of(op.data, cursor, len);
+    } else {
+      bl = op.data;
+    }
+    m->t.write(coll, tobj, off, len, bl);
+    tobj_size = off + len;
+    cursor += len;
   }
 
   if (op.before_progress.first) {
@@ -246,6 +247,15 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
       op.attrset);
   }
 
+  if (op.after_progress.data_complete) {
+    uint64_t shard_size = sinfo.object_size_to_shard_size(op.recovery_info.size,
+      get_parent()->whoami_shard().shard);
+    ceph_assert(shard_size >= tobj_size);
+    if (shard_size != tobj_size) {
+      m->t.truncate( coll, tobj, shard_size);
+    }
+  }
+
   if (op.after_progress.data_complete && !oneshot) {
     dout(10) << __func__ << ": Removing oid "
 	     << tobj.hobj << " from the temp collection" << dendl;
@@ -361,7 +371,7 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
 
   uint64_t aligned_size = ECUtil::align_next(op.obc->obs.oi.size);
 
-  int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size);
+  int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size, get_parent()->get_dpp(), true);
   ceph_assert(r == 0);
 
   // Finally, we don't want to write any padding, so truncate the buffer
@@ -601,22 +611,24 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
       if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
         after_progress.data_complete = true;
       }
+
       for (auto &&pg_shard: op.missing_on) {
         m->pushes[pg_shard].push_back(PushOp());
         PushOp &pop = m->pushes[pg_shard].back();
         pop.soid = op.hoid;
         pop.version = op.recovery_info.oi.get_version_for_shard(pg_shard.shard);
-        op.returned_data->get_shard_first_buffer(pg_shard.shard, pop.data);
+
+        op.returned_data->get_sparse_buffer(pg_shard.shard, pop.data, pop.data_included);
+        ceph_assert(pop.data.length() == pop.data_included.size());
+
         dout(10) << __func__ << ": pop shard=" << pg_shard
                  << ", oid=" << pop.soid
                  << ", before_progress=" << op.recovery_progress
 		 << ", after_progress=" << after_progress
 		 << ", pop.data.length()=" << pop.data.length()
+                 << ", pop.data_included=" << pop.data_included
 		 << ", size=" << op.obc->obs.oi.size << dendl;
-        if (pop.data.length())
-          pop.data_included.union_insert(
-            op.returned_data->get_shard_first_offset(pg_shard.shard),
-            pop.data.length());
+
         if (op.recovery_progress.first) {
           if (sinfo.is_nonprimary_shard(pg_shard.shard)) {
             if (pop.version == op.recovery_info.oi.version) {
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc
index 0db49bc621a2e..377fd86452d54 100644
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -487,7 +487,8 @@ struct ClientReadCompleter final : ECCommon::ReadCompleter {
       /* Decode any missing buffers */
       int r = res.buffers_read.decode(read_pipeline.ec_impl,
                                   req.shard_want_to_read,
-                                  req.object_size);
+                                  req.object_size,
+                                  read_pipeline.get_parent()->get_dpp());
       ceph_assert( r == 0 );
       dout(30) << __func__ << ": after decode: "
                << res.buffers_read.debug_string(2048, 8)
diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc
index 996a37f03d806..1142db8008f4a 100644
--- a/src/osd/ECTransaction.cc
+++ b/src/osd/ECTransaction.cc
@@ -68,7 +68,7 @@ void ECTransaction::Generate::encode_and_write() {
      */
     read_sem->zero_pad(plan.will_write);
     to_write.pad_with_other(plan.will_write, *read_sem);
-    r = to_write.encode_parity_delta(ec_impl, *read_sem);
+    r = to_write.encode_parity_delta(ec_impl, *read_sem, dpp);
   } else {
     r = to_write.encode(ec_impl);
   }
diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc
index b1522a7fbd8c7..911e314deaf80 100644
--- a/src/osd/ECUtil.cc
+++ b/src/osd/ECUtil.cc
@@ -475,19 +475,23 @@ void shard_extent_map_t::insert_parity_buffers() {
   }
 }
 
-slice_iterator<shard_id_t, extent_map> shard_extent_map_t::begin_slice_iterator(
-    const shard_id_set &out) {
-  return slice_iterator(extent_maps, out);
+slice_iterator shard_extent_map_t::begin_slice_iterator(
+    const shard_id_set &out,
+    DoutPrefixProvider *dpp,
+    const shard_id_set *dedup_zeros) {
+  return slice_iterator(extent_maps, out, dpp, dedup_zeros);
 }
 
 /* Encode parity chunks, using the encode_chunks interface into the
  * erasure coding. This generates all parity using full stripe writes.
  */
-int shard_extent_map_t::_encode(const ErasureCodeInterfaceRef &ec_impl) {
+int shard_extent_map_t::encode(const ErasureCodeInterfaceRef &ec_impl,
+    DoutPrefixProvider *dpp,
+    shard_id_set *dedup_zeros) {
   shard_id_set out_set = sinfo->get_parity_shards();
   bool rebuild_req = false;
 
-  for (auto iter = begin_slice_iterator(out_set); !iter.is_end(); ++iter) {
+  for (auto iter = begin_slice_iterator(out_set, dpp, dedup_zeros); !iter.is_end(); ++iter) {
     if (!iter.is_page_aligned()) {
       rebuild_req = true;
       break;
@@ -503,25 +507,19 @@ int shard_extent_map_t::_encode(const ErasureCodeInterfaceRef &ec_impl) {
 
   if (rebuild_req) {
     pad_and_rebuild_to_ec_align();
-    return _encode(ec_impl);
+    return encode(ec_impl, dpp, dedup_zeros);
   }
 
   return 0;
 }
 
-/* Encode parity chunks, using the encode_chunks interface into the
- * erasure coding. This generates all parity using full stripe writes.
- */
-int shard_extent_map_t::encode(const ErasureCodeInterfaceRef &ec_impl) {
-  return _encode(ec_impl);
-}
-
 /* Encode parity chunks, using the parity delta write interfaces on plugins
  * that support them.
  */
 int shard_extent_map_t::encode_parity_delta(
     const ErasureCodeInterfaceRef &ec_impl,
-    shard_extent_map_t &old_sem) {
+    shard_extent_map_t &old_sem,
+    DoutPrefixProvider *dpp) {
   shard_id_set out_set = sinfo->get_parity_shards();
 
   pad_and_rebuild_to_ec_align();
@@ -542,7 +540,7 @@ int shard_extent_map_t::encode_parity_delta(
 
     s.compute_ro_range();
 
-    for (auto iter = s.begin_slice_iterator(out_set); !iter.is_end(); ++iter) {
+    for (auto iter = s.begin_slice_iterator(out_set, dpp); !iter.is_end(); ++iter) {
       ceph_assert(iter.is_page_aligned());
       shard_id_map<bufferptr> &data_shards = iter.get_in_bufferptrs();
       shard_id_map<bufferptr> &parity_shards = iter.get_out_bufferptrs();
@@ -644,7 +642,9 @@ void shard_extent_map_t::trim(const shard_extent_set_t &trim_to) {
 
 int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
                                const shard_extent_set_t &want,
-                               uint64_t object_size) {
+                               uint64_t object_size,
+                               DoutPrefixProvider *dpp,
+                               bool dedup_zeros) {
   shard_id_set want_set;
   shard_id_set have_set;
   want.populate_shard_id_set(want_set);
@@ -679,11 +679,11 @@ int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
       decode_for_parity.intersection_of(want.at(shard), read_mask.at(shard));
       pad_on_shard(decode_for_parity, shard);
     }
-    r = _decode(ec_impl, want_set, decode_set);
+    r = _decode(ec_impl, want_set, decode_set, dpp);
   }
   if (!r && !encode_set.empty()) {
     pad_on_shards(want, encode_set);
-    r = _encode(ec_impl);
+    r = encode(ec_impl, dpp, dedup_zeros?&need_set:nullptr);
   }
 
   // If we failed to decode, then bail out, or the trimming below might fail.
@@ -702,9 +702,10 @@ int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
 
 int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl,
                                 const shard_id_set &want_set,
-                                const shard_id_set &need_set) {
+                                const shard_id_set &need_set,
+                                DoutPrefixProvider *dpp) {
   bool rebuild_req = false;
-  for (auto iter = begin_slice_iterator(need_set); !iter.is_end(); ++iter) {
+  for (auto iter = begin_slice_iterator(need_set, dpp); !iter.is_end(); ++iter) {
     if (!iter.is_page_aligned()) {
       rebuild_req = true;
       break;
@@ -719,7 +720,7 @@ int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl,
 
   if (rebuild_req) {
     pad_and_rebuild_to_ec_align();
-    return _decode(ec_impl, want_set, need_set);
+    return _decode(ec_impl, want_set, need_set, dpp);
   }
 
   compute_ro_range();
diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h
index 04ac579d69948..f31a0b9108388 100644
--- a/src/osd/ECUtil.h
+++ b/src/osd/ECUtil.h
@@ -67,9 +67,8 @@ using extent_map = interval_map<uint64_t, ceph::buffer::list, bl_split_merge,
  * K must a key suitable for a mini_flat_map.
  * T must be either an extent map or a reference to an extent map.
  */
-template <typename K, typename T>
 class slice_iterator {
-  mini_flat_map<K, T> &input;
+  mini_flat_map<shard_id_t, extent_map> &input;
   uint64_t offset = std::numeric_limits<uint64_t>::max();
   uint64_t length = std::numeric_limits<uint64_t>::max();
   uint64_t start = std::numeric_limits<uint64_t>::max();
@@ -79,8 +78,81 @@ class slice_iterator {
   shard_id_map<bufferptr> in;
   shard_id_map<bufferptr> out;
   const shard_id_set &out_set;
+  const shard_id_set *dedup_set;
+  DoutPrefixProvider *dpp;
+
+  /* zero dedup is used by the slice iterator to detect zero buffers and replace
+   * them with the dedup'd zero buffer. It keeps a replacement buffer which
+   * once full (bl.length() == len) can be used to swap out the input buffer.
+   */
+  struct zeros {
+    uint64_t off;
+    uint64_t len;
+    bufferlist bl;
+
+    zeros(uint64_t _off, uint64_t _len) : off(_off), len(_len) {}
+
+    bool dedup(bufferptr &bp) {
+      bool is_zeros = false;
+      uint64_t bp_len = bp.length();
+      uint64_t off = 0;
+      char *c_str = bp.c_str();
+      // Skip any non-aligned chunk.
+      uint64_t analysed = p2roundup((uintptr_t)c_str, EC_ALIGN_SIZE) - (uintptr_t)c_str;
+
+      while (off + analysed <= bp_len) {
+        bool new_is_zeros;
+        if (bp_len - off - analysed < EC_ALIGN_SIZE) {
+          new_is_zeros = false;
+        } else {
+          new_is_zeros = mem_is_zero(c_str + off + analysed, EC_ALIGN_SIZE);
+        }
+        if (new_is_zeros != is_zeros && analysed) {
+          if (is_zeros) {
+            bl.append_zero2(analysed);
+          } else {
+            bl.append(bufferptr(bp, off, analysed));
+          }
+          off += analysed;
+          analysed = 0;
+        }
+        is_zeros = new_is_zeros;
+        analysed += EC_ALIGN_SIZE;
+      }
+      if (is_zeros) {
+        bl.append_zero2(bp_len - off);
+      } else {
+        bl.append(bufferptr(bp, off, bp_len - off));
+      }
+
+      return bl.length() == len;
+    }
+  };
+
+  std::optional<shard_id_map<zeros>> zeros;
+
+  void zeros_dedup() {
+    for (auto &&[shard, _zeros] : *zeros) {
+
+      if (!out.contains(shard) && !in.contains(shard)) {
+        continue;
+      }
+
+      bufferptr &bp = out.contains(shard) ? out.at(shard) : in.at(shard);
+      if (_zeros.dedup(bp)) {
+        ldpp_dout(dpp, 20) << __func__ << ": overwrite input[" << shard << "]="
+                           << _zeros.off << "~" << _zeros.len
+                           << " with bl=" << _zeros.bl << dendl;
+        input.at(shard).insert(_zeros.off, _zeros.len, _zeros.bl);
+        zeros->erase(shard);
+      }
+    }
+  }
 
   void advance() {
+    if (dedup_set) {
+      zeros_dedup();
+    }
     in.clear();
     out.clear();
     offset = start;
@@ -121,9 +193,15 @@ class slice_iterator {
         // Create a new buffer pointer for the result. We don't want the client
         // manipulating the ptr.
         if (out_set.contains(shard)) {
+          ldpp_dout(dpp, 20) << __func__ << " out[" << shard << "]="
+                             << start << "~" << (end - start)
+                             << dendl;
           out.emplace(
             shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start));
         } else {
+          ldpp_dout(dpp, 20) << __func__ << " in[" << shard << "]="
+                   << start << "~" << (end - start)
+                   << dendl;
           in.emplace(
             shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start));
         }
@@ -142,6 +220,9 @@ class slice_iterator {
               bl.invalidate_crc();
             }
             iters.at(shard).second = emap_iter.get_val().begin();
+            if (zeros) {
+              zeros->emplace(shard, emap_iter.get_off(), emap_iter.get_len());
+            }
           }
         }
       } else
@@ -169,15 +250,29 @@ class slice_iterator {
   }
 
 public:
-  slice_iterator(mini_flat_map<K, T> &_input, const shard_id_set &out_set) :
+  slice_iterator(
+      mini_flat_map<shard_id_t, extent_map> &_input,
+      const shard_id_set &out_set,
+      DoutPrefixProvider *_dpp,
+      const shard_id_set *dedup_set) :
     input(_input),
     iters(input.max_size()),
     in(input.max_size()),
     out(input.max_size()),
-    out_set(out_set) {
+    out_set(out_set),
+    dedup_set(dedup_set),
+    dpp(_dpp) {
+
+    if (dedup_set) {
+      zeros.emplace(input.max_size());
+    }
+
     for (auto &&[shard, emap] : input) {
       auto emap_iter = emap.begin();
       auto bl_iter = emap_iter.get_val().begin();
+      if (zeros) {
+        zeros->emplace(shard, emap_iter.get_off(), emap_iter.get_len());
+      }
       auto p = std::make_pair(std::move(emap_iter), std::move(bl_iter));
       iters.emplace(shard, std::move(p));
 
@@ -756,8 +851,10 @@ public:
   uint64_t end_offset;
   shard_id_map<extent_map> extent_maps;
 
-  slice_iterator<shard_id_t, extent_map> begin_slice_iterator(
-      const shard_id_set &out_set);
+  slice_iterator begin_slice_iterator(
+      const shard_id_set &out,
+      DoutPrefixProvider *dpp,
+      const shard_id_set *dedup_zeros = nullptr);
 
   /* This caculates the ro offset for an offset into a particular shard */
   uint64_t calc_ro_offset(raw_shard_id_t raw_shard, int shard_offset) const {
@@ -890,10 +987,12 @@ public:
   void append_zeros_to_ro_offset(uint64_t ro_offset);
   void insert_ro_extent_map(const extent_map &host_extent_map);
   extent_set get_extent_superset() const;
-  int encode(const ErasureCodeInterfaceRef &ec_impl);
-  int _encode(const ErasureCodeInterfaceRef &ec_impl);
+  int encode(const ErasureCodeInterfaceRef &ec_impl,
+    DoutPrefixProvider *dpp = nullptr,
+    shard_id_set *dedup_zeros = nullptr);
   int encode_parity_delta(const ErasureCodeInterfaceRef &ec_impl,
-                          shard_extent_map_t &old_sem);
+                          shard_extent_map_t &old_sem,
+                          DoutPrefixProvider *dpp);
 
   void pad_on_shards(const shard_extent_set_t &pad_to,
                      const shard_id_set &shards);
@@ -904,10 +1003,13 @@ public:
   void trim(const shard_extent_set_t &trim_to);
   int decode(const ErasureCodeInterfaceRef &ec_impl,
              const shard_extent_set_t &want,
-             uint64_t object_size);
+             uint64_t object_size,
+             DoutPrefixProvider *dpp = nullptr,
+             bool dedup_zeros = false);
   int _decode(const ErasureCodeInterfaceRef &ec_impl,
               const shard_id_set &want_set,
-              const shard_id_set &need_set);
+              const shard_id_set &need_set,
+              DoutPrefixProvider *dpp);
   void get_buffer(shard_id_t shard, uint64_t offset, uint64_t length,
                   buffer::list &append_to) const;
   void get_shard_first_buffer(shard_id_t shard, buffer::list &append_to) const;
@@ -977,6 +1079,29 @@ public:
     return changed;
   }
 
+  template <typename IntervalSetT> requires is_interval_set_v<IntervalSetT>
+  void get_sparse_buffer(shard_id_t shard, bufferlist &bl_out, IntervalSetT &iset) {
+    ceph_assert(bl_out.length() == 0);
+    if (!extent_maps.contains(shard)) {
+      return;
+    }
+    for (auto iter = extent_maps.at(shard).begin(); iter != extent_maps.at(shard).end(); ++iter) {
+      uint64_t off = iter.get_off();
+      bufferlist &bl = iter.get_val();
+
+      auto bl_iter = bl.begin();
+      for (const auto &bp : bl.buffers()) {
+        uint64_t len = bp.length();
+        if (!bp.is_zero_fast()) {
+          iset.insert(off, bp.length());
+          bl_out.append(bp);
+        }
+        off += len;
+        bl_iter += len;
+      }
+    }
+  }
+
   friend std::ostream &operator<<(std::ostream &lhs,
                                   const shard_extent_map_t &rhs);
 
diff --git a/src/test/osd/TestECBackend.cc b/src/test/osd/TestECBackend.cc
index a84648d02996c..1dd4faa6e681d 100644
--- a/src/test/osd/TestECBackend.cc
+++ b/src/test/osd/TestECBackend.cc
@@ -141,25 +141,32 @@ public:
   int minimum_to_decode(const shard_id_set &want_to_read, const shard_id_set &available,
                         shard_id_set &minimum_set,
 			shard_id_map<std::vector<std::pair<int, int>>> *minimum_sub_chunks) override {
-    shard_id_t parity_shard_index(data_chunk_count);
+    bool recover = false;
     for (shard_id_t shard : want_to_read) {
       if (available.contains(shard)) {
         minimum_set.insert(shard);
       } else {
-        // Shard is missing.  Recover with every other shard and one parity
-        // for each missing shard.
-        for (shard_id_t i; i<data_chunk_count; ++i) {
-          if (available.contains(i)) {
-            minimum_set.insert(i);
-          } else {
-            minimum_set.insert(parity_shard_index);
-            ++parity_shard_index;
-          }
-
-          if (int(parity_shard_index) == chunk_count)
-            return -EIO; // Cannot recover.
+        recover = true;
+        break;
+      }
+    }
+
+    if (recover) {
+      minimum_set.clear();
+
+      // Shard is missing.  Recover with every other shard and one parity
+      // for each missing shard.
+      for (auto a : available) {
+        minimum_set.insert(a);
+        if (minimum_set.size() == data_chunk_count) {
+          break;
         }
       }
+
+      if (minimum_set.size() != data_chunk_count) {
+        minimum_set.clear();
+        return -EIO; // Cannot recover.
+      }
     }
 
     for (auto &&shard : minimum_set) {
@@ -237,7 +244,7 @@ public:
   int decode_chunks(const shard_id_set &want_to_read,
                     shard_id_map<bufferptr> &in, shard_id_map<bufferptr> &out) override
   {
-    if (in.size() < data_chunk_count) {
+    if (std::cmp_less(in.size(), data_chunk_count)) {
       ADD_FAILURE();
     }
     uint64_t len = 0;
@@ -258,6 +265,9 @@ public:
       if (len != bp.length()) {
         ADD_FAILURE();
       }
+      if (bp.is_zero_fast()) {
+        ADD_FAILURE();
+      }
     }
     return 0;
   }
@@ -1189,7 +1199,7 @@ TEST(ECCommon, get_remaining_shards)
       ECCommon::shard_read_t shard_read;
       shard_read.subchunk = ecode->default_sub_chunk;
       shard_read.extents.insert(0,4096);
-      unsigned int shard_id = i==missing_shard?parity_shard:i;
+      unsigned int shard_id = std::cmp_equal(i, missing_shard) ? parity_shard : i;
       shard_read.pg_shard = pg_shard_t(shard_id, shard_id_t(shard_id));
       ref.shard_reads[shard_id_t(shard_id)] = shard_read;
     }
@@ -1265,15 +1275,30 @@ TEST(ECCommon, encode)
   semap.encode(ec_impl);
 }
 
-TEST(ECCommon, decode)
+bufferlist create_buf(uint64_t len) {
+  bufferlist bl;
+
+  while (bl.length() < len) {
+    uint64_t pages = std::rand() % 5;
+    uint64_t len_to_add = std::min(len - bl.length(), pages * EC_ALIGN_SIZE);
+    bl.append_zero(len_to_add);
+  }
+  ceph_assert(bl.is_aligned(EC_ALIGN_SIZE));
+  ceph_assert(len == bl.length());
+  return bl;
+}
+
+
+void test_decode(unsigned int k, unsigned int m, uint64_t chunk_size, uint64_t object_size, const ECUtil::shard_extent_set_t &want, const shard_id_set &acting_set)
 {
-  const uint64_t align_size = EC_ALIGN_SIZE;
-  const uint64_t swidth = 3*align_size;
-  const unsigned int k = 3;
-  const unsigned int m = 2;
+  const uint64_t swidth = k*chunk_size;
 
   ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
   ECListenerStub listenerStub;
+  listenerStub.acting_shards.clear();
+  for (auto s : acting_set) {
+    listenerStub.acting_shards.insert(pg_shard_t(int(s), s));
+  }
   ASSERT_EQ(s.get_stripe_width(), swidth);
   ASSERT_EQ(s.get_chunk_size(), swidth/k);
 
@@ -1284,67 +1309,171 @@ TEST(ECCommon, decode)
   ErasureCodeInterfaceRef ec_impl(ecode);
   ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
 
+
   ECUtil::shard_extent_map_t semap(&s);
-  bufferlist bl12k;
-  bl12k.append_zero(12288);
-  bufferlist bl8k;
-  bl8k.append_zero(8192);
-  bufferlist bl16k;
-  bl16k.append_zero(16384);
-  semap.insert_in_shard(shard_id_t(1), 512000, bl12k);
-  semap.insert_in_shard(shard_id_t(1), 634880, bl12k);
-  semap.insert_in_shard(shard_id_t(2), 512000, bl12k);
-  semap.insert_in_shard(shard_id_t(2), 630784, bl16k);
-  semap.insert_in_shard(shard_id_t(3), 516096, bl8k);
-  semap.insert_in_shard(shard_id_t(3), 634880, bl12k);
-  ECUtil::shard_extent_set_t want = semap.get_extent_set();
-
-  want[shard_id_t(0)].insert(516096, 8192);
-  want[shard_id_t(0)].insert(634880, 12288);
-  want[shard_id_t(4)].insert(516096, 8192);
-  want[shard_id_t(4)].insert(634880, 12288);
-
-  ceph_assert(0 == semap.decode(ec_impl, want, 2*1024*1024));
+  hobject_t hoid;
+  ECCommon::read_request_t read_request(want, false, object_size);
+  ASSERT_EQ(0, pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request));
+  for (auto [shard, read] : read_request.shard_reads) {
+    for (auto [off, len] : read.extents) {
+      semap.insert_in_shard(shard, off, create_buf(len));
+    }
+  }
+
+  ASSERT_EQ(0, semap.decode(ec_impl, want, object_size, nullptr, true));
 }
 
+TEST(ECCommon, decode) {
+  unsigned int k = 4;
+  unsigned int m = 2;
+  uint64_t chunk_size = 4096;
+  uint64_t object_size = k * 256 * 1024 + 4096 + 1;
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
 
-TEST(ECCommon, decode2)
-{
-  const unsigned int k = 4;
-  const unsigned int m = 2;
-  const uint64_t align_size = EC_ALIGN_SIZE;
-  const uint64_t swidth = k*align_size;
+  want[shard_id_t(1)].insert(256 * 1024, 4096);
+  want[shard_id_t(4)].insert(256 * 1024, 4096);
 
+  acting_set.insert_range(shard_id_t(1), 4);
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
 
-  ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
-  ECListenerStub listenerStub;
-  ASSERT_EQ(s.get_stripe_width(), swidth);
-  ASSERT_EQ(s.get_chunk_size(), swidth/k);
 
-  const std::vector<int> chunk_mapping = {}; // no remapping
-  ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
-  ecode->data_chunk_count = k;
-  ecode->chunk_count = k + m;
-  ErasureCodeInterfaceRef ec_impl(ecode);
-  ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+TEST(ECCommon, decode2)
+{
+  unsigned int k = 4;
+  unsigned int m = 2;
+  uint64_t chunk_size = 4096;
+  uint64_t object_size = 2104*1024;
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
 
-  ECUtil::shard_extent_map_t semap(&s);
-  bufferlist bl528k;
-  bl528k.append_zero(528*1024);
-  bufferlist bl524k;
-  bl524k.append_zero(524*1024);
-  semap.insert_in_shard(shard_id_t(0), 0, bl524k);
-  semap.insert_in_shard(shard_id_t(1), 0, bl528k);
-  semap.insert_in_shard(shard_id_t(3), 0, bl524k);
-  semap.insert_in_shard(shard_id_t(4), 0, bl528k);
-  ECUtil::shard_extent_set_t want(k + m);
-
-  //shard_want_to_read={1:[0~540672],2:[0~536576],3:[0~536576],4:[0~540672],5:[0~540672]}
   want[shard_id_t(1)].insert(0, 528*1024);
   want[shard_id_t(2)].insert(0, 524*1024);
   want[shard_id_t(3)].insert(0, 524*1024);
   want[shard_id_t(4)].insert(0, 528*1024);
   want[shard_id_t(5)].insert(0, 528*1024);
 
-  ceph_assert(0 == semap.decode(ec_impl, want, 2104*1024));
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert(shard_id_t(1));
+  acting_set.insert(shard_id_t(3));
+  acting_set.insert(shard_id_t(4));
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode3) {
+  /* For this problematic IO, we want to reads:
+   * first is readable - shard 0, 0~4k
+   * second is on missing shard - shard 2, 16k~4k
+   *
+   * Recovery would work out it needs to recover shard 2, so would need
+   * shards 0,1,3,4 - howecer it works out that shard 3 does not need a read
+   *                  because the object is off the end!
+   *
+   * So the reads we end up doing are to 0,1 and 4 only.
+   */
+  unsigned int k = 4;
+  unsigned int m = 1;
+  uint64_t chunk_size = 4096;
+  uint64_t object_size = 4 * chunk_size * k + 2 * chunk_size + 1;
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(0)].insert(0, chunk_size);
+  want[shard_id_t(2)].insert(4 * chunk_size, chunk_size);
+
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert(shard_id_t(1));
+  acting_set.insert(shard_id_t(3));
+  acting_set.insert(shard_id_t(4));
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode4) {
+  const unsigned int k = 5;
+  const unsigned int m = 2;
+  const uint64_t chunk_size = 4096;
+  const uint64_t object_size = 3243718;
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(0)].insert(544768, 106496);
+  want[shard_id_t(1)].insert(544151, 106799);
+  want[shard_id_t(2)].insert(540672, 106496);
+  want[shard_id_t(3)].insert(540672, 106496);
+  want[shard_id_t(4)].insert(540672, 106496);
+
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert_range(shard_id_t(2), 4);
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode5) {
+  const unsigned int k = 6;
+  const unsigned int m = 4;
+  const uint64_t chunk_size = 4096;
+  const uint64_t object_size = 3428595;
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(0)].insert(0, 573440);
+  want[shard_id_t(1)].insert(0, 573440);
+  want[shard_id_t(2)].insert(0, 573440);
+  want[shard_id_t(3)].insert(0, 569587);
+  want[shard_id_t(4)].insert(0, 569344);
+  want[shard_id_t(5)].insert(0, 569344);
+
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert(shard_id_t(3));
+  acting_set.insert_range(shard_id_t(6), 4);
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode6) {
+  const unsigned int k = 8;
+  const unsigned int m = 4;
+  const uint64_t chunk_size = 4096;
+  const uint64_t object_size = 3092488;
+
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(0)].insert(262144, 126976);
+  want[shard_id_t(1)].insert(262144, 126976);
+  want[shard_id_t(2)].insert(262144, 126976);
+  want[shard_id_t(3)].insert(262144, 126976);
+  want[shard_id_t(4)].insert(262144, 122880);
+  want[shard_id_t(5)].insert(262144, 122880);
+  want[shard_id_t(6)].insert(262144, 122880);
+  want[shard_id_t(7)].insert(262144, 122880);
+  want[shard_id_t(8)].insert(262144, 126976);
+  want[shard_id_t(9)].insert(262144, 126976);
+  want[shard_id_t(10)].insert(262144, 126976);
+
+  acting_set.insert(shard_id_t(0));
+  acting_set.insert_range(shard_id_t(2), 2);
+  acting_set.insert_range(shard_id_t(5), 5);
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
+}
+
+TEST(ECCommon, decode7) {
+  const unsigned int k = 3;
+  const unsigned int m = 3;
+  const uint64_t chunk_size = 4096;
+  const uint64_t object_size = 89236;
+
+
+  ECUtil::shard_extent_set_t want(k+m);
+  shard_id_set acting_set;
+  want[shard_id_t(5)].insert(0, 32*1024);
+
+  acting_set.insert_range(shard_id_t(0), 3);
+
+  test_decode(k, m, chunk_size, object_size, want, acting_set);
 }
\ No newline at end of file
diff --git a/src/test/osd/TestECUtil.cc b/src/test/osd/TestECUtil.cc
index ccf93d312da5e..c9fa63b8a6b46 100644
--- a/src/test/osd/TestECUtil.cc
+++ b/src/test/osd/TestECUtil.cc
@@ -646,7 +646,7 @@ TEST(ECUtil, slice_iterator)
   out_set.insert_range(shard_id_t(0), 3);
   shard_extent_map_t sem(&sinfo);
   {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
     ASSERT_TRUE(iter.get_out_bufferptrs().empty());
   }
 
@@ -660,7 +660,7 @@ TEST(ECUtil, slice_iterator)
   sem.insert_in_shard(shard_id_t(0), 0, a);
   sem.insert_in_shard(shard_id_t(1), 0, b);
   {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
 
     {
       auto out = iter.get_out_bufferptrs();
@@ -699,7 +699,7 @@ TEST(ECUtil, slice_iterator)
   sem.insert_in_shard(shard_id_t(1), 4096*4, e);
 
   {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
 
     {
       auto out = iter.get_out_bufferptrs();
@@ -755,7 +755,7 @@ TEST(ECUtil, slice_iterator)
   sem.insert_in_shard(shard_id_t(1), 4096*2, d);
 
   {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
 
     {
       auto out = iter.get_out_bufferptrs();
@@ -794,7 +794,7 @@ TEST(ECUtil, slice_iterator_subset_out)
   out_set.insert(shard_id_t(1));
   shard_extent_map_t sem(&sinfo);
   {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
     ASSERT_TRUE(iter.get_in_bufferptrs().empty());
     ASSERT_TRUE(iter.get_out_bufferptrs().empty());
   }
@@ -809,7 +809,7 @@ TEST(ECUtil, slice_iterator_subset_out)
   sem.insert_in_shard(shard_id_t(0), 0, a);
   sem.insert_in_shard(shard_id_t(1), 0, b);
   {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
 
     {
       auto in = iter.get_in_bufferptrs();
@@ -841,7 +841,7 @@ TEST(ECUtil, slice_iterator_subset_out)
   sem.insert_in_shard(shard_id_t(1), 4096*4, e);
 
   {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
 
     {
       auto in = iter.get_in_bufferptrs();
@@ -896,7 +896,7 @@ TEST(ECUtil, slice_iterator_subset_out)
   sem.insert_in_shard(shard_id_t(1), 4096*2, d);
 
   {
-    auto iter = sem.begin_slice_iterator(out_set);
+    auto iter = sem.begin_slice_iterator(out_set, nullptr);
 
     {
       auto in = iter.get_in_bufferptrs();
-- 
2.39.5