osd: Multiple Decode fixes.

author Alex Ainscow <aainscow@uk.ibm.com>

Wed, 18 Jun 2025 19:46:49 +0000 (20:46 +0100)

committer Alex Ainscow <aainscow@uk.ibm.com>

Fri, 25 Jul 2025 07:43:09 +0000 (08:43 +0100)
author Alex Ainscow <aainscow@uk.ibm.com>
Wed, 18 Jun 2025 19:46:49 +0000 (20:46 +0100)
committer Alex Ainscow <aainscow@uk.ibm.com>
Fri, 25 Jul 2025 07:43:09 +0000 (08:43 +0100)
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc

index 883fbb58189f0e0e862bf24d4c05bbe0ed7affaa..dd802c5cc0de976e7fefb65b1e652a4e70e185d5 100644 (file)
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -312,7 +312,7 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
      const hobject_t &hoid,
      ECUtil::shard_extent_map_t &&buffers_read,
      std::optional<map<string, bufferlist, less<>>> attrs,
-    const ECUtil::shard_extent_set_t &want_to_read,
+    read_request_t &req,
      RecoveryMessages *m) {
    dout(10) << __func__ << ": returned " << hoid << " " << buffers_read << dendl;
    ceph_assert(recovery_ops.contains(hoid));
@@ -341,6 +341,13 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
        ceph_assert(op.obc);
        op.recovery_info.size = op.obc->obs.oi.size;
        op.recovery_info.oi = op.obc->obs.oi;
+
+      // We didn't know the size before, meaning the zero for decode calculations
+      // will be off. Recalculate them!
+      req.object_size = op.obc->obs.oi.size;
+      int r = read_pipeline.get_min_avail_to_read_shards(
+        op.hoid, true, false, req);
+      ceph_assert(r == 0);
      }
    }
    ceph_assert(op.xattrs.size());
@@ -352,7 +359,7 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
    sinfo.ro_size_to_read_mask(op.recovery_info.size, read_mask);
    ECUtil::shard_extent_set_t shard_want_to_read(sinfo.get_k_plus_m());
  
-  for (auto &[shard, eset] : want_to_read) {
+  for (auto &[shard, eset] : req.shard_want_to_read) {
      /* Read buffers do not need recovering! */
      if (buffers_read.contains(shard)) {
        continue;
@@ -371,6 +378,11 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
  
    uint64_t aligned_size = ECUtil::align_next(op.obc->obs.oi.size);
  
+  dout(20) << __func__ << " before decode: oid=" << op.hoid << " EC_DEBUG_BUFFERS: "
+         << op.returned_data->debug_string(2048, 0)
+         << dendl;
+
+  op.returned_data->add_zero_padding_for_decode(req.zeros_for_decode);
    int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size, get_parent()->get_dpp(), true);
    ceph_assert(r == 0);
  
@@ -386,7 +398,7 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
    }
  
    dout(20) << __func__ << ": oid=" << op.hoid << dendl;
-  dout(20) << __func__ << "EC_DEBUG_BUFFERS: "
+  dout(20) << __func__ << " after decode: oid=" << op.hoid << " EC_DEBUG_BUFFERS: "
             << op.returned_data->debug_string(2048, 0)
             << dendl;
  
@@ -442,7 +454,7 @@ struct RecoveryReadCompleter : ECCommon::ReadCompleter {
        hoid,
        std::move(res.buffers_read),
        res.attrs,
-      req.shard_want_to_read,
+      req,
        &rm);
    }
  
@@ -1250,6 +1262,13 @@ void ECBackend::handle_sub_read_reply(
      rop.debug_log.emplace_back(ECUtil::ERROR, op.from, complete.buffers_read);
      complete.buffers_read.erase_shard(from.shard);
      complete.processed_read_requests.erase(from.shard);
+    // If we are doing redundant reads, then we must take care that any failed
+    // reads are not replaced with a zero buffer. When fast_reads are disabled,
+    // the send_all_remaining_reads() call will replace the zeros_for_decode
+    // based on the recovery read.
+    if (rop.do_redundant_reads) {
+      rop.to_read.at(hoid).zeros_for_decode.erase(from.shard);
+    }
      dout(20) << __func__ << " shard=" << from << " error=" << err << dendl;
    }
  
@@ -1263,9 +1282,10 @@ void ECBackend::handle_sub_read_reply(
    rop.in_progress.erase(from);
    unsigned is_complete = 0;
    bool need_resend = false;
+  bool all_sub_reads_done = rop.in_progress.empty();
    // For redundant reads check for completion as each shard comes in,
    // or in a non-recovery read check for completion once all the shards read.
-  if (rop.do_redundant_reads || rop.in_progress.empty()) {
+  if (rop.do_redundant_reads || all_sub_reads_done) {
      for (auto &&[oid, read_result]: rop.complete) {
        shard_id_set have;
        read_result.processed_read_requests.populate_shard_id_set(have);
@@ -1275,29 +1295,30 @@ void ECBackend::handle_sub_read_reply(
            populate_shard_id_set(want_to_read);
  
        dout(20) << __func__ << " read_result: " << read_result << dendl;
+      // If all reads are done, we can safely assume that zero buffers can
+      // be applied.
+      if (all_sub_reads_done) {
+        rop.to_read.at(oid).zeros_for_decode.populate_shard_id_set(have);
+      }
  
        int err = ec_impl->minimum_to_decode(want_to_read, have, dummy_minimum,
                                              nullptr);
        if (err) {
          dout(20) << __func__ << " minimum_to_decode failed" << dendl;
-        if (rop.in_progress.empty()) {
+        if (all_sub_reads_done) {
            // If we don't have enough copies, try other pg_shard_ts if available.
            // During recovery there may be multiple osds with copies of the same shard,
            // so getting EIO from one may result in multiple passes through this code path.
-          if (!rop.do_redundant_reads) {
-            rop.debug_log.emplace_back(ECUtil::REQUEST_MISSING, op.from);
-            int r = read_pipeline.send_all_remaining_reads(oid, rop);
-            if (r == 0) {
-              // We found that new reads are required to do a decode.
-              need_resend = true;
-              continue;
-            } else if (r > 0) {
-              // No new reads were requested. This means that some parity
-              // shards can be assumed to be zeros.
-              err = 0;
-            }
-            // else insufficient shards are available, keep the errors.
+
+          rop.debug_log.emplace_back(ECUtil::REQUEST_MISSING, op.from);
+          int r = read_pipeline.send_all_remaining_reads(oid, rop);
+          if (r == 0 && !rop.do_redundant_reads) {
+            // We found that new reads are required to do a decode.
+            need_resend = true;
+            continue;
            }
+          // else insufficient shards are available, keep the errors.
+
            // Couldn't read any additional shards so handle as completed with errors
            // We don't want to confuse clients / RBD with objectstore error
            // values in particular ENOENT.  We may have different error returns
@@ -1341,6 +1362,17 @@ void ECBackend::handle_sub_read_reply(
      dout(20) << __func__ << " Complete: " << rop << dendl;
      rop.trace.event("ec read complete");
      rop.debug_log.emplace_back(ECUtil::COMPLETE, op.from);
+
+    /* If do_redundant_reads is set then there might be some in progress
+     * reads remaining.  We need to make sure that these non-read shards
+     * do not get padded. If there was no in progress read, then the zero
+     * padding is allowed to stay.
+     */
+    for (auto pg_shard : rop.in_progress) {
+      for (auto &&[oid, read] : rop.to_read) {
+        read.zeros_for_decode.erase(pg_shard.shard);
+      }
+    }
      read_pipeline.complete_read_op(std::move(rop));
    } else {
      dout(10) << __func__ << " readop not complete: " << rop << dendl;
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h

index 0eafbf4acb69a0efd132c2a8b0300cafd4990a5d..15f358b313db261eeff5bfd0ad4ed4cdd5189a88 100644 (file)
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -335,7 +335,7 @@ class ECBackend : public ECCommon {
          ECUtil::shard_extent_map_t &&buffers_read,
          std::optional<std::map<std::string, ceph::buffer::list, std::less<>>>
            attrs,
-        const ECUtil::shard_extent_set_t &want_to_read,
+        read_request_t &req,
          RecoveryMessages *m);
      void handle_recovery_push(
          const PushOp &op,
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc

index 0c66957f9c8c78338cb7bdbd6ee2306b9221b4aa..316de749131663fabeddbec9dde19ceafa5a687d 100644 (file)
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -252,12 +252,20 @@ int ECCommon::ReadPipeline::get_min_avail_to_read_shards(
       * redundant reads is set, then we want to have the same reads on
       * every extent. Otherwise, we need to read every shard only if the
       * necessary shard is missing.
+     * In some (recovery) scenarios, we can "want" a shard, but not "need" to
+     * read it.  This typically happens when we do not have a data shard and the
+     * recovery for this will read enough shards to also generate all the parity.
+     *
+     * Since parity shards are often larger than data shards, we must make sure
+     * to read the extra bit!
       */
-    if (!have.contains(shard) || do_redundant_reads) {
+    if (!have.contains(shard) || do_redundant_reads ||
+        (want.contains(shard) && !need_set.contains(shard))) {
        extra_extents.union_of(extent_set);
      }
    }
  
+  read_request.zeros_for_decode.clear();
    for (auto &shard: need_set) {
      if (!have.contains(shard)) {
        continue;
@@ -279,6 +287,15 @@ int ECCommon::ReadPipeline::get_min_avail_to_read_shards(
        shard_read.extents.intersection_of(extents, read_mask.at(shard));
      }
  
+    if (zero_mask.contains(shard)) {
+      extents.intersection_of(zero_mask.at(shard));
+
+      /* Any remaining extents can be assumed ot be zeros... so record these. */
+      if (!extents.empty()) {
+        read_request.zeros_for_decode.emplace(shard, std::move(extents));
+      }
+    }
+
      if (!shard_read.extents.empty()) {
        read_request.shard_reads[shard_id] = std::move(shard_read);
      }
@@ -376,7 +393,7 @@ int ECCommon::ReadPipeline::get_remaining_shards(
      }
    }
  
-  return read_request.shard_reads.empty()?1:0;
+  return 0;
  }
  
  void ECCommon::ReadPipeline::start_read_op(
@@ -552,6 +569,7 @@ struct ClientReadCompleter final : ECCommon::ReadCompleter {
                 << res.buffers_read.debug_string(2048, 0)
                 << dendl;
        /* Decode any missing buffers */
+      res.buffers_read.add_zero_padding_for_decode(req.zeros_for_decode);
        int r = res.buffers_read.decode(read_pipeline.ec_impl,
                                    req.shard_want_to_read,
                                    req.object_size,
diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h

index 13be9c94857eb8672418cae56fdd4ef737b3582a..6e935a8ea13fbb156a8b05cee32bd4b8c19a8fa6 100644 (file)
--- a/src/osd/ECCommon.h
+++ b/src/osd/ECCommon.h
@@ -101,6 +101,7 @@ struct ECCommon {
      const std::list<ec_align_t> to_read;
      const uint32_t flags = 0;
      const ECUtil::shard_extent_set_t shard_want_to_read;
+    ECUtil::shard_extent_set_t zeros_for_decode;
      shard_id_map<shard_read_t> shard_reads;
      bool want_attrs = false;
      uint64_t object_size;
@@ -112,6 +113,7 @@ struct ECCommon {
        to_read(to_read),
        flags(to_read.front().flags),
        shard_want_to_read(shard_want_to_read),
+      zeros_for_decode(shard_want_to_read.get_max_shards()),
        shard_reads(shard_want_to_read.get_max_shards()),
        want_attrs(want_attrs),
        object_size(object_size) {}
@@ -119,6 +121,7 @@ struct ECCommon {
      read_request_t(const ECUtil::shard_extent_set_t &shard_want_to_read,
                 bool want_attrs, uint64_t object_size) :
        shard_want_to_read(shard_want_to_read),
+      zeros_for_decode(shard_want_to_read.get_max_shards()),
        shard_reads(shard_want_to_read.get_max_shards()),
        want_attrs(want_attrs),
        object_size(object_size) {}
@@ -129,6 +132,7 @@ struct ECCommon {
        os << "read_request_t(to_read=[" << to_read << "]"
            << ", flags=" << flags
            << ", shard_want_to_read=" << shard_want_to_read
+          << ", zeros_for_decode=" << zeros_for_decode
            << ", shard_reads=" << shard_reads
            << ", want_attrs=" << want_attrs
            << ")";
diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc

index 911e314deaf802db2d60c6c8bc6d8f9161cdb5c8..399d915549a5888f69ab100705e5774c9bb3f94e 100644 (file)
--- a/src/osd/ECUtil.cc
+++ b/src/osd/ECUtil.cc
@@ -571,11 +571,7 @@ void shard_extent_map_t::pad_on_shards(const shard_extent_set_t &pad_to,
      if (!pad_to.contains(shard)) {
        continue;
      }
-    for (auto &[off, length] : pad_to.at(shard)) {
-      bufferlist bl;
-      bl.push_back(buffer::create_aligned(length, EC_ALIGN_SIZE));
-      insert_in_shard(shard, off, bl);
-    }
+    pad_on_shard(pad_to.at(shard), shard);
    }
  }
  
@@ -585,10 +581,14 @@ void shard_extent_map_t::pad_on_shard(const extent_set &pad_to,
    if (pad_to.size() == 0) {
      return;
    }
+
    for (auto &[off, length] : pad_to) {
      bufferlist bl;
-    bl.push_back(buffer::create_aligned(length, EC_ALIGN_SIZE));
-    insert_in_shard(shard, off, bl);
+    uint64_t start = align_prev(off);
+    uint64_t end = align_next(off + length);
+
+    bl.push_back(buffer::create_aligned(end - start, EC_ALIGN_SIZE));
+    insert_in_shard(shard, start, bl);
    }
  }
  
@@ -599,11 +599,7 @@ void shard_extent_map_t::pad_on_shards(const extent_set &pad_to,
      return;
    }
    for (auto &shard : shards) {
-    for (auto &[off, length] : pad_to) {
-      bufferlist bl;
-      bl.push_back(buffer::create_aligned(length, EC_ALIGN_SIZE));
-      insert_in_shard(shard, off, bl);
-    }
+    pad_on_shard(pad_to, shard);
    }
  }
  
@@ -657,32 +653,44 @@ int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
      return 0;
    }
  
-  if (add_zero_padding_for_decode(object_size, need_set)) {
-    // We added some zero buffers, which means our have and need set may change
-    extent_maps.populate_bitset_set(have_set);
-    need_set = shard_id_set::difference(want_set, have_set);
-  }
-
    shard_id_set decode_set = shard_id_set::intersection(need_set, sinfo->get_data_shards());
    shard_id_set encode_set = shard_id_set::intersection(need_set, sinfo->get_parity_shards());
-  shard_extent_set_t read_mask(sinfo->get_k_plus_m());
-  sinfo->ro_size_to_read_mask(object_size, read_mask);
+
+  if (!encode_set.empty()) {
+    shard_extent_set_t read_mask(sinfo->get_k_plus_m());
+    sinfo->ro_size_to_read_mask(object_size, read_mask);
+
+    /* The function has been asked to "decode" parity. To achieve this, we
+     * need all the data shards to be present... So first see if there are
+     * any missing...
+     */
+    shard_id_set decode_for_parity_shards = shard_id_set::difference(sinfo->get_data_shards(), have_set);
+    decode_for_parity_shards = shard_id_set::intersection(decode_for_parity_shards, read_mask.get_shard_id_set());
+
+    if (!decode_for_parity_shards.empty()) {
+      /* So there are missing data shards which need decoding before we encode,
+       * We need to add these to the decode set and insert buffers for the
+       * decode to happen.
+       */
+      decode_set.insert(decode_for_parity_shards);
+      need_set.insert(decode_for_parity_shards);
+      extent_set decode_for_parity = get_extent_superset();
+
+      for (auto shard : decode_for_parity_shards) {
+        extent_set parity_pad;
+        parity_pad.intersection_of(decode_for_parity, read_mask.at(shard));
+        pad_on_shard(decode_for_parity, shard);
+      }
+    }
+  }
+
    int r = 0;
    if (!decode_set.empty()) {
      pad_on_shards(want, decode_set);
-    /* If we are going to be encoding, we need to make sure all the necessary
-     * shards are decoded. The get_min_available functions should have already
-     * worked out what needs to be read for this.
-     */
-    for (auto shard : encode_set) {
-      extent_set decode_for_parity;
-      decode_for_parity.intersection_of(want.at(shard), read_mask.at(shard));
-      pad_on_shard(decode_for_parity, shard);
-    }
      r = _decode(ec_impl, want_set, decode_set, dpp);
    }
    if (!r && !encode_set.empty()) {
-    pad_on_shards(want, encode_set);
+    pad_on_shards(get_extent_superset(), sinfo->get_parity_shards());
      r = encode(ec_impl, dpp, dedup_zeros?&need_set:nullptr);
    }
  
@@ -705,14 +713,20 @@ int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl,
                                  const shard_id_set &need_set,
                                  DoutPrefixProvider *dpp) {
    bool rebuild_req = false;
+
    for (auto iter = begin_slice_iterator(need_set, dpp); !iter.is_end(); ++iter) {
      if (!iter.is_page_aligned()) {
        rebuild_req = true;
        break;
      }
+
      shard_id_map<bufferptr> &in = iter.get_in_bufferptrs();
      shard_id_map<bufferptr> &out = iter.get_out_bufferptrs();
  
+    if (out.empty()) {
+      continue;
+    }
+
      if (int ret = ec_impl->decode_chunks(want_set, in, out)) {
        return ret;
      }
diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h

index 699bc10ef6c1653832fc89387d321c64e17f8a39..5452ec67f3632c2d156ed0f78c5ece9930d890c7 100644 (file)
--- a/src/osd/ECUtil.h
+++ b/src/osd/ECUtil.h
@@ -1062,16 +1062,9 @@ public:
      }
    }
  
-  bool add_zero_padding_for_decode(uint64_t object_size, shard_id_set &exclude_set) {
-    shard_extent_set_t zeros(sinfo->get_k_plus_m());
-    sinfo->ro_size_to_zero_mask(object_size, zeros);
-    extent_set superset = get_extent_superset();
+  void add_zero_padding_for_decode(ECUtil::shard_extent_set_t &zeros) {
      bool changed = false;
      for (auto &&[shard, z] : zeros) {
-      if (exclude_set.contains(shard)) {
-        continue;
-      }
-      z.intersection_of(superset);
        for (auto [off, len] : z) {
          changed = true;
          bufferlist bl;
@@ -1083,8 +1076,6 @@ public:
      if (changed) {
        compute_ro_range();
      }
-
-    return changed;
    }
  
    template <typename IntervalSetT> requires is_interval_set_v<IntervalSetT>
diff --git a/src/test/osd/TestECBackend.cc b/src/test/osd/TestECBackend.cc

index 1dd4faa6e681dbd8bf2190985bbb2b4378dcd364..ca299abd9631108574b410788c3b46a4cfbe9c40 100644 (file)
--- a/src/test/osd/TestECBackend.cc
+++ b/src/test/osd/TestECBackend.cc
@@ -1320,6 +1320,7 @@ void test_decode(unsigned int k, unsigned int m, uint64_t chunk_size, uint64_t o
      }
    }
  
+  semap.add_zero_padding_for_decode(read_request.zeros_for_decode);
    ASSERT_EQ(0, semap.decode(ec_impl, want, object_size, nullptr, true));
  }
  
@@ -1476,4 +1477,4 @@ TEST(ECCommon, decode7) {
    acting_set.insert_range(shard_id_t(0), 3);
  
    test_decode(k, m, chunk_size, object_size, want, acting_set);
-}
-\ No newline at end of file
+}
author	Alex Ainscow <aainscow@uk.ibm.com>
	Wed, 18 Jun 2025 19:46:49 +0000 (20:46 +0100)
committer	Alex Ainscow <aainscow@uk.ibm.com>
	Fri, 25 Jul 2025 07:43:09 +0000 (08:43 +0100)
src/osd/ECBackend.cc		patch \| blob \| history
src/osd/ECBackend.h		patch \| blob \| history
src/osd/ECCommon.cc		patch \| blob \| history
src/osd/ECCommon.h		patch \| blob \| history
src/osd/ECUtil.cc		patch \| blob \| history
src/osd/ECUtil.h		patch \| blob \| history
src/test/osd/TestECBackend.cc		patch \| blob \| history