]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/osd: port changes from classial FastEC
authorRadoslaw Zarzynski <rzarzyns@redhat.com>
Fri, 28 Nov 2025 12:55:10 +0000 (12:55 +0000)
committerRadoslaw Zarzynski <rzarzyns@redhat.com>
Tue, 27 Jan 2026 19:43:00 +0000 (19:43 +0000)
This commit ports changes from c116b8615d68a3926dc78a4965cc0a28ff85d4f2.

Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
src/crimson/osd/ec_backend.cc

index 4517df37f1823fee3e71c4be2f0f8aa15e6467f7..8d9cf2e959868b1f9603e0eb1305d30c1a00bef2 100644 (file)
@@ -693,6 +693,13 @@ ECBackend::handle_rep_read_reply(ECSubReadReply& mop)
     auto &complete = rop.complete.at(hoid);
     complete.errors.emplace(from, err);
     complete.buffers_read.erase_shard(from.shard);
+    // If we are doing redundant reads, then we must take care that any failed
+    // reads are not replaced with a zero buffer. When fast_reads are disabled,
+    // the send_all_remaining_reads() call will replace the zeros_for_decode
+    // based on the recovery read.
+    if (rop.do_redundant_reads) {
+      rop.to_read.at(hoid).zeros_for_decode.erase(from.shard);
+    }
     complete.processed_read_requests.erase(from.shard);
     logger().debug("{} shard={} error={}", __func__, from, err);
   }
@@ -710,7 +717,8 @@ ECBackend::handle_rep_read_reply(ECSubReadReply& mop)
   // or in a non-recovery read check for completion once all the shards read.
   unsigned is_complete = 0;
   bool need_resend = false;
-  if (rop.do_redundant_reads || rop.in_progress.empty()) {
+  bool all_sub_reads_done = rop.in_progress.empty();
+  if (rop.do_redundant_reads || all_sub_reads_done) {
     for (auto &&[oid, read_result]: rop.complete) {
       shard_id_set have;
       read_result.processed_read_requests.populate_shard_id_set(have);
@@ -719,6 +727,12 @@ ECBackend::handle_rep_read_reply(ECSubReadReply& mop)
       rop.to_read.at(oid).shard_want_to_read.
           populate_shard_id_set(want_to_read);
 
+      // If all reads are done, we can safely assume that zero buffers can
+      // be applied.
+      if (all_sub_reads_done) {
+        rop.to_read.at(oid).zeros_for_decode.populate_shard_id_set(have);
+      }
+
       int err = -EIO; // If attributes needed but not read.
       if (!rop.to_read.at(oid).want_attrs || rop.complete.at(oid).attrs) {
         err = ec_impl->minimum_to_decode(want_to_read, have, dummy_minimum,
@@ -727,23 +741,18 @@ ECBackend::handle_rep_read_reply(ECSubReadReply& mop)
 
       if (err) {
        logger().debug("{} minimum_to_decode failed {}", __func__, err);
-        if (rop.in_progress.empty()) {
+        if (all_sub_reads_done) {
           // If we don't have enough copies, try other pg_shard_ts if available.
           // During recovery there may be multiple osds with copies of the same shard,
           // so getting EIO from one may result in multiple passes through this code path.
-          if (!rop.do_redundant_reads) {
-            int r = read_pipeline.send_all_remaining_reads(oid, rop);
-            if (r == 0) {
-              // We found that new reads are required to do a decode.
-              need_resend = true;
-              continue;
-            } else if (r >  0) {
-              // No new reads were requested. This means that some parity
-              // shards can be assumed to be zeros.
-              err = 0;
-            }
-            // else insufficient shards are available, keep the errors.
-          }
+          int r = read_pipeline.send_all_remaining_reads(oid, rop);
+          if (r == 0 && !rop.do_redundant_reads) {
+            // We found that new reads are required to do a decode.
+            need_resend = true;
+            continue;
+           }
+          // else insufficient shards are available, keep the errors.
+
           // Couldn't read any additional shards so handle as completed with errors
           // We don't want to confuse clients / RBD with objectstore error
           // values in particular ENOENT.  We may have different error returns
@@ -782,6 +791,16 @@ ECBackend::handle_rep_read_reply(ECSubReadReply& mop)
   } else if (rop.in_progress.empty() ||
              is_complete == rop.complete.size()) {
     logger().debug("{}: complete {}", __func__, rop);
+    /* If do_redundant_reads is set then there might be some in progress
+     * reads remaining.  We need to make sure that these non-read shards
+     * do not get padded. If there was no in progress read, then the zero
+     * padding is allowed to stay.
+     */
+    for (auto pg_shard : rop.in_progress) {
+      for (auto &&[oid, read] : rop.to_read) {
+        read.zeros_for_decode.erase(pg_shard.shard);
+      }
+    }
     read_pipeline.complete_read_op(std::move(rop));
   } else {
     logger().info("{}: readop not completed yet: {}", __func__, rop);