osd: Improve backfill in new EC.

author Alex Ainscow <aainscow@uk.ibm.com>

Fri, 2 May 2025 09:11:45 +0000 (10:11 +0100)

committer Laura Flores <lflores@ibm.com>

Wed, 9 Jul 2025 15:47:25 +0000 (15:47 +0000)
author Alex Ainscow <aainscow@uk.ibm.com>
Fri, 2 May 2025 09:11:45 +0000 (10:11 +0100)
committer Laura Flores <lflores@ibm.com>
Wed, 9 Jul 2025 15:47:25 +0000 (15:47 +0000)
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc

index 9740c77ba1694abc1d480c29ff7e7006746a1603..d20791eda2b99b5b3cc46ee574f52340d85ea6e4 100644 (file)
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -296,7 +296,7 @@ void ECBackend::RecoveryBackend::handle_recovery_push_reply(
    if (!recovery_ops.count(op.soid))
      return;
    RecoveryOp &rop = recovery_ops[op.soid];
-  ceph_assert(rop.waiting_on_pushes.count(from));
+  ceph_assert(rop.waiting_on_pushes.contains(from));
    rop.waiting_on_pushes.erase(from);
    continue_recovery_op(rop, m);
  }
@@ -377,10 +377,6 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
  
    int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size);
    ceph_assert(r == 0);
-  // We are never appending here, so we never need hinfo.
-  op.returned_data->insert_parity_buffers();
-  r = op.returned_data->encode(ec_impl, NULL, 0);
-  ceph_assert(r==0);
  
    // Finally, we don't want to write any padding, so truncate the buffer
    // to remove it.
@@ -538,12 +534,30 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
  
        op.state = RecoveryOp::READING;
  
-      // We always read the recovery chunk size (default 8MiB + parity). If that
-      // amount of data is not available, then the backend will truncate the
-      // response.
+      /* When beginning recovery, the OI may not be known. As such the object
+       * size is not known. For the first read, attempt to read the default
+       * size.  If this is larger than the object sizes, then the OSD will
+       * return truncated reads.  If the object size is known, then attempt
+       * correctly sized reads.
+       */
+      uint64_t read_size = get_recovery_chunk_size();
+      if (op.obc) {
+        uint64_t read_to_end = ECUtil::align_next(op.obc->obs.oi.size) -
+          op.recovery_progress.data_recovered_to;
+
+        if (read_to_end < read_size) {
+          read_size = read_to_end;
+        }
+      }
        sinfo.ro_range_to_shard_extent_set_with_parity(
-        op.recovery_progress.data_recovered_to,
-        get_recovery_chunk_size(), want);
+        op.recovery_progress.data_recovered_to, read_size, want);
+
+      op.recovery_progress.data_recovered_to += read_size;
+
+      // We only need to recover shards that are missing.
+      for (auto shard : shard_id_set::difference(sinfo.get_all_shards(), op.missing_on_shards)) {
+        want.erase(shard);
+      }
  
        if (op.recovery_progress.first && op.obc) {
          op.xattrs = op.obc->attr_cache;
@@ -593,9 +607,15 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
        }
        if (read_request.shard_reads.empty()) {
          ceph_assert(op.obc);
-        ceph_assert(0 == op.obc->obs.oi.size);
-        dout(10) << __func__ << "Zero size object recovery, skipping reads."
-                 << op << dendl;
+        /* This can happen for several reasons
+         * - A zero-sized object.
+         * - The missing shards have no data.
+         * - The previous recovery did not need the last data shard. In this
+         *   case, data_recovered_to may indicate that the last shard still
+         *   needs recovery, when it does not.
+         * We can just skip the read and fall through below.
+         */
+        dout(10) << __func__ << " No reads required " << op << dendl;
          // Create an empty read result and fall through.
          op.returned_data.emplace(&sinfo);
        } else {
@@ -614,7 +634,6 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
        dout(20) << __func__ << ": returned_data=" << op.returned_data << dendl;
        op.state = RecoveryOp::WRITING;
        ObjectRecoveryProgress after_progress = op.recovery_progress;
-      after_progress.data_recovered_to = op.returned_data->get_ro_end();
        after_progress.first = false;
        if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
          after_progress.data_complete = true;
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h

index 305d84274fd5c852874f787645f63836e9a32e54..bee0cd5afab9dc1b88b6246c4697b988039eb166 100644 (file)
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -252,7 +252,7 @@ class ECBackend : public ECCommon {
        hobject_t hoid;
        eversion_t v;
        std::set<pg_shard_t> missing_on;
-      std::set<shard_id_t> missing_on_shards;
+      shard_id_set missing_on_shards;
  
        ObjectRecoveryInfo recovery_info;
        ObjectRecoveryProgress recovery_progress;
diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h

index 93300df3a03f192ddf2b0763f6c2ac3fa7448f68..e859b016baa32029ab8c62b1f48bba5c2825c258 100644 (file)
--- a/src/osd/ECUtil.h
+++ b/src/osd/ECUtil.h
@@ -358,6 +358,7 @@ class stripe_info_t {
    const std::vector<raw_shard_id_t> chunk_mapping_reverse;
    const shard_id_set data_shards;
    const shard_id_set parity_shards;
+  const shard_id_set all_shards;
  
  private:
    void ro_range_to_shards(
@@ -410,6 +411,13 @@ private:
      return data_shards;
    }
  
+  static shard_id_set calc_all_shards(int k_plus_m) {
+    shard_id_set all_shards;
+    all_shards.insert_range(shard_id_t(), k_plus_m);
+    return all_shards;
+  }
+
+
  public:
    stripe_info_t(const ErasureCodeInterfaceRef &ec_impl, const pg_pool_t *pool,
                  uint64_t stripe_width
@@ -424,7 +432,8 @@ public:
          complete_chunk_mapping(ec_impl->get_chunk_mapping(), k + m)),
        chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
        data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
-      parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+      parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)),
+      all_shards(calc_all_shards(k + m)) {
      ceph_assert(stripe_width != 0);
      ceph_assert(stripe_width % k == 0);
    }
@@ -601,6 +610,11 @@ public:
      return parity_shards;
    }
  
+  auto get_all_shards() const {
+    return all_shards;
+  }
+
+
    uint64_t ro_offset_to_prev_chunk_offset(uint64_t offset) const {
      return (offset / stripe_width) * chunk_size;
    }
author	Alex Ainscow <aainscow@uk.ibm.com>
	Fri, 2 May 2025 09:11:45 +0000 (10:11 +0100)
committer	Laura Flores <lflores@ibm.com>
	Wed, 9 Jul 2025 15:47:25 +0000 (15:47 +0000)
src/osd/ECBackend.cc		patch \| blob \| history
src/osd/ECBackend.h		patch \| blob \| history
src/osd/ECUtil.h		patch \| blob \| history