]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Improve backfill in new EC.
authorAlex Ainscow <aainscow@uk.ibm.com>
Fri, 2 May 2025 09:11:45 +0000 (10:11 +0100)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 1 Jul 2025 12:03:30 +0000 (13:03 +0100)
In old EC, the full stripe was always read and written.  In new EC, we only attempt
to recover the shards that were missing. If an old OSD is available, the read can
be directed there.

Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
src/osd/ECBackend.cc
src/osd/ECBackend.h
src/osd/ECUtil.h

index 45f93bf88198ad0932f348bd46d72597f4b6f441..888b764fe38e823741e9313c3736af3d8039ffd7 100644 (file)
@@ -296,7 +296,7 @@ void ECBackend::RecoveryBackend::handle_recovery_push_reply(
   if (!recovery_ops.count(op.soid))
     return;
   RecoveryOp &rop = recovery_ops[op.soid];
-  ceph_assert(rop.waiting_on_pushes.count(from));
+  ceph_assert(rop.waiting_on_pushes.contains(from));
   rop.waiting_on_pushes.erase(from);
   continue_recovery_op(rop, m);
 }
@@ -377,10 +377,6 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
 
   int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size);
   ceph_assert(r == 0);
-  // We are never appending here, so we never need hinfo.
-  op.returned_data->insert_parity_buffers();
-  r = op.returned_data->encode(ec_impl, NULL, 0);
-  ceph_assert(r==0);
 
   // Finally, we don't want to write any padding, so truncate the buffer
   // to remove it.
@@ -538,12 +534,30 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
 
       op.state = RecoveryOp::READING;
 
-      // We always read the recovery chunk size (default 8MiB + parity). If that
-      // amount of data is not available, then the backend will truncate the
-      // response.
+      /* When beginning recovery, the OI may not be known. As such the object
+       * size is not known. For the first read, attempt to read the default
+       * size.  If this is larger than the object sizes, then the OSD will
+       * return truncated reads.  If the object size is known, then attempt
+       * correctly sized reads.
+       */
+      uint64_t read_size = get_recovery_chunk_size();
+      if (op.obc) {
+        uint64_t read_to_end = ECUtil::align_next(op.obc->obs.oi.size) -
+          op.recovery_progress.data_recovered_to;
+
+        if (read_to_end < read_size) {
+          read_size = read_to_end;
+        }
+      }
       sinfo.ro_range_to_shard_extent_set_with_parity(
-        op.recovery_progress.data_recovered_to,
-        get_recovery_chunk_size(), want);
+        op.recovery_progress.data_recovered_to, read_size, want);
+
+      op.recovery_progress.data_recovered_to += read_size;
+
+      // We only need to recover shards that are missing.
+      for (auto shard : shard_id_set::difference(sinfo.get_all_shards(), op.missing_on_shards)) {
+        want.erase(shard);
+      }
 
       if (op.recovery_progress.first && op.obc) {
         op.xattrs = op.obc->attr_cache;
@@ -593,9 +607,15 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
       }
       if (read_request.shard_reads.empty()) {
         ceph_assert(op.obc);
-        ceph_assert(0 == op.obc->obs.oi.size);
-        dout(10) << __func__ << "Zero size object recovery, skipping reads."
-                 << op << dendl;
+        /* This can happen for several reasons
+         * - A zero-sized object.
+         * - The missing shards have no data.
+         * - The previous recovery did not need the last data shard. In this
+         *   case, data_recovered_to may indicate that the last shard still
+         *   needs recovery, when it does not.
+         * We can just skip the read and fall through below.
+         */
+        dout(10) << __func__ << " No reads required " << op << dendl;
         // Create an empty read result and fall through.
         op.returned_data.emplace(&sinfo);
       } else {
@@ -614,7 +634,6 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
       dout(20) << __func__ << ": returned_data=" << op.returned_data << dendl;
       op.state = RecoveryOp::WRITING;
       ObjectRecoveryProgress after_progress = op.recovery_progress;
-      after_progress.data_recovered_to = op.returned_data->get_ro_end();
       after_progress.first = false;
       if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
         after_progress.data_complete = true;
index 305d84274fd5c852874f787645f63836e9a32e54..bee0cd5afab9dc1b88b6246c4697b988039eb166 100644 (file)
@@ -252,7 +252,7 @@ class ECBackend : public ECCommon {
       hobject_t hoid;
       eversion_t v;
       std::set<pg_shard_t> missing_on;
-      std::set<shard_id_t> missing_on_shards;
+      shard_id_set missing_on_shards;
 
       ObjectRecoveryInfo recovery_info;
       ObjectRecoveryProgress recovery_progress;
index 93300df3a03f192ddf2b0763f6c2ac3fa7448f68..e859b016baa32029ab8c62b1f48bba5c2825c258 100644 (file)
@@ -358,6 +358,7 @@ class stripe_info_t {
   const std::vector<raw_shard_id_t> chunk_mapping_reverse;
   const shard_id_set data_shards;
   const shard_id_set parity_shards;
+  const shard_id_set all_shards;
 
 private:
   void ro_range_to_shards(
@@ -410,6 +411,13 @@ private:
     return data_shards;
   }
 
+  static shard_id_set calc_all_shards(int k_plus_m) {
+    shard_id_set all_shards;
+    all_shards.insert_range(shard_id_t(), k_plus_m);
+    return all_shards;
+  }
+
+
 public:
   stripe_info_t(const ErasureCodeInterfaceRef &ec_impl, const pg_pool_t *pool,
                 uint64_t stripe_width
@@ -424,7 +432,8 @@ public:
         complete_chunk_mapping(ec_impl->get_chunk_mapping(), k + m)),
       chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
       data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
-      parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+      parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)),
+      all_shards(calc_all_shards(k + m)) {
     ceph_assert(stripe_width != 0);
     ceph_assert(stripe_width % k == 0);
   }
@@ -601,6 +610,11 @@ public:
     return parity_shards;
   }
 
+  auto get_all_shards() const {
+    return all_shards;
+  }
+
+
   uint64_t ro_offset_to_prev_chunk_offset(uint64_t offset) const {
     return (offset / stripe_width) * chunk_size;
   }