]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Fix parity updates in truncates.
authorAlex Ainscow <aainscow@uk.ibm.com>
Wed, 23 Apr 2025 14:41:11 +0000 (15:41 +0100)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 1 Jul 2025 12:03:30 +0000 (13:03 +0100)
Previously in optimised EC, when truncating to a partial
stripe, the parity was not being updated.  This fix reads
the non-truncated data from the final stripe and calculates
parity updates, which are written to the parity shards.

Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
src/osd/ECTransaction.cc
src/test/osd/test_ec_transaction.cc

index ebc0561410215930969ee44346f6a295bed48f9d..0284ccb0eb7f5b19fa0d9c84c3289de973949b07 100644 (file)
@@ -233,11 +233,11 @@ ECTransaction::WritePlanObj::WritePlanObj(
           to_read = std::move(pdw_reads);
           reads.clear(); // So we don't stash it at the end.
         }
-          }
+      }
 
       /* NOTE: We intentionally leave un-writable shards in the write plan.  As
        * it is actually less efficient to take them out:- PDWs still need to
-       * compute the deltas and conventional writes still need to calcualte the
+       * compute the deltas and conventional writes still need to calculate the
        * parity. The transaction will be dropped by generate_transactions.
        */
     }
@@ -247,6 +247,38 @@ ECTransaction::WritePlanObj::WritePlanObj(
     }
   }
 
+  /* Plan for truncates. A truncate can reduce a stripe to a partial stripe.
+   * This means we must update the parity buffers.  To do this, we must
+   * read the existing data on the partial stripe.
+   */
+  if (op.truncate && op.truncate->first < orig_size) {
+    ECUtil::shard_extent_set_t truncate_read(sinfo.get_k_plus_m());
+    extent_set truncate_write;
+    uint64_t prev_stripe = sinfo.ro_offset_to_prev_stripe_ro_offset(op.truncate->first);
+    uint64_t next_align = ECUtil::align_next(op.truncate->first);
+    sinfo.ro_range_to_shard_extent_set_with_superset(
+      prev_stripe, next_align - prev_stripe,
+      truncate_read, truncate_write);
+
+    /* We must always update the entire parity chunk, even if we only read
+     * a small amount of one shard.
+     */
+    truncate_write.align(sinfo.get_chunk_size());
+
+    if (!truncate_read.empty()) {
+      if (to_read) {
+        to_read->insert(truncate_read);
+      } else {
+        to_read = std::move(truncate_read);
+      }
+
+      // We only need to update the parity buffer for the write
+      for (auto && shard : sinfo.get_parity_shards()) {
+        will_write[shard] = truncate_write;
+      }
+    }
+  }
+
   /* validate post conditions:
    * to_read should have an entry for `obj` if it isn't empty
    * and if we are reading from `obj`, we can't be renaming or
@@ -595,8 +627,12 @@ ECTransaction::Generate::Generate(PGTransaction &t,
 
 void ECTransaction::Generate::truncate() {
   ceph_assert(!op.is_fresh_object());
-  // causes encode to invent zeros
-  to_write.erase_after_ro_offset(plan.orig_size);
+  /* We always read aligned. If the new size is not aligned, there will be
+   * some data in the read buffer that needs to be zeroed before the parity
+   * is calculate.  By simply removing the buffer, the parity encode functions
+   * will assume zeros.
+   */
+  to_write.erase_after_ro_offset(op.truncate->first);
   all_shards_written();
 
   debug(oid, "truncate_erase", to_write, dpp);
index 448af5b05322981a3aa0c2ae10eebbf34642b4ae..42e78bcb7e9207d5c907e6ac2a30ec60a5e5b86f 100644 (file)
@@ -401,3 +401,43 @@ TEST(ectransaction, truncate_to_bigger_without_write)
   ASSERT_FALSE(plan.to_read);
   ASSERT_EQ(0u, plan.will_write.shard_count());
 }
+
+TEST(ectransaction, truncate_to_smalelr_without_write) {
+  hobject_t h;
+  PGTransaction::ObjectOperation op;
+
+  op.truncate = std::pair(EC_ALIGN_SIZE/4, EC_ALIGN_SIZE/4);
+
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 2, EC_ALIGN_SIZE*2, &pool);
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 4);
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
+    sinfo,
+    shards,
+    shards,
+    false,
+    16*EC_ALIGN_SIZE,
+    std::nullopt,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  ASSERT_TRUE(plan.to_read);
+  ECUtil::shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+  ref_read[shard_id_t(0)].insert(0, EC_ALIGN_SIZE);
+  ASSERT_EQ(ref_read, plan.to_read);
+
+  // Writes should cover parity only.
+  ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m());
+  ref_write[shard_id_t(2)].insert(0, EC_ALIGN_SIZE);
+  ref_write[shard_id_t(3)].insert(0, EC_ALIGN_SIZE);
+  ASSERT_EQ(ref_write, plan.will_write);
+}
+