From 54a06c97a279d842db0f8059ef991245c3350171 Mon Sep 17 00:00:00 2001 From: Alex Ainscow Date: Wed, 23 Apr 2025 15:41:11 +0100 Subject: [PATCH] osd: Fix parity updates in truncates. Previously in optimised EC, when truncating to a partial stripe, the parity was not being updated. This fix reads the non-truncated data from the final stripe and calculates parity updates, which are written to the parity shards. Signed-off-by: Alex Ainscow --- src/osd/ECTransaction.cc | 44 ++++++++++++++++++++++++++--- src/test/osd/test_ec_transaction.cc | 40 ++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc index ebc0561410215..0284ccb0eb7f5 100644 --- a/src/osd/ECTransaction.cc +++ b/src/osd/ECTransaction.cc @@ -233,11 +233,11 @@ ECTransaction::WritePlanObj::WritePlanObj( to_read = std::move(pdw_reads); reads.clear(); // So we don't stash it at the end. } - } + } /* NOTE: We intentionally leave un-writable shards in the write plan. As * it is actually less efficient to take them out:- PDWs still need to - * compute the deltas and conventional writes still need to calcualte the + * compute the deltas and conventional writes still need to calculate the * parity. The transaction will be dropped by generate_transactions. */ } @@ -247,6 +247,38 @@ ECTransaction::WritePlanObj::WritePlanObj( } } + /* Plan for truncates. A truncate can reduce a stripe to a partial stripe. + * This means we must update the parity buffers. To do this, we must + * read the existing data on the partial stripe. + */ + if (op.truncate && op.truncate->first < orig_size) { + ECUtil::shard_extent_set_t truncate_read(sinfo.get_k_plus_m()); + extent_set truncate_write; + uint64_t prev_stripe = sinfo.ro_offset_to_prev_stripe_ro_offset(op.truncate->first); + uint64_t next_align = ECUtil::align_next(op.truncate->first); + sinfo.ro_range_to_shard_extent_set_with_superset( + prev_stripe, next_align - prev_stripe, + truncate_read, truncate_write); + + /* We must always update the entire parity chunk, even if we only read + * a small amount of one shard. + */ + truncate_write.align(sinfo.get_chunk_size()); + + if (!truncate_read.empty()) { + if (to_read) { + to_read->insert(truncate_read); + } else { + to_read = std::move(truncate_read); + } + + // We only need to update the parity buffer for the write + for (auto && shard : sinfo.get_parity_shards()) { + will_write[shard] = truncate_write; + } + } + } + /* validate post conditions: * to_read should have an entry for `obj` if it isn't empty * and if we are reading from `obj`, we can't be renaming or @@ -595,8 +627,12 @@ ECTransaction::Generate::Generate(PGTransaction &t, void ECTransaction::Generate::truncate() { ceph_assert(!op.is_fresh_object()); - // causes encode to invent zeros - to_write.erase_after_ro_offset(plan.orig_size); + /* We always read aligned. If the new size is not aligned, there will be + * some data in the read buffer that needs to be zeroed before the parity + * is calculate. By simply removing the buffer, the parity encode functions + * will assume zeros. + */ + to_write.erase_after_ro_offset(op.truncate->first); all_shards_written(); debug(oid, "truncate_erase", to_write, dpp); diff --git a/src/test/osd/test_ec_transaction.cc b/src/test/osd/test_ec_transaction.cc index 448af5b053229..42e78bcb7e920 100644 --- a/src/test/osd/test_ec_transaction.cc +++ b/src/test/osd/test_ec_transaction.cc @@ -401,3 +401,43 @@ TEST(ectransaction, truncate_to_bigger_without_write) ASSERT_FALSE(plan.to_read); ASSERT_EQ(0u, plan.will_write.shard_count()); } + +TEST(ectransaction, truncate_to_smalelr_without_write) { + hobject_t h; + PGTransaction::ObjectOperation op; + + op.truncate = std::pair(EC_ALIGN_SIZE/4, EC_ALIGN_SIZE/4); + + pg_pool_t pool; + pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS); + ECUtil::stripe_info_t sinfo(2, 2, EC_ALIGN_SIZE*2, &pool); + shard_id_set shards; + shards.insert_range(shard_id_t(), 4); + ECTransaction::WritePlanObj plan( + h, + op, + sinfo, + shards, + shards, + false, + 16*EC_ALIGN_SIZE, + std::nullopt, + std::nullopt, + ECUtil::HashInfoRef(new ECUtil::HashInfo(1)), + nullptr, + 0); + + generic_derr << "plan " << plan << dendl; + + ASSERT_TRUE(plan.to_read); + ECUtil::shard_extent_set_t ref_read(sinfo.get_k_plus_m()); + ref_read[shard_id_t(0)].insert(0, EC_ALIGN_SIZE); + ASSERT_EQ(ref_read, plan.to_read); + + // Writes should cover parity only. + ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m()); + ref_write[shard_id_t(2)].insert(0, EC_ALIGN_SIZE); + ref_write[shard_id_t(3)].insert(0, EC_ALIGN_SIZE); + ASSERT_EQ(ref_write, plan.will_write); +} + -- 2.39.5