From fd809c4b7db86a4373d563f54dc345370e0fd9ab Mon Sep 17 00:00:00 2001 From: Alex Ainscow Date: Fri, 27 Jun 2025 16:00:56 +0100 Subject: [PATCH] osd: Deduplicate zeros in EC slice iterator Signed-off-by: Alex Ainscow (cherry picked from commit 06658fdac16dde95d20a8907511afb7fde7313da) Signed-off-by: Alex Ainscow --- src/include/interval_set.h | 10 ++ src/osd/ECBackend.cc | 50 ++++--- src/osd/ECCommon.cc | 3 +- src/osd/ECTransaction.cc | 2 +- src/osd/ECUtil.cc | 43 +++--- src/osd/ECUtil.h | 147 +++++++++++++++++-- src/test/osd/TestECBackend.cc | 267 +++++++++++++++++++++++++--------- src/test/osd/TestECUtil.cc | 16 +- 8 files changed, 408 insertions(+), 130 deletions(-) diff --git a/src/include/interval_set.h b/src/include/interval_set.h index 61392289c2afb..9ed8d3a9cabe2 100644 --- a/src/include/interval_set.h +++ b/src/include/interval_set.h @@ -999,4 +999,14 @@ public: template class C, bool strict> struct fmt::is_range, char> : std::false_type {}; +template +struct is_interval_set : std::false_type {}; + +template class C, bool strict> +struct is_interval_set> : std::true_type {}; + +template +inline constexpr bool is_interval_set_v = is_interval_set::value; + +#undef strict_mode_assert #endif diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 0f91eba5bf868..112a65e85d4fa 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -223,19 +223,20 @@ void ECBackend::RecoveryBackend::handle_recovery_push( m->t.touch(coll, tobj); } - if (!op.data_included.empty()) { - uint64_t start = op.data_included.range_start(); - uint64_t end = op.data_included.range_end(); - ceph_assert(op.data.length() == (end - start)); - - m->t.write( - coll, - tobj, - start, - op.data.length(), - op.data); - } else { - ceph_assert(op.data.length() == 0); + ceph_assert(op.data.length() == op.data_included.size()); + uint64_t tobj_size = 0; + + uint64_t cursor = 0; + for (auto [off, len] : op.data_included) { + bufferlist bl; + if (len != op.data.length()) { + bl.substr_of(op.data, cursor, len); + } else { + bl = op.data; + } + m->t.write(coll, tobj, off, len, bl); + tobj_size = off + len; + cursor += len; } if (op.before_progress.first) { @@ -246,6 +247,15 @@ void ECBackend::RecoveryBackend::handle_recovery_push( op.attrset); } + if (op.after_progress.data_complete) { + uint64_t shard_size = sinfo.object_size_to_shard_size(op.recovery_info.size, + get_parent()->whoami_shard().shard); + ceph_assert(shard_size >= tobj_size); + if (shard_size != tobj_size) { + m->t.truncate( coll, tobj, shard_size); + } + } + if (op.after_progress.data_complete && !oneshot) { dout(10) << __func__ << ": Removing oid " << tobj.hobj << " from the temp collection" << dendl; @@ -361,7 +371,7 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete( uint64_t aligned_size = ECUtil::align_next(op.obc->obs.oi.size); - int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size); + int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size, get_parent()->get_dpp(), true); ceph_assert(r == 0); // Finally, we don't want to write any padding, so truncate the buffer @@ -601,22 +611,24 @@ void ECBackend::RecoveryBackend::continue_recovery_op( if (after_progress.data_recovered_to >= op.obc->obs.oi.size) { after_progress.data_complete = true; } + for (auto &&pg_shard: op.missing_on) { m->pushes[pg_shard].push_back(PushOp()); PushOp &pop = m->pushes[pg_shard].back(); pop.soid = op.hoid; pop.version = op.recovery_info.oi.get_version_for_shard(pg_shard.shard); - op.returned_data->get_shard_first_buffer(pg_shard.shard, pop.data); + + op.returned_data->get_sparse_buffer(pg_shard.shard, pop.data, pop.data_included); + ceph_assert(pop.data.length() == pop.data_included.size()); + dout(10) << __func__ << ": pop shard=" << pg_shard << ", oid=" << pop.soid << ", before_progress=" << op.recovery_progress << ", after_progress=" << after_progress << ", pop.data.length()=" << pop.data.length() + << ", pop.data_included=" << pop.data_included << ", size=" << op.obc->obs.oi.size << dendl; - if (pop.data.length()) - pop.data_included.union_insert( - op.returned_data->get_shard_first_offset(pg_shard.shard), - pop.data.length()); + if (op.recovery_progress.first) { if (sinfo.is_nonprimary_shard(pg_shard.shard)) { if (pop.version == op.recovery_info.oi.version) { diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc index 0db49bc621a2e..377fd86452d54 100644 --- a/src/osd/ECCommon.cc +++ b/src/osd/ECCommon.cc @@ -487,7 +487,8 @@ struct ClientReadCompleter final : ECCommon::ReadCompleter { /* Decode any missing buffers */ int r = res.buffers_read.decode(read_pipeline.ec_impl, req.shard_want_to_read, - req.object_size); + req.object_size, + read_pipeline.get_parent()->get_dpp()); ceph_assert( r == 0 ); dout(30) << __func__ << ": after decode: " << res.buffers_read.debug_string(2048, 8) diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc index 996a37f03d806..1142db8008f4a 100644 --- a/src/osd/ECTransaction.cc +++ b/src/osd/ECTransaction.cc @@ -68,7 +68,7 @@ void ECTransaction::Generate::encode_and_write() { */ read_sem->zero_pad(plan.will_write); to_write.pad_with_other(plan.will_write, *read_sem); - r = to_write.encode_parity_delta(ec_impl, *read_sem); + r = to_write.encode_parity_delta(ec_impl, *read_sem, dpp); } else { r = to_write.encode(ec_impl); } diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc index b1522a7fbd8c7..911e314deaf80 100644 --- a/src/osd/ECUtil.cc +++ b/src/osd/ECUtil.cc @@ -475,19 +475,23 @@ void shard_extent_map_t::insert_parity_buffers() { } } -slice_iterator shard_extent_map_t::begin_slice_iterator( - const shard_id_set &out) { - return slice_iterator(extent_maps, out); +slice_iterator shard_extent_map_t::begin_slice_iterator( + const shard_id_set &out, + DoutPrefixProvider *dpp, + const shard_id_set *dedup_zeros) { + return slice_iterator(extent_maps, out, dpp, dedup_zeros); } /* Encode parity chunks, using the encode_chunks interface into the * erasure coding. This generates all parity using full stripe writes. */ -int shard_extent_map_t::_encode(const ErasureCodeInterfaceRef &ec_impl) { +int shard_extent_map_t::encode(const ErasureCodeInterfaceRef &ec_impl, + DoutPrefixProvider *dpp, + shard_id_set *dedup_zeros) { shard_id_set out_set = sinfo->get_parity_shards(); bool rebuild_req = false; - for (auto iter = begin_slice_iterator(out_set); !iter.is_end(); ++iter) { + for (auto iter = begin_slice_iterator(out_set, dpp, dedup_zeros); !iter.is_end(); ++iter) { if (!iter.is_page_aligned()) { rebuild_req = true; break; @@ -503,25 +507,19 @@ int shard_extent_map_t::_encode(const ErasureCodeInterfaceRef &ec_impl) { if (rebuild_req) { pad_and_rebuild_to_ec_align(); - return _encode(ec_impl); + return encode(ec_impl, dpp, dedup_zeros); } return 0; } -/* Encode parity chunks, using the encode_chunks interface into the - * erasure coding. This generates all parity using full stripe writes. - */ -int shard_extent_map_t::encode(const ErasureCodeInterfaceRef &ec_impl) { - return _encode(ec_impl); -} - /* Encode parity chunks, using the parity delta write interfaces on plugins * that support them. */ int shard_extent_map_t::encode_parity_delta( const ErasureCodeInterfaceRef &ec_impl, - shard_extent_map_t &old_sem) { + shard_extent_map_t &old_sem, + DoutPrefixProvider *dpp) { shard_id_set out_set = sinfo->get_parity_shards(); pad_and_rebuild_to_ec_align(); @@ -542,7 +540,7 @@ int shard_extent_map_t::encode_parity_delta( s.compute_ro_range(); - for (auto iter = s.begin_slice_iterator(out_set); !iter.is_end(); ++iter) { + for (auto iter = s.begin_slice_iterator(out_set, dpp); !iter.is_end(); ++iter) { ceph_assert(iter.is_page_aligned()); shard_id_map &data_shards = iter.get_in_bufferptrs(); shard_id_map &parity_shards = iter.get_out_bufferptrs(); @@ -644,7 +642,9 @@ void shard_extent_map_t::trim(const shard_extent_set_t &trim_to) { int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl, const shard_extent_set_t &want, - uint64_t object_size) { + uint64_t object_size, + DoutPrefixProvider *dpp, + bool dedup_zeros) { shard_id_set want_set; shard_id_set have_set; want.populate_shard_id_set(want_set); @@ -679,11 +679,11 @@ int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl, decode_for_parity.intersection_of(want.at(shard), read_mask.at(shard)); pad_on_shard(decode_for_parity, shard); } - r = _decode(ec_impl, want_set, decode_set); + r = _decode(ec_impl, want_set, decode_set, dpp); } if (!r && !encode_set.empty()) { pad_on_shards(want, encode_set); - r = _encode(ec_impl); + r = encode(ec_impl, dpp, dedup_zeros?&need_set:nullptr); } // If we failed to decode, then bail out, or the trimming below might fail. @@ -702,9 +702,10 @@ int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl, int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl, const shard_id_set &want_set, - const shard_id_set &need_set) { + const shard_id_set &need_set, + DoutPrefixProvider *dpp) { bool rebuild_req = false; - for (auto iter = begin_slice_iterator(need_set); !iter.is_end(); ++iter) { + for (auto iter = begin_slice_iterator(need_set, dpp); !iter.is_end(); ++iter) { if (!iter.is_page_aligned()) { rebuild_req = true; break; @@ -719,7 +720,7 @@ int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl, if (rebuild_req) { pad_and_rebuild_to_ec_align(); - return _decode(ec_impl, want_set, need_set); + return _decode(ec_impl, want_set, need_set, dpp); } compute_ro_range(); diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h index 04ac579d69948..f31a0b9108388 100644 --- a/src/osd/ECUtil.h +++ b/src/osd/ECUtil.h @@ -67,9 +67,8 @@ using extent_map = interval_map class slice_iterator { - mini_flat_map &input; + mini_flat_map &input; uint64_t offset = std::numeric_limits::max(); uint64_t length = std::numeric_limits::max(); uint64_t start = std::numeric_limits::max(); @@ -79,8 +78,81 @@ class slice_iterator { shard_id_map in; shard_id_map out; const shard_id_set &out_set; + const shard_id_set *dedup_set; + DoutPrefixProvider *dpp; + + /* zero dedup is used by the slice iterator to detect zero buffers and replace + * them with the dedup'd zero buffer. It keeps a replacement buffer which + * once full (bl.length() == len) can be used to swap out the input buffer. + */ + struct zeros { + uint64_t off; + uint64_t len; + bufferlist bl; + + zeros(uint64_t _off, uint64_t _len) : off(_off), len(_len) {} + + bool dedup(bufferptr &bp) { + bool is_zeros = false; + uint64_t bp_len = bp.length(); + uint64_t off = 0; + char *c_str = bp.c_str(); + // Skip any non-aligned chunk. + uint64_t analysed = p2roundup((uintptr_t)c_str, EC_ALIGN_SIZE) - (uintptr_t)c_str; + + while (off + analysed <= bp_len) { + bool new_is_zeros; + if (bp_len - off - analysed < EC_ALIGN_SIZE) { + new_is_zeros = false; + } else { + new_is_zeros = mem_is_zero(c_str + off + analysed, EC_ALIGN_SIZE); + } + if (new_is_zeros != is_zeros && analysed) { + if (is_zeros) { + bl.append_zero2(analysed); + } else { + bl.append(bufferptr(bp, off, analysed)); + } + off += analysed; + analysed = 0; + } + is_zeros = new_is_zeros; + analysed += EC_ALIGN_SIZE; + } + if (is_zeros) { + bl.append_zero2(bp_len - off); + } else { + bl.append(bufferptr(bp, off, bp_len - off)); + } + + return bl.length() == len; + } + }; + + std::optional> zeros; + + void zeros_dedup() { + for (auto &&[shard, _zeros] : *zeros) { + + if (!out.contains(shard) && !in.contains(shard)) { + continue; + } + + bufferptr &bp = out.contains(shard) ? out.at(shard) : in.at(shard); + if (_zeros.dedup(bp)) { + ldpp_dout(dpp, 20) << __func__ << ": overwrite input[" << shard << "]=" + << _zeros.off << "~" << _zeros.len + << " with bl=" << _zeros.bl << dendl; + input.at(shard).insert(_zeros.off, _zeros.len, _zeros.bl); + zeros->erase(shard); + } + } + } void advance() { + if (dedup_set) { + zeros_dedup(); + } in.clear(); out.clear(); offset = start; @@ -121,9 +193,15 @@ class slice_iterator { // Create a new buffer pointer for the result. We don't want the client // manipulating the ptr. if (out_set.contains(shard)) { + ldpp_dout(dpp, 20) << __func__ << " out[" << shard << "]=" + << start << "~" << (end - start) + << dendl; out.emplace( shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start)); } else { + ldpp_dout(dpp, 20) << __func__ << " in[" << shard << "]=" + << start << "~" << (end - start) + << dendl; in.emplace( shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start)); } @@ -142,6 +220,9 @@ class slice_iterator { bl.invalidate_crc(); } iters.at(shard).second = emap_iter.get_val().begin(); + if (zeros) { + zeros->emplace(shard, emap_iter.get_off(), emap_iter.get_len()); + } } } } else @@ -169,15 +250,29 @@ class slice_iterator { } public: - slice_iterator(mini_flat_map &_input, const shard_id_set &out_set) : + slice_iterator( + mini_flat_map &_input, + const shard_id_set &out_set, + DoutPrefixProvider *_dpp, + const shard_id_set *dedup_set) : input(_input), iters(input.max_size()), in(input.max_size()), out(input.max_size()), - out_set(out_set) { + out_set(out_set), + dedup_set(dedup_set), + dpp(_dpp) { + + if (dedup_set) { + zeros.emplace(input.max_size()); + } + for (auto &&[shard, emap] : input) { auto emap_iter = emap.begin(); auto bl_iter = emap_iter.get_val().begin(); + if (zeros) { + zeros->emplace(shard, emap_iter.get_off(), emap_iter.get_len()); + } auto p = std::make_pair(std::move(emap_iter), std::move(bl_iter)); iters.emplace(shard, std::move(p)); @@ -756,8 +851,10 @@ public: uint64_t end_offset; shard_id_map extent_maps; - slice_iterator begin_slice_iterator( - const shard_id_set &out_set); + slice_iterator begin_slice_iterator( + const shard_id_set &out, + DoutPrefixProvider *dpp, + const shard_id_set *dedup_zeros = nullptr); /* This caculates the ro offset for an offset into a particular shard */ uint64_t calc_ro_offset(raw_shard_id_t raw_shard, int shard_offset) const { @@ -890,10 +987,12 @@ public: void append_zeros_to_ro_offset(uint64_t ro_offset); void insert_ro_extent_map(const extent_map &host_extent_map); extent_set get_extent_superset() const; - int encode(const ErasureCodeInterfaceRef &ec_impl); - int _encode(const ErasureCodeInterfaceRef &ec_impl); + int encode(const ErasureCodeInterfaceRef &ec_impl, + DoutPrefixProvider *dpp = nullptr, + shard_id_set *dedup_zeros = nullptr); int encode_parity_delta(const ErasureCodeInterfaceRef &ec_impl, - shard_extent_map_t &old_sem); + shard_extent_map_t &old_sem, + DoutPrefixProvider *dpp); void pad_on_shards(const shard_extent_set_t &pad_to, const shard_id_set &shards); @@ -904,10 +1003,13 @@ public: void trim(const shard_extent_set_t &trim_to); int decode(const ErasureCodeInterfaceRef &ec_impl, const shard_extent_set_t &want, - uint64_t object_size); + uint64_t object_size, + DoutPrefixProvider *dpp = nullptr, + bool dedup_zeros = false); int _decode(const ErasureCodeInterfaceRef &ec_impl, const shard_id_set &want_set, - const shard_id_set &need_set); + const shard_id_set &need_set, + DoutPrefixProvider *dpp); void get_buffer(shard_id_t shard, uint64_t offset, uint64_t length, buffer::list &append_to) const; void get_shard_first_buffer(shard_id_t shard, buffer::list &append_to) const; @@ -977,6 +1079,29 @@ public: return changed; } + template requires is_interval_set_v + void get_sparse_buffer(shard_id_t shard, bufferlist &bl_out, IntervalSetT &iset) { + ceph_assert(bl_out.length() == 0); + if (!extent_maps.contains(shard)) { + return; + } + for (auto iter = extent_maps.at(shard).begin(); iter != extent_maps.at(shard).end(); ++iter) { + uint64_t off = iter.get_off(); + bufferlist &bl = iter.get_val(); + + auto bl_iter = bl.begin(); + for (const auto &bp : bl.buffers()) { + uint64_t len = bp.length(); + if (!bp.is_zero_fast()) { + iset.insert(off, bp.length()); + bl_out.append(bp); + } + off += len; + bl_iter += len; + } + } + } + friend std::ostream &operator<<(std::ostream &lhs, const shard_extent_map_t &rhs); diff --git a/src/test/osd/TestECBackend.cc b/src/test/osd/TestECBackend.cc index a84648d02996c..1dd4faa6e681d 100644 --- a/src/test/osd/TestECBackend.cc +++ b/src/test/osd/TestECBackend.cc @@ -141,25 +141,32 @@ public: int minimum_to_decode(const shard_id_set &want_to_read, const shard_id_set &available, shard_id_set &minimum_set, shard_id_map>> *minimum_sub_chunks) override { - shard_id_t parity_shard_index(data_chunk_count); + bool recover = false; for (shard_id_t shard : want_to_read) { if (available.contains(shard)) { minimum_set.insert(shard); } else { - // Shard is missing. Recover with every other shard and one parity - // for each missing shard. - for (shard_id_t i; i &in, shard_id_map &out) override { - if (in.size() < data_chunk_count) { + if (std::cmp_less(in.size(), data_chunk_count)) { ADD_FAILURE(); } uint64_t len = 0; @@ -258,6 +265,9 @@ public: if (len != bp.length()) { ADD_FAILURE(); } + if (bp.is_zero_fast()) { + ADD_FAILURE(); + } } return 0; } @@ -1189,7 +1199,7 @@ TEST(ECCommon, get_remaining_shards) ECCommon::shard_read_t shard_read; shard_read.subchunk = ecode->default_sub_chunk; shard_read.extents.insert(0,4096); - unsigned int shard_id = i==missing_shard?parity_shard:i; + unsigned int shard_id = std::cmp_equal(i, missing_shard) ? parity_shard : i; shard_read.pg_shard = pg_shard_t(shard_id, shard_id_t(shard_id)); ref.shard_reads[shard_id_t(shard_id)] = shard_read; } @@ -1265,15 +1275,30 @@ TEST(ECCommon, encode) semap.encode(ec_impl); } -TEST(ECCommon, decode) +bufferlist create_buf(uint64_t len) { + bufferlist bl; + + while (bl.length() < len) { + uint64_t pages = std::rand() % 5; + uint64_t len_to_add = std::min(len - bl.length(), pages * EC_ALIGN_SIZE); + bl.append_zero(len_to_add); + } + ceph_assert(bl.is_aligned(EC_ALIGN_SIZE)); + ceph_assert(len == bl.length()); + return bl; +} + + +void test_decode(unsigned int k, unsigned int m, uint64_t chunk_size, uint64_t object_size, const ECUtil::shard_extent_set_t &want, const shard_id_set &acting_set) { - const uint64_t align_size = EC_ALIGN_SIZE; - const uint64_t swidth = 3*align_size; - const unsigned int k = 3; - const unsigned int m = 2; + const uint64_t swidth = k*chunk_size; ECUtil::stripe_info_t s(k, m, swidth, vector(0)); ECListenerStub listenerStub; + listenerStub.acting_shards.clear(); + for (auto s : acting_set) { + listenerStub.acting_shards.insert(pg_shard_t(int(s), s)); + } ASSERT_EQ(s.get_stripe_width(), swidth); ASSERT_EQ(s.get_chunk_size(), swidth/k); @@ -1284,67 +1309,171 @@ TEST(ECCommon, decode) ErasureCodeInterfaceRef ec_impl(ecode); ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub); + ECUtil::shard_extent_map_t semap(&s); - bufferlist bl12k; - bl12k.append_zero(12288); - bufferlist bl8k; - bl8k.append_zero(8192); - bufferlist bl16k; - bl16k.append_zero(16384); - semap.insert_in_shard(shard_id_t(1), 512000, bl12k); - semap.insert_in_shard(shard_id_t(1), 634880, bl12k); - semap.insert_in_shard(shard_id_t(2), 512000, bl12k); - semap.insert_in_shard(shard_id_t(2), 630784, bl16k); - semap.insert_in_shard(shard_id_t(3), 516096, bl8k); - semap.insert_in_shard(shard_id_t(3), 634880, bl12k); - ECUtil::shard_extent_set_t want = semap.get_extent_set(); - - want[shard_id_t(0)].insert(516096, 8192); - want[shard_id_t(0)].insert(634880, 12288); - want[shard_id_t(4)].insert(516096, 8192); - want[shard_id_t(4)].insert(634880, 12288); - - ceph_assert(0 == semap.decode(ec_impl, want, 2*1024*1024)); + hobject_t hoid; + ECCommon::read_request_t read_request(want, false, object_size); + ASSERT_EQ(0, pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request)); + for (auto [shard, read] : read_request.shard_reads) { + for (auto [off, len] : read.extents) { + semap.insert_in_shard(shard, off, create_buf(len)); + } + } + + ASSERT_EQ(0, semap.decode(ec_impl, want, object_size, nullptr, true)); } +TEST(ECCommon, decode) { + unsigned int k = 4; + unsigned int m = 2; + uint64_t chunk_size = 4096; + uint64_t object_size = k * 256 * 1024 + 4096 + 1; + ECUtil::shard_extent_set_t want(k+m); + shard_id_set acting_set; -TEST(ECCommon, decode2) -{ - const unsigned int k = 4; - const unsigned int m = 2; - const uint64_t align_size = EC_ALIGN_SIZE; - const uint64_t swidth = k*align_size; + want[shard_id_t(1)].insert(256 * 1024, 4096); + want[shard_id_t(4)].insert(256 * 1024, 4096); + acting_set.insert_range(shard_id_t(1), 4); + test_decode(k, m, chunk_size, object_size, want, acting_set); +} - ECUtil::stripe_info_t s(k, m, swidth, vector(0)); - ECListenerStub listenerStub; - ASSERT_EQ(s.get_stripe_width(), swidth); - ASSERT_EQ(s.get_chunk_size(), swidth/k); - const std::vector chunk_mapping = {}; // no remapping - ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl(); - ecode->data_chunk_count = k; - ecode->chunk_count = k + m; - ErasureCodeInterfaceRef ec_impl(ecode); - ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub); +TEST(ECCommon, decode2) +{ + unsigned int k = 4; + unsigned int m = 2; + uint64_t chunk_size = 4096; + uint64_t object_size = 2104*1024; + + ECUtil::shard_extent_set_t want(k+m); + shard_id_set acting_set; - ECUtil::shard_extent_map_t semap(&s); - bufferlist bl528k; - bl528k.append_zero(528*1024); - bufferlist bl524k; - bl524k.append_zero(524*1024); - semap.insert_in_shard(shard_id_t(0), 0, bl524k); - semap.insert_in_shard(shard_id_t(1), 0, bl528k); - semap.insert_in_shard(shard_id_t(3), 0, bl524k); - semap.insert_in_shard(shard_id_t(4), 0, bl528k); - ECUtil::shard_extent_set_t want(k + m); - - //shard_want_to_read={1:[0~540672],2:[0~536576],3:[0~536576],4:[0~540672],5:[0~540672]} want[shard_id_t(1)].insert(0, 528*1024); want[shard_id_t(2)].insert(0, 524*1024); want[shard_id_t(3)].insert(0, 524*1024); want[shard_id_t(4)].insert(0, 528*1024); want[shard_id_t(5)].insert(0, 528*1024); - ceph_assert(0 == semap.decode(ec_impl, want, 2104*1024)); + acting_set.insert(shard_id_t(0)); + acting_set.insert(shard_id_t(1)); + acting_set.insert(shard_id_t(3)); + acting_set.insert(shard_id_t(4)); + + test_decode(k, m, chunk_size, object_size, want, acting_set); +} + +TEST(ECCommon, decode3) { + /* For this problematic IO, we want to reads: + * first is readable - shard 0, 0~4k + * second is on missing shard - shard 2, 16k~4k + * + * Recovery would work out it needs to recover shard 2, so would need + * shards 0,1,3,4 - howecer it works out that shard 3 does not need a read + * because the object is off the end! + * + * So the reads we end up doing are to 0,1 and 4 only. + */ + unsigned int k = 4; + unsigned int m = 1; + uint64_t chunk_size = 4096; + uint64_t object_size = 4 * chunk_size * k + 2 * chunk_size + 1; + + ECUtil::shard_extent_set_t want(k+m); + shard_id_set acting_set; + want[shard_id_t(0)].insert(0, chunk_size); + want[shard_id_t(2)].insert(4 * chunk_size, chunk_size); + + acting_set.insert(shard_id_t(0)); + acting_set.insert(shard_id_t(1)); + acting_set.insert(shard_id_t(3)); + acting_set.insert(shard_id_t(4)); + + test_decode(k, m, chunk_size, object_size, want, acting_set); +} + +TEST(ECCommon, decode4) { + const unsigned int k = 5; + const unsigned int m = 2; + const uint64_t chunk_size = 4096; + const uint64_t object_size = 3243718; + + ECUtil::shard_extent_set_t want(k+m); + shard_id_set acting_set; + want[shard_id_t(0)].insert(544768, 106496); + want[shard_id_t(1)].insert(544151, 106799); + want[shard_id_t(2)].insert(540672, 106496); + want[shard_id_t(3)].insert(540672, 106496); + want[shard_id_t(4)].insert(540672, 106496); + + acting_set.insert(shard_id_t(0)); + acting_set.insert_range(shard_id_t(2), 4); + + test_decode(k, m, chunk_size, object_size, want, acting_set); +} + +TEST(ECCommon, decode5) { + const unsigned int k = 6; + const unsigned int m = 4; + const uint64_t chunk_size = 4096; + const uint64_t object_size = 3428595; + + ECUtil::shard_extent_set_t want(k+m); + shard_id_set acting_set; + want[shard_id_t(0)].insert(0, 573440); + want[shard_id_t(1)].insert(0, 573440); + want[shard_id_t(2)].insert(0, 573440); + want[shard_id_t(3)].insert(0, 569587); + want[shard_id_t(4)].insert(0, 569344); + want[shard_id_t(5)].insert(0, 569344); + + acting_set.insert(shard_id_t(0)); + acting_set.insert(shard_id_t(3)); + acting_set.insert_range(shard_id_t(6), 4); + + test_decode(k, m, chunk_size, object_size, want, acting_set); +} + +TEST(ECCommon, decode6) { + const unsigned int k = 8; + const unsigned int m = 4; + const uint64_t chunk_size = 4096; + const uint64_t object_size = 3092488; + + + ECUtil::shard_extent_set_t want(k+m); + shard_id_set acting_set; + want[shard_id_t(0)].insert(262144, 126976); + want[shard_id_t(1)].insert(262144, 126976); + want[shard_id_t(2)].insert(262144, 126976); + want[shard_id_t(3)].insert(262144, 126976); + want[shard_id_t(4)].insert(262144, 122880); + want[shard_id_t(5)].insert(262144, 122880); + want[shard_id_t(6)].insert(262144, 122880); + want[shard_id_t(7)].insert(262144, 122880); + want[shard_id_t(8)].insert(262144, 126976); + want[shard_id_t(9)].insert(262144, 126976); + want[shard_id_t(10)].insert(262144, 126976); + + acting_set.insert(shard_id_t(0)); + acting_set.insert_range(shard_id_t(2), 2); + acting_set.insert_range(shard_id_t(5), 5); + + test_decode(k, m, chunk_size, object_size, want, acting_set); +} + +TEST(ECCommon, decode7) { + const unsigned int k = 3; + const unsigned int m = 3; + const uint64_t chunk_size = 4096; + const uint64_t object_size = 89236; + + + ECUtil::shard_extent_set_t want(k+m); + shard_id_set acting_set; + want[shard_id_t(5)].insert(0, 32*1024); + + acting_set.insert_range(shard_id_t(0), 3); + + test_decode(k, m, chunk_size, object_size, want, acting_set); } \ No newline at end of file diff --git a/src/test/osd/TestECUtil.cc b/src/test/osd/TestECUtil.cc index ccf93d312da5e..c9fa63b8a6b46 100644 --- a/src/test/osd/TestECUtil.cc +++ b/src/test/osd/TestECUtil.cc @@ -646,7 +646,7 @@ TEST(ECUtil, slice_iterator) out_set.insert_range(shard_id_t(0), 3); shard_extent_map_t sem(&sinfo); { - auto iter = sem.begin_slice_iterator(out_set); + auto iter = sem.begin_slice_iterator(out_set, nullptr); ASSERT_TRUE(iter.get_out_bufferptrs().empty()); } @@ -660,7 +660,7 @@ TEST(ECUtil, slice_iterator) sem.insert_in_shard(shard_id_t(0), 0, a); sem.insert_in_shard(shard_id_t(1), 0, b); { - auto iter = sem.begin_slice_iterator(out_set); + auto iter = sem.begin_slice_iterator(out_set, nullptr); { auto out = iter.get_out_bufferptrs(); @@ -699,7 +699,7 @@ TEST(ECUtil, slice_iterator) sem.insert_in_shard(shard_id_t(1), 4096*4, e); { - auto iter = sem.begin_slice_iterator(out_set); + auto iter = sem.begin_slice_iterator(out_set, nullptr); { auto out = iter.get_out_bufferptrs(); @@ -755,7 +755,7 @@ TEST(ECUtil, slice_iterator) sem.insert_in_shard(shard_id_t(1), 4096*2, d); { - auto iter = sem.begin_slice_iterator(out_set); + auto iter = sem.begin_slice_iterator(out_set, nullptr); { auto out = iter.get_out_bufferptrs(); @@ -794,7 +794,7 @@ TEST(ECUtil, slice_iterator_subset_out) out_set.insert(shard_id_t(1)); shard_extent_map_t sem(&sinfo); { - auto iter = sem.begin_slice_iterator(out_set); + auto iter = sem.begin_slice_iterator(out_set, nullptr); ASSERT_TRUE(iter.get_in_bufferptrs().empty()); ASSERT_TRUE(iter.get_out_bufferptrs().empty()); } @@ -809,7 +809,7 @@ TEST(ECUtil, slice_iterator_subset_out) sem.insert_in_shard(shard_id_t(0), 0, a); sem.insert_in_shard(shard_id_t(1), 0, b); { - auto iter = sem.begin_slice_iterator(out_set); + auto iter = sem.begin_slice_iterator(out_set, nullptr); { auto in = iter.get_in_bufferptrs(); @@ -841,7 +841,7 @@ TEST(ECUtil, slice_iterator_subset_out) sem.insert_in_shard(shard_id_t(1), 4096*4, e); { - auto iter = sem.begin_slice_iterator(out_set); + auto iter = sem.begin_slice_iterator(out_set, nullptr); { auto in = iter.get_in_bufferptrs(); @@ -896,7 +896,7 @@ TEST(ECUtil, slice_iterator_subset_out) sem.insert_in_shard(shard_id_t(1), 4096*2, d); { - auto iter = sem.begin_slice_iterator(out_set); + auto iter = sem.begin_slice_iterator(out_set, nullptr); { auto in = iter.get_in_bufferptrs(); -- 2.39.5