From 5813ce6c190995d304454d2437acf7fe76632685 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Fri, 12 Jan 2024 15:44:51 +0000 Subject: [PATCH] osd: EC Partial Stripe Reads (Retry of #23138 and #52746) This commit is a further ressurection of the EC partial reads concept; this time of the Mark Nelson's work sent as PR #52746. The modifications in this commit are mostly about settling Mark's work on top of the recent rework of `ECBackend` which had shared the EC codebase with the crimson-osd. At the original description says, Mark's work is based on earlier attempt from Xiaofei Cui. Therefore credits go to: * Mark Nelson (Clyso), * Xiaofei Cui (cuixiaofei@sangfor.com.cn). The original commit description is preserved below: > This is a re-implementation of PR #23138 rebased on main with a couple of nitpicky changes to make the code a little more clear (to me at least). Credit goes to Xiaofei Cui [cuixiaofei@sangfor.com.cn](mailto:cuixiaofei@sangfor.com.cn) for the original implementation. > > Looking at the original PR's review, it does not appear that we can use the same technique as in 468ad4b. We don't have the ReadOp yet. I'm not sure if @gregsforytwo's idea to query the plugin works, but it's clear we are not doing the efficient thing from the get-go here. > > The performance and efficiency benefits for small random reads appears to be quite substantial, especially for large stripe widths. Signed-off-by: Radoslaw Zarzynski --- src/erasure-code/ErasureCode.cc | 26 +++- src/erasure-code/ErasureCode.h | 2 + src/erasure-code/ErasureCodeInterface.h | 6 + src/osd/ECBackend.cc | 9 +- src/osd/ECCommon.cc | 147 +++++++++++++++++- src/osd/ECCommon.h | 51 +++++- src/osd/ECUtil.cc | 2 + src/osd/ECUtil.h | 15 +- .../erasure-code/TestErasureCodeExample.cc | 7 +- 9 files changed, 254 insertions(+), 11 deletions(-) diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc index 6784fa355cf7d..b0b5e414bc3c1 100644 --- a/src/erasure-code/ErasureCode.cc +++ b/src/erasure-code/ErasureCode.cc @@ -352,17 +352,41 @@ int ErasureCode::decode_concat(const map &chunks, bufferlist *decoded) { set want_to_read; + set decode_chunks; + bool need_decode = false; for (unsigned int i = 0; i < get_data_chunk_count(); i++) { want_to_read.insert(chunk_index(i)); } +#if 1 + if (chunks.size() < get_data_chunk_count()) { + // for partial_read + for (const auto& [key, bl] : chunks) { + if (want_to_read.contains(key)) { + decode_chunks.insert(key); + } else { + need_decode = true; + break; + } + } + if (!need_decode) { + want_to_read.swap(decode_chunks); + } + } +#endif map decoded_map; int r = _decode(want_to_read, chunks, &decoded_map); if (r == 0) { for (unsigned int i = 0; i < get_data_chunk_count(); i++) { - decoded->claim_append(decoded_map[chunk_index(i)]); + if (decoded_map.contains(chunk_index(i))) { + decoded->claim_append(decoded_map[chunk_index(i)]); + } } } return r; } + +bool ErasureCode::is_systematic() const { + return true; +} } diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h index fd6d1a41f714d..5813109b70038 100644 --- a/src/erasure-code/ErasureCode.h +++ b/src/erasure-code/ErasureCode.h @@ -115,6 +115,8 @@ namespace ceph { int decode_concat(const std::map &chunks, bufferlist *decoded) override; + bool is_systematic() const override; + protected: int parse(const ErasureCodeProfile &profile, std::ostream *ss); diff --git a/src/erasure-code/ErasureCodeInterface.h b/src/erasure-code/ErasureCodeInterface.h index 7107f978dd4fb..2e5c3d91e3ace 100644 --- a/src/erasure-code/ErasureCodeInterface.h +++ b/src/erasure-code/ErasureCodeInterface.h @@ -459,6 +459,12 @@ namespace ceph { */ virtual int decode_concat(const std::map &chunks, bufferlist *decoded) = 0; + + + /** + * @return **true** if the EC plugin's data placement is systematic. + */ + virtual bool is_systematic() const = 0; }; typedef std::shared_ptr ErasureCodeInterfaceRef; diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index e2de6dfee9860..4c19cc2bf3b36 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1233,10 +1233,10 @@ void ECBackend::handle_sub_read_reply( ++j, ++req_iter, ++riter) { ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end()); ceph_assert(riter != rop.complete[i->first].returned.end()); - pair adjusted = + pair aligned = sinfo.aligned_offset_len_to_chunk( make_pair(req_iter->get<0>(), req_iter->get<1>())); - ceph_assert(adjusted.first == j->first); + ceph_assert(aligned.first == j->first); riter->get<2>()[from] = std::move(j->second); } } @@ -1554,9 +1554,14 @@ void ECBackend::objects_read_async( to_read.begin(); i != to_read.end(); ++i) { +#if 0 pair tmp = sinfo.offset_len_to_stripe_bounds( make_pair(i->first.get<0>(), i->first.get<1>())); +#else + pair tmp = + make_pair(i->first.get<0>(), i->first.get<1>()); +#endif es.union_insert(tmp.first, tmp.second); flags |= i->first.get<2>(); diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc index 6b16431e01fc0..3fc54dce9f03e 100644 --- a/src/osd/ECCommon.cc +++ b/src/osd/ECCommon.cc @@ -298,6 +298,30 @@ int ECCommon::ReadPipeline::get_min_avail_to_read_shards( return 0; } +void ECCommon::ReadPipeline::get_min_want_to_read_shards( + pair off_len, + set *want_to_read) +{ + uint64_t off = off_len.first; + uint64_t len = off_len.second; + uint64_t chunk_size = sinfo.get_chunk_size(); + int data_chunk_count = sinfo.get_stripe_width() / sinfo.get_chunk_size(); + const vector &chunk_mapping = ec_impl->get_chunk_mapping(); + + int total_chunks = (chunk_size - 1 + len) / chunk_size; + int first_chunk = (off / chunk_size) % data_chunk_count; + + if (total_chunks > data_chunk_count) { + total_chunks = data_chunk_count; + } + + for(int i = 0; i < total_chunks; i++) { + int j = (first_chunk + i) % data_chunk_count; + int chunk = (int)chunk_mapping.size() > j ? chunk_mapping[j] : j; + want_to_read->insert(chunk); + } +} + int ECCommon::ReadPipeline::get_remaining_shards( const hobject_t &hoid, const set &avail, @@ -478,11 +502,21 @@ struct ClientReadCompleter : ECCommon::ReadCompleter { ceph_assert(res.returned.size() == to_read.size()); ceph_assert(res.errors.empty()); for (auto &&read: to_read) { +#if 0 pair adjusted = read_pipeline.sinfo.offset_len_to_stripe_bounds( make_pair(read.get<0>(), read.get<1>())); ceph_assert(res.returned.front().get<0>() == adjusted.first); ceph_assert(res.returned.front().get<1>() == adjusted.second); +#else + auto bounds = make_pair(read.get<0>(), read.get<1>()); + auto aligned = read_pipeline.sinfo.offset_len_to_stripe_bounds(bounds); + if (aligned.first != read.get<0>() || aligned.second != read.get<1>()) { + aligned = read_pipeline.sinfo.offset_len_to_chunk_bounds(bounds); + } + ceph_assert(res.returned.front().get<0>() == aligned.first); + ceph_assert(res.returned.front().get<1>() == aligned.second); +#endif map to_decode; bufferlist bl; for (map::iterator j = @@ -501,11 +535,19 @@ struct ClientReadCompleter : ECCommon::ReadCompleter { goto out; } bufferlist trimmed; +#if 0 trimmed.substr_of( bl, read.get<0>() - adjusted.first, std::min(read.get<1>(), bl.length() - (read.get<0>() - adjusted.first))); +#else + // XXX: hmm, just refactor, at least at first glance + auto off = read.get<0>() - aligned.first; + auto len = + std::min(read.get<1>(), bl.length() - (read.get<0>() - aligned.first)); + trimmed.substr_of(bl, off, len); +#endif result.insert( read.get<0>(), trimmed.length(), std::move(trimmed)); res.returned.pop_front(); @@ -524,6 +566,46 @@ out: ECCommon::ClientAsyncReadStatus *status; }; +bool ECCommon::ReadPipeline::should_partial_read( + const hobject_t &hoid, + std::list to_read, + const std::set &want, + bool fast_read, + bool for_recovery) +{ + // Don't partial read if we are doing a fast_read + if (fast_read) { + return false; + } + // Don't partial read if the EC isn't systematic + if (!ec_impl->is_systematic()) { + return false; + } + // Don't partial read if we have multiple stripes + if (to_read.size() != 1) { + return false; + } + // Only partial read if the length is inside the stripe boundary + auto read = to_read.front(); + auto bounds = make_pair(read.get<0>(), read.get<1>()); + auto aligned = sinfo.offset_len_to_stripe_bounds(bounds); + if (sinfo.get_stripe_width() != aligned.second) { + return false; + } + + set have; + map shards; + set error_shards; + get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); + + set data_shards; + get_want_to_read_shards(&data_shards); + + return includes(data_shards.begin(), data_shards.end(), want.begin(), want.end()) + && includes(have.begin(), have.end(), want.begin(), want.end()); +} + + void ECCommon::ReadPipeline::objects_read_and_reconstruct( const map> &reads, bool fast_read, @@ -537,11 +619,49 @@ void ECCommon::ReadPipeline::objects_read_and_reconstruct( } map> obj_want_to_read; - set want_to_read; - get_want_to_read_shards(&want_to_read); map for_read_op; for (auto &&to_read: reads) { + set want_to_read; + get_want_to_read_shards(&want_to_read); +#if 1 + std::list align_offsets; + bool partial_read = should_partial_read( + to_read.first, + to_read.second, + want_to_read, + fast_read, + false); + + // Our extent set and flags + extent_set es; + uint32_t flags = 0; + + for (auto read : to_read.second) { + auto bounds = make_pair(read.get<0>(), read.get<1>()); + + // By default, align to the stripe + auto aligned = sinfo.offset_len_to_stripe_bounds(bounds); + if (partial_read) { + // align to the chunk instead + aligned = sinfo.offset_len_to_chunk_bounds(bounds); + set new_want_to_read; + get_min_want_to_read_shards(aligned, &new_want_to_read); + want_to_read = new_want_to_read; + } + // Add the new extents/flags + extent_set new_es; + new_es.insert(aligned.first, aligned.second); + es.union_of(new_es); + flags |= read.get<2>(); + } + if (!es.empty()) { + for (auto e = es.begin(); e != es.end(); ++e) { + align_offsets.push_back( + boost::make_tuple(e.get_start(), e.get_len(), flags)); + } + } +#endif map>> shards; int r = get_min_avail_to_read_shards( to_read.first, @@ -555,9 +675,14 @@ void ECCommon::ReadPipeline::objects_read_and_reconstruct( make_pair( to_read.first, read_request_t( +#if 1 + align_offsets, +#else to_read.second, +#endif shards, - false))); + false, + partial_read))); obj_want_to_read.insert(make_pair(to_read.first, want_to_read)); } @@ -589,6 +714,19 @@ int ECCommon::ReadPipeline::send_all_remaining_reads( list to_read = rop.to_read.find(hoid)->second.to_read; + bool partial_read = rop.to_read.find(hoid)->second.partial_read; + // realign the offset and len to make partial reads normal reads. + if (partial_read) { + list new_to_read; + for(const auto& read : to_read) { + auto bounds = make_pair(read.get<0>(), read.get<1>()); + auto aligned = sinfo.offset_len_to_stripe_bounds(bounds); + new_to_read.push_back( + boost::make_tuple(aligned.first, aligned.second, read.get<2>())); + } + to_read = new_to_read; + } + // (Note cuixf) If we need to read attrs and we read failed, try to read again. bool want_attrs = rop.to_read.find(hoid)->second.want_attrs && @@ -604,6 +742,9 @@ int ECCommon::ReadPipeline::send_all_remaining_reads( to_read, shards, want_attrs))); + if (partial_read) { + rop.refresh_complete(hoid); + } return 0; } diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h index 7112080cb2a82..a27c2e7065958 100644 --- a/src/osd/ECCommon.h +++ b/src/osd/ECCommon.h @@ -231,11 +231,13 @@ struct ECCommon { const std::list > to_read; std::map>> need; bool want_attrs; + bool partial_read; read_request_t( const std::list > &to_read, const std::map>> &need, - bool want_attrs) - : to_read(to_read), need(need), want_attrs(want_attrs) {} + bool want_attrs, + bool partial_read=false) + : to_read(to_read), need(need), want_attrs(want_attrs), partial_read(partial_read) {} }; friend std::ostream &operator<<(std::ostream &lhs, const read_request_t &rhs); struct ReadOp; @@ -364,6 +366,22 @@ struct ECCommon { ReadOp() = delete; ReadOp(const ReadOp &) = delete; // due to on_complete being unique_ptr ReadOp(ReadOp &&) = default; + + void refresh_complete(const hobject_t &hoid) { + std::list< + boost::tuple< + uint64_t, uint64_t, std::map>> new_returned; + auto returned = complete[hoid].returned; + auto reads = to_read.find(hoid)->second.to_read; + + auto r = returned.begin(); + for (auto read : reads) { + new_returned.push_back( + boost::make_tuple(read.get<0>(), read.get<1>(), r->get<2>())); + ++r; + } + complete[hoid].returned = new_returned; + } }; struct ReadPipeline { void objects_read_and_reconstruct( @@ -371,6 +389,13 @@ struct ECCommon { bool fast_read, GenContextURef &&func); + bool should_partial_read( + const hobject_t &hoid, + std::list to_read, + const std::set &want, + bool fast_read, + bool for_recovery); + template void filter_read_op( const OSDMapRef& osdmap, @@ -430,6 +455,28 @@ struct ECCommon { parent(parent) { } + /** + * The basic idea here is that we only call this function when there is a + * partial read. The criteria for performing a partial read involves + * checking to make sure that we don't read across multiple stripes and that + * the read is within the stripe boundary (see should_partial_read). + * + * While get_want_to_read_shards creates a want_to_read based on the EC + * plugin's get_data_chunk_count(), this instead uses the number of chunks + * necessary to read the length of data and only inserts those chunks. Just + * like in get_want_to_read_shards, we check the plugin's mapping but the + * difference is that we start at first_chunk and we end when we no longer + * have chunks based on the read's length. + * + * The resulting want_to_read has fewer chunks than a normal read, and thus + * gets intercepted in ErasureCode::decode_concat to be handled differently + * than when get_want_to_read_shards is used and we decode all data chunks. + */ + void get_min_want_to_read_shards( + std::pair off_len, ///< [in] + std::set *want_to_read ///< [out] + ); + int get_remaining_shards( const hobject_t &hoid, const std::set &avail, diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc index 94b32845847a6..e1cb9de43e09b 100644 --- a/src/osd/ECUtil.cc +++ b/src/osd/ECUtil.cc @@ -41,7 +41,9 @@ int ECUtil::decode( bufferlist bl; int r = ec_impl->decode_concat(chunks, &bl); ceph_assert(r == 0); +#if 0 //ndef EC_PARTIAL_READ ceph_assert(bl.length() == sinfo.get_stripe_width()); +#endif out->claim_append(bl); } return 0; diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h index dce78b8a8683e..28c2f3bd69da4 100644 --- a/src/osd/ECUtil.h +++ b/src/osd/ECUtil.h @@ -66,9 +66,11 @@ public: } std::pair aligned_offset_len_to_chunk( std::pair in) const { + // we need to align to stripe again to deal with partial chunk read. + std::pair aligned = offset_len_to_stripe_bounds(in); return std::make_pair( - aligned_logical_offset_to_chunk_offset(in.first), - aligned_logical_offset_to_chunk_offset(in.second)); + aligned_logical_offset_to_chunk_offset(aligned.first), + aligned_logical_offset_to_chunk_offset(aligned.second)); } std::pair offset_len_to_stripe_bounds( std::pair in) const { @@ -77,6 +79,15 @@ public: (in.first - off) + in.second); return std::make_pair(off, len); } + std::pair offset_len_to_chunk_bounds( + std::pair in) const { + uint64_t off = in.first - (in.first % chunk_size); + uint64_t tmp_len = (in.first - off) + in.second; + uint64_t len = ((tmp_len % chunk_size) ? + (tmp_len - (tmp_len % chunk_size) + chunk_size) : + tmp_len); + return std::make_pair(off, len); + } }; int decode( diff --git a/src/test/erasure-code/TestErasureCodeExample.cc b/src/test/erasure-code/TestErasureCodeExample.cc index b488a604b61e7..9fcf5c3632885 100644 --- a/src/test/erasure-code/TestErasureCodeExample.cc +++ b/src/test/erasure-code/TestErasureCodeExample.cc @@ -197,9 +197,14 @@ TEST(ErasureCodeExample, decode) usable.substr_of(out, 0, in.length()); EXPECT_TRUE(usable == in); + // partial chunk decode + map partial_decode; + partial_decode[0] = encoded[0]; + EXPECT_EQ(0, example.decode_concat(partial_decode, &out)); + // cannot recover map degraded; - degraded[0] = encoded[0]; + degraded[2] = encoded[2]; EXPECT_EQ(-ERANGE, example.decode_concat(degraded, &out)); } -- 2.39.5