From 33d79f303d945c8d6d2e9e9a471efc4df57617a6 Mon Sep 17 00:00:00 2001 From: Myna V Date: Thu, 25 May 2017 11:45:54 +0530 Subject: [PATCH] Introducing sub-chunks. Replaced the old decode, minimum_to_decode with new decode, minimum_to_decode in ErasureCodeInterface. Updated ECBackend, ECUtil to use the new functions.Fixed the test cases to use the new functions. Fixed the review comments. Authors: Myna, Elita. Signed-off-by: Myna Vajha --- src/erasure-code/ErasureCode.cc | 24 ++++++ src/erasure-code/ErasureCode.h | 20 ++++- src/erasure-code/ErasureCodeInterface.h | 19 +++- src/erasure-code/shec/ErasureCodeShec.h | 4 +- src/osd/ECBackend.cc | 86 +++++++++++++------ src/osd/ECBackend.h | 10 +-- src/osd/ECMsgTypes.cc | 15 +++- src/osd/ECMsgTypes.h | 1 + src/osd/ECUtil.cc | 73 ++++++++++------ src/test/erasure-code/ErasureCodeExample.h | 4 +- .../ceph_erasure_code_benchmark.cc | 6 +- .../ceph_erasure_code_non_regression.cc | 2 +- 12 files changed, 189 insertions(+), 75 deletions(-) diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc index c37adedb6c79a..4628e73cccc3e 100644 --- a/src/erasure-code/ErasureCode.cc +++ b/src/erasure-code/ErasureCode.cc @@ -107,6 +107,23 @@ int ErasureCode::minimum_to_decode(const set &want_to_read, return 0; } +int ErasureCode::minimum_to_decode(const set &want_to_read, + const set &available_chunks, + map>> *minimum) +{ + set minimum_shard_ids; + int r = minimum_to_decode(want_to_read, available_chunks, &minimum_shard_ids); + if (r != 0) { + return r; + } + vector> default_subchunks; + default_subchunks.push_back(make_pair(0, get_sub_chunk_count())); + for(auto &&id:minimum_shard_ids){ + minimum->insert(make_pair(id, default_subchunks)); + } + return 0; +} + int ErasureCode::minimum_to_decode_with_cost(const set &want_to_read, const map &available, set *minimum) @@ -215,6 +232,13 @@ int ErasureCode::decode(const set &want_to_read, return decode_chunks(want_to_read, chunks, decoded); } +int ErasureCode::decode(const set &want_to_read, + const map &chunks, + map *decoded, int chunk_size) +{ + return decode(want_to_read, chunks, decoded); +} + int ErasureCode::decode_chunks(const set &want_to_read, const map &chunks, map *decoded) diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h index fc79cf03a44ae..3ae0ce51f8da5 100644 --- a/src/erasure-code/ErasureCode.h +++ b/src/erasure-code/ErasureCode.h @@ -56,9 +56,17 @@ namespace ceph { return get_chunk_count() - get_data_chunk_count(); } - int minimum_to_decode(const std::set &want_to_read, + virtual int get_sub_chunk_count() { + return 1; + } + + virtual int minimum_to_decode(const std::set &want_to_read, const std::set &available_chunks, - std::set *minimum) override; + std::set *minimum); + + virtual int minimum_to_decode(const std::set &want_to_read, + const std::set &available, + std::map>> *minimum)override; int minimum_to_decode_with_cost(const std::set &want_to_read, const std::map &available, @@ -74,9 +82,13 @@ namespace ceph { int encode_chunks(const std::set &want_to_encode, std::map *encoded) override; - int decode(const std::set &want_to_read, + virtual int decode(const std::set &want_to_read, const std::map &chunks, - std::map *decoded) override; + std::map *decoded); + + int decode(const std::set &want_to_read, + const std::map &chunks, + std::map *decoded, int chunk_size) override; int decode_chunks(const std::set &want_to_read, const std::map &chunks, diff --git a/src/erasure-code/ErasureCodeInterface.h b/src/erasure-code/ErasureCodeInterface.h index 2159ae1ce5d2e..b0c24e1e42a71 100644 --- a/src/erasure-code/ErasureCodeInterface.h +++ b/src/erasure-code/ErasureCodeInterface.h @@ -248,6 +248,16 @@ namespace ceph { */ virtual unsigned int get_coding_chunk_count() const = 0; + /** + * Return the number of sub chunks chunks created by a call to the + * **encode** method. Each chunk can be viewed as union of sub-chunks + * For the case of array codes, the sub-chunk count > 1, where as the + * scalar codes have sub-chunk count = 1. + * + * @return the number of sub-chunks per chunk created by encode() + */ + virtual int get_sub_chunk_count() = 0; + /** * Return the size (in bytes) of a single chunk created by a call * to the **decode** method. The returned size multiplied by @@ -280,12 +290,14 @@ namespace ceph { * * @param [in] want_to_read chunk indexes to be decoded * @param [in] available chunk indexes containing valid data - * @param [out] minimum chunk indexes to retrieve + * @param [out] minimum chunk indexes and corresponding + * subchunk index offsets, count. * @return **0** on success or a negative errno on error. */ virtual int minimum_to_decode(const std::set &want_to_read, const std::set &available, - std::set *minimum) = 0; + std::map>> + *minimum) = 0; /** * Compute the smallest subset of **available** chunks that needs @@ -389,11 +401,12 @@ namespace ceph { * @param [in] want_to_read chunk indexes to be decoded * @param [in] chunks map chunk indexes to chunk data * @param [out] decoded map chunk indexes to chunk data + * @param [in] chunk_size chunk size * @return **0** on success or a negative errno on error. */ virtual int decode(const std::set &want_to_read, const std::map &chunks, - std::map *decoded) = 0; + std::map *decoded, int chunk_size) = 0; virtual int decode_chunks(const std::set &want_to_read, const std::map &chunks, diff --git a/src/erasure-code/shec/ErasureCodeShec.h b/src/erasure-code/shec/ErasureCodeShec.h index 073644e7ad0a6..33c49a320d2a0 100644 --- a/src/erasure-code/shec/ErasureCodeShec.h +++ b/src/erasure-code/shec/ErasureCodeShec.h @@ -73,7 +73,7 @@ public: int minimum_to_decode(const set &want_to_read, const set &available_chunks, - set *minimum) override; + set *minimum); int minimum_to_decode_with_cost(const set &want_to_read, const map &available, @@ -87,7 +87,7 @@ public: int decode(const set &want_to_read, const map &chunks, - map *decoded) override; + map *decoded); int decode_chunks(const set &want_to_read, const map &chunks, map *decoded) override; diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 59d9c6de2c672..e30b280b21dd4 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -257,7 +257,7 @@ struct RecoveryMessages { void read( ECBackend *ec, const hobject_t &hoid, uint64_t off, uint64_t len, - const set &need, + const map>> &need, bool attrs) { list > to_read; to_read.push_back(boost::make_tuple(off, len, 0)); @@ -411,7 +411,8 @@ void ECBackend::handle_recovery_read_complete( from[i->first.shard].claim(i->second); } dout(10) << __func__ << ": " << from << dendl; - int r = ECUtil::decode(sinfo, ec_impl, from, target); + int r; + r = ECUtil::decode(sinfo, ec_impl, from, target); assert(r == 0); if (attrs) { op.xattrs.swap(*attrs); @@ -554,7 +555,7 @@ void ECBackend::continue_recovery_op( ::encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]); } - set to_read; + map>> to_read; int r = get_min_avail_to_read_shards( op.hoid, want, true, false, &to_read); if (r != 0) { @@ -995,7 +996,9 @@ void ECBackend::handle_sub_read( ++i) { int r = 0; ECUtil::HashInfoRef hinfo; + int subchunk_size = sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count(); if (!get_parent()->get_pool().allows_ecoverwrites()) { + hinfo = get_hash_info(i->first); if (!hinfo) { r = -EIO; @@ -1007,12 +1010,32 @@ void ECBackend::handle_sub_read( } for (auto j = i->second.begin(); j != i->second.end(); ++j) { bufferlist bl; - r = store->read( - ch, - ghobject_t(i->first, ghobject_t::NO_GEN, shard), - j->get<0>(), - j->get<1>(), - bl, j->get<2>()); + if ((op.subchunks.find(i->first)->second.size() == 1) && + (op.subchunks.find(i->first)->second.front().second == + ec_impl->get_sub_chunk_count())) { + dout(25) << __func__ << " case1: reading the complete chunk/shard." << dendl; + r = store->read( + ch, + ghobject_t(i->first, ghobject_t::NO_GEN, shard), + j->get<0>(), + j->get<1>(), + bl, j->get<2>()); // Allow EIO return + } else { + dout(25) << __func__ << " case2: going to do fragmented read." << dendl; + for (int m = 0; m < (int)j->get<1>(); m += sinfo.get_chunk_size()) { + for (auto &&k:op.subchunks.find(i->first)->second) { + bufferlist bl0; + r = store->read( + ch, + ghobject_t(i->first, ghobject_t::NO_GEN, shard), + j->get<0>() + m + (k.first)*subchunk_size, + (k.second)*subchunk_size, + bl0, j->get<2>()); + bl.claim_append(bl0); + } + } + } + if (r < 0) { get_parent()->clog_error() << "Error " << r << " reading object " @@ -1203,7 +1226,8 @@ void ECBackend::handle_sub_read_reply( have.insert(j->first.shard); dout(20) << __func__ << " have shard=" << j->first.shard << dendl; } - set want_to_read, dummy_minimum; + set want_to_read; + map>> dummy_minimum; get_want_to_read_shards(&want_to_read); int err; if ((err = ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum)) < 0) { @@ -1550,7 +1574,7 @@ int ECBackend::get_min_avail_to_read_shards( const set &want, bool for_recovery, bool do_redundant_reads, - set *to_read) + map>> *to_read) { // Make sure we don't do redundant reads for recovery assert(!for_recovery || !do_redundant_reads); @@ -1560,23 +1584,25 @@ int ECBackend::get_min_avail_to_read_shards( get_all_avail_shards(hoid, have, shards, for_recovery); - set need; + map>> need; int r = ec_impl->minimum_to_decode(want, have, &need); if (r < 0) return r; if (do_redundant_reads) { - need.swap(have); + vector> subchunks_list; + subchunks_list.push_back(make_pair(0, ec_impl->get_sub_chunk_count())); + for (auto &&i: have) { + need[i] = subchunks_list; + } } if (!to_read) return 0; - for (set::iterator i = need.begin(); - i != need.end(); - ++i) { - assert(shards.count(shard_id_t(*i))); - to_read->insert(shards[shard_id_t(*i)]); + for (auto &&i:need) { + assert(shards.count(shard_id_t(i.first))); + to_read->insert(make_pair(shards[shard_id_t(i.first)], i.second)); } return 0; } @@ -1584,7 +1610,7 @@ int ECBackend::get_min_avail_to_read_shards( int ECBackend::get_remaining_shards( const hobject_t &hoid, const set &avail, - set *to_read, + map>> *to_read, bool for_recovery) { assert(to_read); @@ -1594,12 +1620,14 @@ int ECBackend::get_remaining_shards( get_all_avail_shards(hoid, have, shards, for_recovery); + vector> subchunks; + subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count())); for (set::iterator i = have.begin(); i != have.end(); ++i) { assert(shards.count(shard_id_t(*i))); if (avail.find(*i) == avail.end()) - to_read->insert(shards[shard_id_t(*i)]); + to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks)); } return 0; } @@ -1642,15 +1670,17 @@ void ECBackend::do_read_op(ReadOp &op) i != op.to_read.end(); ++i) { bool need_attrs = i->second.want_attrs; - for (set::const_iterator j = i->second.need.begin(); + + for (auto j = i->second.need.begin(); j != i->second.need.end(); ++j) { if (need_attrs) { - messages[*j].attrs_to_read.insert(i->first); + messages[j->first].attrs_to_read.insert(i->first); need_attrs = false; } - op.obj_to_source[i->first].insert(*j); - op.source_to_obj[*j].insert(i->first); + messages[j->first].subchunks[i->first] = j->second; + op.obj_to_source[i->first].insert(j->first); + op.source_to_obj[j->first].insert(i->first); } for (list >::const_iterator j = i->second.to_read.begin(); @@ -1658,10 +1688,10 @@ void ECBackend::do_read_op(ReadOp &op) ++j) { pair chunk_off_len = sinfo.aligned_offset_len_to_chunk(make_pair(j->get<0>(), j->get<1>())); - for (set::const_iterator k = i->second.need.begin(); + for (auto k = i->second.need.begin(); k != i->second.need.end(); ++k) { - messages[*k].to_read[i->first].push_back( + messages[k->first].to_read[i->first].push_back( boost::make_tuple( chunk_off_len.first, chunk_off_len.second, @@ -2273,7 +2303,7 @@ void ECBackend::objects_read_and_reconstruct( map for_read_op; for (auto &&to_read: reads) { - set shards; + map>> shards; int r = get_min_avail_to_read_shards( to_read.first, want_to_read, @@ -2315,7 +2345,7 @@ int ECBackend::send_all_remaining_reads( for (set::iterator i = ots.begin(); i != ots.end(); ++i) already_read.insert(i->shard); dout(10) << __func__ << " have/error shards=" << already_read << dendl; - set shards; + map>> shards; int r = get_remaining_shards(hoid, already_read, &shards, rop.for_recovery); if (r) return r; diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index bcae74e06afca..a65b5a0746153 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -359,12 +359,12 @@ public: }; struct read_request_t { const list > to_read; - const set need; + const map>> need; const bool want_attrs; GenContext &> *cb; read_request_t( const list > &to_read, - const set &need, + const map>> &need, bool want_attrs, GenContext &> *cb) : to_read(to_read), need(need), want_attrs(want_attrs), @@ -598,7 +598,7 @@ public: ++i) { have.insert(i->shard); } - set min; + map>> min; return ec_impl->minimum_to_decode(want, have, &min) == 0; } }; @@ -649,13 +649,13 @@ public: const set &want, ///< [in] desired shards bool for_recovery, ///< [in] true if we may use non-acting replicas bool do_redundant_reads, ///< [in] true if we want to issue redundant reads to reduce latency - set *to_read ///< [out] shards to read + map>> *to_read ///< [out] shards, corresponding subchunks to read ); ///< @return error code, 0 on success int get_remaining_shards( const hobject_t &hoid, const set &avail, - set *to_read, + map>> *to_read, bool for_recovery); int objects_get_attrs( diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc index 4da2857055835..68539ad9c8140 100644 --- a/src/osd/ECMsgTypes.cc +++ b/src/osd/ECMsgTypes.cc @@ -166,7 +166,7 @@ void ECSubWriteReply::generate_test_instances(list& o) void ECSubRead::encode(bufferlist &bl, uint64_t features) const { if ((features & CEPH_FEATURE_OSD_FADVISE_FLAGS) == 0) { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(from, bl); ::encode(tid, bl); map >> tmp; @@ -181,21 +181,23 @@ void ECSubRead::encode(bufferlist &bl, uint64_t features) const } ::encode(tmp, bl); ::encode(attrs_to_read, bl); + ::encode(subchunks, bl); ENCODE_FINISH(bl); return; } - ENCODE_START(2, 2, bl); + ENCODE_START(3, 2, bl); ::encode(from, bl); ::encode(tid, bl); ::encode(to_read, bl); ::encode(attrs_to_read, bl); + ::encode(subchunks, bl); ENCODE_FINISH(bl); } void ECSubRead::decode(bufferlist::iterator &bl) { - DECODE_START(2, bl); + DECODE_START(3, bl); ::decode(from, bl); ::decode(tid, bl); if (struct_v == 1) { @@ -214,6 +216,13 @@ void ECSubRead::decode(bufferlist::iterator &bl) ::decode(to_read, bl); } ::decode(attrs_to_read, bl); + if (struct_v > 2 && struct_v > struct_compat) { + ::decode(subchunks, bl); + } else { + for (auto &&i:attrs_to_read) { + subchunks[i].push_back(make_pair(0,1)); + } + } DECODE_FINISH(bl); } diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h index bfcb5361ae1f0..00dca1be04907 100644 --- a/src/osd/ECMsgTypes.h +++ b/src/osd/ECMsgTypes.h @@ -107,6 +107,7 @@ struct ECSubRead { ceph_tid_t tid; map >> to_read; set attrs_to_read; + map>> subchunks; void encode(bufferlist &bl, uint64_t features) const; void decode(bufferlist::iterator &bl); void dump(Formatter *f) const; diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc index 9c3c7981ea32b..d7a64e4e1808e 100644 --- a/src/osd/ECUtil.cc +++ b/src/osd/ECUtil.cc @@ -3,6 +3,7 @@ #include #include "include/encoding.h" #include "ECUtil.h" + using namespace std; int ECUtil::decode( @@ -36,8 +37,8 @@ int ECUtil::decode( } bufferlist bl; int r = ec_impl->decode_concat(chunks, &bl); - assert(bl.length() == sinfo.get_stripe_width()); assert(r == 0); + assert(bl.length() == sinfo.get_stripe_width()); out->claim_append(bl); } return 0; @@ -48,19 +49,8 @@ int ECUtil::decode( ErasureCodeInterfaceRef &ec_impl, map &to_decode, map &out) { - assert(to_decode.size()); - - uint64_t total_data_size = to_decode.begin()->second.length(); - assert(total_data_size % sinfo.get_chunk_size() == 0); - - for (map::iterator i = to_decode.begin(); - i != to_decode.end(); - ++i) { - assert(i->second.length() == total_data_size); - } - if (total_data_size == 0) - return 0; + assert(to_decode.size()); set need; for (map::iterator i = out.begin(); @@ -71,28 +61,63 @@ int ECUtil::decode( need.insert(i->first); } - for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) { + set avail; + for (auto i = to_decode.begin(); + i != to_decode.end(); + ++i) { + assert(i->second.length() != 0); + avail.insert(i->first); + } + + map>> min; + assert(ec_impl->minimum_to_decode(need, avail, &min) == 0); + + int chunks_count = 0; + map repair_data_per_chunk; + int subchunk_size = sinfo.get_chunk_size()/ec_impl->get_sub_chunk_count(); + + for (map::iterator i = to_decode.begin(); + i != to_decode.end(); + ++i) { + assert(min.find(i->first) != min.end()); + + int repair_subchunk_count = 0; + for (auto j = min[i->first].begin(); + j != min[i->first].end(); ++j) { + repair_subchunk_count += j->second; + } + repair_data_per_chunk[i->first] = repair_subchunk_count*subchunk_size; + assert(i->second.length() % repair_data_per_chunk[i->first] == 0); + + if (i == to_decode.begin()) { + chunks_count = (int) i->second.length() / repair_data_per_chunk[i->first]; + } + else { + assert(chunks_count == + (int) i->second.length() / repair_data_per_chunk[i->first]); + } + } + + for (int i = 0; i < chunks_count; i++) { map chunks; - for (map::iterator j = to_decode.begin(); + for (auto j = to_decode.begin(); j != to_decode.end(); ++j) { - chunks[j->first].substr_of(j->second, i, sinfo.get_chunk_size()); + chunks[j->first].substr_of(j->second, + i*repair_data_per_chunk[j->first], + repair_data_per_chunk[j->first]); } map out_bls; - int r = ec_impl->decode(need, chunks, &out_bls); + int r = ec_impl->decode(need, chunks, &out_bls, sinfo.get_chunk_size()); assert(r == 0); - for (map::iterator j = out.begin(); - j != out.end(); - ++j) { + for (auto j = out.begin(); j != out.end(); ++j) { assert(out_bls.count(j->first)); assert(out_bls[j->first].length() == sinfo.get_chunk_size()); j->second->claim_append(out_bls[j->first]); } } - for (map::iterator i = out.begin(); - i != out.end(); - ++i) { - assert(i->second->length() == total_data_size); + for (auto i = out.begin(); i != out.end(); ++i) { + assert(i->second->length() == chunks_count*sinfo.get_chunk_size()); } return 0; } diff --git a/src/test/erasure-code/ErasureCodeExample.h b/src/test/erasure-code/ErasureCodeExample.h index e180f9c6b0bb5..7e5c9b656a56b 100644 --- a/src/test/erasure-code/ErasureCodeExample.h +++ b/src/test/erasure-code/ErasureCodeExample.h @@ -48,7 +48,7 @@ public: int minimum_to_decode(const set &want_to_read, const set &available_chunks, - set *minimum) override { + set *minimum) { if (includes(available_chunks.begin(), available_chunks.end(), want_to_read.begin(), want_to_read.end())) { *minimum = want_to_read; @@ -146,7 +146,7 @@ public: int decode(const set &want_to_read, const map &chunks, - map *decoded) override { + map *decoded) { // // All chunks have the same size // diff --git a/src/test/erasure-code/ceph_erasure_code_benchmark.cc b/src/test/erasure-code/ceph_erasure_code_benchmark.cc index 13b1160a28313..fcaa16b511682 100644 --- a/src/test/erasure-code/ceph_erasure_code_benchmark.cc +++ b/src/test/erasure-code/ceph_erasure_code_benchmark.cc @@ -219,7 +219,7 @@ int ErasureCodeBench::decode_erasures(const map &all_chunks, want_to_read.insert(chunk); map decoded; - code = erasure_code->decode(want_to_read, chunks, &decoded); + code = erasure_code->decode(want_to_read, chunks, &decoded, 0); if (code) return code; for (set::iterator chunk = want_to_read.begin(); @@ -303,7 +303,7 @@ int ErasureCodeBench::decode() return code; } else if (erased.size() > 0) { map decoded; - code = erasure_code->decode(want_to_read, encoded, &decoded); + code = erasure_code->decode(want_to_read, encoded, &decoded, 0); if (code) return code; } else { @@ -316,7 +316,7 @@ int ErasureCodeBench::decode() chunks.erase(erasure); } map decoded; - code = erasure_code->decode(want_to_read, chunks, &decoded); + code = erasure_code->decode(want_to_read, chunks, &decoded, 0); if (code) return code; } diff --git a/src/test/erasure-code/ceph_erasure_code_non_regression.cc b/src/test/erasure-code/ceph_erasure_code_non_regression.cc index 355191714b7c3..6f5071394ea80 100644 --- a/src/test/erasure-code/ceph_erasure_code_non_regression.cc +++ b/src/test/erasure-code/ceph_erasure_code_non_regression.cc @@ -209,7 +209,7 @@ int ErasureCodeNonRegression::decode_erasures(ErasureCodeInterfaceRef erasure_co } map decoded; - int code = erasure_code->decode(erasures, available, &decoded); + int code = erasure_code->decode(erasures, available, &decoded, 0); if (code) return code; for (set::iterator erasure = erasures.begin(); -- 2.39.5