From d94805c09faf9c37a012ae8e0ee5bde284837518 Mon Sep 17 00:00:00 2001 From: Alex Ainscow Date: Wed, 18 Jun 2025 20:46:49 +0100 Subject: [PATCH] osd: Multiple Decode fixes. Fix 1: These are multiple fixes that affected the same code. To simplify review and understanding of the changes, they have been merged into a single commit. What happened in defect is (k,m = 6,4) 1. State is: fast_reads = true, shards 0,4,5,6,7,8 are available. Shard 1 is missing this object. 2. Shard 5 only needs zeros, so read is dropped. Other sub read message sent. 3. Object on shard 1 completes recovery (so becomes not-missing) 4. Read completes, complete notices that it only has 5 reads, so calculates what it needs to re-read. 5. Calculates it needs 0,1,4,5,6,7 - and so wants to read shard 1. 6. Code assumes that enough reads should have been performed, so refused to do another reads and instead generates an EIO. The problem here is some "lazy" code in step (4). What is should be doing is working out that it can use the zero buffers and not calling get_remaining_reads(). Instead, what it attempts to do is call get_remaining_reads() and if there is no work to do, then it assumes it has everything already and completes the read with success. This assumption mostly works - but in this combination of fast_reads picking less than k shards to read from AND an object completing recovery in parallel causes issue. The solution is to wait for all reads to complete and then assume that any remaining zero buffers count as completed reads. This should then cause the plugin to declare "success" Fix 2: There are decodes in new EC which can occur when less than k shards have been read. These reads in the last stripe, where for decoding purposes, the data past the end of the shard can be considered zeros. EC does not read these, but instead relies on the decode function inventing the zero buffers. This was working correctly when fast reads were turned off, but if such an IO was encountered with fast reads turned on the logic was disabled and the IO returns an EIO. This commit fixes that logic, so that if all reads have complete and send_all_remaining_reads conveys that no new reads were requested, then decode will still be possible. FIX 3: If reading the end of an object with unequally sized objects, we pad out the end of the decode with zeros, to provide the correct data to the plugin. Previously, the code decided not to add the zeros to "needed" shards. This caused a problem where for some parity-only decodes, an incomplete set of zeros was generated, fooling the _decode function into thinking that the entire shard was zeros. In the fix, we need to cope with the case where the only data needed from the shard is the padding itself. The comments around the new code describe the logic behind the change. This makes the encode-specific use case of padding out the to-be-decoded shards unnecessary, as this is performed by the pad_on_shards function below. Also fixing some logic in calculating the need_set being passed to the decode function did not contain the extra shards needed for the decode. This need_set is actually ignored by all the plugins as far as I know, but being wrong does not seem helpful if its needed in the future. Fix 4: Extend reads when recovering parity Here is an example use case which was going wrong: 1. Start with 3+2 EC, shards 0,3,4 are 8k shard 1,2 is 4k 2. Perform a recovery, where we recover 2 and 4. 2 is missing, 4 can be copied from another OSD. 3. Recovery works out that it can do the whole recovery with shards 0,1,3. (note not 4) 4. So the "need" set is 0,1,3, the "want" set is 2,4 and the "have" set is 0,1,3,4,5 5. The logic in get_all_avail_shards then tries to work out the extents it needs - it only. looks at 2, because we "have" 4 6. Result is that we end up reading 4k on 0,1,3, then attempt to recover 8k on shard 4 from this... which clearly does not work. Fix 5: Round up padding to 4k alignment in EC The pad_on_shards was not aligning to 4k. However, the decode/encode functions were. This meant that we assigned a new buffer, then added another after - this should be faster. Fix 6: Do not invent encode buffers before doing decode. In this bug, during recovery, we could potentially be creating unwanted encode buffers and using them to decode data buffers. This fix simply removes the bad code, as there is new code above which is already doing the correct action. Fix 7: Fix miscompare with missing decodes. In this case, two OSDs failed at once. One was replaced and the other was not. This caused us to attempt to encode a missing shard while another shard was missing, which caused a miscompare because the recovery failed to do the decode properly before doing an encode. Signed-off-by: Alex Ainscow (cherry picked from commit c116b8615d68a3926dc78a4965cc0a28ff85d4f2) --- src/osd/ECBackend.cc | 70 ++++++++++++++++++++++++--------- src/osd/ECBackend.h | 2 +- src/osd/ECCommon.cc | 22 ++++++++++- src/osd/ECCommon.h | 4 ++ src/osd/ECUtil.cc | 74 +++++++++++++++++++++-------------- src/osd/ECUtil.h | 11 +----- src/test/osd/TestECBackend.cc | 10 ++--- 7 files changed, 126 insertions(+), 67 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 0d429d5c2302c..cd06b0a1bbc64 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -312,7 +312,7 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete( const hobject_t &hoid, ECUtil::shard_extent_map_t &&buffers_read, std::optional>> attrs, - const ECUtil::shard_extent_set_t &want_to_read, + read_request_t &req, RecoveryMessages *m) { dout(10) << __func__ << ": returned " << hoid << " " << buffers_read << dendl; ceph_assert(recovery_ops.contains(hoid)); @@ -341,6 +341,13 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete( ceph_assert(op.obc); op.recovery_info.size = op.obc->obs.oi.size; op.recovery_info.oi = op.obc->obs.oi; + + // We didn't know the size before, meaning the zero for decode calculations + // will be off. Recalculate them! + req.object_size = op.obc->obs.oi.size; + int r = read_pipeline.get_min_avail_to_read_shards( + op.hoid, true, false, req); + ceph_assert(r == 0); } } ceph_assert(op.xattrs.size()); @@ -352,7 +359,7 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete( sinfo.ro_size_to_read_mask(op.recovery_info.size, read_mask); ECUtil::shard_extent_set_t shard_want_to_read(sinfo.get_k_plus_m()); - for (auto &[shard, eset] : want_to_read) { + for (auto &[shard, eset] : req.shard_want_to_read) { /* Read buffers do not need recovering! */ if (buffers_read.contains(shard)) { continue; @@ -371,6 +378,11 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete( uint64_t aligned_size = ECUtil::align_next(op.obc->obs.oi.size); + dout(20) << __func__ << " before decode: oid=" << op.hoid << " EC_DEBUG_BUFFERS: " + << op.returned_data->debug_string(2048, 0) + << dendl; + + op.returned_data->add_zero_padding_for_decode(req.zeros_for_decode); int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size, get_parent()->get_dpp(), true); ceph_assert(r == 0); @@ -386,7 +398,7 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete( } dout(20) << __func__ << ": oid=" << op.hoid << dendl; - dout(20) << __func__ << "EC_DEBUG_BUFFERS: " + dout(20) << __func__ << " after decode: oid=" << op.hoid << " EC_DEBUG_BUFFERS: " << op.returned_data->debug_string(2048, 0) << dendl; @@ -442,7 +454,7 @@ struct RecoveryReadCompleter : ECCommon::ReadCompleter { hoid, std::move(res.buffers_read), res.attrs, - req.shard_want_to_read, + req, &rm); } @@ -1250,6 +1262,13 @@ void ECBackend::handle_sub_read_reply( rop.debug_log.emplace_back(ECUtil::ERROR, op.from, complete.buffers_read); complete.buffers_read.erase_shard(from.shard); complete.processed_read_requests.erase(from.shard); + // If we are doing redundant reads, then we must take care that any failed + // reads are not replaced with a zero buffer. When fast_reads are disabled, + // the send_all_remaining_reads() call will replace the zeros_for_decode + // based on the recovery read. + if (rop.do_redundant_reads) { + rop.to_read.at(hoid).zeros_for_decode.erase(from.shard); + } dout(20) << __func__ << " shard=" << from << " error=" << err << dendl; } @@ -1263,9 +1282,10 @@ void ECBackend::handle_sub_read_reply( rop.in_progress.erase(from); unsigned is_complete = 0; bool need_resend = false; + bool all_sub_reads_done = rop.in_progress.empty(); // For redundant reads check for completion as each shard comes in, // or in a non-recovery read check for completion once all the shards read. - if (rop.do_redundant_reads || rop.in_progress.empty()) { + if (rop.do_redundant_reads || all_sub_reads_done) { for (auto &&[oid, read_result]: rop.complete) { shard_id_set have; read_result.processed_read_requests.populate_shard_id_set(have); @@ -1275,29 +1295,30 @@ void ECBackend::handle_sub_read_reply( populate_shard_id_set(want_to_read); dout(20) << __func__ << " read_result: " << read_result << dendl; + // If all reads are done, we can safely assume that zero buffers can + // be applied. + if (all_sub_reads_done) { + rop.to_read.at(oid).zeros_for_decode.populate_shard_id_set(have); + } int err = ec_impl->minimum_to_decode(want_to_read, have, dummy_minimum, nullptr); if (err) { dout(20) << __func__ << " minimum_to_decode failed" << dendl; - if (rop.in_progress.empty()) { + if (all_sub_reads_done) { // If we don't have enough copies, try other pg_shard_ts if available. // During recovery there may be multiple osds with copies of the same shard, // so getting EIO from one may result in multiple passes through this code path. - if (!rop.do_redundant_reads) { - rop.debug_log.emplace_back(ECUtil::REQUEST_MISSING, op.from); - int r = read_pipeline.send_all_remaining_reads(oid, rop); - if (r == 0) { - // We found that new reads are required to do a decode. - need_resend = true; - continue; - } else if (r > 0) { - // No new reads were requested. This means that some parity - // shards can be assumed to be zeros. - err = 0; - } - // else insufficient shards are available, keep the errors. + + rop.debug_log.emplace_back(ECUtil::REQUEST_MISSING, op.from); + int r = read_pipeline.send_all_remaining_reads(oid, rop); + if (r == 0 && !rop.do_redundant_reads) { + // We found that new reads are required to do a decode. + need_resend = true; + continue; } + // else insufficient shards are available, keep the errors. + // Couldn't read any additional shards so handle as completed with errors // We don't want to confuse clients / RBD with objectstore error // values in particular ENOENT. We may have different error returns @@ -1341,6 +1362,17 @@ void ECBackend::handle_sub_read_reply( dout(20) << __func__ << " Complete: " << rop << dendl; rop.trace.event("ec read complete"); rop.debug_log.emplace_back(ECUtil::COMPLETE, op.from); + + /* If do_redundant_reads is set then there might be some in progress + * reads remaining. We need to make sure that these non-read shards + * do not get padded. If there was no in progress read, then the zero + * padding is allowed to stay. + */ + for (auto pg_shard : rop.in_progress) { + for (auto &&[oid, read] : rop.to_read) { + read.zeros_for_decode.erase(pg_shard.shard); + } + } read_pipeline.complete_read_op(std::move(rop)); } else { dout(10) << __func__ << " readop not complete: " << rop << dendl; diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index d85defc9e5987..1895f063098a0 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -329,7 +329,7 @@ class ECBackend : public ECCommon { ECUtil::shard_extent_map_t &&buffers_read, std::optional>> attrs, - const ECUtil::shard_extent_set_t &want_to_read, + read_request_t &req, RecoveryMessages *m); void handle_recovery_push( const PushOp &op, diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc index cdc7f1d783120..acdc801e7bbf7 100644 --- a/src/osd/ECCommon.cc +++ b/src/osd/ECCommon.cc @@ -252,12 +252,20 @@ int ECCommon::ReadPipeline::get_min_avail_to_read_shards( * redundant reads is set, then we want to have the same reads on * every extent. Otherwise, we need to read every shard only if the * necessary shard is missing. + * In some (recovery) scenarios, we can "want" a shard, but not "need" to + * read it. This typically happens when we do not have a data shard and the + * recovery for this will read enough shards to also generate all the parity. + * + * Since parity shards are often larger than data shards, we must make sure + * to read the extra bit! */ - if (!have.contains(shard) || do_redundant_reads) { + if (!have.contains(shard) || do_redundant_reads || + (want.contains(shard) && !need_set.contains(shard))) { extra_extents.union_of(extent_set); } } + read_request.zeros_for_decode.clear(); for (auto &shard: need_set) { if (!have.contains(shard)) { continue; @@ -279,6 +287,15 @@ int ECCommon::ReadPipeline::get_min_avail_to_read_shards( shard_read.extents.intersection_of(extents, read_mask.at(shard)); } + if (zero_mask.contains(shard)) { + extents.intersection_of(zero_mask.at(shard)); + + /* Any remaining extents can be assumed ot be zeros... so record these. */ + if (!extents.empty()) { + read_request.zeros_for_decode.emplace(shard, std::move(extents)); + } + } + if (!shard_read.extents.empty()) { read_request.shard_reads[shard_id] = std::move(shard_read); } @@ -376,7 +393,7 @@ int ECCommon::ReadPipeline::get_remaining_shards( } } - return read_request.shard_reads.empty()?1:0; + return 0; } void ECCommon::ReadPipeline::start_read_op( @@ -514,6 +531,7 @@ struct ClientReadCompleter final : ECCommon::ReadCompleter { << res.buffers_read.debug_string(2048, 0) << dendl; /* Decode any missing buffers */ + res.buffers_read.add_zero_padding_for_decode(req.zeros_for_decode); int r = res.buffers_read.decode(read_pipeline.ec_impl, req.shard_want_to_read, req.object_size, diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h index a685a50291ecc..0deb98765ec5e 100644 --- a/src/osd/ECCommon.h +++ b/src/osd/ECCommon.h @@ -96,6 +96,7 @@ struct ECCommon { const std::list to_read; const uint32_t flags = 0; const ECUtil::shard_extent_set_t shard_want_to_read; + ECUtil::shard_extent_set_t zeros_for_decode; shard_id_map shard_reads; bool want_attrs = false; uint64_t object_size; @@ -107,6 +108,7 @@ struct ECCommon { to_read(to_read), flags(to_read.front().flags), shard_want_to_read(shard_want_to_read), + zeros_for_decode(shard_want_to_read.get_max_shards()), shard_reads(shard_want_to_read.get_max_shards()), want_attrs(want_attrs), object_size(object_size) {} @@ -114,6 +116,7 @@ struct ECCommon { read_request_t(const ECUtil::shard_extent_set_t &shard_want_to_read, bool want_attrs, uint64_t object_size) : shard_want_to_read(shard_want_to_read), + zeros_for_decode(shard_want_to_read.get_max_shards()), shard_reads(shard_want_to_read.get_max_shards()), want_attrs(want_attrs), object_size(object_size) {} @@ -124,6 +127,7 @@ struct ECCommon { os << "read_request_t(to_read=[" << to_read << "]" << ", flags=" << flags << ", shard_want_to_read=" << shard_want_to_read + << ", zeros_for_decode=" << zeros_for_decode << ", shard_reads=" << shard_reads << ", want_attrs=" << want_attrs << ")"; diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc index 911e314deaf80..399d915549a58 100644 --- a/src/osd/ECUtil.cc +++ b/src/osd/ECUtil.cc @@ -571,11 +571,7 @@ void shard_extent_map_t::pad_on_shards(const shard_extent_set_t &pad_to, if (!pad_to.contains(shard)) { continue; } - for (auto &[off, length] : pad_to.at(shard)) { - bufferlist bl; - bl.push_back(buffer::create_aligned(length, EC_ALIGN_SIZE)); - insert_in_shard(shard, off, bl); - } + pad_on_shard(pad_to.at(shard), shard); } } @@ -585,10 +581,14 @@ void shard_extent_map_t::pad_on_shard(const extent_set &pad_to, if (pad_to.size() == 0) { return; } + for (auto &[off, length] : pad_to) { bufferlist bl; - bl.push_back(buffer::create_aligned(length, EC_ALIGN_SIZE)); - insert_in_shard(shard, off, bl); + uint64_t start = align_prev(off); + uint64_t end = align_next(off + length); + + bl.push_back(buffer::create_aligned(end - start, EC_ALIGN_SIZE)); + insert_in_shard(shard, start, bl); } } @@ -599,11 +599,7 @@ void shard_extent_map_t::pad_on_shards(const extent_set &pad_to, return; } for (auto &shard : shards) { - for (auto &[off, length] : pad_to) { - bufferlist bl; - bl.push_back(buffer::create_aligned(length, EC_ALIGN_SIZE)); - insert_in_shard(shard, off, bl); - } + pad_on_shard(pad_to, shard); } } @@ -657,32 +653,44 @@ int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl, return 0; } - if (add_zero_padding_for_decode(object_size, need_set)) { - // We added some zero buffers, which means our have and need set may change - extent_maps.populate_bitset_set(have_set); - need_set = shard_id_set::difference(want_set, have_set); - } - shard_id_set decode_set = shard_id_set::intersection(need_set, sinfo->get_data_shards()); shard_id_set encode_set = shard_id_set::intersection(need_set, sinfo->get_parity_shards()); - shard_extent_set_t read_mask(sinfo->get_k_plus_m()); - sinfo->ro_size_to_read_mask(object_size, read_mask); + + if (!encode_set.empty()) { + shard_extent_set_t read_mask(sinfo->get_k_plus_m()); + sinfo->ro_size_to_read_mask(object_size, read_mask); + + /* The function has been asked to "decode" parity. To achieve this, we + * need all the data shards to be present... So first see if there are + * any missing... + */ + shard_id_set decode_for_parity_shards = shard_id_set::difference(sinfo->get_data_shards(), have_set); + decode_for_parity_shards = shard_id_set::intersection(decode_for_parity_shards, read_mask.get_shard_id_set()); + + if (!decode_for_parity_shards.empty()) { + /* So there are missing data shards which need decoding before we encode, + * We need to add these to the decode set and insert buffers for the + * decode to happen. + */ + decode_set.insert(decode_for_parity_shards); + need_set.insert(decode_for_parity_shards); + extent_set decode_for_parity = get_extent_superset(); + + for (auto shard : decode_for_parity_shards) { + extent_set parity_pad; + parity_pad.intersection_of(decode_for_parity, read_mask.at(shard)); + pad_on_shard(decode_for_parity, shard); + } + } + } + int r = 0; if (!decode_set.empty()) { pad_on_shards(want, decode_set); - /* If we are going to be encoding, we need to make sure all the necessary - * shards are decoded. The get_min_available functions should have already - * worked out what needs to be read for this. - */ - for (auto shard : encode_set) { - extent_set decode_for_parity; - decode_for_parity.intersection_of(want.at(shard), read_mask.at(shard)); - pad_on_shard(decode_for_parity, shard); - } r = _decode(ec_impl, want_set, decode_set, dpp); } if (!r && !encode_set.empty()) { - pad_on_shards(want, encode_set); + pad_on_shards(get_extent_superset(), sinfo->get_parity_shards()); r = encode(ec_impl, dpp, dedup_zeros?&need_set:nullptr); } @@ -705,14 +713,20 @@ int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl, const shard_id_set &need_set, DoutPrefixProvider *dpp) { bool rebuild_req = false; + for (auto iter = begin_slice_iterator(need_set, dpp); !iter.is_end(); ++iter) { if (!iter.is_page_aligned()) { rebuild_req = true; break; } + shard_id_map &in = iter.get_in_bufferptrs(); shard_id_map &out = iter.get_out_bufferptrs(); + if (out.empty()) { + continue; + } + if (int ret = ec_impl->decode_chunks(want_set, in, out)) { return ret; } diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h index f31a0b9108388..0d02130f93c4f 100644 --- a/src/osd/ECUtil.h +++ b/src/osd/ECUtil.h @@ -1054,16 +1054,9 @@ public: } } - bool add_zero_padding_for_decode(uint64_t object_size, shard_id_set &exclude_set) { - shard_extent_set_t zeros(sinfo->get_k_plus_m()); - sinfo->ro_size_to_zero_mask(object_size, zeros); - extent_set superset = get_extent_superset(); + void add_zero_padding_for_decode(ECUtil::shard_extent_set_t &zeros) { bool changed = false; for (auto &&[shard, z] : zeros) { - if (exclude_set.contains(shard)) { - continue; - } - z.intersection_of(superset); for (auto [off, len] : z) { changed = true; bufferlist bl; @@ -1075,8 +1068,6 @@ public: if (changed) { compute_ro_range(); } - - return changed; } template requires is_interval_set_v diff --git a/src/test/osd/TestECBackend.cc b/src/test/osd/TestECBackend.cc index 1dd4faa6e681d..a554fe4c94e8e 100644 --- a/src/test/osd/TestECBackend.cc +++ b/src/test/osd/TestECBackend.cc @@ -158,12 +158,12 @@ public: // for each missing shard. for (auto a : available) { minimum_set.insert(a); - if (minimum_set.size() == data_chunk_count) { + if (std::cmp_equal(minimum_set.size(), data_chunk_count)) { break; } } - if (minimum_set.size() != data_chunk_count) { + if (std::cmp_not_equal(minimum_set.size(), data_chunk_count)) { minimum_set.clear(); return -EIO; // Cannot recover. } @@ -174,7 +174,6 @@ public: } return 0; } - [[deprecated]] int minimum_to_decode(const std::set &want_to_read, const std::set &available, @@ -1320,7 +1319,8 @@ void test_decode(unsigned int k, unsigned int m, uint64_t chunk_size, uint64_t o } } - ASSERT_EQ(0, semap.decode(ec_impl, want, object_size, nullptr, true)); + semap.add_zero_padding_for_decode(read_request.zeros_for_decode); + ASSERT_EQ(0, semap.decode(ec_impl, want, object_size)); } TEST(ECCommon, decode) { @@ -1476,4 +1476,4 @@ TEST(ECCommon, decode7) { acting_set.insert_range(shard_id_t(0), 3); test_decode(k, m, chunk_size, object_size, want, acting_set); -} \ No newline at end of file +} -- 2.39.5