From 52915db9507053c8291f94995a662b7f5292491c Mon Sep 17 00:00:00 2001 From: Alex Ainscow Date: Mon, 14 Jul 2025 16:55:40 +0100 Subject: [PATCH] osd: Optimised EC avoids ever reading more than K shards (if plugin supports it). Plugins which support partial reads, should never need more than k shards to read the data, even if some shards have failed. However, rebalancing commonly requests k + m shards, as very frequently all shards are moved. If this occurs and all k + m shards are online, the read will be achieved by reading ALL shards rather than just reading k shards. This commit fixes that issue. The problem is that we don't want to change the API to the old EC, so we cannot update the plugin behaviour here. Instead, the EC code itself will reduce the number of shards it tells minimum_to_decode about. In a comment we note that bitset_set performance could be improved using _pdep_u64. This would require fiddly platform-specific code and would likely not show any performance improvements for most applications. The majority of the calls to this function will be with a bitset that has <=n set bits and will never enter this if statement. When there are >n bits set we are going to save one or more read I/Os, the cost of the for loop is insignificant vs this saving. I have left the comment in as a hint to future users of this function. Further notes were made in a review comment that are worth recording: - If performance is limited by the drives, then less read I/Os is a clear advantage. - If performance is limited by the network then less remote read I/Os is a clear advantage. - If performance is limited by the CPU then the CPU cost of M unnecessary remote read I/Os (messenger+bluestore) is almost certainly more than the cost of doing an extra encode operation to calculate the coding parities. - If performance is limited by system memory bandwidth the encode+crc generation has less overhead than the read+bluestore crc check+messenger overheads. Longer term this logic should probably be pushed into the plugins, in particular to give LRC the opportunity to optimize for locality of the shards. Reason for not doing this now is that it would be messy because the legacy EC code cannot support this optimization and LRC isn't yet optimizing for locality Signed-off-by: Alex Ainscow --- src/common/bitset_set.h | 23 ++++++++++++++ src/osd/ECCommon.cc | 17 +++++++++- src/test/common/test_bitset_set.cc | 51 ++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/src/common/bitset_set.h b/src/common/bitset_set.h index d6d02144959..9cde2270313 100644 --- a/src/common/bitset_set.h +++ b/src/common/bitset_set.h @@ -283,6 +283,29 @@ class bitset_set { return end(); } + /** @return a const_iterator to the nth key or end if it does not exist. + * + * This is called "find_nth" rather an overloading find, as its clearer + * what it is doing find(4) may imply "find(Key(4))" + */ + const_iterator find_nth(unsigned int n) const { + for (size_t i = 0; i < word_count; ++i) { + unsigned int bits_set = std::popcount(words[i]); + if (bits_set > n) { + uint64_t tmp = words[i]; + // This could be optimised with BMI _pdep_u64 + for (unsigned int j = 0; j < n; ++j) { + // This clears the least significant bit that is set to 1. + tmp &= tmp - 1; + } + return const_iterator(this, + std::countr_zero(tmp) + i * bits_per_uint64_t); + } + n -= bits_set; + } + return end(); + } + /** @return number of keys in the container. O(1) complexity on most * modern CPUs. */ diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc index 1987d51adee..b98f5cb2b84 100644 --- a/src/osd/ECCommon.cc +++ b/src/osd/ECCommon.cc @@ -220,8 +220,23 @@ int ECCommon::ReadPipeline::get_min_avail_to_read_shards( read_request.shard_want_to_read.populate_shard_id_set(want); - int r = ec_impl->minimum_to_decode(want, have, need_set, + int r = 0; + auto kth_iter = want.find_nth(sinfo.get_k()); + if (kth_iter != want.end()) { + // If we support partial reads, we are making the assumption that only + // K shards need to be read to recover data. We opt here for minimising + // the number of reads over minimising the amount of parity calculations + // that are needed. + shard_id_set want_for_plugin = want; + shard_id_t kth = *kth_iter; + want_for_plugin.erase_range(kth, sinfo.get_k_plus_m() - (int)kth); + r = ec_impl->minimum_to_decode(want_for_plugin, have, need_set, need_sub_chunks.get()); + } else { + r = ec_impl->minimum_to_decode(want, have, need_set, + need_sub_chunks.get()); + } + if (r < 0) { dout(20) << "minimum_to_decode_failed r: " << r << "want: " << want << " have: " << have << " need: " << need_set << dendl; diff --git a/src/test/common/test_bitset_set.cc b/src/test/common/test_bitset_set.cc index 7c4ac61b2fa..b98f0683059 100644 --- a/src/test/common/test_bitset_set.cc +++ b/src/test/common/test_bitset_set.cc @@ -211,3 +211,54 @@ TEST(bitset_set, fmt_formatting) { oss << bitset; EXPECT_EQ(using_fmt, oss.str()); } + +TEST(bitset_set, find_nth) { + constexpr size_t range = 128; + bitset_set bitset; + + ASSERT_EQ(bitset.end(), bitset.find_nth(0) ); + ASSERT_EQ(bitset.end(), bitset.find_nth(1) ); + ASSERT_EQ(bitset.end(), bitset.find_nth(range) ); + + bitset.insert(0); + ASSERT_EQ(Key(0), *bitset.find_nth(0) ); + ASSERT_EQ(bitset.end(), bitset.find_nth(1) ); + ASSERT_EQ(bitset.end(), bitset.find_nth(range) ); + + // Single bit set + for (unsigned int i = 0; i < range; i++) { + bitset.clear(); + bitset.insert(i); + ASSERT_EQ(Key(i), *bitset.find_nth(0) ); + ASSERT_EQ(bitset.end(), bitset.find_nth(1) ); + ASSERT_EQ(bitset.end(), bitset.find_nth(range) ); + } + + /* Alt bits set */ + bitset.clear(); + for (unsigned int i = 0; i < range; i += 2) { + bitset.insert(i); + } + for (unsigned int i = 0; i < range / 2; i++) { + ASSERT_EQ(Key(i * 2), *bitset.find_nth(i) ); + } + ASSERT_EQ(bitset.end(), bitset.find_nth(range / 2) ); + + /* Other alt bits set */ + bitset.clear(); + for (unsigned int i = 1; i < range; i += 2) { + bitset.insert(i); + } + for (unsigned int i = 0; i < range / 2; i++) { + ASSERT_EQ(Key(i * 2 + 1), *bitset.find_nth(i) ); + } + ASSERT_EQ(bitset.end(), bitset.find_nth(range / 2) ); + + /* All bits set */ + bitset.clear(); + bitset.insert_range(Key(0), range); + for (unsigned int i = 0; i < range; i++) { + ASSERT_EQ(Key(i), *bitset.find_nth(i) ); + } + ASSERT_EQ(bitset.end(), bitset.find_nth(range) ); +} \ No newline at end of file -- 2.39.5