osd: Optimised EC avoids ever reading more than K shards (if plugin supports it).

author Alex Ainscow <aainscow@uk.ibm.com>

Mon, 14 Jul 2025 15:55:40 +0000 (16:55 +0100)

committer Alex Ainscow <aainscow@uk.ibm.com>

Fri, 1 Aug 2025 08:13:12 +0000 (09:13 +0100)
author Alex Ainscow <aainscow@uk.ibm.com>
Mon, 14 Jul 2025 15:55:40 +0000 (16:55 +0100)
committer Alex Ainscow <aainscow@uk.ibm.com>
Fri, 1 Aug 2025 08:13:12 +0000 (09:13 +0100)
diff --git a/src/common/bitset_set.h b/src/common/bitset_set.h

index d6d021449598383669f46c0ae4e476890017c839..9cde22703135c66497d254939255a62af620d7fa 100644 (file)
--- a/src/common/bitset_set.h
+++ b/src/common/bitset_set.h
@@ -283,6 +283,29 @@ class bitset_set {
      return end();
    }
  
+  /** @return a const_iterator to the nth key or end if it does not exist.
+   *
+   * This is called "find_nth" rather an overloading find, as its clearer
+   * what it is doing find(4) may imply "find(Key(4))"
+   */
+  const_iterator find_nth(unsigned int n) const {
+    for (size_t i = 0; i < word_count; ++i) {
+      unsigned int bits_set = std::popcount(words[i]);
+      if (bits_set > n) {
+        uint64_t tmp = words[i];
+        // This could be optimised with BMI _pdep_u64
+        for (unsigned int j = 0; j < n; ++j) {
+          // This clears the least significant bit that is set to 1.
+          tmp &= tmp - 1;
+        }
+        return const_iterator(this,
+          std::countr_zero(tmp) + i * bits_per_uint64_t);
+      }
+      n -= bits_set;
+    }
+    return end();
+  }
+
    /** @return number of keys in the container. O(1) complexity on most
     * modern CPUs.
     */
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc

index 1987d51adeeb54404077dfb51866c9f72c7f79f2..b98f5cb2b848033a129ed75dacd6435133b969d4 100644 (file)
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -220,8 +220,23 @@ int ECCommon::ReadPipeline::get_min_avail_to_read_shards(
  
    read_request.shard_want_to_read.populate_shard_id_set(want);
  
-  int r = ec_impl->minimum_to_decode(want, have, need_set,
+  int r = 0;
+  auto kth_iter = want.find_nth(sinfo.get_k());
+  if (kth_iter != want.end()) {
+    // If we support partial reads, we are making the assumption that only
+    // K shards need to be read to recover data.  We opt here for minimising
+    // the number of reads over minimising the amount of parity calculations
+    // that are needed.
+    shard_id_set want_for_plugin = want;
+    shard_id_t kth = *kth_iter;
+    want_for_plugin.erase_range(kth, sinfo.get_k_plus_m() - (int)kth);
+    r = ec_impl->minimum_to_decode(want_for_plugin, have, need_set,
                                       need_sub_chunks.get());
+  } else {
+    r = ec_impl->minimum_to_decode(want, have, need_set,
+                                     need_sub_chunks.get());
+  }
+
    if (r < 0) {
      dout(20) << "minimum_to_decode_failed r: " << r << "want: " << want
        << " have: " << have << " need: " << need_set << dendl;
diff --git a/src/test/common/test_bitset_set.cc b/src/test/common/test_bitset_set.cc

index 7c4ac61b2fa38039de27fe286b20295c48d5c494..b98f068305982065ed92507fc9c5f7972771a7ac 100644 (file)
--- a/src/test/common/test_bitset_set.cc
+++ b/src/test/common/test_bitset_set.cc
@@ -211,3 +211,54 @@ TEST(bitset_set, fmt_formatting) {
    oss << bitset;
    EXPECT_EQ(using_fmt, oss.str());
  }
+
+TEST(bitset_set, find_nth) {
+  constexpr size_t range = 128;
+  bitset_set<range, Key> bitset;
+
+  ASSERT_EQ(bitset.end(), bitset.find_nth(0) );
+  ASSERT_EQ(bitset.end(), bitset.find_nth(1) );
+  ASSERT_EQ(bitset.end(), bitset.find_nth(range) );
+
+  bitset.insert(0);
+  ASSERT_EQ(Key(0), *bitset.find_nth(0) );
+  ASSERT_EQ(bitset.end(), bitset.find_nth(1) );
+  ASSERT_EQ(bitset.end(), bitset.find_nth(range) );
+
+  // Single bit set
+  for (unsigned int i = 0; i < range; i++) {
+    bitset.clear();
+    bitset.insert(i);
+    ASSERT_EQ(Key(i), *bitset.find_nth(0) );
+    ASSERT_EQ(bitset.end(), bitset.find_nth(1) );
+    ASSERT_EQ(bitset.end(), bitset.find_nth(range) );
+  }
+
+  /* Alt bits set */
+  bitset.clear();
+  for (unsigned int i = 0; i < range; i += 2) {
+    bitset.insert(i);
+  }
+  for (unsigned int i = 0; i < range / 2; i++) {
+    ASSERT_EQ(Key(i * 2), *bitset.find_nth(i) );
+  }
+  ASSERT_EQ(bitset.end(), bitset.find_nth(range / 2) );
+
+  /* Other alt bits set */
+  bitset.clear();
+  for (unsigned int i = 1; i < range; i += 2) {
+    bitset.insert(i);
+  }
+  for (unsigned int i = 0; i < range / 2; i++) {
+    ASSERT_EQ(Key(i * 2 + 1), *bitset.find_nth(i) );
+  }
+  ASSERT_EQ(bitset.end(), bitset.find_nth(range / 2) );
+
+  /* All bits set */
+  bitset.clear();
+  bitset.insert_range(Key(0), range);
+  for (unsigned int i = 0; i < range; i++) {
+    ASSERT_EQ(Key(i), *bitset.find_nth(i) );
+  }
+  ASSERT_EQ(bitset.end(), bitset.find_nth(range) );
+}
+\ No newline at end of file
author	Alex Ainscow <aainscow@uk.ibm.com>
	Mon, 14 Jul 2025 15:55:40 +0000 (16:55 +0100)
committer	Alex Ainscow <aainscow@uk.ibm.com>
	Fri, 1 Aug 2025 08:13:12 +0000 (09:13 +0100)
src/common/bitset_set.h		patch \| blob \| history
src/osd/ECCommon.cc		patch \| blob \| history
src/test/common/test_bitset_set.cc		patch \| blob \| history