]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/os/seastore: adjust and fix rebalance logic in lba/backref/omap tree
authorXuehan Xu <xuxuehan@qianxin.com>
Thu, 15 May 2025 09:00:17 +0000 (17:00 +0800)
committerMatan Breizman <mbreizma@redhat.com>
Thu, 22 May 2025 12:09:54 +0000 (15:09 +0300)
* For fixed_kv_btree, drop `prefer_left`.
* For omap btree, tolerate underflow nodes if rebalance has the same results.

Fixes: https://tracker.ceph.com/issues/71307
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
(cherry picked from commit c90d7aeb30ba4632f195c1ec01e505d155cb67f2)

src/crimson/common/fixed_kv_node_layout.h
src/crimson/os/seastore/backref/backref_tree_node.h
src/crimson/os/seastore/btree/fixed_kv_btree.h
src/crimson/os/seastore/btree/fixed_kv_node.h
src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
src/crimson/os/seastore/linked_tree_node.h
src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
src/test/crimson/test_fixed_kv_node_layout.cc

index cdba35520d613f8c2e67534d732627ec070b907f..1768f530cda850282e12e42b3f82fbae9e7a225d 100644 (file)
@@ -560,14 +560,14 @@ public:
 
   static uint32_t get_balance_pivot_idx(
     const FixedKVNodeLayout &left,
-    const FixedKVNodeLayout &right,
-    bool prefer_left)
+    const FixedKVNodeLayout &right)
   {
-    auto total = left.get_size() + right.get_size();
+    auto l_size = left.get_size();
+    auto r_size = right.get_size();
+    auto total = l_size + r_size;
     auto pivot_idx = total / 2;
-    if (total % 2 && prefer_left) {
-      pivot_idx++;
-    }
+    assert(pivot_idx > std::min(l_size, r_size)
+      && pivot_idx < std::max(l_size, r_size));
     return pivot_idx;
   }
 
@@ -575,19 +575,18 @@ public:
    * balance_into_new_nodes
    *
    * Takes the contents of left and right and copies them into
-   * replacement_left and replacement_right such that in the
-   * event that the number of elements is odd the extra goes to
-   * the left side iff prefer_left.
+   * replacement_left and replacement_right such that the size
+   * of both are balanced.
    */
   static K balance_into_new_nodes(
     const FixedKVNodeLayout &left,
     const FixedKVNodeLayout &right,
-    bool prefer_left,
+    uint32_t pivot_idx,
     FixedKVNodeLayout &replacement_left,
     FixedKVNodeLayout &replacement_right)
   {
+    assert(pivot_idx != left.get_size() && pivot_idx != right.get_size());
     auto total = left.get_size() + right.get_size();
-    auto pivot_idx = get_balance_pivot_idx(left, right, prefer_left);
     auto replacement_pivot = pivot_idx >= left.get_size() ?
       right.iter_idx(pivot_idx - left.get_size())->get_key() :
       left.iter_idx(pivot_idx)->get_key();
index e31ac24acb5a8be9b716a80e00a4c2c459aa2f29..d8e104ee44401572f4585781662b7312b2e4ce38 100644 (file)
@@ -173,7 +173,7 @@ public:
     Transaction &t,
     BackrefLeafNode &left,
     BackrefLeafNode &right,
-    bool prefer_left,
+    uint32_t pivot_idx,
     BackrefLeafNode &replacement_left,
     BackrefLeafNode &replacement_right) final {}
 
@@ -191,7 +191,7 @@ public:
     Transaction &t,
     BackrefLeafNode &left,
     BackrefLeafNode &right,
-    bool prefer_left,
+    uint32_t pivot_idx,
     BackrefLeafNode &replacement_left,
     BackrefLeafNode &replacement_right) final {}
   // backref leaf nodes don't have to resolve relative addresses
index bb7ae563b0c1b13e916747b627f0d8752985997c..d9a114af5f66bc1a19e9f76747bc0fb558683b80 100644 (file)
@@ -2064,12 +2064,13 @@ private:
         c.cache.retire_extent(c.trans, r);
         get_tree_stats<self_type>(c.trans).extents_num_delta--;
       } else {
+        auto pivot_idx = l->get_balance_pivot_idx(*l, *r);
         LOG_PREFIX(FixedKVBtree::merge_level);
         auto [replacement_l, replacement_r, pivot] =
           l->make_balanced(
             c,
             r,
-            !donor_is_left);
+            pivot_idx);
 
         parent_pos.node->update(
           liter,
index 12cdd340d618a1921e1c8287bf751e419139ae01..8ee42895276d2e3781533442e0058e9cb7fb076c 100644 (file)
@@ -341,7 +341,7 @@ struct FixedKVInternalNode
   make_balanced(
     op_context_t<NODE_KEY> c,
     Ref &_right,
-    bool prefer_left) {
+    uint32_t pivot_idx) {
     ceph_assert(_right->get_type() == this->get_type());
     auto &right = *_right->template cast<node_type_t>();
     auto replacement_left = c.cache.template alloc_new_non_data_extent<node_type_t>(
@@ -353,13 +353,13 @@ struct FixedKVInternalNode
       c.trans,
       static_cast<node_type_t&>(*this),
       right,
-      prefer_left,
+      pivot_idx,
       *replacement_left,
       *replacement_right);
     auto pivot = this->balance_into_new_nodes(
       *this,
       right,
-      prefer_left,
+      pivot_idx,
       *replacement_left,
       *replacement_right);
     replacement_left->range = replacement_left->get_meta();
@@ -368,7 +368,7 @@ struct FixedKVInternalNode
       c.trans,
       static_cast<node_type_t&>(*this),
       right,
-      prefer_left,
+      pivot_idx,
       *replacement_left,
       *replacement_right);
     return std::make_tuple(
@@ -688,14 +688,14 @@ struct FixedKVLeafNode
     Transaction &t,
     node_type_t &left,
     node_type_t &right,
-    bool prefer_left,
+    uint32_t pivot_idx,
     node_type_t &replacement_left,
     node_type_t &replacement_right) = 0;
   virtual void adjust_copy_src_dest_on_balance(
     Transaction &t,
     node_type_t &left,
     node_type_t &right,
-    bool prefer_left,
+    uint32_t pivot_idx,
     node_type_t &replacement_left,
     node_type_t &replacement_right) = 0;
 
@@ -703,7 +703,7 @@ struct FixedKVLeafNode
   make_balanced(
     op_context_t<NODE_KEY> c,
     Ref &_right,
-    bool prefer_left) {
+    uint32_t pivot_idx) {
     ceph_assert(_right->get_type() == this->get_type());
     auto &right = *_right->template cast<node_type_t>();
     auto replacement_left = c.cache.template alloc_new_non_data_extent<node_type_t>(
@@ -715,13 +715,13 @@ struct FixedKVLeafNode
       c.trans,
       static_cast<node_type_t&>(*this),
       right,
-      prefer_left,
+      pivot_idx,
       *replacement_left,
       *replacement_right);
     auto pivot = this->balance_into_new_nodes(
       *this,
       right,
-      prefer_left,
+      pivot_idx,
       *replacement_left,
       *replacement_right);
     replacement_left->range = replacement_left->get_meta();
@@ -730,7 +730,7 @@ struct FixedKVLeafNode
       c.trans,
       static_cast<node_type_t&>(*this),
       right,
-      prefer_left,
+      pivot_idx,
       *replacement_left,
       *replacement_right);
     return std::make_tuple(
index 9a66169ae2b48408a78960b3aa5cbff3dbe90667..ef363d44e6cd4b206f938a39639d36242c6a258a 100644 (file)
@@ -304,21 +304,21 @@ struct LBALeafNode
     Transaction &t,
     LBALeafNode &left,
     LBALeafNode &right,
-    bool prefer_left,
+    uint32_t pivot_idx,
     LBALeafNode &replacement_left,
     LBALeafNode &replacement_right) final {
     this->balance_child_ptrs(
-      t, left, right, prefer_left, replacement_left, replacement_right);
+      t, left, right, pivot_idx, replacement_left, replacement_right);
   }
   void adjust_copy_src_dest_on_balance(
     Transaction &t,
     LBALeafNode &left,
     LBALeafNode &right,
-    bool prefer_left,
+    uint32_t pivot_idx,
     LBALeafNode &replacement_left,
     LBALeafNode &replacement_right) final {
     this->parent_node_t::adjust_copy_src_dest_on_balance(
-      t, left, right, prefer_left, replacement_left, replacement_right);
+      t, left, right, pivot_idx, replacement_left, replacement_right);
   }
 
   CachedExtentRef duplicate_for_write(Transaction&) final {
index d420ac2ffe8b404db671bd4c4ac21c66b22fa6be..154353776a2566b5588916390d5040ff8f138705 100644 (file)
@@ -632,14 +632,14 @@ protected:
     Transaction &t,
     T &left,
     T &right,
-    bool prefer_left,
+    uint32_t pivot_idx,
     T &replacement_left,
     T &replacement_right)
   {
     size_t l_size = left.get_size();
     size_t r_size = right.get_size();
-    size_t pivot_idx = T::get_balance_pivot_idx(left, right, prefer_left);
 
+    ceph_assert(pivot_idx != l_size && pivot_idx != r_size);
     replacement_left.maybe_expand_children(pivot_idx);
     replacement_right.maybe_expand_children(r_size + l_size - pivot_idx);
 
@@ -678,17 +678,11 @@ protected:
     Transaction &t,
     T &left,
     T &right,
-    bool prefer_left,
+    uint32_t pivot_idx,
     T &replacement_left,
     T &replacement_right)
   {
     size_t l_size = left.get_size();
-    size_t r_size = right.get_size();
-    size_t total = l_size + r_size;
-    size_t pivot_idx = (l_size + r_size) / 2;
-    if (total % 2 && prefer_left) {
-      pivot_idx++;
-    }
 
     if (left.is_initial_pending()) {
       for (auto &cs : left.copy_sources) {
index 9a2ef3795749c8eef9c714a9b36b13bd81f1e50a..6a455fc882b843b2f37eb8b184d20f76e484da45 100644 (file)
@@ -101,7 +101,7 @@ struct OMapNode : LogicalChildNode {
 
   using make_balanced_iertr = base_iertr;
   using make_balanced_ret = make_balanced_iertr::future
-          <std::tuple<OMapNodeRef, OMapNodeRef, std::string>>;
+          <std::tuple<OMapNodeRef, OMapNodeRef, std::optional<std::string>>>;
   virtual make_balanced_ret make_balanced(
     omap_context_t oc,
     OMapNodeRef _right) = 0;
index 057752eb6aa0eccc54c4b8da2ef273dd6f66c830..9ed808f551841b40f0fcf02dade7aaeb9b37ccd5 100644 (file)
@@ -394,18 +394,24 @@ OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
   LOG_PREFIX(OMapInnerNode::make_balanced);
   DEBUGT("l: {}, r: {}", oc.t, *this, *_right);
   ceph_assert(_right->get_type() == TYPE);
+  auto &right = *_right->cast<OMapInnerNode>();
+  auto pivot_idx = get_balance_pivot_idx(*this, right);
+  if (!pivot_idx) {
+    return make_balanced_ret(
+      interruptible::ready_future_marker{},
+      std::make_tuple(OMapNodeRef{}, OMapNodeRef{}, std::nullopt));
+  }
   return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint,
     OMAP_INNER_BLOCK_SIZE, 2)
-    .si_then([this, _right, oc] (auto &&replacement_pair){
+    .si_then([this, &right, pivot_idx, oc] (auto &&replacement_pair){
       auto replacement_left = replacement_pair.front();
       auto replacement_right = replacement_pair.back();
-      auto &right = *_right->cast<OMapInnerNode>();
-      this->balance_child_ptrs(oc.t, *this, right, true,
+      this->balance_child_ptrs(oc.t, *this, right, *pivot_idx,
                               *replacement_left, *replacement_right);
       return make_balanced_ret(
              interruptible::ready_future_marker{},
              std::make_tuple(replacement_left, replacement_right,
-                             balance_into_new_nodes(*this, right,
+                             balance_into_new_nodes(*this, right, *pivot_idx,
                                *replacement_left, *replacement_right)));
   }).handle_error_interruptible(
     crimson::ct_error::enospc::assert_failure{"unexpected enospc"},
@@ -490,8 +496,14 @@ OMapInnerNode::merge_entry(
       ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this](auto tuple) {
        LOG_PREFIX(OMapInnerNode::merge_entry);
         auto [replacement_l, replacement_r, replacement_pivot] = tuple;
-       replacement_l->init_range(l->get_begin(), replacement_pivot);
-       replacement_r->init_range(replacement_pivot, r->get_end());
+       if (!replacement_pivot) {
+         return merge_entry_ret(
+                interruptible::ready_future_marker{},
+                mutation_result_t(mutation_status_t::SUCCESS,
+                  std::nullopt, std::nullopt));
+       }
+       replacement_l->init_range(l->get_begin(), *replacement_pivot);
+       replacement_r->init_range(*replacement_pivot, r->get_end());
        DEBUGT("to update parent: {} {} {}",
          oc.t, *this, *replacement_l, *replacement_r);
        if (get_meta().depth > 2) { // l and r are inner nodes
@@ -511,7 +523,7 @@ OMapInnerNode::merge_entry(
          liter,
          replacement_l->get_laddr(),
          maybe_get_delta_buffer());
-        bool overflow = extent_will_overflow(replacement_pivot.size(),
+        bool overflow = extent_will_overflow(replacement_pivot->size(),
          std::nullopt);
         if (!overflow) {
          this->update_child_ptr(
@@ -521,7 +533,7 @@ OMapInnerNode::merge_entry(
           journal_inner_insert(
            riter,
            replacement_r->get_laddr(),
-           replacement_pivot,
+           *replacement_pivot,
            maybe_get_delta_buffer());
           std::vector<laddr_t> dec_laddrs{l->get_laddr(), r->get_laddr()};
           return dec_ref(oc, dec_laddrs
@@ -539,7 +551,7 @@ OMapInnerNode::merge_entry(
          this->remove_child_ptr(riter.get_offset());
           journal_inner_remove(riter, maybe_get_delta_buffer());
           return make_split_insert(
-           oc, riter, replacement_pivot, replacement_r
+           oc, riter, *replacement_pivot, replacement_r
          ).si_then([this, oc, l = l, r = r](auto mresult) {
            std::vector<laddr_t> dec_laddrs{
              l->get_laddr(),
index 10f070479dd5fde99c19f7943bef9a87b1b27d87..7e6df63d257ac5a2b43d0f5bd5c865acbde46139 100644 (file)
@@ -738,11 +738,9 @@ public:
     set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta()));
   }
 
-  static uint32_t get_balance_pivot_idx(
+  static std::optional<uint32_t> get_balance_pivot_idx(
     const StringKVInnerNodeLayout &left,
-    const StringKVInnerNodeLayout &right,
-    // this is just for the consistency with linked tree nodes
-    bool prefer_left = false)
+    const StringKVInnerNodeLayout &right)
   {
     uint32_t left_size = omap_inner_key_t(
       left.get_node_key_ptr()[left.get_size()-1]).key_off;
@@ -773,7 +771,9 @@ public:
         }
       }
     }
-    return pivot_idx;
+    return pivot_idx == left.get_size()
+      ? std::nullopt
+      : std::make_optional<uint32_t>(pivot_idx);
   }
 
   /**
@@ -786,14 +786,15 @@ public:
   static std::string balance_into_new_nodes(
     const StringKVInnerNodeLayout &left,
     const StringKVInnerNodeLayout &right,
+    uint32_t pivot_idx,
     StringKVInnerNodeLayout &replacement_left,
     StringKVInnerNodeLayout &replacement_right)
   {
+    ceph_assert(!(left.below_min() && right.below_min()));
     uint32_t left_size = omap_inner_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off;
     uint32_t right_size = omap_inner_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off;
     uint32_t total = left_size + right_size;
     uint32_t pivot_size = total / 2;
-    uint32_t pivot_idx = get_balance_pivot_idx(left, right);
 
     auto replacement_pivot = pivot_idx >= left.get_size() ?
       right.iter_idx(pivot_idx - left.get_size())->get_key() :
index 9b6d19661ac1ed2fd7f7b67610d3c58a620981bf..62f9ea6086b0aaecc4e7bba76e313d98563b56e2 100644 (file)
@@ -245,7 +245,7 @@ TEST(FixedKVNodeTest, merge) {
   ASSERT_EQ(num, total);
 }
 
-void run_balance_test(unsigned left, unsigned right, bool prefer_left)
+void run_balance_test(unsigned left, unsigned right)
 {
   auto node = TestNode();
   auto node2 = TestNode();
@@ -276,10 +276,11 @@ void run_balance_test(unsigned left, unsigned right, bool prefer_left)
 
   auto node_balanced = TestNode();
   auto node_balanced2 = TestNode();
+  auto pivot_idx = TestNode::get_balance_pivot_idx(node, node2);
   auto pivot = TestNode::balance_into_new_nodes(
     node,
     node2,
-    prefer_left,
+    pivot_idx,
     node_balanced,
     node_balanced2);
 
@@ -287,13 +288,8 @@ void run_balance_test(unsigned left, unsigned right, bool prefer_left)
 
   unsigned left_size, right_size;
   if (total % 2) {
-    if (prefer_left) {
-      left_size = (total/2) + 1;
-      right_size = total/2;
-    } else {
       left_size = total/2;
       right_size = (total/2) + 1;
-    }
   } else {
     left_size = right_size = total/2;
   }
@@ -332,13 +328,11 @@ void run_balance_test(unsigned left, unsigned right, bool prefer_left)
 }
 
 TEST(FixedKVNodeTest, balanced) {
-  run_balance_test(CAPACITY / 2, CAPACITY, true);
-  run_balance_test(CAPACITY / 2, CAPACITY, false);
-  run_balance_test(CAPACITY, CAPACITY / 2, true);
-  run_balance_test(CAPACITY, CAPACITY / 2, false);
-  run_balance_test(CAPACITY - 1, CAPACITY / 2, true);
-  run_balance_test(CAPACITY / 2, CAPACITY - 1, false);
-  run_balance_test(CAPACITY / 2, CAPACITY / 2, false);
+  run_balance_test(CAPACITY / 2, CAPACITY);
+  run_balance_test(CAPACITY, CAPACITY / 2);
+  run_balance_test(CAPACITY - 1, CAPACITY / 2);
+  run_balance_test(CAPACITY / 2, CAPACITY - 1);
+  run_balance_test(CAPACITY / 2, CAPACITY / 2 + 2);
 }
 
 void run_replay_test(