From: Xuehan Xu Date: Thu, 15 May 2025 09:00:17 +0000 (+0800) Subject: crimson/os/seastore: adjust and fix rebalance logic in lba/backref/omap tree X-Git-Tag: testing/wip-vshankar-testing-20250521.043110-debug~10^2~2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=c90d7aeb30ba4632f195c1ec01e505d155cb67f2;p=ceph-ci.git crimson/os/seastore: adjust and fix rebalance logic in lba/backref/omap tree * For fixed_kv_btree, drop `prefer_left`. * For omap btree, tolerate underflow nodes if rebalance has the same results. Fixes: https://tracker.ceph.com/issues/71307 Signed-off-by: Xuehan Xu Signed-off-by: Yingxin Cheng --- diff --git a/src/crimson/common/fixed_kv_node_layout.h b/src/crimson/common/fixed_kv_node_layout.h index cdba35520d6..1768f530cda 100644 --- a/src/crimson/common/fixed_kv_node_layout.h +++ b/src/crimson/common/fixed_kv_node_layout.h @@ -560,14 +560,14 @@ public: static uint32_t get_balance_pivot_idx( const FixedKVNodeLayout &left, - const FixedKVNodeLayout &right, - bool prefer_left) + const FixedKVNodeLayout &right) { - auto total = left.get_size() + right.get_size(); + auto l_size = left.get_size(); + auto r_size = right.get_size(); + auto total = l_size + r_size; auto pivot_idx = total / 2; - if (total % 2 && prefer_left) { - pivot_idx++; - } + assert(pivot_idx > std::min(l_size, r_size) + && pivot_idx < std::max(l_size, r_size)); return pivot_idx; } @@ -575,19 +575,18 @@ public: * balance_into_new_nodes * * Takes the contents of left and right and copies them into - * replacement_left and replacement_right such that in the - * event that the number of elements is odd the extra goes to - * the left side iff prefer_left. + * replacement_left and replacement_right such that the size + * of both are balanced. */ static K balance_into_new_nodes( const FixedKVNodeLayout &left, const FixedKVNodeLayout &right, - bool prefer_left, + uint32_t pivot_idx, FixedKVNodeLayout &replacement_left, FixedKVNodeLayout &replacement_right) { + assert(pivot_idx != left.get_size() && pivot_idx != right.get_size()); auto total = left.get_size() + right.get_size(); - auto pivot_idx = get_balance_pivot_idx(left, right, prefer_left); auto replacement_pivot = pivot_idx >= left.get_size() ? right.iter_idx(pivot_idx - left.get_size())->get_key() : left.iter_idx(pivot_idx)->get_key(); diff --git a/src/crimson/os/seastore/backref/backref_tree_node.h b/src/crimson/os/seastore/backref/backref_tree_node.h index e31ac24acb5..d8e104ee444 100644 --- a/src/crimson/os/seastore/backref/backref_tree_node.h +++ b/src/crimson/os/seastore/backref/backref_tree_node.h @@ -173,7 +173,7 @@ public: Transaction &t, BackrefLeafNode &left, BackrefLeafNode &right, - bool prefer_left, + uint32_t pivot_idx, BackrefLeafNode &replacement_left, BackrefLeafNode &replacement_right) final {} @@ -191,7 +191,7 @@ public: Transaction &t, BackrefLeafNode &left, BackrefLeafNode &right, - bool prefer_left, + uint32_t pivot_idx, BackrefLeafNode &replacement_left, BackrefLeafNode &replacement_right) final {} // backref leaf nodes don't have to resolve relative addresses diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index bb7ae563b0c..d9a114af5f6 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -2064,12 +2064,13 @@ private: c.cache.retire_extent(c.trans, r); get_tree_stats(c.trans).extents_num_delta--; } else { + auto pivot_idx = l->get_balance_pivot_idx(*l, *r); LOG_PREFIX(FixedKVBtree::merge_level); auto [replacement_l, replacement_r, pivot] = l->make_balanced( c, r, - !donor_is_left); + pivot_idx); parent_pos.node->update( liter, diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 12cdd340d61..8ee42895276 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -341,7 +341,7 @@ struct FixedKVInternalNode make_balanced( op_context_t c, Ref &_right, - bool prefer_left) { + uint32_t pivot_idx) { ceph_assert(_right->get_type() == this->get_type()); auto &right = *_right->template cast(); auto replacement_left = c.cache.template alloc_new_non_data_extent( @@ -353,13 +353,13 @@ struct FixedKVInternalNode c.trans, static_cast(*this), right, - prefer_left, + pivot_idx, *replacement_left, *replacement_right); auto pivot = this->balance_into_new_nodes( *this, right, - prefer_left, + pivot_idx, *replacement_left, *replacement_right); replacement_left->range = replacement_left->get_meta(); @@ -368,7 +368,7 @@ struct FixedKVInternalNode c.trans, static_cast(*this), right, - prefer_left, + pivot_idx, *replacement_left, *replacement_right); return std::make_tuple( @@ -688,14 +688,14 @@ struct FixedKVLeafNode Transaction &t, node_type_t &left, node_type_t &right, - bool prefer_left, + uint32_t pivot_idx, node_type_t &replacement_left, node_type_t &replacement_right) = 0; virtual void adjust_copy_src_dest_on_balance( Transaction &t, node_type_t &left, node_type_t &right, - bool prefer_left, + uint32_t pivot_idx, node_type_t &replacement_left, node_type_t &replacement_right) = 0; @@ -703,7 +703,7 @@ struct FixedKVLeafNode make_balanced( op_context_t c, Ref &_right, - bool prefer_left) { + uint32_t pivot_idx) { ceph_assert(_right->get_type() == this->get_type()); auto &right = *_right->template cast(); auto replacement_left = c.cache.template alloc_new_non_data_extent( @@ -715,13 +715,13 @@ struct FixedKVLeafNode c.trans, static_cast(*this), right, - prefer_left, + pivot_idx, *replacement_left, *replacement_right); auto pivot = this->balance_into_new_nodes( *this, right, - prefer_left, + pivot_idx, *replacement_left, *replacement_right); replacement_left->range = replacement_left->get_meta(); @@ -730,7 +730,7 @@ struct FixedKVLeafNode c.trans, static_cast(*this), right, - prefer_left, + pivot_idx, *replacement_left, *replacement_right); return std::make_tuple( diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h index 9a66169ae2b..ef363d44e6c 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -304,21 +304,21 @@ struct LBALeafNode Transaction &t, LBALeafNode &left, LBALeafNode &right, - bool prefer_left, + uint32_t pivot_idx, LBALeafNode &replacement_left, LBALeafNode &replacement_right) final { this->balance_child_ptrs( - t, left, right, prefer_left, replacement_left, replacement_right); + t, left, right, pivot_idx, replacement_left, replacement_right); } void adjust_copy_src_dest_on_balance( Transaction &t, LBALeafNode &left, LBALeafNode &right, - bool prefer_left, + uint32_t pivot_idx, LBALeafNode &replacement_left, LBALeafNode &replacement_right) final { this->parent_node_t::adjust_copy_src_dest_on_balance( - t, left, right, prefer_left, replacement_left, replacement_right); + t, left, right, pivot_idx, replacement_left, replacement_right); } CachedExtentRef duplicate_for_write(Transaction&) final { diff --git a/src/crimson/os/seastore/linked_tree_node.h b/src/crimson/os/seastore/linked_tree_node.h index d420ac2ffe8..154353776a2 100644 --- a/src/crimson/os/seastore/linked_tree_node.h +++ b/src/crimson/os/seastore/linked_tree_node.h @@ -632,14 +632,14 @@ protected: Transaction &t, T &left, T &right, - bool prefer_left, + uint32_t pivot_idx, T &replacement_left, T &replacement_right) { size_t l_size = left.get_size(); size_t r_size = right.get_size(); - size_t pivot_idx = T::get_balance_pivot_idx(left, right, prefer_left); + ceph_assert(pivot_idx != l_size && pivot_idx != r_size); replacement_left.maybe_expand_children(pivot_idx); replacement_right.maybe_expand_children(r_size + l_size - pivot_idx); @@ -678,17 +678,11 @@ protected: Transaction &t, T &left, T &right, - bool prefer_left, + uint32_t pivot_idx, T &replacement_left, T &replacement_right) { size_t l_size = left.get_size(); - size_t r_size = right.get_size(); - size_t total = l_size + r_size; - size_t pivot_idx = (l_size + r_size) / 2; - if (total % 2 && prefer_left) { - pivot_idx++; - } if (left.is_initial_pending()) { for (auto &cs : left.copy_sources) { diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h index 9a2ef379574..6a455fc882b 100644 --- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h @@ -101,7 +101,7 @@ struct OMapNode : LogicalChildNode { using make_balanced_iertr = base_iertr; using make_balanced_ret = make_balanced_iertr::future - >; + >>; virtual make_balanced_ret make_balanced( omap_context_t oc, OMapNodeRef _right) = 0; diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc index 057752eb6aa..9ed808f5518 100644 --- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc @@ -394,18 +394,24 @@ OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right) LOG_PREFIX(OMapInnerNode::make_balanced); DEBUGT("l: {}, r: {}", oc.t, *this, *_right); ceph_assert(_right->get_type() == TYPE); + auto &right = *_right->cast(); + auto pivot_idx = get_balance_pivot_idx(*this, right); + if (!pivot_idx) { + return make_balanced_ret( + interruptible::ready_future_marker{}, + std::make_tuple(OMapNodeRef{}, OMapNodeRef{}, std::nullopt)); + } return oc.tm.alloc_extents(oc.t, oc.hint, OMAP_INNER_BLOCK_SIZE, 2) - .si_then([this, _right, oc] (auto &&replacement_pair){ + .si_then([this, &right, pivot_idx, oc] (auto &&replacement_pair){ auto replacement_left = replacement_pair.front(); auto replacement_right = replacement_pair.back(); - auto &right = *_right->cast(); - this->balance_child_ptrs(oc.t, *this, right, true, + this->balance_child_ptrs(oc.t, *this, right, *pivot_idx, *replacement_left, *replacement_right); return make_balanced_ret( interruptible::ready_future_marker{}, std::make_tuple(replacement_left, replacement_right, - balance_into_new_nodes(*this, right, + balance_into_new_nodes(*this, right, *pivot_idx, *replacement_left, *replacement_right))); }).handle_error_interruptible( crimson::ct_error::enospc::assert_failure{"unexpected enospc"}, @@ -490,8 +496,14 @@ OMapInnerNode::merge_entry( ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this](auto tuple) { LOG_PREFIX(OMapInnerNode::merge_entry); auto [replacement_l, replacement_r, replacement_pivot] = tuple; - replacement_l->init_range(l->get_begin(), replacement_pivot); - replacement_r->init_range(replacement_pivot, r->get_end()); + if (!replacement_pivot) { + return merge_entry_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::SUCCESS, + std::nullopt, std::nullopt)); + } + replacement_l->init_range(l->get_begin(), *replacement_pivot); + replacement_r->init_range(*replacement_pivot, r->get_end()); DEBUGT("to update parent: {} {} {}", oc.t, *this, *replacement_l, *replacement_r); if (get_meta().depth > 2) { // l and r are inner nodes @@ -511,7 +523,7 @@ OMapInnerNode::merge_entry( liter, replacement_l->get_laddr(), maybe_get_delta_buffer()); - bool overflow = extent_will_overflow(replacement_pivot.size(), + bool overflow = extent_will_overflow(replacement_pivot->size(), std::nullopt); if (!overflow) { this->update_child_ptr( @@ -521,7 +533,7 @@ OMapInnerNode::merge_entry( journal_inner_insert( riter, replacement_r->get_laddr(), - replacement_pivot, + *replacement_pivot, maybe_get_delta_buffer()); std::vector dec_laddrs{l->get_laddr(), r->get_laddr()}; return dec_ref(oc, dec_laddrs @@ -539,7 +551,7 @@ OMapInnerNode::merge_entry( this->remove_child_ptr(riter.get_offset()); journal_inner_remove(riter, maybe_get_delta_buffer()); return make_split_insert( - oc, riter, replacement_pivot, replacement_r + oc, riter, *replacement_pivot, replacement_r ).si_then([this, oc, l = l, r = r](auto mresult) { std::vector dec_laddrs{ l->get_laddr(), diff --git a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h index 10f070479dd..7e6df63d257 100644 --- a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h +++ b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h @@ -738,11 +738,9 @@ public: set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta())); } - static uint32_t get_balance_pivot_idx( + static std::optional get_balance_pivot_idx( const StringKVInnerNodeLayout &left, - const StringKVInnerNodeLayout &right, - // this is just for the consistency with linked tree nodes - bool prefer_left = false) + const StringKVInnerNodeLayout &right) { uint32_t left_size = omap_inner_key_t( left.get_node_key_ptr()[left.get_size()-1]).key_off; @@ -773,7 +771,9 @@ public: } } } - return pivot_idx; + return pivot_idx == left.get_size() + ? std::nullopt + : std::make_optional(pivot_idx); } /** @@ -786,14 +786,15 @@ public: static std::string balance_into_new_nodes( const StringKVInnerNodeLayout &left, const StringKVInnerNodeLayout &right, + uint32_t pivot_idx, StringKVInnerNodeLayout &replacement_left, StringKVInnerNodeLayout &replacement_right) { + ceph_assert(!(left.below_min() && right.below_min())); uint32_t left_size = omap_inner_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off; uint32_t right_size = omap_inner_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off; uint32_t total = left_size + right_size; uint32_t pivot_size = total / 2; - uint32_t pivot_idx = get_balance_pivot_idx(left, right); auto replacement_pivot = pivot_idx >= left.get_size() ? right.iter_idx(pivot_idx - left.get_size())->get_key() : diff --git a/src/test/crimson/test_fixed_kv_node_layout.cc b/src/test/crimson/test_fixed_kv_node_layout.cc index 9b6d19661ac..62f9ea6086b 100644 --- a/src/test/crimson/test_fixed_kv_node_layout.cc +++ b/src/test/crimson/test_fixed_kv_node_layout.cc @@ -245,7 +245,7 @@ TEST(FixedKVNodeTest, merge) { ASSERT_EQ(num, total); } -void run_balance_test(unsigned left, unsigned right, bool prefer_left) +void run_balance_test(unsigned left, unsigned right) { auto node = TestNode(); auto node2 = TestNode(); @@ -276,10 +276,11 @@ void run_balance_test(unsigned left, unsigned right, bool prefer_left) auto node_balanced = TestNode(); auto node_balanced2 = TestNode(); + auto pivot_idx = TestNode::get_balance_pivot_idx(node, node2); auto pivot = TestNode::balance_into_new_nodes( node, node2, - prefer_left, + pivot_idx, node_balanced, node_balanced2); @@ -287,13 +288,8 @@ void run_balance_test(unsigned left, unsigned right, bool prefer_left) unsigned left_size, right_size; if (total % 2) { - if (prefer_left) { - left_size = (total/2) + 1; - right_size = total/2; - } else { left_size = total/2; right_size = (total/2) + 1; - } } else { left_size = right_size = total/2; } @@ -332,13 +328,11 @@ void run_balance_test(unsigned left, unsigned right, bool prefer_left) } TEST(FixedKVNodeTest, balanced) { - run_balance_test(CAPACITY / 2, CAPACITY, true); - run_balance_test(CAPACITY / 2, CAPACITY, false); - run_balance_test(CAPACITY, CAPACITY / 2, true); - run_balance_test(CAPACITY, CAPACITY / 2, false); - run_balance_test(CAPACITY - 1, CAPACITY / 2, true); - run_balance_test(CAPACITY / 2, CAPACITY - 1, false); - run_balance_test(CAPACITY / 2, CAPACITY / 2, false); + run_balance_test(CAPACITY / 2, CAPACITY); + run_balance_test(CAPACITY, CAPACITY / 2); + run_balance_test(CAPACITY - 1, CAPACITY / 2); + run_balance_test(CAPACITY / 2, CAPACITY - 1); + run_balance_test(CAPACITY / 2, CAPACITY / 2 + 2); } void run_replay_test(