* For fixed_kv_btree, drop `prefer_left`.
* For omap btree, tolerate underflow nodes if rebalance has the same results.
Fixes: https://tracker.ceph.com/issues/71307
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
static uint32_t get_balance_pivot_idx(
const FixedKVNodeLayout &left,
- const FixedKVNodeLayout &right,
- bool prefer_left)
+ const FixedKVNodeLayout &right)
{
- auto total = left.get_size() + right.get_size();
+ auto l_size = left.get_size();
+ auto r_size = right.get_size();
+ auto total = l_size + r_size;
auto pivot_idx = total / 2;
- if (total % 2 && prefer_left) {
- pivot_idx++;
- }
+ assert(pivot_idx > std::min(l_size, r_size)
+ && pivot_idx < std::max(l_size, r_size));
return pivot_idx;
}
* balance_into_new_nodes
*
* Takes the contents of left and right and copies them into
- * replacement_left and replacement_right such that in the
- * event that the number of elements is odd the extra goes to
- * the left side iff prefer_left.
+ * replacement_left and replacement_right such that the size
+ * of both are balanced.
*/
static K balance_into_new_nodes(
const FixedKVNodeLayout &left,
const FixedKVNodeLayout &right,
- bool prefer_left,
+ uint32_t pivot_idx,
FixedKVNodeLayout &replacement_left,
FixedKVNodeLayout &replacement_right)
{
+ assert(pivot_idx != left.get_size() && pivot_idx != right.get_size());
auto total = left.get_size() + right.get_size();
- auto pivot_idx = get_balance_pivot_idx(left, right, prefer_left);
auto replacement_pivot = pivot_idx >= left.get_size() ?
right.iter_idx(pivot_idx - left.get_size())->get_key() :
left.iter_idx(pivot_idx)->get_key();
Transaction &t,
BackrefLeafNode &left,
BackrefLeafNode &right,
- bool prefer_left,
+ uint32_t pivot_idx,
BackrefLeafNode &replacement_left,
BackrefLeafNode &replacement_right) final {}
Transaction &t,
BackrefLeafNode &left,
BackrefLeafNode &right,
- bool prefer_left,
+ uint32_t pivot_idx,
BackrefLeafNode &replacement_left,
BackrefLeafNode &replacement_right) final {}
// backref leaf nodes don't have to resolve relative addresses
c.cache.retire_extent(c.trans, r);
get_tree_stats<self_type>(c.trans).extents_num_delta--;
} else {
+ auto pivot_idx = l->get_balance_pivot_idx(*l, *r);
LOG_PREFIX(FixedKVBtree::merge_level);
auto [replacement_l, replacement_r, pivot] =
l->make_balanced(
c,
r,
- !donor_is_left);
+ pivot_idx);
parent_pos.node->update(
liter,
make_balanced(
op_context_t<NODE_KEY> c,
Ref &_right,
- bool prefer_left) {
+ uint32_t pivot_idx) {
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_non_data_extent<node_type_t>(
c.trans,
static_cast<node_type_t&>(*this),
right,
- prefer_left,
+ pivot_idx,
*replacement_left,
*replacement_right);
auto pivot = this->balance_into_new_nodes(
*this,
right,
- prefer_left,
+ pivot_idx,
*replacement_left,
*replacement_right);
replacement_left->range = replacement_left->get_meta();
c.trans,
static_cast<node_type_t&>(*this),
right,
- prefer_left,
+ pivot_idx,
*replacement_left,
*replacement_right);
return std::make_tuple(
Transaction &t,
node_type_t &left,
node_type_t &right,
- bool prefer_left,
+ uint32_t pivot_idx,
node_type_t &replacement_left,
node_type_t &replacement_right) = 0;
virtual void adjust_copy_src_dest_on_balance(
Transaction &t,
node_type_t &left,
node_type_t &right,
- bool prefer_left,
+ uint32_t pivot_idx,
node_type_t &replacement_left,
node_type_t &replacement_right) = 0;
make_balanced(
op_context_t<NODE_KEY> c,
Ref &_right,
- bool prefer_left) {
+ uint32_t pivot_idx) {
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_non_data_extent<node_type_t>(
c.trans,
static_cast<node_type_t&>(*this),
right,
- prefer_left,
+ pivot_idx,
*replacement_left,
*replacement_right);
auto pivot = this->balance_into_new_nodes(
*this,
right,
- prefer_left,
+ pivot_idx,
*replacement_left,
*replacement_right);
replacement_left->range = replacement_left->get_meta();
c.trans,
static_cast<node_type_t&>(*this),
right,
- prefer_left,
+ pivot_idx,
*replacement_left,
*replacement_right);
return std::make_tuple(
Transaction &t,
LBALeafNode &left,
LBALeafNode &right,
- bool prefer_left,
+ uint32_t pivot_idx,
LBALeafNode &replacement_left,
LBALeafNode &replacement_right) final {
this->balance_child_ptrs(
- t, left, right, prefer_left, replacement_left, replacement_right);
+ t, left, right, pivot_idx, replacement_left, replacement_right);
}
void adjust_copy_src_dest_on_balance(
Transaction &t,
LBALeafNode &left,
LBALeafNode &right,
- bool prefer_left,
+ uint32_t pivot_idx,
LBALeafNode &replacement_left,
LBALeafNode &replacement_right) final {
this->parent_node_t::adjust_copy_src_dest_on_balance(
- t, left, right, prefer_left, replacement_left, replacement_right);
+ t, left, right, pivot_idx, replacement_left, replacement_right);
}
CachedExtentRef duplicate_for_write(Transaction&) final {
Transaction &t,
T &left,
T &right,
- bool prefer_left,
+ uint32_t pivot_idx,
T &replacement_left,
T &replacement_right)
{
size_t l_size = left.get_size();
size_t r_size = right.get_size();
- size_t pivot_idx = T::get_balance_pivot_idx(left, right, prefer_left);
+ ceph_assert(pivot_idx != l_size && pivot_idx != r_size);
replacement_left.maybe_expand_children(pivot_idx);
replacement_right.maybe_expand_children(r_size + l_size - pivot_idx);
Transaction &t,
T &left,
T &right,
- bool prefer_left,
+ uint32_t pivot_idx,
T &replacement_left,
T &replacement_right)
{
size_t l_size = left.get_size();
- size_t r_size = right.get_size();
- size_t total = l_size + r_size;
- size_t pivot_idx = (l_size + r_size) / 2;
- if (total % 2 && prefer_left) {
- pivot_idx++;
- }
if (left.is_initial_pending()) {
for (auto &cs : left.copy_sources) {
using make_balanced_iertr = base_iertr;
using make_balanced_ret = make_balanced_iertr::future
- <std::tuple<OMapNodeRef, OMapNodeRef, std::string>>;
+ <std::tuple<OMapNodeRef, OMapNodeRef, std::optional<std::string>>>;
virtual make_balanced_ret make_balanced(
omap_context_t oc,
OMapNodeRef _right) = 0;
LOG_PREFIX(OMapInnerNode::make_balanced);
DEBUGT("l: {}, r: {}", oc.t, *this, *_right);
ceph_assert(_right->get_type() == TYPE);
+ auto &right = *_right->cast<OMapInnerNode>();
+ auto pivot_idx = get_balance_pivot_idx(*this, right);
+ if (!pivot_idx) {
+ return make_balanced_ret(
+ interruptible::ready_future_marker{},
+ std::make_tuple(OMapNodeRef{}, OMapNodeRef{}, std::nullopt));
+ }
return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint,
OMAP_INNER_BLOCK_SIZE, 2)
- .si_then([this, _right, oc] (auto &&replacement_pair){
+ .si_then([this, &right, pivot_idx, oc] (auto &&replacement_pair){
auto replacement_left = replacement_pair.front();
auto replacement_right = replacement_pair.back();
- auto &right = *_right->cast<OMapInnerNode>();
- this->balance_child_ptrs(oc.t, *this, right, true,
+ this->balance_child_ptrs(oc.t, *this, right, *pivot_idx,
*replacement_left, *replacement_right);
return make_balanced_ret(
interruptible::ready_future_marker{},
std::make_tuple(replacement_left, replacement_right,
- balance_into_new_nodes(*this, right,
+ balance_into_new_nodes(*this, right, *pivot_idx,
*replacement_left, *replacement_right)));
}).handle_error_interruptible(
crimson::ct_error::enospc::assert_failure{"unexpected enospc"},
).si_then([liter=liter, riter=riter, l=l, r=r, oc, this](auto tuple) {
LOG_PREFIX(OMapInnerNode::merge_entry);
auto [replacement_l, replacement_r, replacement_pivot] = tuple;
- replacement_l->init_range(l->get_begin(), replacement_pivot);
- replacement_r->init_range(replacement_pivot, r->get_end());
+ if (!replacement_pivot) {
+ return merge_entry_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS,
+ std::nullopt, std::nullopt));
+ }
+ replacement_l->init_range(l->get_begin(), *replacement_pivot);
+ replacement_r->init_range(*replacement_pivot, r->get_end());
DEBUGT("to update parent: {} {} {}",
oc.t, *this, *replacement_l, *replacement_r);
if (get_meta().depth > 2) { // l and r are inner nodes
liter,
replacement_l->get_laddr(),
maybe_get_delta_buffer());
- bool overflow = extent_will_overflow(replacement_pivot.size(),
+ bool overflow = extent_will_overflow(replacement_pivot->size(),
std::nullopt);
if (!overflow) {
this->update_child_ptr(
journal_inner_insert(
riter,
replacement_r->get_laddr(),
- replacement_pivot,
+ *replacement_pivot,
maybe_get_delta_buffer());
std::vector<laddr_t> dec_laddrs{l->get_laddr(), r->get_laddr()};
return dec_ref(oc, dec_laddrs
this->remove_child_ptr(riter.get_offset());
journal_inner_remove(riter, maybe_get_delta_buffer());
return make_split_insert(
- oc, riter, replacement_pivot, replacement_r
+ oc, riter, *replacement_pivot, replacement_r
).si_then([this, oc, l = l, r = r](auto mresult) {
std::vector<laddr_t> dec_laddrs{
l->get_laddr(),
set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta()));
}
- static uint32_t get_balance_pivot_idx(
+ static std::optional<uint32_t> get_balance_pivot_idx(
const StringKVInnerNodeLayout &left,
- const StringKVInnerNodeLayout &right,
- // this is just for the consistency with linked tree nodes
- bool prefer_left = false)
+ const StringKVInnerNodeLayout &right)
{
uint32_t left_size = omap_inner_key_t(
left.get_node_key_ptr()[left.get_size()-1]).key_off;
}
}
}
- return pivot_idx;
+ return pivot_idx == left.get_size()
+ ? std::nullopt
+ : std::make_optional<uint32_t>(pivot_idx);
}
/**
static std::string balance_into_new_nodes(
const StringKVInnerNodeLayout &left,
const StringKVInnerNodeLayout &right,
+ uint32_t pivot_idx,
StringKVInnerNodeLayout &replacement_left,
StringKVInnerNodeLayout &replacement_right)
{
+ ceph_assert(!(left.below_min() && right.below_min()));
uint32_t left_size = omap_inner_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off;
uint32_t right_size = omap_inner_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off;
uint32_t total = left_size + right_size;
uint32_t pivot_size = total / 2;
- uint32_t pivot_idx = get_balance_pivot_idx(left, right);
auto replacement_pivot = pivot_idx >= left.get_size() ?
right.iter_idx(pivot_idx - left.get_size())->get_key() :
ASSERT_EQ(num, total);
}
-void run_balance_test(unsigned left, unsigned right, bool prefer_left)
+void run_balance_test(unsigned left, unsigned right)
{
auto node = TestNode();
auto node2 = TestNode();
auto node_balanced = TestNode();
auto node_balanced2 = TestNode();
+ auto pivot_idx = TestNode::get_balance_pivot_idx(node, node2);
auto pivot = TestNode::balance_into_new_nodes(
node,
node2,
- prefer_left,
+ pivot_idx,
node_balanced,
node_balanced2);
unsigned left_size, right_size;
if (total % 2) {
- if (prefer_left) {
- left_size = (total/2) + 1;
- right_size = total/2;
- } else {
left_size = total/2;
right_size = (total/2) + 1;
- }
} else {
left_size = right_size = total/2;
}
}
TEST(FixedKVNodeTest, balanced) {
- run_balance_test(CAPACITY / 2, CAPACITY, true);
- run_balance_test(CAPACITY / 2, CAPACITY, false);
- run_balance_test(CAPACITY, CAPACITY / 2, true);
- run_balance_test(CAPACITY, CAPACITY / 2, false);
- run_balance_test(CAPACITY - 1, CAPACITY / 2, true);
- run_balance_test(CAPACITY / 2, CAPACITY - 1, false);
- run_balance_test(CAPACITY / 2, CAPACITY / 2, false);
+ run_balance_test(CAPACITY / 2, CAPACITY);
+ run_balance_test(CAPACITY, CAPACITY / 2);
+ run_balance_test(CAPACITY - 1, CAPACITY / 2);
+ run_balance_test(CAPACITY / 2, CAPACITY - 1);
+ run_balance_test(CAPACITY / 2, CAPACITY / 2 + 2);
}
void run_replay_test(