From eb4c03f4efcef432c78955ffcc478f5c4011f28a Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Fri, 23 Apr 2021 11:07:12 +0800 Subject: [PATCH] crimson/onode-staged-tree: implement node-level erase/merge logic Signed-off-by: Yingxin Cheng --- .../onode_manager/staged-fltree/node.cc | 984 ++++++++++++++++-- .../onode_manager/staged-fltree/node.h | 64 +- .../staged-fltree/node_extent_accessor.h | 51 + .../onode_manager/staged-fltree/node_impl.cc | 4 +- .../onode_manager/staged-fltree/node_impl.h | 32 +- .../onode_manager/staged-fltree/node_layout.h | 127 ++- .../staged-fltree/stages/stage.h | 13 + .../staged-fltree/stages/stage_types.h | 15 + .../seastore/onode_tree/test_staged_fltree.cc | 28 +- 9 files changed, 1237 insertions(+), 81 deletions(-) diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc index 4f1bd74ca81..c04580d4191 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc @@ -286,10 +286,24 @@ Node::Node(NodeImplURef&& impl) : impl{std::move(impl)} {} Node::~Node() { // XXX: tolerate failure between allocate() and as_child() - if (is_root()) { - super->do_untrack_root(*this); + if (!super && !_parent_info.has_value()) { + // To erase a node: + // 1. I'm not tracking any children or cursors + // 2. unlink parent/super --ptr-> me + // 3. unlink me --ref-> parent/super + // 4. extent is retired + assert(!impl->is_extent_valid()); + + // TODO: maybe its possible when eagain happens internally, we should + // revisit to make sure tree operations can be aborted normally, + // without resource leak or hitting unexpected asserts. } else { - _parent_info->ptr->do_untrack_child(*this); + assert(impl->is_extent_valid()); + if (is_root()) { + super->do_untrack_root(*this); + } else { + _parent_info->ptr->do_untrack_child(*this); + } } } @@ -458,12 +472,12 @@ template void Node::as_child(const search_position_t&, Ref); template void Node::as_child(const search_position_t&, Ref); node_future<> Node::apply_split_to_parent( - context_t c, Ref split_right) + context_t c, Ref split_right, bool update_right_index) { assert(!is_root()); // TODO(cross-node string dedup) return parent_info().ptr->apply_child_split( - c, this, split_right); + c, this, split_right, update_right_index); } node_future> @@ -474,6 +488,122 @@ Node::get_next_cursor_from_parent(context_t c) return parent_info().ptr->get_next_cursor(c, parent_info().position); } +node_future<> +Node::try_merge_adjacent(context_t c, bool update_parent_index) +{ + impl->validate_non_empty(); + assert(!is_root()); + Ref this_ref = this; + if (!impl->is_size_underflow()) { + if (update_parent_index) { + return fix_parent_index(c); + } else { + parent_info().ptr->validate_child_tracked(*this); + return node_ertr::now(); + } + } + + return parent_info().ptr->get_child_peers(c, parent_info().position + ).safe_then([c, this_ref = std::move(this_ref), this, + update_parent_index] (auto lr_nodes) mutable -> node_future<> { + auto& [lnode, rnode] = lr_nodes; + Ref left_for_merge; + Ref right_for_merge; + bool is_left; + if (!lnode && !rnode) { + // XXX: this is possible before node rebalance is implemented, + // when its parent cannot merge with its peers and has only one child + // (this node). + } else if (!lnode) { + left_for_merge = std::move(this_ref); + right_for_merge = std::move(rnode); + is_left = true; + } else if (!rnode) { + left_for_merge = std::move(lnode); + right_for_merge = std::move(this_ref); + is_left = false; + } else { // lnode && rnode + if (lnode->impl->free_size() > rnode->impl->free_size()) { + left_for_merge = std::move(lnode); + right_for_merge = std::move(this_ref); + is_left = false; + } else { // lnode free size <= rnode free size + left_for_merge = std::move(this_ref); + right_for_merge = std::move(rnode); + is_left = true; + } + } + + if (left_for_merge) { + assert(right_for_merge); + auto [merge_stage, merge_size] = left_for_merge->impl->evaluate_merge( + *right_for_merge->impl); + if (merge_size <= left_for_merge->impl->total_size()) { + // proceed merge + bool update_index_after_merge; + if (is_left) { + update_index_after_merge = false; + } else { + update_index_after_merge = update_parent_index; + } + logger().info("OTree::Node::MergeAdjacent: merge {} and {} " + "at merge_stage={}, merge_size={}B, update_index={} ...", + left_for_merge->get_name(), right_for_merge->get_name(), + merge_stage, merge_size, update_index_after_merge); + // we currently cannot generate delta depends on another extent content, + // so use rebuild_extent() as a workaround to rebuild the node from a + // fresh extent, thus no need to generate delta. + return left_for_merge->rebuild_extent(c + ).safe_then([c, merge_stage, merge_size, update_index_after_merge, + left_for_merge = std::move(left_for_merge), + right_for_merge = std::move(right_for_merge)] (auto left_mut) mutable { + if (left_for_merge->impl->node_type() == node_type_t::LEAF) { + auto& left = *static_cast(left_for_merge.get()); + left.on_layout_change(); + } + search_position_t left_last_pos = left_for_merge->impl->merge( + left_mut, *right_for_merge->impl, merge_stage, merge_size); + left_for_merge->track_merge(right_for_merge, merge_stage, left_last_pos); + return left_for_merge->parent_info().ptr->apply_children_merge( + c, std::move(left_for_merge), + std::move(right_for_merge), update_index_after_merge); + }); + } else { + // size will overflow if merge + } + } + + // cannot merge + if (update_parent_index) { + return fix_parent_index(c); + } else { + parent_info().ptr->validate_child_tracked(*this); + return node_ertr::now(); + } + // XXX: rebalance + }); +} + +node_future<> Node::erase_node(context_t c, Ref&& this_ref) +{ + assert(this_ref.get() == this); + assert(!is_tracking()); + assert(!is_root()); + assert(this_ref->use_count() == 1); + return parent_info().ptr->erase_child(c, std::move(this_ref)); +} + +node_future<> Node::fix_parent_index(context_t c) +{ + assert(!is_root()); + auto& parent = parent_info().ptr; + // one-way unlink + parent->do_untrack_child(*this); + // the rest of parent tracks should be correct + parent->validate_tracked_children(); + return parent->fix_index(c, this); +} + node_future> Node::load( context_t c, laddr_t addr, bool expect_is_level_tail) { @@ -496,6 +626,44 @@ node_future> Node::load( }); } +node_future Node::rebuild_extent(context_t c) +{ + logger().debug("OTree::Node::Rebuild: {} ...", get_name()); + assert(!is_root()); + // assume I'm already ref counted by caller + + // note: laddr can be changed after rebuild, but we don't fix the parent + // mapping as it is part of the merge process. + return impl->rebuild_extent(c); +} + +node_future<> Node::retire(context_t c, Ref&& this_ref) +{ + logger().debug("OTree::Node::Retire: {} ...", get_name()); + assert(this_ref.get() == this); + assert(!is_tracking()); + assert(!super); + // make sure the parent also has untracked this node + assert(!_parent_info.has_value()); + assert(this_ref->use_count() == 1); + + return impl->retire_extent(c + ).safe_then([this_ref = std::move(this_ref)]{ /* deallocate node */}); +} + +void Node::make_tail(context_t c) +{ + assert(!impl->is_level_tail()); + assert(!impl->is_keys_empty()); + logger().debug("OTree::Node::MakeTail: {} ...", get_name()); + impl->prepare_mutate(c); + auto tail_pos = impl->make_tail(); + if (impl->node_type() == node_type_t::INTERNAL) { + auto& node = *static_cast(this); + node.track_make_tail(tail_pos); + } +} + /* * InternalNode */ @@ -530,7 +698,8 @@ InternalNode::get_next_cursor(context_t c, const search_position_t& pos) } node_future<> InternalNode::apply_child_split( - context_t c, Ref left_child, Ref right_child) + context_t c, Ref left_child, Ref right_child, + bool update_right_index) { auto& left_pos = left_child->parent_info().position; @@ -540,69 +709,379 @@ node_future<> InternalNode::apply_child_split( if (left_pos.is_end()) { assert(impl->is_level_tail()); assert(right_child->impl->is_level_tail()); + assert(!update_right_index); } #endif impl->prepare_mutate(c); - auto left_key = *left_child->impl->get_pivot_index(); + logger().debug("OTree::Internal::ApplyChildSplit: apply {}'s child " + "{} to split to {}, update_index={} ...", + get_name(), left_child->get_name(), + right_child->get_name(), update_right_index); + + // update layout from left_pos => left_child_addr to right_child_addr auto left_child_addr = left_child->impl->laddr(); - auto op_right_key = right_child->impl->get_pivot_index(); auto right_child_addr = right_child->impl->laddr(); - if (op_right_key.has_value()) { - logger().debug("OTree::Internal::Insert: " - "left_pos({}), left_child({}, {:#x}), right_child({}, {:#x}) ...", - left_pos, left_key, left_child_addr, *op_right_key, right_child_addr); - } else { - logger().debug("OTree::Internal::Insert: " - "left_pos({}), left_child({}, {:#x}), right_child(N/A, {:#x}) ...", - left_pos, left_key, left_child_addr, right_child_addr); - } - // update left_pos => left_child to left_pos => right_child impl->replace_child_addr(left_pos, right_child_addr, left_child_addr); - replace_track(right_child, left_child); - search_position_t insert_pos = left_pos; - auto [insert_stage, insert_size] = impl->evaluate_insert( - left_key, left_child_addr, insert_pos); - auto free_size = impl->free_size(); - if (free_size >= insert_size) { - // insert - [[maybe_unused]] auto p_value = impl->insert( - left_key, left_child_addr, insert_pos, insert_stage, insert_size); - assert(impl->free_size() == free_size - insert_size); - assert(insert_pos <= left_pos); - assert(p_value->value == left_child_addr); - track_insert(insert_pos, insert_stage, left_child, right_child); - validate_tracked_children(); - return node_ertr::now(); + // update track from left_pos => left_child to right_child + replace_track(right_child, left_child, update_right_index); + + auto left_key = *left_child->impl->get_pivot_index(); + Ref this_ref = this; + return insert_or_split( + c, left_pos, left_key, left_child, + (update_right_index ? right_child : nullptr) + ).safe_then([this, c, this_ref = std::move(this_ref)] (auto split_right) { + if (split_right) { + // even if update_right_index could be true, + // we haven't fixed the right_child index of this node yet, + // so my parent index should be correct now. + return apply_split_to_parent(c, split_right, false); + } else { + return node_ertr::now(); + } + }).safe_then([c, update_right_index, right_child] { + if (update_right_index) { + // right_child must be already untracked by insert_or_split() + return right_child->parent_info().ptr->fix_index(c, right_child); + } else { + // there is no need to call try_merge_adjacent() because + // the filled size of the inserted node or the split right node + // won't be reduced if update_right_index is false. + return node_ertr::now(); + } + }); +} + +node_future<> InternalNode::erase_child(context_t c, Ref&& child_ref) +{ + // this is a special version of recursive merge + impl->validate_non_empty(); + assert(child_ref->use_count() == 1); + validate_child_tracked(*child_ref); + + // fix the child's previous node as the new tail, + // and trigger prv_child_ref->try_merge_adjacent() at the end + bool fix_tail = (child_ref->parent_info().position.is_end() && + !impl->is_keys_empty()); + return node_ertr::now().safe_then([c, this, fix_tail] { + if (fix_tail) { + search_position_t new_tail_pos; + const laddr_packed_t* new_tail_p_addr = nullptr; + impl->get_largest_slot(&new_tail_pos, nullptr, &new_tail_p_addr); + return get_or_track_child(c, new_tail_pos, new_tail_p_addr->value); + } else { + return node_ertr::make_ready_future>(); + } + }).safe_then([c, this, child_ref = std::move(child_ref)] + (auto&& new_tail_child) mutable { + auto child_pos = child_ref->parent_info().position; + if (new_tail_child) { + logger().info("OTree::Internal::EraseChild: erase {}'s child {} at pos({}), " + "and fix new child tail {} at pos({}) ...", + get_name(), child_ref->get_name(), child_pos, + new_tail_child->get_name(), + new_tail_child->parent_info().position); + assert(!new_tail_child->impl->is_level_tail()); + new_tail_child->make_tail(c); + assert(new_tail_child->impl->is_level_tail()); + if (new_tail_child->impl->node_type() == node_type_t::LEAF) { + // no need to proceed merge because the filled size is not changed + new_tail_child.reset(); + } + } else { + logger().info("OTree::Internal::EraseChild: erase {}'s child {} at pos({}) ...", + get_name(), child_ref->get_name(), child_pos); + } + + do_untrack_child(*child_ref); + Ref this_ref = this; + child_ref->_parent_info.reset(); + return child_ref->retire(c, std::move(child_ref) + ).safe_then([c, this, child_pos, this_ref = std::move(this_ref)] () mutable { + if ((impl->is_level_tail() && impl->is_keys_empty()) || + (!impl->is_level_tail() && impl->is_keys_one())) { + // there is only one value left + // fast path without mutating the extent + logger().debug("OTree::Internal::EraseChild: {} has one value left, erase ...", + get_name()); +#ifndef NDEBUG + if (impl->is_level_tail()) { + assert(child_pos.is_end()); + } else { + assert(child_pos == search_position_t::begin()); + } +#endif + + if (is_root()) { + // Note: if merge/split works as expected, we should never encounter the + // situation when the internal root has <=1 children: + // + // A newly created internal root (see Node::upgrade_root()) will have 2 + // children after split is finished. + // + // When merge happens, children will try to merge each other, and if the + // root detects there is only one child left, the root will be + // down-graded to the only child. + // + // In order to preserve the invariant, we need to make sure the new + // internal root also has at least 2 children. + ceph_abort("trying to erase the last item from the internal root node"); + } + + // track erase + assert(tracked_child_nodes.empty()); + + // no child should be referencing this node now, this_ref is the last one. + assert(this_ref->use_count() == 1); + Ref node_ref = this_ref; + this_ref.reset(); + return Node::erase_node(c, std::move(node_ref)); + } + + impl->prepare_mutate(c); + auto [erase_stage, next_or_last_pos] = impl->erase(child_pos); + if (child_pos.is_end()) { + // next_or_last_pos as last_pos + track_make_tail(next_or_last_pos); + } else { + // next_or_last_pos as next_pos + track_erase(child_pos, erase_stage); + } + validate_tracked_children(); + + if (is_root()) { + return try_downgrade_root(c, std::move(this_ref)); + } else { + bool update_parent_index; + if (impl->is_level_tail()) { + update_parent_index = false; + } else { + // next_or_last_pos as next_pos + next_or_last_pos.is_end() ? update_parent_index = true + : update_parent_index = false; + } + return try_merge_adjacent(c, update_parent_index); + } + }).safe_then([c, new_tail_child = std::move(new_tail_child)] { + // finally, check if the new tail child needs to merge + if (new_tail_child && !new_tail_child->is_root()) { + assert(new_tail_child->impl->is_level_tail()); + return new_tail_child->try_merge_adjacent(c, false); + } else { + return node_ertr::now(); + } + }); + }); +} + +node_future<> InternalNode::fix_index(context_t c, Ref child) +{ + impl->validate_non_empty(); + + auto& child_pos = child->parent_info().position; + // child must already be untracked before calling fix_index() + assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end()); + validate_child_inconsistent(*child); + + impl->prepare_mutate(c); + + key_view_t new_key = *child->impl->get_pivot_index(); + logger().debug("OTree::Internal::FixIndex: fix {}'s index of child {} at pos({}), " + "new_key={} ...", + get_name(), child->get_name(), child_pos, new_key); + + // erase the incorrect item + auto [erase_stage, next_pos] = impl->erase(child_pos); + track_erase(child_pos, erase_stage); + validate_tracked_children(); + + // find out whether there is a need to fix parent index recursively + bool update_parent_index; + if (impl->is_level_tail()) { + update_parent_index = false; + } else { + next_pos.is_end() ? update_parent_index = true + : update_parent_index = false; } - // split and insert + Ref this_ref = this; - return (is_root() ? upgrade_root(c) : node_ertr::now() - ).safe_then([this, c] { - return InternalNode::allocate( - c, impl->field_type(), impl->is_level_tail(), impl->level()); - }).safe_then([this_ref, this, c, left_key, left_child, right_child, - insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable { - auto right_node = fresh_right.node; - auto left_child_addr = left_child->impl->laddr(); - auto [split_pos, is_insert_left, p_value] = impl->split_insert( - fresh_right.mut, *right_node->impl, left_key, left_child_addr, - insert_pos, insert_stage, insert_size); - assert(p_value->value == left_child_addr); - track_split(split_pos, right_node); - if (is_insert_left) { - track_insert(insert_pos, insert_stage, left_child); + return insert_or_split(c, next_pos, new_key, child + ).safe_then([this, c, update_parent_index, + this_ref = std::move(this_ref)] (auto split_right) { + if (split_right) { + // after split, the parent index to the split_right will be incorrect + // if update_parent_index is true. + return apply_split_to_parent(c, split_right, update_parent_index); } else { - right_node->track_insert(insert_pos, insert_stage, left_child); + // no split path + if (is_root()) { + // no need to call try_downgrade_root() because the number of keys + // has not changed. + return node_ertr::now(); + } else { + // for non-root, maybe need merge adjacent or fix parent, + // because the filled node size may be reduced. + return try_merge_adjacent(c, update_parent_index); + } } - validate_tracked_children(); - right_node->validate_tracked_children(); + }); +} - return apply_split_to_parent(c, right_node); - // TODO (optimize) - // try to acquire space from siblings before split... see btrfs +node_future<> InternalNode::apply_children_merge( + context_t c, Ref&& left_child, + Ref&& right_child, bool update_index) +{ + auto left_pos = left_child->parent_info().position; + auto left_addr = left_child->impl->laddr(); + auto& right_pos = right_child->parent_info().position; + auto right_addr = right_child->impl->laddr(); + logger().debug("OTree::Internal::ApplyChildMerge: apply {}'s child " + "{} at pos({}), to merge with {} at pos({}), update_index={} ...", + get_name(), left_child->get_name(), left_pos, + right_child->get_name(), right_pos, update_index); + +#ifndef NDEBUG + assert(left_child->parent_info().ptr == this); + assert(!left_pos.is_end()); + const laddr_packed_t* p_value_left; + impl->get_slot(left_pos, nullptr, &p_value_left); + assert(p_value_left->value == left_addr); + + assert(right_child->use_count() == 1); + assert(right_child->parent_info().ptr == this); + const laddr_packed_t* p_value_right; + if (right_pos.is_end()) { + assert(right_child->impl->is_level_tail()); + assert(left_child->impl->is_level_tail()); + assert(impl->is_level_tail()); + assert(!update_index); + p_value_right = impl->get_tail_value(); + } else { + assert(!right_child->impl->is_level_tail()); + assert(!left_child->impl->is_level_tail()); + impl->get_slot(right_pos, nullptr, &p_value_right); + } + assert(p_value_right->value == right_addr); +#endif + + // XXX: we may jump to try_downgrade_root() without mutating this node. + + // update layout from right_pos => right_addr to left_addr + impl->prepare_mutate(c); + impl->replace_child_addr(right_pos, left_addr, right_addr); + + // update track from right_pos => right_child to left_child + do_untrack_child(*left_child); + replace_track(left_child, right_child, update_index); + + // erase left_pos from layout + auto [erase_stage, next_pos] = impl->erase(left_pos); + track_erase(left_pos, erase_stage); + assert(next_pos == left_child->parent_info().position); + + // All good to retire the right_child. + // I'm already ref-counted by left_child. + return right_child->retire(c, std::move(right_child) + ).safe_then([c, this, update_index, + left_child = std::move(left_child)] () mutable { + Ref this_ref = this; + if (update_index) { + return left_child->fix_parent_index(c + ).safe_then([c, this, this_ref = std::move(this_ref)] { + // I'm all good but: + // - my number of keys is reduced by 1 + // - my size may underflow, + // but try_merge_adjacent() is already part of fix_index() + if (is_root()) { + return try_downgrade_root(c, std::move(this_ref)); + } else { + return node_ertr::now(); + } + }); + } else { + left_child.reset(); + validate_tracked_children(); + // I'm all good but: + // - my number of keys is reduced by 1 + // - my size may underflow + if (is_root()) { + return try_downgrade_root(c, std::move(this_ref)); + } else { + return try_merge_adjacent(c, false); + } + } + }); +} + +node_future, Ref>> InternalNode::get_child_peers( + context_t c, const search_position_t& pos) +{ + // assume I'm already ref counted by caller + search_position_t prev_pos; + const laddr_packed_t* prev_p_child_addr = nullptr; + search_position_t next_pos; + const laddr_packed_t* next_p_child_addr = nullptr; + + if (pos.is_end()) { + assert(impl->is_level_tail()); + if (!impl->is_keys_empty()) { + // got previous child only + impl->get_largest_slot(&prev_pos, nullptr, &prev_p_child_addr); + assert(prev_pos < pos); + assert(prev_p_child_addr != nullptr); + } else { + // no keys, so no peer children + } + } else { // !pos.is_end() + if (pos != search_position_t::begin()) { + // got previous child + prev_pos = pos; + impl->get_prev_slot(prev_pos, nullptr, &prev_p_child_addr); + assert(prev_pos < pos); + assert(prev_p_child_addr != nullptr); + } else { + // is already the first child, so no previous child + } + + next_pos = pos; + impl->get_next_slot(next_pos, nullptr, &next_p_child_addr); + if (next_pos.is_end()) { + if (impl->is_level_tail()) { + // the next child is the tail + next_p_child_addr = impl->get_tail_value(); + assert(pos < next_pos); + assert(next_p_child_addr != nullptr); + } else { + // next child doesn't exist + assert(next_p_child_addr == nullptr); + } + } else { + // got the next child + assert(pos < next_pos); + assert(next_p_child_addr != nullptr); + } + } + + return node_ertr::now().safe_then([this, c, prev_pos, prev_p_child_addr] { + if (prev_p_child_addr != nullptr) { + return get_or_track_child(c, prev_pos, prev_p_child_addr->value); + } else { + return node_ertr::make_ready_future>(); + } + }).safe_then([this, c, next_pos, next_p_child_addr] (Ref lnode) { + if (next_p_child_addr != nullptr) { + return get_or_track_child(c, next_pos, next_p_child_addr->value + ).safe_then([lnode] (Ref rnode) { + return node_ertr::make_ready_future, Ref>>( + lnode, rnode); + }); + } else { + return node_ertr::make_ready_future, Ref>>( + lnode, nullptr); + } }); } @@ -708,6 +1187,60 @@ node_future<> InternalNode::do_get_tree_stats( ); } +void InternalNode::track_merge( + Ref _right_node, match_stage_t stage, search_position_t& left_last_pos) +{ + assert(level() == _right_node->level()); + assert(impl->node_type() == _right_node->impl->node_type()); + auto& right_node = *static_cast(_right_node.get()); + if (right_node.tracked_child_nodes.empty()) { + return; + } + + match_stage_t curr_stage = STAGE_BOTTOM; + + // prepare the initial left_last_pos for offset + while (curr_stage < stage) { + left_last_pos.index_by_stage(curr_stage) = 0; + ++curr_stage; + } + ++left_last_pos.index_by_stage(curr_stage); + + // fix the tracked child nodes of right_node, stage by stage. + auto& right_tracked_children = right_node.tracked_child_nodes; + auto rit = right_tracked_children.begin(); + while (curr_stage <= STAGE_TOP) { + auto right_pos_until = search_position_t::begin(); + right_pos_until.index_by_stage(curr_stage) = INDEX_UPPER_BOUND; + auto rend = right_tracked_children.lower_bound(right_pos_until); + while (rit != rend) { + auto new_pos = rit->second->parent_info().position; + assert(new_pos == rit->first); + assert(rit->second->parent_info().ptr == &right_node); + new_pos += left_last_pos; + auto p_child = rit->second; + rit = right_tracked_children.erase(rit); + p_child->as_child(new_pos, this); + } + left_last_pos.index_by_stage(curr_stage) = 0; + ++curr_stage; + } + + // fix the end tracked child node of right_node, if exists. + if (rit != right_tracked_children.end()) { + assert(rit->first == search_position_t::end()); + assert(rit->second->parent_info().position == search_position_t::end()); + assert(right_node.impl->is_level_tail()); + assert(impl->is_level_tail()); + auto p_child = rit->second; + rit = right_tracked_children.erase(rit); + p_child->as_child(search_position_t::end(), this); + } + assert(right_tracked_children.empty()); + + validate_tracked_children(); +} + node_future<> InternalNode::test_clone_root( context_t c_other, RootNodeTracker& tracker_other) const { @@ -739,6 +1272,125 @@ node_future<> InternalNode::test_clone_root( }); } +node_future<> InternalNode::try_downgrade_root( + context_t c, Ref&& this_ref) +{ + assert(this_ref.get() == this); + assert(is_root()); + assert(impl->is_level_tail()); + if (!impl->is_keys_empty()) { + // I have more than 1 values, no need to downgrade + return node_ertr::now(); + } + + // proceed downgrade root to the only child + laddr_t child_addr = impl->get_tail_value()->value; + return get_or_track_child(c, search_position_t::end(), child_addr + ).safe_then([c, this, this_ref = std::move(this_ref)] (auto child) mutable { + logger().info("OTree::Internal::DownGradeRoot: downgrade {} to new root {}", + get_name(), child->get_name()); + // Invariant, see InternalNode::erase_child() + // the new internal root should have at least 2 children. + assert(child->impl->is_level_tail()); + if (child->impl->node_type() == node_type_t::INTERNAL) { + ceph_assert(!child->impl->is_keys_empty()); + } + + this->super->do_untrack_root(*this); + assert(tracked_child_nodes.size() == 1); + do_untrack_child(*child); + child->_parent_info.reset(); + child->make_root_from(c, std::move(this->super), impl->laddr()); + return retire(c, std::move(this_ref)); + }); +} + +node_future> InternalNode::insert_or_split( + context_t c, + const search_position_t& pos, + const key_view_t& insert_key, + Ref insert_child, + Ref outdated_child) +{ + // XXX: check the insert_child is unlinked from this node +#ifndef NDEBUG + auto _insert_key = *insert_child->impl->get_pivot_index(); + assert(insert_key.compare_to(_insert_key) == MatchKindCMP::EQ); +#endif + auto insert_value = insert_child->impl->laddr(); + auto insert_pos = pos; + logger().debug("OTree::Internal::InsertSplit: insert {} " + "with insert_key={}, insert_child={}, insert_pos({}), " + "outdated_child={} ...", + get_name(), insert_key, insert_child->get_name(), + insert_pos, (outdated_child ? "True" : "False")); + auto [insert_stage, insert_size] = impl->evaluate_insert( + insert_key, insert_value, insert_pos); + auto free_size = impl->free_size(); + if (free_size >= insert_size) { + // proceed to insert + [[maybe_unused]] auto p_value = impl->insert( + insert_key, insert_value, insert_pos, insert_stage, insert_size); + assert(impl->free_size() == free_size - insert_size); + assert(insert_pos <= pos); + assert(p_value->value == insert_value); + track_insert(insert_pos, insert_stage, insert_child); + + if (outdated_child) { + // untrack the inaccurate child after updated its position + // before validate, and before fix_index() + validate_child_inconsistent(*outdated_child); + // we will need its parent_info valid for the following fix_index() + do_untrack_child(*outdated_child); + } + + validate_tracked_children(); + return node_ertr::make_ready_future>(nullptr); + } + + // proceed to split with insert + // assume I'm already ref-counted by caller + return (is_root() ? upgrade_root(c) : node_ertr::now() + ).safe_then([this, c] { + return InternalNode::allocate( + c, impl->field_type(), impl->is_level_tail(), impl->level()); + }).safe_then([this, insert_key, insert_child, insert_pos, + insert_stage=insert_stage, insert_size=insert_size, + outdated_child](auto fresh_right) mutable { + // I'm the left_node and need to split into the right_node + auto right_node = fresh_right.node; + logger().info("OTree::Internal::InsertSplit: proceed split {} to fresh {} " + "with insert_child={}, outdated_child={} ...", + get_name(), right_node->get_name(), + insert_child->get_name(), + (outdated_child ? outdated_child->get_name() : "N/A")); + auto insert_value = insert_child->impl->laddr(); + auto [split_pos, is_insert_left, p_value] = impl->split_insert( + fresh_right.mut, *right_node->impl, insert_key, insert_value, + insert_pos, insert_stage, insert_size); + assert(p_value->value == insert_value); + track_split(split_pos, right_node); + if (is_insert_left) { + track_insert(insert_pos, insert_stage, insert_child); + } else { + right_node->track_insert(insert_pos, insert_stage, insert_child); + } + + if (outdated_child) { + // untrack the inaccurate child after updated its position + // before validate, and before fix_index() + auto& _parent = outdated_child->parent_info().ptr; + _parent->validate_child_inconsistent(*outdated_child); + // we will need its parent_info valid for the following fix_index() + _parent->do_untrack_child(*outdated_child); + } + + validate_tracked_children(); + right_node->validate_tracked_children(); + return right_node; + }); +} + node_future> InternalNode::get_or_track_child( context_t c, const search_position_t& position, laddr_t child_addr) { @@ -801,16 +1453,28 @@ void InternalNode::track_insert( } void InternalNode::replace_track( - Ref new_child, Ref old_child) + Ref new_child, Ref old_child, bool is_new_child_outdated) { assert(old_child->parent_info().ptr == this); auto& pos = old_child->parent_info().position; do_untrack_child(*old_child); - new_child->as_child(pos, this); + if (is_new_child_outdated) { + // we need to keep track of the outdated child through + // insert and split. + new_child->as_child(pos, this); + } else { + new_child->as_child(pos, this); + } // ok, safe to release ref old_child->_parent_info.reset(); - validate_child_tracked(*new_child); +#ifndef NDEBUG + if (is_new_child_outdated) { + validate_child_inconsistent(*new_child); + } else { + validate_child_tracked(*new_child); + } +#endif } void InternalNode::track_split( @@ -826,6 +1490,48 @@ void InternalNode::track_split( } } +template +void InternalNode::track_erase( + const search_position_t& erase_pos, match_stage_t erase_stage) +{ + auto first = tracked_child_nodes.lower_bound(erase_pos); + assert(first == tracked_child_nodes.end() || + first->first != erase_pos); + auto pos_upper_bound = erase_pos; + pos_upper_bound.index_by_stage(erase_stage) = INDEX_UPPER_BOUND; + auto last = tracked_child_nodes.lower_bound(pos_upper_bound); + std::vector p_nodes; + std::for_each(first, last, [&p_nodes](auto& kv) { + p_nodes.push_back(kv.second); + }); + tracked_child_nodes.erase(first, last); + for (auto& p_node: p_nodes) { + auto new_pos = p_node->parent_info().position; + assert(new_pos.index_by_stage(erase_stage) > 0); + --new_pos.index_by_stage(erase_stage); + p_node->as_child(new_pos, this); + } +} + +void InternalNode::track_make_tail(const search_position_t& last_pos) +{ + // assume I'm ref counted by the caller. + assert(impl->is_level_tail()); + assert(!last_pos.is_end()); + assert(tracked_child_nodes.find(search_position_t::end()) == + tracked_child_nodes.end()); + auto last_it = tracked_child_nodes.find(last_pos); + if (last_it != tracked_child_nodes.end()) { + assert(std::next(last_it) == tracked_child_nodes.end()); + auto p_last_child = last_it->second; + tracked_child_nodes.erase(last_it); + p_last_child->as_child(search_position_t::end(), this); + } else { + assert(tracked_child_nodes.lower_bound(last_pos) == + tracked_child_nodes.end()); + } +} + void InternalNode::validate_child(const Node& child) const { #ifndef NDEBUG @@ -849,6 +1555,25 @@ void InternalNode::validate_child(const Node& child) const #endif } +void InternalNode::validate_child_inconsistent(const Node& child) const +{ +#ifndef NDEBUG + assert(impl->level() - 1 == child.impl->level()); + assert(this == child.parent_info().ptr); + auto& child_pos = child.parent_info().position; + // the tail value has no key to fix + assert(!child_pos.is_end()); + assert(!child.impl->is_level_tail()); + + key_view_t current_key; + const laddr_packed_t* p_value; + impl->get_slot(child_pos, ¤t_key, &p_value); + key_view_t new_key = *child.impl->get_pivot_index(); + assert(current_key.compare_to(new_key) != MatchKindCMP::EQ); + assert(p_value->value == child.impl->laddr()); +#endif +} + node_future InternalNode::allocate( context_t c, field_type_t field_type, bool is_level_tail, level_t level) { @@ -915,9 +1640,74 @@ LeafNode::get_next_cursor(context_t c, const search_position_t& pos) node_future> LeafNode::erase(context_t c, const search_position_t& pos, bool get_next) { - ceph_abort("not implemented"); - return node_ertr::make_ready_future>( - tree_cursor_t::get_invalid()); + assert(!pos.is_end()); + assert(!impl->is_keys_empty()); + Ref this_ref = this; + logger().debug("OTree::Leaf::Erase: erase {}'s pos({}), get_next={} ...", + get_name(), pos, get_next); + + // get the next cursor + return node_ertr::now().safe_then([c, &pos, get_next, this] { + if (get_next) { + return get_next_cursor(c, pos); + } else { + return node_ertr::make_ready_future>(); + } + }).safe_then([c, &pos, this_ref = std::move(this_ref), + this] (Ref next_cursor) { + if (next_cursor && next_cursor->is_end()) { + // reset the node reference from the end cursor + next_cursor.reset(); + } + return node_ertr::now( + ).safe_then([c, &pos, this_ref = std::move(this_ref), this] () mutable { +#ifndef NDEBUG + assert(!impl->is_keys_empty()); + if (impl->is_keys_one()) { + assert(pos == search_position_t::begin()); + } +#endif + if (!is_root() && impl->is_keys_one()) { + // we need to keep the root as an empty leaf node + // fast path without mutating the extent + // track_erase + logger().debug("OTree::Leaf::Erase: {} has one value left, erase ...", + get_name()); + assert(tracked_cursors.size() == 1); + auto iter = tracked_cursors.begin(); + assert(iter->first == pos); + iter->second->invalidate(); + tracked_cursors.clear(); + + // no cursor should be referencing this node now, this_ref is the last one. + assert(this_ref->use_count() == 1); + Ref node_ref = this_ref; + this_ref.reset(); + return Node::erase_node(c, std::move(node_ref)); + } + + on_layout_change(); + impl->prepare_mutate(c); + auto [erase_stage, next_pos] = impl->erase(pos); + track_erase(pos, erase_stage); + validate_tracked_cursors(); + + if (is_root()) { + return node_ertr::now(); + } else { + bool update_parent_index; + if (impl->is_level_tail()) { + update_parent_index = false; + } else { + next_pos.is_end() ? update_parent_index = true + : update_parent_index = false; + } + return try_merge_adjacent(c, update_parent_index); + } + }).safe_then([next_cursor] { + return next_cursor; + }); + }); } node_future<> LeafNode::extend_value( @@ -1003,6 +1793,49 @@ node_future<> LeafNode::do_get_tree_stats(context_t, tree_stats_t& stats) return node_ertr::now(); } +void LeafNode::track_merge( + Ref _right_node, match_stage_t stage, search_position_t& left_last_pos) +{ + assert(level() == _right_node->level()); + // assert(impl->node_type() == _right_node->impl->node_type()); + auto& right_node = *static_cast(_right_node.get()); + if (right_node.tracked_cursors.empty()) { + return; + } + + match_stage_t curr_stage = STAGE_BOTTOM; + + // prepare the initial left_last_pos for offset + while (curr_stage < stage) { + left_last_pos.index_by_stage(curr_stage) = 0; + ++curr_stage; + } + ++left_last_pos.index_by_stage(curr_stage); + + // fix the tracked child nodes of right_node, stage by stage. + auto& right_tracked_cursors = right_node.tracked_cursors; + auto rit = right_tracked_cursors.begin(); + while (curr_stage <= STAGE_TOP) { + auto right_pos_until = search_position_t::begin(); + right_pos_until.index_by_stage(curr_stage) = INDEX_UPPER_BOUND; + auto rend = right_tracked_cursors.lower_bound(right_pos_until); + while (rit != rend) { + auto new_pos = rit->second->get_position(); + assert(new_pos == rit->first); + assert(rit->second->get_leaf_node().get() == &right_node); + new_pos += left_last_pos; + auto p_cursor = rit->second; + rit = right_tracked_cursors.erase(rit); + p_cursor->update_track(this, new_pos); + } + left_last_pos.index_by_stage(curr_stage) = 0; + ++curr_stage; + } + assert(right_tracked_cursors.empty()); + + validate_tracked_cursors(); +} + node_future<> LeafNode::test_clone_root( context_t c_other, RootNodeTracker& tracker_other) const { @@ -1078,7 +1911,7 @@ node_future> LeafNode::insert_value( validate_tracked_cursors(); right_node->validate_tracked_cursors(); - return apply_split_to_parent(c, right_node + return apply_split_to_parent(c, right_node, false ).safe_then([ret] { return ret; }); @@ -1172,6 +2005,33 @@ void LeafNode::track_split( } } +void LeafNode::track_erase( + const search_position_t& erase_pos, match_stage_t erase_stage) +{ + // erase tracking and invalidate the erased cursor + auto to_erase = tracked_cursors.find(erase_pos); + assert(to_erase != tracked_cursors.end()); + to_erase->second->invalidate(); + auto first = tracked_cursors.erase(to_erase); + + // update cursor position + assert(first == tracked_cursors.lower_bound(erase_pos)); + auto pos_upper_bound = erase_pos; + pos_upper_bound.index_by_stage(erase_stage) = INDEX_UPPER_BOUND; + auto last = tracked_cursors.lower_bound(pos_upper_bound); + std::vector p_cursors; + std::for_each(first, last, [&p_cursors](auto& kv) { + p_cursors.push_back(kv.second); + }); + tracked_cursors.erase(first, last); + for (auto& p_cursor : p_cursors) { + search_position_t new_pos = p_cursor->get_position(); + assert(new_pos.index_by_stage(erase_stage) > 0); + --new_pos.index_by_stage(erase_stage); + p_cursor->update_track(this, new_pos); + } +} + node_future LeafNode::allocate( context_t c, field_type_t field_type, bool is_level_tail) { diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h index 2159ff51395..e5161d8e8f4 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h @@ -394,6 +394,10 @@ class Node context_t, const key_hobj_t&, MatchHistory&) = 0; virtual node_future<> do_get_tree_stats(context_t, tree_stats_t&) = 0; + virtual bool is_tracking() const = 0; + + virtual void track_merge(Ref, match_stage_t, search_position_t&) = 0; + protected: Node(NodeImplURef&&); bool is_root() const { @@ -425,8 +429,14 @@ class Node }; const parent_info_t& parent_info() const { return *_parent_info; } - node_future<> apply_split_to_parent(context_t, Ref); + node_future<> apply_split_to_parent(context_t, Ref, bool); node_future> get_next_cursor_from_parent(context_t); + node_future<> try_merge_adjacent(context_t, bool); + node_future<> erase_node(context_t, Ref&&); + node_future<> fix_parent_index(context_t); + node_future rebuild_extent(context_t); + node_future<> retire(context_t, Ref&&); + void make_tail(context_t); private: /** @@ -472,7 +482,7 @@ class InternalNode final : public Node { node_future> get_next_cursor(context_t, const search_position_t&); - node_future<> apply_child_split(context_t, Ref left, Ref right); + node_future<> apply_child_split(context_t, Ref left, Ref right, bool); template void do_track_child(Node& child) { @@ -501,6 +511,16 @@ class InternalNode final : public Node { } } + node_future, Ref>> get_child_peers( + context_t, const search_position_t&); + + node_future<> erase_child(context_t, Ref&&); + + node_future<> fix_index(context_t, Ref); + + node_future<> apply_children_merge( + context_t, Ref&& left, Ref&& right, bool update_index); + void validate_child_tracked(const Node& child) const { validate_child(child); assert(tracked_child_nodes.find(child.parent_info().position) != @@ -508,6 +528,19 @@ class InternalNode final : public Node { assert(tracked_child_nodes.find(child.parent_info().position)->second == &child); } + void validate_child_inconsistent(const Node& child) const; + + void validate_tracked_children() const { +#ifndef NDEBUG + for (auto& kv : tracked_child_nodes) { + assert(kv.first == kv.second->parent_info().position); + validate_child(*kv.second); + } +#endif + } + + void track_make_tail(const search_position_t&); + static node_future> allocate_root( context_t, level_t, laddr_t, Super::URef&&); @@ -517,25 +550,29 @@ class InternalNode final : public Node { node_future lower_bound_tracked( context_t, const key_hobj_t&, MatchHistory&) override; node_future<> do_get_tree_stats(context_t, tree_stats_t&) override; + bool is_tracking() const override { + return !tracked_child_nodes.empty(); + } + void track_merge(Ref, match_stage_t, search_position_t&) override; node_future<> test_clone_root(context_t, RootNodeTracker&) const override; private: + node_future<> try_downgrade_root(context_t, Ref&&); + + node_future> insert_or_split( + context_t, const search_position_t&, const key_view_t&, Ref, + Ref outdated_child=nullptr); + // XXX: extract a common tracker for InternalNode to track Node, // and LeafNode to track tree_cursor_t. node_future> get_or_track_child(context_t, const search_position_t&, laddr_t); void track_insert( const search_position_t&, match_stage_t, Ref, Ref nxt_child = nullptr); - void replace_track(Ref new_child, Ref old_child); + void replace_track(Ref new_child, Ref old_child, bool); void track_split(const search_position_t&, Ref); - void validate_tracked_children() const { -#ifndef NDEBUG - for (auto& kv : tracked_child_nodes) { - assert(kv.first == kv.second->parent_info().position); - validate_child(*kv.second); - } -#endif - } + template + void track_erase(const search_position_t&, match_stage_t); void validate_child(const Node& child) const; struct fresh_node_t { @@ -630,6 +667,10 @@ class LeafNode final : public Node { node_future lower_bound_tracked( context_t, const key_hobj_t&, MatchHistory&) override; node_future<> do_get_tree_stats(context_t, tree_stats_t&) override; + bool is_tracking() const override { + return !tracked_cursors.empty(); + } + void track_merge(Ref, match_stage_t, search_position_t&) override; node_future<> test_clone_root(context_t, RootNodeTracker&) const override; @@ -650,6 +691,7 @@ class LeafNode final : public Node { Ref track_insert( const search_position_t&, match_stage_t, const value_header_t*); void track_split(const search_position_t&, Ref); + void track_erase(const search_position_t&, match_stage_t); void validate_tracked_cursors() const { #ifndef NDEBUG for (auto& kv : tracked_cursors) { diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h index 01a931e9143..db93d9570a9 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h @@ -329,6 +329,15 @@ class NodeExtentAccessorT { return state; } + bool is_valid() const { + if (extent) { + assert(extent->is_valid()); + return true; + } else { + return false; + } + } + // must be called before any mutate attempes. // for the safety of mixed read and mutate, call before read. void prepare_mutate(context_t c) { @@ -487,7 +496,49 @@ class NodeExtentAccessorT { std::memcpy(to.get_write(), extent->get_read(), extent->get_length()); } + using ertr = NodeExtentManager::tm_ertr; + ertr::future rebuild(context_t c) { + assert(extent->is_valid()); + if (state == nextent_state_t::FRESH) { + assert(extent->is_initial_pending()); + // already fresh and no need to record + return ertr::make_ready_future(*mut); + } + assert(!extent->is_initial_pending()); + return c.nm.alloc_extent(c.t, node_stage_t::EXTENT_SIZE + ).safe_then([this, c] (auto fresh_extent) { + logger().debug("OTree::Extent::Rebuild: update addr from {:#x} to {:#x} ...", + extent->get_laddr(), fresh_extent->get_laddr()); + assert(fresh_extent->is_initial_pending()); + assert(fresh_extent->get_recorder() == nullptr); + assert(extent->get_length() == fresh_extent->get_length()); + auto fresh_mut = fresh_extent->get_mutable(); + std::memcpy(fresh_mut.get_write(), extent->get_read(), extent->get_length()); + NodeExtentRef to_discard = extent; + + extent = fresh_extent; + node_stage = node_stage_t( + reinterpret_cast(extent->get_read())); + state = nextent_state_t::FRESH; + mut.emplace(fresh_mut); + recorder = nullptr; + + return c.nm.retire_extent(c.t, to_discard); + }).safe_then([this] { + return *mut; + }); + } + + ertr::future<> retire(context_t c) { + assert(extent->is_valid()); + return c.nm.retire_extent(c.t, std::move(extent)); + } + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + NodeExtentRef extent; node_stage_t node_stage; nextent_state_t state; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc index fd3f93b53f1..39a94edbb43 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc @@ -11,7 +11,7 @@ last_split_info_t last_split = {}; #endif // XXX: branchless allocation -InternalNodeImpl::alloc_ertr::future +InternalNodeImpl::ertr::future InternalNodeImpl::allocate( context_t c, field_type_t type, bool is_level_tail, level_t level) { @@ -28,7 +28,7 @@ InternalNodeImpl::allocate( } } -LeafNodeImpl::alloc_ertr::future +LeafNodeImpl::ertr::future LeafNodeImpl::allocate( context_t c, field_type_t type, bool is_level_tail) { diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h index 6a74911d2d0..55c8de765aa 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h @@ -58,7 +58,7 @@ class NodeExtentMutable; */ class NodeImpl { public: - using alloc_ertr = crimson::errorator< + using ertr = crimson::errorator< crimson::ct_error::input_output_error, crimson::ct_error::invarg, crimson::ct_error::enoent, @@ -83,10 +83,22 @@ class NodeImpl { */ virtual void validate_non_empty() const = 0; virtual bool is_keys_empty() const = 0; + // under the assumption that keys are not empty, check whether num_keys == 1 + virtual bool is_keys_one() const = 0; virtual level_t level() const = 0; virtual node_offset_t free_size() const = 0; + virtual node_offset_t total_size() const = 0; + virtual bool is_extent_valid() const = 0; virtual std::optional get_pivot_index() const = 0; + virtual bool is_size_underflow() const = 0; + + virtual std::tuple erase(const search_position_t&) = 0; + virtual std::tuple evaluate_merge(NodeImpl&) = 0; + virtual search_position_t merge(NodeExtentMutable&, NodeImpl&, match_stage_t, node_offset_t) = 0; + virtual ertr::future rebuild_extent(context_t) = 0; + virtual ertr::future<> retire_extent(context_t) = 0; + virtual search_position_t make_tail() = 0; virtual node_stats_t get_stats() const = 0; virtual std::ostream& dump(std::ostream&) const = 0; @@ -118,6 +130,13 @@ class InternalNodeImpl : public NodeImpl { ceph_abort("impossible path"); } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_prev_slot(search_position_t&, // IN&OUT + key_view_t* = nullptr, // OUT + const laddr_packed_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" virtual void get_next_slot(search_position_t&, // IN&OUT key_view_t* = nullptr, // OUT @@ -166,7 +185,7 @@ class InternalNodeImpl : public NodeImpl { return {std::move(impl), mut}; } }; - static alloc_ertr::future allocate(context_t, field_type_t, bool, level_t); + static ertr::future allocate(context_t, field_type_t, bool, level_t); static InternalNodeImplURef load(NodeExtentRef, field_type_t, bool); @@ -191,6 +210,13 @@ class LeafNodeImpl : public NodeImpl { ceph_abort("impossible path"); } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_prev_slot(search_position_t&, // IN&OUT + key_view_t* = nullptr, // OUT + const value_header_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" virtual void get_next_slot(search_position_t&, // IN&OUT key_view_t* = nullptr, // OUT @@ -239,7 +265,7 @@ class LeafNodeImpl : public NodeImpl { return {std::move(impl), mut}; } }; - static alloc_ertr::future allocate(context_t, field_type_t, bool); + static ertr::future allocate(context_t, field_type_t, bool); static LeafNodeImplURef load(NodeExtentRef, field_type_t, bool); diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h index 047dd4cac91..949faa5a6fd 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h @@ -65,8 +65,7 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { return ret; } - using alloc_ertr = NodeExtentManager::tm_ertr; - static alloc_ertr::future allocate( + static ertr::future allocate( context_t c, bool is_level_tail, level_t level) { // NOTE: Currently, all the node types have the same size for simplicity. // But depending on the requirement, we may need to make node size @@ -103,9 +102,15 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { assert(!is_keys_empty()); } bool is_keys_empty() const override { return extent.read().keys() == 0; } + bool is_keys_one() const override { + assert(!is_keys_empty()); + return STAGE_T::is_keys_one(extent.read()); + } level_t level() const override { return extent.read().level(); } node_offset_t free_size() const override { return extent.read().free_size(); } + node_offset_t total_size() const override { return extent.read().total_size(); } + bool is_extent_valid() const override { return extent.is_valid(); } std::optional get_pivot_index() const override { if (is_level_tail()) { @@ -118,6 +123,112 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { return {pivot_index}; } + bool is_size_underflow() const override { + /** + * There might be 2 node-merge strategies: + * + * The first is to rebalance and merge nodes and perfer tree fillness as + * much as possible in order to save space and improve key density for + * lookup, in exchange to the efforts of frequent merge, split and + * rebalance. These operations cannot benefit from seastore deltas because + * they are allocating fresh extents which need to be write into the + * journal as a whole, making write amplification much larger. + * + * The second is to delay rebalance and merge. When submit the transaction, + * simple insert and erase only need to append delta including just enough + * information about the inserted/erase item. The downside is tree fillness + * is not as good as the first strategy. + * + * Currently the decision is the second way by delaying merge until the + * node is 1/4 full, so that: + * - After a split operation (making the node at least 1/2 full): + * - The next merge need to erase items taking at least 1/4 space; + * - The next split need to insert items taking at most 1/2 space; + * - After a merge operation (making the node at least 1/2 full): + * - The next merge need to erase items taking at least 1/4 space; + * - The next split need to insert items taking at most 1/2 space; + * - TODO: before node rebalance is implemented, the node size can be below + * the underflow limit if it cannot be merged with peers; + */ + auto& node_stage = extent.read(); + size_t empty_size = node_stage.size_before(0); + size_t filled_kv_size = filled_size() - empty_size; + size_t full_kv_size = node_stage.total_size() - empty_size; + return filled_kv_size <= full_kv_size / 4; + } + + std::tuple + erase(const search_position_t& pos) override { + logger().debug("OTree::Layout::Erase: begin at erase_pos({}) ...", pos); + if (unlikely(logger().is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + logger().trace("OTree::Layout::Erase: -- dump\n{}", sos.str()); + } + auto [stage, next_or_last_pos] = extent.erase_replayable(cast_down(pos)); + logger().debug("OTree::Layout::Erase: done at erase_stage={}, n/l_pos({})", + stage, next_or_last_pos); + if (unlikely(logger().is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + logger().trace("OTree::Layout::Erase: -- dump\n{}", sos.str()); + } +#ifndef NDEBUG + if (!is_keys_empty()) { + validate_layout(); + } +#endif + return {stage, normalize(std::move(next_or_last_pos))}; + } + + std::tuple evaluate_merge( + NodeImpl& _right_node) override { + assert(NODE_TYPE == _right_node.node_type()); + assert(FIELD_TYPE == _right_node.field_type()); + auto& left_node_stage = extent.read(); + auto& right_node = dynamic_cast(_right_node); + auto& right_node_stage = right_node.extent.read(); + assert(!is_level_tail()); + assert(!is_keys_empty()); + assert(!right_node.is_keys_empty()); + key_view_t left_pivot_index; + STAGE_T::template get_largest_slot( + left_node_stage, nullptr, &left_pivot_index, nullptr); + auto [merge_stage, size_comp] = STAGE_T::evaluate_merge( + left_pivot_index, right_node_stage); + auto size_left = filled_size(); + auto size_right = right_node.filled_size(); + assert(size_right > size_comp); + std::size_t merge_size = size_left + size_right - size_comp; + return {merge_stage, merge_size}; + } + + search_position_t merge(NodeExtentMutable& mut, NodeImpl& _right_node, + match_stage_t merge_stage, node_offset_t merge_size) override { + // TODO + ceph_abort("not implemented"); + } + + ertr::future + rebuild_extent(context_t c) override { + return extent.rebuild(c).safe_then([this] (auto mut) { + // addr may change + build_name(); + return mut; + }); + } + + ertr::future<> retire_extent(context_t c) override { + return extent.retire(c); + } + + search_position_t make_tail() override { + auto&& ret = extent.make_tail_replayable(); + // is_level_tail is changed + build_name(); + return normalize(std::move(ret)); + } + node_stats_t get_stats() const override { node_stats_t stats; auto& node_stage = extent.read(); @@ -217,6 +328,12 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { } } + void get_prev_slot(search_position_t& pos, + key_view_t* p_index_key = nullptr, + const value_t** pp_value = nullptr) const override { + ceph_abort("not implemented"); + } + void get_next_slot(search_position_t& pos, key_view_t* p_index_key = nullptr, const value_t** pp_value = nullptr) const override { @@ -247,6 +364,12 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { } else if (!p_pos && p_index_key && !pp_value) { STAGE_T::template get_largest_slot( extent.read(), nullptr, p_index_key, nullptr); + } else if (p_pos && !p_index_key && pp_value) { + STAGE_T::template get_largest_slot( + extent.read(), &cast_down_fill_0(*p_pos), nullptr, pp_value); + } else if (p_pos && !p_index_key && !pp_value) { + STAGE_T::template get_largest_slot( + extent.read(), &cast_down_fill_0(*p_pos), nullptr, nullptr); } else { ceph_abort("not implemented"); } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h index b9a4dcb9961..e3897a9ce3b 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h @@ -856,6 +856,12 @@ struct staged { * Lookup internals (hide?) */ + static bool is_keys_one( + const container_t& container) { // IN + // TODO + ceph_abort("not implemented"); + } + template static result_t smallest_result( const iterator_t& iter, full_key_t* p_index_key) { @@ -2111,6 +2117,13 @@ struct staged { iter.trim_until(mut); } } + + static std::tuple evaluate_merge( + const full_key_t& left_pivot_index, + const container_t& right_container) { + // TODO + ceph_abort("not implemented"); + } }; /** diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h index 2a7de82f524..b2ca8d35577 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h @@ -194,6 +194,14 @@ struct staged_position_t { return *this; } + me_t& operator+=(const me_t& o) { + assert(is_valid_index(index)); + assert(is_valid_index(o.index)); + index += o.index; + nxt += o.nxt; + return *this; + } + void encode(ceph::bufferlist& encoded) const { ceph::encode(index, encoded); nxt.encode(encoded); @@ -269,6 +277,13 @@ struct staged_position_t { return *this; } + me_t& operator+=(const me_t& o) { + assert(is_valid_index(index)); + assert(is_valid_index(o.index)); + index += o.index; + return *this; + } + void assert_next_to(const me_t& prv) const { #ifndef NDEBUG if (is_end()) { diff --git a/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc b/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc index 7f3af8db188..44a701ae576 100644 --- a/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc +++ b/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc @@ -706,7 +706,13 @@ class DummyChildPool { laddr_t laddr() const override { return _laddr; } bool is_level_tail() const override { return _is_level_tail; } std::optional get_pivot_index() const override { return {key_view}; } + bool is_extent_valid() const override { return true; } const std::string& get_name() const override { return name; } + search_position_t make_tail() override { + _is_level_tail = true; + build_name(); + return search_position_t::end(); + } protected: node_type_t node_type() const override { return node_type_t::LEAF; } @@ -722,8 +728,24 @@ class DummyChildPool { ceph_abort("impossible path"); } bool is_keys_empty() const override { ceph_abort("impossible path"); } + bool is_keys_one() const override { + ceph_abort("impossible path"); } node_offset_t free_size() const override { ceph_abort("impossible path"); } + node_offset_t total_size() const override { + ceph_abort("impossible path"); } + bool is_size_underflow() const override { + ceph_abort("impossible path"); } + std::tuple erase(const search_position_t&) override { + ceph_abort("impossible path"); } + std::tuple evaluate_merge(NodeImpl&) override { + ceph_abort("impossible path"); } + search_position_t merge(NodeExtentMutable&, NodeImpl&, match_stage_t, node_offset_t) override { + ceph_abort("impossible path"); } + ertr::future rebuild_extent(context_t) override { + ceph_abort("impossible path"); } + ertr::future<> retire_extent(context_t) override { + ceph_abort("impossible path"); } node_stats_t get_stats() const override { ceph_abort("impossible path"); } std::ostream& dump(std::ostream&) const override { @@ -787,7 +809,7 @@ class DummyChildPool { if (right_child->can_split()) { splitable_nodes.insert(right_child); } - return apply_split_to_parent(c, right_child); + return apply_split_to_parent(c, right_child, false); } node_future<> insert_and_split( @@ -863,6 +885,10 @@ class DummyChildPool { ceph_abort("impossible path"); } node_future<> do_get_tree_stats(context_t, tree_stats_t&) override { ceph_abort("impossible path"); } + bool is_tracking() const override { + ceph_abort("impossible path"); } + void track_merge(Ref, match_stage_t, search_position_t&) override { + ceph_abort("impossible path"); } private: DummyChild(DummyChildImpl* impl, DummyChildImpl::URef&& ref, DummyChildPool& pool) -- 2.39.5