From: Yingxin Cheng Date: Wed, 8 Apr 2020 02:35:40 +0000 (+0800) Subject: crimson/onode-staged-tree: add initial onode staged-fltree X-Git-Tag: v17.0.0~391^2~46 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=24202782fff35f1445168b75b7d642fd73775b37;p=ceph-ci.git crimson/onode-staged-tree: add initial onode staged-fltree Support basic insert and lookup operations, tested by an isolated dummy backend, and also a solution to integrate with seastore TransactionManager and LogicalCachedExtent. Signed-off-by: Yingxin Cheng --- diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index 4c265bcf231..f13d7b5cb4c 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -15,6 +15,16 @@ add_library(crimson-seastore STATIC onode_manager/simple-fltree/onode_block.cc onode_manager/simple-fltree/onode_delta.cc onode_manager/simple-fltree/onode_node.cc + onode_manager/staged-fltree/node.cc + onode_manager/staged-fltree/node_extent_manager.cc + onode_manager/staged-fltree/node_extent_mutable.cc + onode_manager/staged-fltree/stages/item_iterator_stage.cc + onode_manager/staged-fltree/stages/key_layout.cc + onode_manager/staged-fltree/stages/node_layout.cc + onode_manager/staged-fltree/stages/node_stage.cc + onode_manager/staged-fltree/stages/sub_items_stage.cc + onode_manager/staged-fltree/super.cc + onode_manager/staged-fltree/tree.cc extentmap_manager.cc extentmap_manager/btree/extentmap_btree_node_impl.cc extentmap_manager/btree/btree_extentmap_manager.cc diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 1b2a22391ce..6a406c1b85a 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -8,6 +8,7 @@ #include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h" #include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" #include "crimson/os/seastore/onode_manager/simple-fltree/onode_block.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h" #include "test/crimson/seastore/test_block.h" namespace { @@ -467,6 +468,8 @@ Cache::get_root_ret Cache::get_root(Transaction &t) } } +using StagedOnodeBlock = crimson::os::seastore::onode::SeastoreNodeExtent; + Cache::get_extent_ertr::future Cache::get_extent_by_type( extent_types_t type, paddr_t offset, @@ -503,6 +506,11 @@ Cache::get_extent_ertr::future Cache::get_extent_by_type( ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); + case extent_types_t::ONODE_BLOCK_STAGED: + return get_extent(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); case extent_types_t::TEST_BLOCK: return get_extent(offset, length ).safe_then([](auto extent) { diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 8c9312a6a92..b1875ba0d3b 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -34,6 +34,9 @@ using TCachedExtentRef = boost::intrusive_ptr; /** * CachedExtent */ +namespace onode { + class DummyNodeExtent; +} class ExtentIndex; class CachedExtent : public boost::intrusive_ref_counter< CachedExtent, boost::thread_unsafe_counter> { @@ -47,6 +50,8 @@ class CachedExtent : public boost::intrusive_ref_counter< INVALID // Part of no ExtentIndex set } state = extent_state_t::INVALID; friend std::ostream &operator<<(std::ostream &, extent_state_t); + // allow a dummy onode extent to pretend it is a fresh block + friend class onode::DummyNodeExtent; uint32_t last_committed_crc = 0; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h new file mode 100644 index 00000000000..f66fe57c1ff --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction.h" + +namespace crimson::os::seastore::onode { + +using crimson::os::seastore::Transaction; +using crimson::os::seastore::TransactionRef; +using crimson::os::seastore::make_transaction; +using crimson::os::seastore::laddr_t; +using crimson::os::seastore::L_ADDR_MIN; +using crimson::os::seastore::L_ADDR_NULL; +using crimson::os::seastore::extent_len_t; + +class NodeExtentManager; +class RootNodeTracker; +struct context_t { + NodeExtentManager& nm; + Transaction& t; +}; +using NodeExtentManagerURef = std::unique_ptr; +using RootNodeTrackerURef = std::unique_ptr; + +constexpr auto INDEX_END = std::numeric_limits::max(); + +// TODO: decide by NODE_BLOCK_SIZE +using node_offset_t = uint16_t; +constexpr node_offset_t DISK_BLOCK_SIZE = 1u << 12; +constexpr node_offset_t NODE_BLOCK_SIZE = DISK_BLOCK_SIZE * 1u; + +enum class MatchKindBS : int8_t { NE = -1, EQ = 0 }; + +enum class MatchKindCMP : int8_t { NE = -1, EQ = 0, PO }; +inline MatchKindCMP toMatchKindCMP(int value) { + if (value > 0) { + return MatchKindCMP::PO; + } else if (value < 0) { + return MatchKindCMP::NE; + } else { + return MatchKindCMP::EQ; + } +} +template +MatchKindCMP toMatchKindCMP(const Type& l, const Type& r) { + int match = l - r; + return toMatchKindCMP(match); +} + +template <> +inline MatchKindCMP toMatchKindCMP( + const std::string& l, const std::string& r) { + return toMatchKindCMP(l.compare(r)); +} + +inline MatchKindCMP toMatchKindCMP( + const char* l, size_t l_len, const char* r, size_t r_len) { + assert(l && l_len); + assert(r && r_len); + auto min_len = std::min(l_len, r_len); + auto match = toMatchKindCMP(std::strncmp(l, r, min_len)); + if (match == MatchKindCMP::EQ) { + return toMatchKindCMP(l_len, r_len); + } else { + return match; + } +} + +inline MatchKindCMP toMatchKindCMP( + const std::string& l, const char* r, size_t r_len) { + assert(r && r_len); + return toMatchKindCMP(l.compare(0u, l.length(), r, r_len)); +} + +inline MatchKindCMP toMatchKindCMP( + const char* l, size_t l_len, const std::string& r) { + assert(l && l_len); + return toMatchKindCMP(-r.compare(0u, r.length(), l, l_len)); +} + +inline MatchKindCMP reverse(MatchKindCMP cmp) { + if (cmp == MatchKindCMP::NE) { + return MatchKindCMP::PO; + } else if (cmp == MatchKindCMP::PO) { + return MatchKindCMP::NE; + } else { + return cmp; + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc new file mode 100644 index 00000000000..e6fcb382530 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc @@ -0,0 +1,184 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "node.h" + +#include +#include + +#include "node_impl.h" + +namespace crimson::os::seastore::onode { + +using node_ertr = Node::node_ertr; +template +using node_future = Node::node_future; + +tree_cursor_t::tree_cursor_t( + Ref node, const search_position_t& pos, const onode_t* p_value) + : leaf_node{node}, position{pos}, p_value{p_value} { + assert((!pos.is_end() && p_value) || (pos.is_end() && !p_value)); + if (!pos.is_end()) { + assert(p_value == leaf_node->get_p_value(position)); + leaf_node->do_track_cursor(*this); + } +} + +tree_cursor_t::~tree_cursor_t() { + if (!position.is_end()) { + leaf_node->do_untrack_cursor(*this); + } +} + +void tree_cursor_t::update_track( + Ref node, const search_position_t& pos) { + // already untracked + assert(!pos.is_end()); + assert(!is_end()); + leaf_node = node; + position = pos; + // p_value must be already invalidated + assert(!p_value); + leaf_node->do_track_cursor(*this); +} + +const onode_t* tree_cursor_t::get_p_value() const { + assert(!is_end()); + if (!p_value) { + // NOTE: the leaf node is always present when we hold its reference. + p_value = leaf_node->get_p_value(position); + } + assert(p_value); + return p_value; +} + +Node::~Node() { + // XXX: tolerate failure between allocate() and as_child() + if (is_root()) { + super->do_untrack_root(*this); + } else { + _parent_info->ptr->do_untrack_child(*this); + } +} + +node_future<> Node::upgrade_root(context_t c) { + assert(is_root()); + assert(is_level_tail()); + assert(field_type() == field_type_t::N0); + super->do_untrack_root(*this); + return InternalNode0::allocate_root(c, level(), laddr(), std::move(super) + ).safe_then([this](auto new_root) { + as_child(search_position_t::end(), new_root); + }); +} + +template +void Node::as_child(const search_position_t& pos, Ref parent_node) { + assert(!super); + _parent_info = parent_info_t{pos, parent_node}; + parent_info().ptr->do_track_child(*this); +} +template void Node::as_child(const search_position_t&, Ref); +template void Node::as_child(const search_position_t&, Ref); + + +node_future +Node::lower_bound(context_t c, const key_hobj_t& key) { + return seastar::do_with( + MatchHistory(), [this, c, &key](auto& history) { + return do_lower_bound(c, key, history); + } + ); +} + +node_future, bool>> +Node::insert(context_t c, const key_hobj_t& key, const onode_t& value) { + return seastar::do_with( + MatchHistory(), [this, c, &key, &value](auto& history) { + return do_lower_bound(c, key, history + ).safe_then([c, &key, &value, &history](auto result) { + if (result.match == MatchKindBS::EQ) { + return node_ertr::make_ready_future, bool>>( + std::make_pair(result.p_cursor, false)); + } else { + auto leaf_node = result.p_cursor->get_leaf_node(); + return leaf_node->insert_value( + c, key, value, result.p_cursor->get_position(), history + ).safe_then([](auto p_cursor) { + return node_ertr::make_ready_future, bool>>( + std::make_pair(p_cursor, true)); + }); + } + }); + } + ); +} + +node_future<> Node::mkfs(context_t c, RootNodeTracker& root_tracker) { + return LeafNode0::mkfs(c, root_tracker); +} + +node_future> +Node::load_root(context_t c, RootNodeTracker& root_tracker) { + return c.nm.get_super(c.t, root_tracker + ).safe_then([c, &root_tracker](auto&& _super) { + auto root_addr = _super->get_root_laddr(); + assert(root_addr != L_ADDR_NULL); + return load_node(c, root_addr, true + ).safe_then([c, _super = std::move(_super), + &root_tracker](auto root) mutable { + assert(root->field_type() == field_type_t::N0); + root->as_root(std::move(_super)); + assert(root == root_tracker.get_root(c.t)); + return node_ertr::make_ready_future>(root); + }); + }); +} + +node_future<> Node::insert_parent(context_t c, Ref right_node) { + assert(!is_root()); + // TODO(cross-node string dedup) + auto my_key = get_largest_key_view(); + return parent_info().ptr->apply_child_split( + c, parent_info().position, my_key, this, right_node); +} + +node_future> InternalNode::get_or_track_child( + context_t c, const search_position_t& position, laddr_t child_addr) { + bool level_tail = position.is_end(); + Ref child; + auto found = tracked_child_nodes.find(position); + return (found == tracked_child_nodes.end() + ? load_node(c, child_addr, level_tail + ).safe_then([this, position] (auto child) { + child->as_child(position, this); + return child; + }) + : node_ertr::make_ready_future>(found->second) + ).safe_then([this, position, child_addr] (auto child) { + assert(child_addr == child->laddr()); + assert(position == child->parent_info().position); + validate_child(*child); + return child; + }); +} + +void InternalNode::validate_child(const Node& child) const { +#ifndef NDEBUG + assert(this->level() - 1 == child.level()); + assert(this == child.parent_info().ptr); + auto& child_pos = child.parent_info().position; + assert(*get_p_value(child_pos) == child.laddr()); + if (child_pos.is_end()) { + assert(this->is_level_tail()); + assert(child.is_level_tail()); + } else { + assert(!child.is_level_tail()); + assert(get_key_view(child_pos) == child.get_largest_key_view()); + } + // XXX(multi-type) + assert(this->field_type() <= child.field_type()); +#endif +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h new file mode 100644 index 00000000000..c1c96eb5bbb --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h @@ -0,0 +1,420 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include + +#include "crimson/common/type_helpers.h" + +#include "node_types.h" +#include "stages/stage_types.h" +#include "super.h" +#include "tree_types.h" + +namespace crimson::os::seastore::onode { + +/** + * in-memory subtree management: + * + * resource management (bottom-up): + * USER --> Ref + * tree_cursor_t --> Ref + * Node (child) --> Ref (see parent_info_t) + * Node (root) --> Super::URef + * Super --> Btree + * + * tracked lookup (top-down): + * Btree --> Super* + * Super --> Node* (root) + * InternalNode --> Node* (children) + * LeafNode --> tree_cursor_t* + */ + +class LeafNode; +class InternalNode; +class NodeExtentMutable; + +class tree_cursor_t final + : public boost::intrusive_ref_counter< + tree_cursor_t, boost::thread_unsafe_counter> { + public: + ~tree_cursor_t(); + bool is_end() const { return position.is_end(); } + const onode_t* get_p_value() const; + + private: + tree_cursor_t(Ref, const search_position_t&, const onode_t*); + const search_position_t& get_position() const { return position; } + Ref get_leaf_node() { return leaf_node; } + // TODO: version based invalidation + void invalidate_p_value() { p_value = nullptr; } + void update_track(Ref, const search_position_t&); + void set_p_value(const onode_t* _p_value) { + if (!p_value) { + p_value = _p_value; + } else { + assert(p_value == _p_value); + } + } + + Ref leaf_node; + search_position_t position; + mutable const onode_t* p_value; + + friend class LeafNode; + friend class Node; // get_position(), get_leaf_node() +}; + +struct key_view_t; +struct key_hobj_t; + +class Node + : public boost::intrusive_ref_counter { + public: + using node_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template + using node_future = node_ertr::future; + + struct search_result_t { + bool is_end() const { return p_cursor->is_end(); } + Ref p_cursor; + MatchKindBS match; + }; + + virtual ~Node(); + virtual level_t level() const = 0; + virtual node_future> lookup_smallest(context_t) = 0; + virtual node_future> lookup_largest(context_t) = 0; + node_future lower_bound(context_t, const key_hobj_t& key); + + node_future, bool>> + insert(context_t, const key_hobj_t&, const onode_t&); + + virtual std::ostream& dump(std::ostream&) const = 0; + virtual std::ostream& dump_brief(std::ostream&) const = 0; + + static node_future<> mkfs(context_t, RootNodeTracker&); + + static node_future> load_root(context_t, RootNodeTracker&); + + virtual void test_make_destructable( + context_t, NodeExtentMutable&, Super::URef&&) = 0; + virtual node_future<> test_clone_root(context_t, RootNodeTracker&) const { + assert(false && "impossible path"); + } + virtual node_future<> test_clone_non_root(context_t, Ref) const { + assert(false && "impossible path"); + } + + public: // used by node_impl.h, XXX: protected? + virtual bool is_level_tail() const = 0; + virtual field_type_t field_type() const = 0; + virtual laddr_t laddr() const = 0; + virtual key_view_t get_key_view(const search_position_t&) const = 0; + virtual key_view_t get_largest_key_view() const = 0; + virtual node_future + do_lower_bound(context_t, const key_hobj_t&, MatchHistory&) = 0; + + protected: + Node() {} + + struct parent_info_t { + search_position_t position; + Ref ptr; + }; + bool is_root() const { + assert((super && !_parent_info.has_value()) || + (!super && _parent_info.has_value())); + return !_parent_info.has_value(); + } + void make_root(context_t c, Super::URef&& _super) { + _super->write_root_laddr(c, laddr()); + as_root(std::move(_super)); + } + void make_root_new(context_t c, Super::URef&& _super) { + assert(_super->get_root_laddr() == L_ADDR_NULL); + make_root(c, std::move(_super)); + } + void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) { + assert(_super->get_root_laddr() == from_addr); + make_root(c, std::move(_super)); + } + void as_root(Super::URef&& _super) { + assert(!super && !_parent_info); + assert(_super->get_root_laddr() == laddr()); + assert(is_level_tail()); + super = std::move(_super); + super->do_track_root(*this); + } + node_future<> upgrade_root(context_t); + template + void as_child(const search_position_t&, Ref); + const parent_info_t& parent_info() const { return *_parent_info; } + node_future<> insert_parent(context_t, Ref right_node); + + static node_future> load( + context_t, laddr_t, bool expect_is_level_tail); + + private: + // as child/non-root + std::optional _parent_info; + // as root + Super::URef super; + + friend class InternalNode; +}; +inline std::ostream& operator<<(std::ostream& os, const Node& node) { + return node.dump_brief(os); +} + +// TODO: remove virtual inheritance once decoupled with layout +class InternalNode : virtual public Node { + public: + virtual ~InternalNode() { assert(tracked_child_nodes.empty()); } + + protected: + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + node_future> get_or_track_child( + context_t, const search_position_t&, laddr_t); + + void track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + Ref insert_child, Ref nxt_child = nullptr) { + // update tracks + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_END; + auto first = tracked_child_nodes.lower_bound(insert_pos); + auto last = tracked_child_nodes.lower_bound(pos_upper_bound); + std::vector nodes; + std::for_each(first, last, [&nodes](auto& kv) { + nodes.push_back(kv.second); + }); + tracked_child_nodes.erase(first, last); + for (auto& node : nodes) { + auto _pos = node->parent_info().position; + assert(!_pos.is_end()); + ++_pos.index_by_stage(insert_stage); + node->as_child(_pos, this); + } + // track insert + insert_child->as_child(insert_pos, this); + +#ifndef NDEBUG + // validate left_child is before right_child + if (nxt_child) { + auto iter = tracked_child_nodes.find(insert_pos); + ++iter; + assert(iter->second == nxt_child); + } +#endif + } + + void replace_track( + const search_position_t& position, + Ref old_child, Ref new_child) { + assert(tracked_child_nodes[position] == old_child); + tracked_child_nodes.erase(position); + new_child->as_child(position, this); + assert(tracked_child_nodes[position] == new_child); + } + + void track_split( + const search_position_t& split_pos, Ref right_node) { + auto first = tracked_child_nodes.lower_bound(split_pos); + auto iter = first; + while (iter != tracked_child_nodes.end()) { + search_position_t new_pos = iter->first; + new_pos -= split_pos; + iter->second->as_child(new_pos, right_node); + ++iter; + } + tracked_child_nodes.erase(first, tracked_child_nodes.end()); + } + + void validate_tracked_children() const { +#ifndef NDEBUG + for (auto& kv : tracked_child_nodes) { + assert(kv.first == kv.second->parent_info().position); + validate_child(*kv.second); + } +#endif + } + + node_future<> test_clone_children( + context_t c_other, Ref clone) const { + Ref this_ref = this; + return crimson::do_for_each( + tracked_child_nodes.begin(), + tracked_child_nodes.end(), + [this_ref, c_other, clone](auto& kv) { + assert(kv.first == kv.second->parent_info().position); + return kv.second->test_clone_non_root(c_other, clone); + } + ); + } + + private: + virtual node_future<> apply_child_split( + context_t, const search_position_t&, const key_view_t&, Ref, Ref) = 0; + virtual const laddr_t* get_p_value(const search_position_t&) const = 0; + void validate_child(const Node& child) const; + template + void do_track_child(Node& child) { + if constexpr (VALIDATE) { + validate_child(child); + } + auto& child_pos = child.parent_info().position; + assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end()); + tracked_child_nodes[child_pos] = &child; + } + void do_untrack_child(const Node& child) { + auto& child_pos = child.parent_info().position; + assert(tracked_child_nodes.find(child_pos)->second == &child); + auto removed = tracked_child_nodes.erase(child_pos); + assert(removed); + } + + // XXX: leverage intrusive data structure to control memory overhead + // track the current living child nodes by position + std::map tracked_child_nodes; + + friend class Node; +}; + +// TODO: remove virtual inheritance once decoupled with layout +class LeafNode : virtual public Node { + public: + virtual ~LeafNode() { assert(tracked_cursors.empty()); } + + protected: + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + Ref get_or_track_cursor( + const search_position_t& position, const onode_t* p_value) { + if (position.is_end()) { + assert(this->is_level_tail()); + assert(!p_value); + // we need to return the leaf node to insert + return new tree_cursor_t(this, position, p_value); + } + + Ref p_cursor; + auto found = tracked_cursors.find(position); + if (found == tracked_cursors.end()) { + p_cursor = new tree_cursor_t(this, position, p_value); + } else { + p_cursor = found->second; + assert(p_cursor->get_leaf_node() == this); + assert(p_cursor->get_position() == position); + p_cursor->set_p_value(p_value); + } + return p_cursor; + } + + Ref track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + const onode_t* p_onode) { + // invalidate cursor value + // TODO: version based invalidation + auto pos_invalidate_begin = insert_pos; + pos_invalidate_begin.index_by_stage(STAGE_RIGHT) = 0; + auto begin_invalidate = tracked_cursors.lower_bound(pos_invalidate_begin); + std::for_each(begin_invalidate, tracked_cursors.end(), [](auto& kv) { + kv.second->invalidate_p_value(); + }); + + // update cursor position + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_END; + auto first = tracked_cursors.lower_bound(insert_pos); + auto last = tracked_cursors.lower_bound(pos_upper_bound); + std::vector p_cursors; + std::for_each(first, last, [&p_cursors](auto& kv) { + p_cursors.push_back(kv.second); + }); + tracked_cursors.erase(first, last); + for (auto& p_cursor : p_cursors) { + search_position_t new_pos = p_cursor->get_position(); + ++new_pos.index_by_stage(insert_stage); + p_cursor->update_track(this, new_pos); + } + + // track insert + return new tree_cursor_t(this, insert_pos, p_onode); + } + + void track_split( + const search_position_t& split_pos, Ref right_node) { + // invalidate cursor value + // TODO: version based invalidation + auto pos_invalidate_begin = split_pos; + pos_invalidate_begin.index_by_stage(STAGE_RIGHT) = 0; + auto begin_invalidate = tracked_cursors.lower_bound(pos_invalidate_begin); + std::for_each(begin_invalidate, tracked_cursors.end(), [](auto& kv) { + kv.second->invalidate_p_value(); + }); + + // update cursor ownership and position + auto first = tracked_cursors.lower_bound(split_pos); + auto iter = first; + while (iter != tracked_cursors.end()) { + search_position_t new_pos = iter->first; + new_pos -= split_pos; + iter->second->update_track(right_node, new_pos); + ++iter; + } + tracked_cursors.erase(first, tracked_cursors.end()); + } + + void validate_tracked_cursors() const { +#ifndef NDEBUG + for (auto& kv : tracked_cursors) { + assert(kv.first == kv.second->get_position()); + validate_cursor(*kv.second); + } +#endif + } + + private: + virtual node_future> insert_value( + context_t, + const key_hobj_t&, + const onode_t&, + const search_position_t&, + const MatchHistory&) = 0; + friend class Node; + + virtual const onode_t* get_p_value(const search_position_t&) const = 0; + void validate_cursor(tree_cursor_t& cursor) const { + assert(this == cursor.get_leaf_node().get()); + assert(!cursor.is_end()); + assert(get_p_value(cursor.get_position()) == cursor.get_p_value()); + } + void do_track_cursor(tree_cursor_t& cursor) { + validate_cursor(cursor); + auto& cursor_pos = cursor.get_position(); + assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end()); + tracked_cursors[cursor_pos] = &cursor; + } + void do_untrack_cursor(tree_cursor_t& cursor) { + validate_cursor(cursor); + auto& cursor_pos = cursor.get_position(); + assert(tracked_cursors.find(cursor_pos)->second == &cursor); + auto removed = tracked_cursors.erase(cursor_pos); + assert(removed); + } + // XXX: leverage intrusive data structure to control memory overhead + // track the current living cursors by position + std::map tracked_cursors; + friend class tree_cursor_t; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc new file mode 100644 index 00000000000..667ae6a3607 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc @@ -0,0 +1,19 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "node_extent_manager.h" +#include "node_extent_manager/dummy.h" +#include "node_extent_manager/seastore.h" + +namespace crimson::os::seastore::onode { + +NodeExtentManagerURef NodeExtentManager::create_dummy() { + return NodeExtentManagerURef(new DummyNodeExtentManager()); +} + +NodeExtentManagerURef NodeExtentManager::create_seastore( + TransactionManager& tm, laddr_t min_laddr) { + return NodeExtentManagerURef(new SeastoreNodeExtentManager(tm, min_laddr)); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h new file mode 100644 index 00000000000..f5f37af4d9c --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/type_helpers.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/transaction_manager.h" + +#include "fwd.h" +#include "super.h" +#include "node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +using crimson::os::seastore::LogicalCachedExtent; +class NodeExtent : public LogicalCachedExtent { + public: + using Ref = crimson::os::seastore::TCachedExtentRef; + virtual ~NodeExtent() = default; + const char* get_read() const { + return get_bptr().c_str(); + } + auto get_mutable() { + assert(is_pending()); + return NodeExtentMutable(*this); + } + virtual Ref mutate(context_t/* DeltaBuffer::Ref */) = 0; + + protected: + template + NodeExtent(T&&... t) : LogicalCachedExtent(std::forward(t)...) {} + + /** + * abstracted: + * - CacheExtent::duplicate_for_write() -> CachedExtentRef + * - CacheExtent::get_type() -> extent_types_t + * - CacheExtent::get_delta() -> ceph::bufferlist + * - LogicalCachedExtent::apply_delta(const ceph::bufferlist) -> void + */ + + private: + friend class NodeExtentMutable; +}; + +using crimson::os::seastore::TransactionManager; +class NodeExtentManager { + public: + virtual ~NodeExtentManager() = default; + using tm_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template + using tm_future = tm_ertr::future; + + virtual bool is_read_isolated() const = 0; + virtual tm_future read_extent( + Transaction&, laddr_t, extent_len_t) = 0; + virtual tm_future alloc_extent(Transaction&, extent_len_t) = 0; + virtual tm_future get_super(Transaction&, RootNodeTracker&) = 0; + + static NodeExtentManagerURef create_dummy(); + static NodeExtentManagerURef create_seastore( + TransactionManager& tm, laddr_t min_laddr = L_ADDR_MIN); +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h new file mode 100644 index 00000000000..394db8c2e10 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/buffer_raw.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +namespace crimson::os::seastore::onode { + +class DummySuper final: public Super { + public: + DummySuper(Transaction& t, RootNodeTracker& tracker, laddr_t* p_root_laddr) + : Super(t, tracker), p_root_laddr{p_root_laddr} {} + ~DummySuper() override = default; + protected: + laddr_t get_root_laddr() const override { return *p_root_laddr; } + void write_root_laddr(context_t, laddr_t addr) override { *p_root_laddr = addr; } + laddr_t* p_root_laddr; +}; + +class DummyNodeExtent final: public NodeExtent { + public: + DummyNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) { + state = extent_state_t::INITIAL_WRITE_PENDING; + } + ~DummyNodeExtent() override = default; + protected: + Ref mutate(context_t) override { + assert(false && "impossible path"); } + CachedExtentRef duplicate_for_write() override { + assert(false && "impossible path"); } + extent_types_t get_type() const override { + assert(false && "impossible path"); } + ceph::bufferlist get_delta() override { + assert(false && "impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + assert(false && "impossible path"); } +}; + +class DummyNodeExtentManager final: public NodeExtentManager { + static constexpr size_t ALIGNMENT = 4096; + public: + ~DummyNodeExtentManager() override = default; + protected: + bool is_read_isolated() const { return false; } + + tm_future read_extent( + Transaction& t, laddr_t addr, extent_len_t len) { + auto iter = allocate_map.find(addr); + assert(iter != allocate_map.end()); + assert(iter->second->get_length() == len); + return tm_ertr::make_ready_future(iter->second); + } + + tm_future alloc_extent( + Transaction& t, extent_len_t len) { + assert(len % ALIGNMENT == 0); + auto r = ceph::buffer::create_aligned(len, ALIGNMENT); + auto addr = reinterpret_cast(r->get_data()); + auto bp = ceph::bufferptr(std::move(r)); + auto extent = Ref(new DummyNodeExtent(std::move(bp))); + extent->set_laddr(addr); + assert(allocate_map.find(extent->get_laddr()) == allocate_map.end()); + allocate_map.insert({extent->get_laddr(), extent}); + return tm_ertr::make_ready_future(extent); + } + + tm_future get_super(Transaction& t, RootNodeTracker& tracker) { + return tm_ertr::make_ready_future( + Super::URef(new DummySuper(t, tracker, &root_laddr))); + } + + std::map> allocate_map; + laddr_t root_laddr = L_ADDR_NULL; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h new file mode 100644 index 00000000000..0ceae18dc67 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +namespace crimson::os::seastore::onode { + +class SeastoreSuper final: public Super { + public: + SeastoreSuper(Transaction& t, RootNodeTracker& tracker, + laddr_t root_addr, TransactionManager& tm) + : Super(t, tracker), root_addr{root_addr}, tm{tm} {} + ~SeastoreSuper() override = default; + protected: + laddr_t get_root_laddr() const override { + return root_addr; + } + void write_root_laddr(context_t c, laddr_t addr) override { + root_addr = addr; + //TODO + assert(false && "not implemented"); + } + laddr_t root_addr; + TransactionManager& tm; +}; + +class SeastoreNodeExtent final: public NodeExtent { + public: + SeastoreNodeExtent(ceph::bufferptr &&ptr) + : NodeExtent(std::move(ptr)) {} + SeastoreNodeExtent(const SeastoreNodeExtent& other) + : NodeExtent(other) {} + ~SeastoreNodeExtent() override = default; + protected: + Ref mutate(context_t c) override; + CachedExtentRef duplicate_for_write() override { + return CachedExtentRef(new SeastoreNodeExtent(*this)); + } + extent_types_t get_type() const override { + return extent_types_t::ONODE_BLOCK_STAGED; + } + ceph::bufferlist get_delta() override { + //TODO + assert(false && "not implemented"); + } + void apply_delta(const ceph::bufferlist&) override { + //TODO + assert(false && "not implemented"); + } + //TODO: recorder +}; + +class SeastoreNodeExtentManager final: public NodeExtentManager { + public: + SeastoreNodeExtentManager(TransactionManager& tm, laddr_t min) + : tm{tm}, addr_min{min} {}; + ~SeastoreNodeExtentManager() override = default; + TransactionManager& get_tm() { return tm; } + protected: + bool is_read_isolated() const { return true; } + + tm_future read_extent( + Transaction& t, laddr_t addr, extent_len_t len) { + return tm.read_extents(t, addr, len + ).safe_then([](auto&& extents) { + assert(extents.size() == 1); + [[maybe_unused]] auto [laddr, e] = extents.front(); + return NodeExtent::Ref(e); + }); + } + + tm_future alloc_extent( + Transaction& t, extent_len_t len) { + return tm.alloc_extent(t, addr_min, len + ).safe_then([](auto extent) { + return NodeExtent::Ref(extent); + }); + } + + tm_future get_super(Transaction& t, RootNodeTracker& tracker) { + // TODO + return tm_ertr::make_ready_future( + Super::URef(new SeastoreSuper(t, tracker, L_ADDR_NULL, tm))); + } + + TransactionManager& tm; + const laddr_t addr_min; +}; + +inline NodeExtent::Ref SeastoreNodeExtent::mutate(context_t c) { + auto nm = static_cast(&c.nm); + auto ret = nm->get_tm().get_mutable_extent(c.t, this); + return ret->cast(); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc new file mode 100644 index 00000000000..de67500274e --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "node_extent_mutable.h" +#include "node_extent_manager.h" + +namespace crimson::os::seastore::onode { + +NodeExtentMutable::NodeExtentMutable(NodeExtent& extent) + : extent{extent} { + assert(extent.is_pending()); +} + +const char* NodeExtentMutable::get_read() const { + assert(extent.is_pending()); + return extent.get_bptr().c_str(); +} + +char* NodeExtentMutable::get_write() { + assert(extent.is_pending()); + return extent.get_bptr().c_str(); +} + +extent_len_t NodeExtentMutable::get_length() const { + return extent.get_length(); +} + +const char* NodeExtentMutable::buf_upper_bound() const { + return get_read() + get_length(); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h new file mode 100644 index 00000000000..a34bbc9509e --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "fwd.h" + +#pragma once + +namespace crimson::os::seastore::onode { + +class NodeExtent; + +// the wrapper of NodeExtent which is mutable and safe to be mutated +class NodeExtentMutable { + public: + void copy_in_absolute(void* dst, const void* src, extent_len_t len) { + assert((char*)dst >= get_write()); + assert((char*)dst + len <= buf_upper_bound()); + std::memcpy(dst, src, len); + } + template + void copy_in_absolute(void* dst, const T& src) { + copy_in_absolute(dst, &src, sizeof(T)); + } + + const void* copy_in_relative( + extent_len_t dst_offset, const void* src, extent_len_t len) { + auto dst = get_write() + dst_offset; + copy_in_absolute(dst, src, len); + return dst; + } + template + const T* copy_in_relative( + extent_len_t dst_offset, const T& src) { + auto dst = copy_in_relative(dst_offset, &src, sizeof(T)); + return static_cast(dst); + } + + void shift_absolute(const void* src, extent_len_t len, int offset) { + assert((const char*)src >= get_write()); + assert((const char*)src + len <= buf_upper_bound()); + char* to = (char*)src + offset; + assert(to >= get_write()); + assert(to + len <= buf_upper_bound()); + if (len != 0) { + std::memmove(to, src, len); + } + } + void shift_relative(extent_len_t src_offset, extent_len_t len, int offset) { + shift_absolute(get_write() + src_offset, len, offset); + } + + template + void validate_inplace_update(const T& updated) { + assert((const char*)&updated >= get_write()); + assert((const char*)&updated + sizeof(T) <= buf_upper_bound()); + } + + char* get_write(); + extent_len_t get_length() const; + + private: + explicit NodeExtentMutable(NodeExtent&); + const char* get_read() const; + const char* buf_upper_bound() const; + + NodeExtent& extent; + + friend class NodeExtent; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_visitor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_visitor.h new file mode 100644 index 00000000000..96075468553 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_visitor.h @@ -0,0 +1,170 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "node_extent_manager.h" +#include "node_impl_replayable.h" + +namespace crimson::os::seastore::onode { + +template +class NodeExtentT { + enum class state_t { + NO_RECORDING, // extent_state_t::INITIAL_WRITE_PENDING + RECORDING, // extent_state_t::MUTATION_PENDING + PENDING_MUTATE // extent_state_t::CLEAN/DIRTY + }; + + public: + using layout_t = NodeLayoutReplayableT; + using node_stage_t = typename layout_t::node_stage_t; + using position_t = typename layout_t::position_t; + using StagedIterator = typename layout_t::StagedIterator; + using value_t = typename layout_t::value_t; + static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE; + + // TODO: remove + NodeExtentT() = default; + NodeExtentT(NodeExtentT&& other) noexcept { + *this = std::move(other); + } + NodeExtentT& operator=(NodeExtentT&& other) noexcept { + extent = std::move(other.extent); + state = std::move(other.state); + node_stage = std::move(other.node_stage); + mut.emplace(*other.mut); + return *this; + } + + const node_stage_t& read() const { return node_stage; } + laddr_t get_laddr() const { return extent->get_laddr(); } + + // must be called before any mutate attempes. + // for the safety of mixed read and mutate, call before read. + void prepare_mutate(context_t c) { + if (state == state_t::PENDING_MUTATE) { + assert(!extent->is_pending()); + // TODO: create and set recorder DeltaRecorderT + extent = extent->mutate(c/* recorder */); + assert(extent->is_mutation_pending()); + state = state_t::RECORDING; + node_stage = node_stage_t( + reinterpret_cast(extent->get_read())); + mut.emplace(extent->get_mutable()); + } + } + + // TODO: translate absolute modifications to relative + template + const value_t* insert_replayable( + const full_key_t& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(state != state_t::PENDING_MUTATE); + // TODO: encode params to recorder as delta + return layout_t::template insert( + *mut, read(), key, value, + insert_pos, insert_stage, insert_size); + } + + void split_replayable(StagedIterator& split_at) { + assert(state != state_t::PENDING_MUTATE); + // TODO: encode params to recorder as delta + layout_t::split(*mut, read(), split_at); + } + + template + const value_t* split_insert_replayable( + StagedIterator& split_at, + const full_key_t& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(state != state_t::PENDING_MUTATE); + // TODO: encode params to recorder as delta + return layout_t::template split_insert( + *mut, read(), split_at, key, value, + insert_pos, insert_stage, insert_size); + } + + void prepare_internal_split_replayable( + const laddr_t left_child_addr, + const laddr_t right_child_addr, + laddr_t* p_split_addr) { + assert(state != state_t::PENDING_MUTATE); + // TODO: encode params to recorder as delta + return layout_t::prepare_internal_split( + *mut, read(), left_child_addr, right_child_addr, p_split_addr); + } + + void test_copy_to(NodeExtentMutable& to) const { + assert(extent->get_length() == to.get_length()); + std::memcpy(to.get_write(), extent->get_read(), extent->get_length()); + } + + static NodeExtentT load(NodeExtent::Ref extent) { + state_t state; + if (extent->is_initial_pending()) { + state = state_t::NO_RECORDING; + } else if (extent->is_mutation_pending()) { + state = state_t::RECORDING; + } else if (!extent->is_valid()) { + state = state_t::PENDING_MUTATE; + } else { + ceph_abort("invalid extent"); + } + return NodeExtentT(extent, state); + } + + struct fresh_extent_t { + NodeExtentT extent; + NodeExtentMutable mut; + }; + using alloc_ertr = NodeExtentManager::tm_ertr; + static alloc_ertr::future + allocate(context_t c, level_t level, bool is_level_tail) { + // NOTE: + // *option1: all types of node have the same length; + // option2: length is defined by node/field types; + // option3: length is totally flexible; + return c.nm.alloc_extent(c.t, node_stage_t::EXTENT_SIZE + ).safe_then([level, is_level_tail](auto extent) { + assert(extent->is_initial_pending()); + auto mut = extent->get_mutable(); + node_stage_t::bootstrap_extent( + mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level); + return fresh_extent_t{NodeExtentT(extent, state_t::NO_RECORDING), mut}; + }); + } + + private: + NodeExtentT(NodeExtent::Ref extent, state_t state) + : extent{extent}, state{state}, + node_stage{reinterpret_cast(extent->get_read())} { + if (state == state_t::NO_RECORDING) { + assert(!mut.has_value()); + mut.emplace(extent->get_mutable()); + // TODO: recorder = nullptr; + } else if (state == state_t::RECORDING) { + assert(!mut.has_value()); + mut.emplace(extent->get_mutable()); + // TODO: get recorder from extent + } else if (state == state_t::PENDING_MUTATE) { + // TODO: recorder = nullptr; + } else { + ceph_abort("impossible path"); + } + } + + NodeExtent::Ref extent; + state_t state; + node_stage_t node_stage; + std::optional mut; + // TODO: DeltaRecorderT* recorder; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h new file mode 100644 index 00000000000..f99c090b163 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h @@ -0,0 +1,664 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +// TODO: remove +#include +#include + +#include "common/likely.h" +#include "node.h" +#include "node_extent_visitor.h" +#include "stages/node_layout.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +// TODO: decouple NodeT with Node + +template +class NodeT : virtual public Node { + public: + using extent_t = NodeExtentT; + using node_ertr = Node::node_ertr; + template + using node_future = Node::node_future; + using node_stage_t = typename extent_t::node_stage_t; + using position_t = typename extent_t::position_t; + using value_t = typename extent_t::value_t; + static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE; + static constexpr auto NODE_TYPE = _NODE_TYPE; + + struct fresh_node_t { + Ref node; + NodeExtentMutable mut; + std::pair, NodeExtentMutable> make_pair() { + return std::make_pair(Ref(node), mut); + } + }; + + virtual ~NodeT() = default; + + bool is_level_tail() const override final { return extent.read().is_level_tail(); } + field_type_t field_type() const override final { return FIELD_TYPE; } + laddr_t laddr() const override final { return extent.get_laddr(); } + level_t level() const override final { return extent.read().level(); } + + full_key_t get_key_view( + const search_position_t& position) const override final { + full_key_t ret; + STAGE_T::get_key_view( + extent.read(), cast_down(position), ret); + return ret; + } + + full_key_t get_largest_key_view() const override final { + full_key_t key_view; + STAGE_T::lookup_largest_index(extent.read(), key_view); + return key_view; + } + + std::ostream& dump(std::ostream& os) const override final { + auto& node_stage = extent.read(); + auto p_start = node_stage.p_start(); + os << *this << ":"; + os << "\n header: " << node_stage_t::header_size() << "B"; + size_t size = 0u; + if (node_stage.keys()) { + STAGE_T::dump(node_stage, os, " ", size, p_start); + } else { + if constexpr (NODE_TYPE == node_type_t::LEAF) { + return os << " empty!"; + } else { // internal node + if (!is_level_tail()) { + return os << " empty!"; + } else { + size += node_stage_t::header_size(); + } + } + } + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + size += sizeof(laddr_t); + auto value_ptr = node_stage.get_end_p_laddr(); + int offset = reinterpret_cast(value_ptr) - p_start; + os << "\n tail value: 0x" + << std::hex << *value_ptr << std::dec + << " " << size << "B" + << " @" << offset << "B"; + } + } + return os; + } + + std::ostream& dump_brief(std::ostream& os) const override final { + auto& node_stage = extent.read(); + os << "Node" << NODE_TYPE << FIELD_TYPE + << "@0x" << std::hex << laddr() + << "+" << node_stage_t::EXTENT_SIZE << std::dec + << (is_level_tail() ? "$" : "") + << "(level=" << (unsigned)level() + << ", filled=" << node_stage.total_size() - node_stage.free_size() << "B" + << ", free=" << node_stage.free_size() << "B" + << ")"; + return os; + } + + const value_t* get_value_ptr(const search_position_t& position) const { + auto& node_stage = extent.read(); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (position.is_end()) { + assert(is_level_tail()); + return node_stage.get_end_p_laddr(); + } + } else { + assert(!position.is_end()); + } + return STAGE_T::get_p_value(node_stage, cast_down(position)); + } + + void test_make_destructable( + context_t c, NodeExtentMutable& mut, Super::URef&& _super) override final { + node_stage_t::update_is_level_tail(mut, extent.read(), true); + make_root(c, std::move(_super)); + } + + static Ref load(NodeExtent::Ref extent, bool expect_is_level_tail) { + Ref ret = new ConcreteType(); + ret->extent = extent_t::load(extent); + assert(ret->is_level_tail() == expect_is_level_tail); + return ret; + } + + protected: + // TODO: constructor + extent_t extent; +}; + +template +class InternalNodeT : public InternalNode, + public NodeT { + public: + using parent_t = NodeT; + using extent_t = typename parent_t::extent_t; + using fresh_node_t = typename parent_t::fresh_node_t; + using node_stage_t = typename parent_t::node_stage_t; + using position_t = typename parent_t::position_t; + + virtual ~InternalNodeT() = default; + + node_future do_lower_bound( + context_t c, const full_key_t& key, + MatchHistory& history) override final { + auto result = STAGE_T::lower_bound_normalized( + this->extent.read(), key, history); + auto& position = result.position; + laddr_t child_addr; + if (position.is_end()) { + assert(this->is_level_tail()); + child_addr = *this->get_value_ptr(position); + } else { + assert(result.p_value); + child_addr = *result.p_value; + } + return get_or_track_child(c, position, child_addr + ).safe_then([c, &key, &history](auto child) { + // XXX(multi-type): pass result.mstat to child + return child->do_lower_bound(c, key, history); + }); + } + + node_future> lookup_smallest(context_t c) override final { + auto position = search_position_t::begin(); + laddr_t child_addr = *this->get_value_ptr(position); + return get_or_track_child(c, position, child_addr).safe_then([c](auto child) { + return child->lookup_smallest(c); + }); + } + + node_future> lookup_largest(context_t c) override final { + // NOTE: unlike LeafNodeT::lookup_largest(), this only works for the tail + // internal node to return the tail child address. + auto position = search_position_t::end(); + laddr_t child_addr = *this->get_value_ptr(position); + return get_or_track_child(c, position, child_addr).safe_then([c](auto child) { + return child->lookup_largest(c); + }); + } + + node_future<> apply_child_split( + context_t c, const search_position_t& pos, + const full_key_t& left_key, Ref left_child, + Ref right_child) override final { + this->extent.prepare_mutate(c); + auto& node_stage = this->extent.read(); + + // update pos => l_addr to r_addr + auto left_laddr = left_child->laddr(); + auto right_laddr = right_child->laddr(); + const laddr_t* p_rvalue = this->get_value_ptr(pos); + this->extent.prepare_internal_split_replayable( + left_laddr, right_laddr, const_cast(p_rvalue)); + this->replace_track(pos, left_child, right_child); + + // evaluate insertion + position_t insert_pos = cast_down(pos); + match_stage_t insert_stage; + node_offset_t insert_size; + if (unlikely(!node_stage.keys())) { + assert(insert_pos.is_end()); + insert_stage = STAGE_T::STAGE; + insert_size = STAGE_T::template insert_size(left_key, left_laddr); + } else { + std::tie(insert_stage, insert_size) = + STAGE_T::evaluate_insert(node_stage, left_key, left_laddr, insert_pos, true); + } + + // TODO: common part begin, move to NodeT + auto free_size = node_stage.free_size(); + if (free_size >= insert_size) { + auto p_value = this->extent.template insert_replayable( + left_key, left_laddr, insert_pos, insert_stage, insert_size); + assert(node_stage.free_size() == free_size - insert_size); + // TODO: common part end, move to NodeT + + assert(*p_value == left_laddr); + auto insert_pos_normalized = normalize(std::move(insert_pos)); + assert(insert_pos_normalized <= pos); + assert(get_key_view(insert_pos_normalized) == left_key); + track_insert(insert_pos_normalized, insert_stage, left_child, right_child); + this->validate_tracked_children(); + return node_ertr::now(); + } + + std::cout << " try insert at: " << insert_pos + << ", insert_stage=" << (int)insert_stage + << ", insert_size=" << insert_size + << ", values=0x" << std::hex << left_laddr + << ",0x" << right_laddr << std::dec << std::endl; + + Ref this_ref = this; + return (is_root() ? this->upgrade_root(c) : node_ertr::now() + ).safe_then([this, c] { + return ConcreteType::allocate(c, this->level(), this->is_level_tail()); + }).safe_then([this_ref, this, c, left_key, left_child, right_child, left_laddr, + insert_pos, insert_stage, insert_size](auto fresh_right) mutable { + auto& node_stage = this->extent.read(); + size_t empty_size = node_stage.size_before(0); + size_t available_size = node_stage.total_size() - empty_size; + size_t target_split_size = empty_size + (available_size + insert_size) / 2; + // TODO adjust NODE_BLOCK_SIZE according to this requirement + assert(insert_size < available_size / 2); + typename STAGE_T::StagedIterator split_at; + bool insert_left = STAGE_T::locate_split( + node_stage, target_split_size, insert_pos, insert_stage, insert_size, split_at); + + std::cout << " split at: " << split_at << ", insert_left=" << insert_left + << ", now insert at: " << insert_pos + << std::endl; + + auto append_at = split_at; + // TODO(cross-node string dedup) + typename STAGE_T::template StagedAppender right_appender; + right_appender.init(&fresh_right.mut, fresh_right.mut.get_write()); + const laddr_t* p_value = nullptr; + if (!insert_left) { + // right node: append [start(append_at), insert_pos) + STAGE_T::template append_until( + append_at, right_appender, insert_pos, insert_stage); + std::cout << "insert to right: " << insert_pos + << ", insert_stage=" << (int)insert_stage << std::endl; + // right node: append [insert_pos(key, value)] + bool is_front_insert = (insert_pos == position_t::begin()); + bool is_end = STAGE_T::template append_insert( + left_key, left_laddr, append_at, right_appender, + is_front_insert, insert_stage, p_value); + assert(append_at.is_end() == is_end); + } + + // right node: append (insert_pos, end) + auto pos_end = position_t::end(); + STAGE_T::template append_until( + append_at, right_appender, pos_end, STAGE_T::STAGE); + assert(append_at.is_end()); + right_appender.wrap(); + fresh_right.node->dump(std::cout) << std::endl; + + // mutate left node + if (insert_left) { + p_value = this->extent.template split_insert_replayable( + split_at, left_key, left_laddr, insert_pos, insert_stage, insert_size); + } else { + this->extent.split_replayable(split_at); + } + this->dump(std::cout) << std::endl; + assert(p_value); + // TODO: common part end, move to NodeT + + auto split_pos_normalized = normalize(split_at.get_pos()); + auto insert_pos_normalized = normalize(std::move(insert_pos)); + std::cout << "split at " << split_pos_normalized + << ", insert at " << insert_pos_normalized + << ", insert_left=" << insert_left + << ", insert_stage=" << (int)insert_stage << std::endl; + track_split(split_pos_normalized, fresh_right.node); + if (insert_left) { + track_insert(insert_pos_normalized, insert_stage, left_child); + } else { + fresh_right.node->track_insert(insert_pos_normalized, insert_stage, left_child); + } + + this->validate_tracked_children(); + fresh_right.node->validate_tracked_children(); + + // propagate index to parent + return this->insert_parent(c, fresh_right.node); + // TODO (optimize) + // try to acquire space from siblings before split... see btrfs + }); + } + + static node_future allocate( + context_t c, level_t level, bool is_level_tail) { + assert(level != 0u); + return extent_t::allocate(c, level, is_level_tail + ).safe_then([](auto&& fresh_extent) { + auto ret = Ref(new ConcreteType()); + ret->extent = std::move(fresh_extent.extent); + return fresh_node_t{ret, fresh_extent.mut}; + }); + } + + private: + const laddr_t* get_p_value(const search_position_t& pos) const override final { + return this->get_value_ptr(pos); + } + +}; + +class InternalNode0 final : public InternalNodeT { + public: + node_future<> test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const override final { + assert(is_root()); + assert(is_level_tail()); + Ref this_ref = this; + return InternalNode0::allocate(c_other, level(), true + ).safe_then([this, c_other, &tracker_other](auto fresh_other) { + this->extent.test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).safe_then([c_other, cloned_root](auto&& super_other) { + cloned_root->make_root_new(c_other, std::move(super_other)); + return cloned_root; + }); + }).safe_then([this_ref, this, c_other](auto cloned_root) { + // In some unit tests, the children are stubbed out that they + // don't exist in NodeExtentManager, and are only tracked in memory. + return test_clone_children(c_other, cloned_root); + }); + } + + static node_future> allocate_root( + context_t c, level_t old_root_level, + laddr_t old_root_addr, Super::URef&& super) { + return allocate(c, old_root_level + 1, true + ).safe_then([c, old_root_addr, + super = std::move(super)](auto fresh_root) mutable { + auto root = fresh_root.node; + const laddr_t* p_value = root->get_value_ptr(search_position_t::end()); + fresh_root.mut.copy_in_absolute( + const_cast(p_value), old_root_addr); + root->make_root_from(c, std::move(super), old_root_addr); + return root; + }); + } +}; + +class InternalNode1 final : public InternalNodeT {}; +class InternalNode2 final : public InternalNodeT {}; +class InternalNode3 final : public InternalNodeT {}; + +template +class LeafNodeT: public LeafNode, + public NodeT { + public: + using parent_t = NodeT; + using extent_t = typename parent_t::extent_t; + using fresh_node_t = typename parent_t::fresh_node_t; + using node_stage_t = typename parent_t::node_stage_t; + using position_t = typename parent_t::position_t; + + virtual ~LeafNodeT() = default; + + node_future do_lower_bound( + context_t, const full_key_t& key, + MatchHistory& history) override final { + auto& node_stage = this->extent.read(); + if (unlikely(node_stage.keys() == 0)) { + assert(this->is_root()); + history.set(MatchKindCMP::NE); + auto p_cursor = get_or_track_cursor(search_position_t::end(), nullptr); + return node_ertr::make_ready_future( + search_result_t{p_cursor, MatchKindBS::NE}); + } + + auto result = STAGE_T::lower_bound_normalized(node_stage, key, history); + if (result.is_end()) { + assert(this->is_level_tail()); + } else { + assert(result.p_value); + } + auto p_cursor = get_or_track_cursor(result.position, result.p_value); + return node_ertr::make_ready_future( + search_result_t{p_cursor, result.match()}); + } + + node_future> lookup_smallest(context_t) override final { + auto& node_stage = this->extent.read(); + if (unlikely(node_stage.keys() == 0)) { + assert(this->is_root()); + auto pos = search_position_t::end(); + return node_ertr::make_ready_future>( + get_or_track_cursor(pos, nullptr)); + } + + auto pos = search_position_t::begin(); + const onode_t* p_value = this->get_value_ptr(pos); + return node_ertr::make_ready_future>( + get_or_track_cursor(pos, p_value)); + } + + node_future> lookup_largest(context_t) override final { + auto& node_stage = this->extent.read(); + if (unlikely(node_stage.keys() == 0)) { + assert(this->is_root()); + auto pos = search_position_t::end(); + return node_ertr::make_ready_future>( + get_or_track_cursor(pos, nullptr)); + } + + search_position_t pos; + const onode_t* p_value = nullptr; + STAGE_T::lookup_largest_normalized(node_stage, pos, p_value); + return node_ertr::make_ready_future>( + get_or_track_cursor(pos, p_value)); + } + + node_future> insert_value( + context_t c, const full_key_t& key, const onode_t& value, + const search_position_t& pos, const MatchHistory& history) override final { +#ifndef NDEBUG + if (pos.is_end()) { + assert(this->is_level_tail()); + } +#endif + this->extent.prepare_mutate(c); + auto& node_stage = this->extent.read(); + + position_t insert_pos = cast_down(pos); + auto [insert_stage, insert_size] = + STAGE_T::evaluate_insert(key, value, history, insert_pos); + + // TODO: common part begin, move to NodeT + auto free_size = node_stage.free_size(); + if (free_size >= insert_size) { + auto p_value = this->extent.template insert_replayable( + key, value, insert_pos, insert_stage, insert_size); + assert(node_stage.free_size() == free_size - insert_size); + // TODO: common part end, move to NodeT + + assert(p_value->size == value.size); + auto insert_pos_normalized = normalize(std::move(insert_pos)); + assert(insert_pos_normalized <= pos); + assert(get_key_view(insert_pos_normalized) == key); + auto ret = track_insert(insert_pos_normalized, insert_stage, p_value); + this->validate_tracked_cursors(); + return node_ertr::make_ready_future>(ret); + } + + std::cout << " try insert at: " << insert_pos + << ", insert_stage=" << (int)insert_stage + << ", insert_size=" << insert_size + << std::endl; + + Ref this_ref = this; + return (is_root() ? this->upgrade_root(c) : node_ertr::now() + ).safe_then([this, c] { + return ConcreteType::allocate(c, this->is_level_tail()); + }).safe_then([this_ref, this, c, &key, &value, &history, + insert_pos, insert_stage, insert_size](auto fresh_right) mutable { + auto& node_stage = this->extent.read(); + size_t empty_size = node_stage.size_before(0); + size_t available_size = node_stage.total_size() - empty_size; + size_t target_split_size = empty_size + (available_size + insert_size) / 2; + // TODO adjust NODE_BLOCK_SIZE according to this requirement + assert(insert_size < available_size / 2); + typename STAGE_T::StagedIterator split_at; + bool insert_left = STAGE_T::locate_split( + node_stage, target_split_size, insert_pos, insert_stage, insert_size, split_at); + + std::cout << " split at: " << split_at << ", insert_left=" << insert_left + << ", now insert at: " << insert_pos + << std::endl; + + auto append_at = split_at; + // TODO(cross-node string dedup) + typename STAGE_T::template StagedAppender right_appender; + right_appender.init(&fresh_right.mut, fresh_right.mut.get_write()); + const onode_t* p_value = nullptr; + if (!insert_left) { + // right node: append [start(append_at), insert_pos) + STAGE_T::template append_until( + append_at, right_appender, insert_pos, insert_stage); + std::cout << "insert to right: " << insert_pos + << ", insert_stage=" << (int)insert_stage << std::endl; + // right node: append [insert_pos(key, value)] + bool is_front_insert = (insert_pos == position_t::begin()); + bool is_end = STAGE_T::template append_insert( + key, value, append_at, right_appender, + is_front_insert, insert_stage, p_value); + assert(append_at.is_end() == is_end); + } + + // right node: append (insert_pos, end) + auto pos_end = position_t::end(); + STAGE_T::template append_until( + append_at, right_appender, pos_end, STAGE_T::STAGE); + assert(append_at.is_end()); + right_appender.wrap(); + fresh_right.node->dump(std::cout) << std::endl; + + // mutate left node + if (insert_left) { + p_value = this->extent.template split_insert_replayable( + split_at, key, value, insert_pos, insert_stage, insert_size); + } else { + this->extent.split_replayable(split_at); + } + this->dump(std::cout) << std::endl; + assert(p_value); + // TODO: common part end, move to NodeT + + auto split_pos_normalized = normalize(split_at.get_pos()); + auto insert_pos_normalized = normalize(std::move(insert_pos)); + std::cout << "split at " << split_pos_normalized + << ", insert at " << insert_pos_normalized + << ", insert_left=" << insert_left + << ", insert_stage=" << (int)insert_stage << std::endl; + track_split(split_pos_normalized, fresh_right.node); + Ref ret; + if (insert_left) { + assert(this->get_key_view(insert_pos_normalized) == key); + ret = track_insert(insert_pos_normalized, insert_stage, p_value); + } else { + assert(fresh_right.node->get_key_view(insert_pos_normalized) == key); + ret = fresh_right.node->track_insert(insert_pos_normalized, insert_stage, p_value); + } + + this->validate_tracked_cursors(); + fresh_right.node->validate_tracked_cursors(); + + // propagate index to parent + return this->insert_parent(c, fresh_right.node).safe_then([ret] { + return ret; + }); + // TODO (optimize) + // try to acquire space from siblings before split... see btrfs + }); + } + + static node_future allocate(context_t c, bool is_level_tail) { + return extent_t::allocate(c, 0u, is_level_tail + ).safe_then([](auto&& fresh_extent) { + auto ret = Ref(new ConcreteType()); + ret->extent = std::move(fresh_extent.extent); + return fresh_node_t{ret, fresh_extent.mut}; + }); + } + + private: + const onode_t* get_p_value(const search_position_t& pos) const override final { + return this->get_value_ptr(pos); + } +}; +class LeafNode0 final : public LeafNodeT { + public: + node_future<> test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const override final { + assert(this->is_root()); + assert(is_level_tail()); + Ref this_ref = this; + return LeafNode0::allocate(c_other, true + ).safe_then([this, c_other, &tracker_other](auto fresh_other) { + this->extent.test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).safe_then([c_other, cloned_root](auto&& super_other) { + cloned_root->make_root_new(c_other, std::move(super_other)); + }); + }).safe_then([this_ref]{}); + } + + static node_future<> mkfs(context_t c, RootNodeTracker& root_tracker) { + return allocate(c, true + ).safe_then([c, &root_tracker](auto fresh_node) { + auto root = fresh_node.node; + return c.nm.get_super(c.t, root_tracker + ).safe_then([c, root](auto&& super) { + root->make_root_new(c, std::move(super)); + }); + }); + } +}; +class LeafNode1 final : public LeafNodeT {}; +class LeafNode2 final : public LeafNodeT {}; +class LeafNode3 final : public LeafNodeT {}; + +inline Node::node_future> load_node( + context_t c, laddr_t addr, bool expect_is_level_tail) { + // NOTE: + // *option1: all types of node have the same length; + // option2: length is defined by node/field types; + // option3: length is totally flexible; + return c.nm.read_extent(c.t, addr, NODE_BLOCK_SIZE + ).safe_then([expect_is_level_tail](auto extent) { + const auto header = reinterpret_cast(extent->get_read()); + auto _field_type = header->get_field_type(); + if (!_field_type.has_value()) { + throw std::runtime_error("load failed: bad field type"); + } + auto _node_type = header->get_node_type(); + if (_field_type == field_type_t::N0) { + if (_node_type == node_type_t::LEAF) { + return LeafNode0::load(extent, expect_is_level_tail); + } else { + return InternalNode0::load(extent, expect_is_level_tail); + } + } else if (_field_type == field_type_t::N1) { + if (_node_type == node_type_t::LEAF) { + return LeafNode1::load(extent, expect_is_level_tail); + } else { + return InternalNode1::load(extent, expect_is_level_tail); + } + } else if (_field_type == field_type_t::N2) { + if (_node_type == node_type_t::LEAF) { + return LeafNode2::load(extent, expect_is_level_tail); + } else { + return InternalNode2::load(extent, expect_is_level_tail); + } + } else if (_field_type == field_type_t::N3) { + if (_node_type == node_type_t::LEAF) { + return LeafNode3::load(extent, expect_is_level_tail); + } else { + return InternalNode3::load(extent, expect_is_level_tail); + } + } else { + assert(false); + } + }); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl_replayable.h new file mode 100644 index 00000000000..cc7b55e2119 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl_replayable.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +// TODO: remove +#include + +#include "node_extent_mutable.h" +#include "stages/node_stage.h" +#include "stages/stage.h" + +#define STAGE_T node_to_stage_t + +namespace crimson::os::seastore::onode { + +template +struct NodeLayoutReplayableT { + using node_stage_t = node_extent_t; + using position_t = typename STAGE_T::position_t; + using StagedIterator = typename STAGE_T::StagedIterator; + using value_t = value_type_t; + static constexpr auto FIELD_TYPE = FieldType::FIELD_TYPE; + + template + static const value_t* insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + const full_key_t& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + auto p_value = STAGE_T::template proceed_insert( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void split( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + STAGE_T::trim(mut, split_at); + } + + template + static const value_t* split_insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at, + const full_key_t& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + STAGE_T::trim(mut, split_at); + std::cout << "insert to left: " << insert_pos + << ", insert_stage=" << (int)insert_stage << std::endl; + auto p_value = STAGE_T::template proceed_insert( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void prepare_internal_split( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + const laddr_t left_child_addr, + const laddr_t right_child_addr, + laddr_t* p_split_addr) { + assert(NODE_TYPE == node_type_t::INTERNAL); + assert(*p_split_addr == left_child_addr); + mut.copy_in_absolute(p_split_addr, right_child_addr); + } + +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h new file mode 100644 index 00000000000..17878d8e4b9 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +constexpr uint8_t FIELD_TYPE_MAGIC = 0x25; +enum class field_type_t : uint8_t { + N0 = FIELD_TYPE_MAGIC, + N1, + N2, + N3, + _MAX +}; +inline uint8_t to_unsigned(field_type_t type) { + auto value = static_cast(type); + assert(value >= FIELD_TYPE_MAGIC); + assert(value < static_cast(field_type_t::_MAX)); + return value - FIELD_TYPE_MAGIC; +} +inline std::ostream& operator<<(std::ostream &os, field_type_t type) { + const char* const names[] = {"0", "1", "2", "3"}; + auto index = to_unsigned(type); + os << names[index]; + return os; +} + +enum class node_type_t : uint8_t { + LEAF = 0, + INTERNAL +}; +inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) { + const char* const names[] = {"L", "I"}; + auto index = static_cast(type); + assert(index <= 1u); + os << names[index]; + return os; +} + +using level_t = uint8_t; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc new file mode 100644 index 00000000000..42160d2dd08 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "item_iterator_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +#define ITER_T item_iterator_t +#define ITER_INST(NT) item_iterator_t +#define ITER_TEMPLATE(NT) template class ITER_INST(NT) +ITER_TEMPLATE(node_type_t::LEAF); +ITER_TEMPLATE(node_type_t::INTERNAL); + +template +template +memory_range_t ITER_T::insert_prefix( + NodeExtentMutable& mut, const ITER_T& iter, const full_key_t& key, + bool is_end, node_offset_t size, const char* p_left_bound) { + // 1. insert range + char* p_insert; + if (is_end) { + assert(!iter.has_next()); + p_insert = const_cast(iter.p_start()); + } else { + p_insert = const_cast(iter.p_end()); + } + char* p_insert_front = p_insert - size; + + // 2. shift memory + const char* p_shift_start = p_left_bound; + const char* p_shift_end = p_insert; + mut.shift_absolute(p_shift_start, + p_shift_end - p_shift_start, + -(int)size); + + // 3. append header + p_insert -= sizeof(node_offset_t); + node_offset_t back_offset = (p_insert - p_insert_front); + mut.copy_in_absolute(p_insert, back_offset); + ns_oid_view_t::append(mut, key, p_insert); + + return {p_insert_front, p_insert}; +} +#define IP_TEMPLATE(NT, KT) \ + template memory_range_t ITER_INST(NT)::insert_prefix( \ + NodeExtentMutable&, const ITER_INST(NT)&, const full_key_t&, \ + bool, node_offset_t, const char*) +IP_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +IP_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +template +void ITER_T::update_size( + NodeExtentMutable& mut, const ITER_T& iter, int change) { + node_offset_t offset = iter.get_back_offset(); + assert(change + offset > 0); + assert(change + offset < NODE_BLOCK_SIZE); + mut.copy_in_absolute( + (void*)iter.get_item_range().p_end, node_offset_t(offset + change)); +} + +template +size_t ITER_T::trim_until(NodeExtentMutable&, const ITER_T& iter) { + assert(iter.index() != 0); + return iter.p_end() - iter.p_items_start; +} + +template +size_t ITER_T::trim_at( + NodeExtentMutable& mut, const ITER_T& iter, size_t trimmed) { + size_t trim_size = iter.p_start() - iter.p_items_start + trimmed; + assert(iter.get_back_offset() > trimmed); + node_offset_t new_offset = iter.get_back_offset() - trimmed; + mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset); + return trim_size; +} + +#define APPEND_T ITER_T::Appender +template class ITER_INST(node_type_t::LEAF)::Appender; +template class ITER_INST(node_type_t::INTERNAL)::Appender; +template class ITER_INST(node_type_t::LEAF)::Appender; +template class ITER_INST(node_type_t::INTERNAL)::Appender; + +template +template +bool APPEND_T::append(const ITER_T& src, size_t& items, index_t type) { + auto p_end = src.p_end(); + if (items != INDEX_END) { + for (auto i = 1u; i <= items; ++i) { + if (!src.has_next()) { + assert(i == items); + type = index_t::end; + break; + } + ++src; + } + } else if (type != index_t::none) { + items = 0; + while (src.has_next()) { + ++src; + ++items; + } + if (type == index_t::end) { + ++items; + } + } else { + assert(false); + } + const char* p_start; + if (type == index_t::end) { + // include last + p_start = src.p_start(); + } else { + // exclude last + p_start = src.p_end(); + } + assert(p_end >= p_start); + size_t append_size = p_end - p_start; + p_append -= append_size; + p_mut->copy_in_absolute(p_append, p_start, append_size); + return type == index_t::end; +} + +template +template +std::tuple +APPEND_T::open_nxt(const key_get_type& partial_key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append(*p_mut, partial_key, p_append); + return {p_mut, p_append}; +} + +template +template +std::tuple +APPEND_T::open_nxt(const full_key_t& key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append(*p_mut, key, p_append); + return {p_mut, p_append}; +} + +template +template +void APPEND_T::wrap_nxt(char* _p_append) { + assert(_p_append < p_append); + p_mut->copy_in_absolute( + p_offset_while_open, node_offset_t(p_offset_while_open - _p_append)); + p_append = _p_append; +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h new file mode 100644 index 00000000000..6c7e928b807 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/* + * internal/leaf node N0, N1 + * + * (_index) + * p_items_start + * | item_range ------------+ + * | | +----key---------+ + * | | | | + * V V V V + * | |sub |oid char|ns char|colli-| | + * |...|items|array & |array &|-sion |...| + * | |... |len |len |offset| | + * ^ | + * | | + * +---- back_offset -----+ + */ +template +class item_iterator_t { + using value_t = value_type_t; + public: + item_iterator_t(const memory_range_t& range) + : p_items_start(range.p_start) { next_item_range(range.p_end); } + + const char* p_start() const { return item_range.p_start; } + const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); } + const memory_range_t& get_item_range() const { return item_range; } + node_offset_t get_back_offset() const { return back_offset; } + + // container type system + using key_get_type = const ns_oid_view_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE; + size_t index() const { return _index; } + key_get_type get_key() const { + if (!key.has_value()) { + key = ns_oid_view_t(item_range.p_end); + assert(item_range.p_start < (*key).p_start()); + } + return *key; + } + size_t size() const { + return item_range.p_end - item_range.p_start + sizeof(node_offset_t); + }; + size_t size_to_nxt() const { + return get_key().size() + sizeof(node_offset_t); + } + memory_range_t get_nxt_container() const { + return {item_range.p_start, get_key().p_start()}; + } + bool has_next() const { + assert(p_items_start <= item_range.p_start); + return p_items_start < item_range.p_start; + } + const item_iterator_t& operator++() const { + assert(has_next()); + next_item_range(item_range.p_start); + key.reset(); + ++_index; + return *this; + } + + static node_offset_t header_size() { return 0u; } + + template + static node_offset_t estimate_insert( + const full_key_t& key, const value_t&) { + return ns_oid_view_t::estimate_size(key) + sizeof(node_offset_t); + } + + template + static memory_range_t insert_prefix( + NodeExtentMutable& mut, const item_iterator_t& iter, + const full_key_t& key, bool is_end, + node_offset_t size, const char* p_left_bound); + + static void update_size( + NodeExtentMutable& mut, const item_iterator_t& iter, int change); + + static size_t trim_until(NodeExtentMutable&, const item_iterator_t&); + static size_t trim_at( + NodeExtentMutable&, const item_iterator_t&, size_t trimmed); + + enum class index_t { none, last, end }; + template + class Appender; + + private: + void next_item_range(const char* p_end) const { + auto p_item_end = p_end - sizeof(node_offset_t); + assert(p_items_start < p_item_end); + back_offset = *reinterpret_cast(p_item_end); + assert(back_offset); + const char* p_item_start = p_item_end - back_offset; + assert(p_items_start <= p_item_start); + item_range = {p_item_start, p_item_end}; + } + + const char* p_items_start; + mutable memory_range_t item_range; + mutable node_offset_t back_offset; + mutable std::optional key; + mutable size_t _index = 0u; +}; + +template +template +class item_iterator_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + bool append(const item_iterator_t& src, size_t& items, index_t type); + char* wrap() { return p_append; } + std::tuple open_nxt(const key_get_type&); + std::tuple open_nxt(const full_key_t&); + void wrap_nxt(char* _p_append); + + private: + NodeExtentMutable* p_mut; + char* p_append; + char* p_offset_while_open; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc new file mode 100644 index 00000000000..725dad0b528 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "key_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void string_key_view_t::append_str( + NodeExtentMutable& mut, const char* data, size_t len, char*& p_append) { + p_append -= sizeof(string_size_t); + assert(len < std::numeric_limits::max()); + mut.copy_in_absolute(p_append, (string_size_t)len); + p_append -= len; + mut.copy_in_absolute(p_append, data, len); +} + +void string_key_view_t::append_dedup( + NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + if (dedup_type == Type::MIN) { + mut.copy_in_absolute(p_append, (string_size_t)0u); + } else if (dedup_type == Type::MAX) { + mut.copy_in_absolute(p_append, std::numeric_limits::max()); + } else { + assert(false); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h new file mode 100644 index 00000000000..77d5e6705bd --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h @@ -0,0 +1,598 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include + +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/tree_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; +class key_view_t; +class key_hobj_t; +enum class KeyT { VIEW, HOBJ }; +template struct _key_type; +template<> struct _key_type { using type = key_view_t; }; +template<> struct _key_type { using type = key_hobj_t; }; +template +using full_key_t = typename _key_type::type; + +// TODO: consider alignments +struct shard_pool_t { + bool operator==(const shard_pool_t& x) const { + return (shard == x.shard && pool == x.pool); + } + bool operator!=(const shard_pool_t& x) const { return !(*this == x); } + + template + static shard_pool_t from_key(const full_key_t& key); + + shard_t shard; + pool_t pool; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) { + return os << (unsigned)sp.shard << "," << sp.pool; +} + +struct crush_t { + bool operator==(const crush_t& x) const { return crush == x.crush; } + bool operator!=(const crush_t& x) const { return !(*this == x); } + + template + static crush_t from_key(const full_key_t& key); + + crush_hash_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const crush_t& c) { + return os << c.crush; +} + +struct shard_pool_crush_t { + bool operator==(const shard_pool_crush_t& x) const { + return (shard_pool == x.shard_pool && crush == x.crush); + } + bool operator!=(const shard_pool_crush_t& x) const { return !(*this == x); } + + template + static shard_pool_crush_t from_key(const full_key_t& key); + + shard_pool_t shard_pool; + crush_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) { + return os << spc.shard_pool << "," << spc.crush; +} + +struct snap_gen_t { + bool operator==(const snap_gen_t& x) const { + return (snap == x.snap && gen == x.gen); + } + bool operator!=(const snap_gen_t& x) const { return !(*this == x); } + + template + static snap_gen_t from_key(const full_key_t& key); + + snap_t snap; + gen_t gen; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) { + return os << sg.snap << "," << sg.gen; +} + +struct string_key_view_t { + enum class Type {MIN, STR, MAX}; + // presumably the maximum string length is 2KiB + using string_size_t = uint16_t; + string_key_view_t(const char* p_end) { + p_length = p_end - sizeof(string_size_t); + std::memcpy(&length, p_length, sizeof(string_size_t)); + if (length && length != std::numeric_limits::max()) { + auto _p_key = p_length - length; + p_key = static_cast(_p_key); + } else { + p_key = nullptr; + } + } + Type type() const { + if (length == 0u) { + return Type::MIN; + } else if (length == std::numeric_limits::max()) { + return Type::MAX; + } else { + return Type::STR; + } + } + const char* p_start() const { + if (p_key) { + return p_key; + } else { + return p_length; + } + } + const char* p_next_end() const { + if (p_key) { + return p_start(); + } else { + return p_length + sizeof(string_size_t); + } + } + size_t size() const { return length + sizeof(string_size_t); } + bool operator==(const string_key_view_t& x) const { + if (type() == x.type() && type() != Type::STR) + return true; + if (type() != x.type()) + return false; + if (length != x.length) + return false; + return (memcmp(p_key, x.p_key, length) == 0); + } + bool operator!=(const string_key_view_t& x) const { return !(*this == x); } + + static void append_str( + NodeExtentMutable&, const char* data, size_t len, char*& p_append); + + static void append_str(const char* data, size_t len, char*& p_append) { + p_append -= sizeof(string_size_t); + assert(len < std::numeric_limits::max()); + string_size_t _len = len; + std::memcpy(p_append, &_len, sizeof(string_size_t)); + p_append -= len; + std::memcpy(p_append, data, len); + } + + static void append_str(NodeExtentMutable& mut, + const std::string& str, + char*& p_append) { + append_str(mut, str.data(), str.length(), p_append); + } + + static void append_str(NodeExtentMutable& mut, + const string_key_view_t& view, + char*& p_append) { + assert(view.type() == Type::STR); + append_str(mut, view.p_key, view.length, p_append); + } + + static void append_str(const std::string& str, char*& p_append) { + append_str(str.data(), str.length(), p_append); + } + + static void append_dedup( + NodeExtentMutable&, const Type& dedup_type, char*& p_append); + + static void append_dedup(const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + string_size_t len; + if (dedup_type == Type::MIN) { + len = 0u; + } else if (dedup_type == Type::MAX) { + len = std::numeric_limits::max(); + } else { + assert(false); + } + std::memcpy(p_append, &len, sizeof(string_size_t)); + } + + const char* p_key; + const char* p_length; + // TODO: remove if p_length is aligned + string_size_t length; + + friend std::ostream& operator<<(std::ostream&, const string_key_view_t&); +}; +inline MatchKindCMP compare_to(const string_key_view_t& l, const string_key_view_t& r) { + using Type = string_key_view_t::Type; + auto l_type = l.type(); + auto r_type = r.type(); + if (l_type == Type::STR && r_type == Type::STR) { + return toMatchKindCMP(l.p_key, l.length, r.p_key, r.length); + } else if (l_type == r_type) { + return MatchKindCMP::EQ; + } else if (l_type == Type::MIN || r_type == Type::MAX) { + return MatchKindCMP::NE; + } else { // l_type == Type::MAX || r_type == Type::MIN + return MatchKindCMP::PO; + } +} +inline MatchKindCMP compare_to(const std::string& key, const string_key_view_t& target) { + assert(key.length()); + if (target.type() == string_key_view_t::Type::MIN) { + return MatchKindCMP::PO; + } else if (target.type() == string_key_view_t::Type::MAX) { + return MatchKindCMP::NE; + } else { + return toMatchKindCMP(key, target.p_key, target.length); + } +} +inline MatchKindCMP compare_to(const string_key_view_t& key, const std::string& target) { + return reverse(compare_to(target, key)); +} +inline MatchKindCMP compare_to(const std::string& key, const std::string& target) { + return toMatchKindCMP(key, target); +} + +inline std::ostream& operator<<(std::ostream& os, const string_key_view_t& view) { + auto type = view.type(); + if (type == string_key_view_t::Type::MIN) { + return os << "MIN"; + } else if (type == string_key_view_t::Type::MAX) { + return os << "MAX"; + } else { + if (view.length <= 12) { + os << "\"" << std::string(view.p_key, 0, view.length) << "\""; + } else { + os << "\"" << std::string(view.p_key, 0, 4) << ".." + << std::string(view.p_key + view.length - 2, 0, 2) + << "/" << view.length << "B\""; + } + return os; + } +} + +struct ns_oid_view_t { + using string_size_t = string_key_view_t::string_size_t; + using Type = string_key_view_t::Type; + + ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {} + Type type() const { return oid.type(); } + const char* p_start() const { return oid.p_start(); } + size_t size() const { + if (type() == Type::STR) { + return nspace.size() + oid.size(); + } else { + return sizeof(string_size_t); + } + } + bool operator==(const ns_oid_view_t& x) const { + return (nspace == x.nspace && oid == x.oid); + } + bool operator!=(const ns_oid_view_t& x) const { return !(*this == x); } + + template + static node_offset_t estimate_size(const full_key_t& key); + + template + static void append(NodeExtentMutable&, + const full_key_t& key, + char*& p_append); + + static void append(NodeExtentMutable& mut, + const ns_oid_view_t& view, + char*& p_append) { + if (view.type() == Type::STR) { + string_key_view_t::append_str(mut, view.nspace, p_append); + string_key_view_t::append_str(mut, view.oid, p_append); + } else { + string_key_view_t::append_dedup(mut, view.type(), p_append); + } + } + + template + static void append(const full_key_t& key, char*& p_append); + + string_key_view_t nspace; + string_key_view_t oid; +}; +inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) { + return os << ns_oid.nspace << "," << ns_oid.oid; +} + +class key_hobj_t { + public: + explicit key_hobj_t(const onode_key_t& key) : key{key} {} + + /* + * common interface as full_key_t + */ + shard_t shard() const { + return key.shard; + } + pool_t pool() const { + return key.pool; + } + crush_hash_t crush() const { + return key.crush; + } + const std::string& nspace() const { + return key.nspace; + } + const std::string& oid() const { + return key.oid; + } + ns_oid_view_t::Type dedup_type() const { + return _dedup_type; + } + snap_t snap() const { + return key.snap; + } + gen_t gen() const { + return key.gen; + } + + bool operator==(const full_key_t& o) const; + bool operator==(const full_key_t& o) const; + bool operator!=(const full_key_t& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t& o) const { + return !operator==(o); + } + + private: + ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR; + onode_key_t key; +}; +inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) { + return os << "key_hobj(" + << (unsigned)key.shard() << "," + << key.pool() << "," << key.crush() << "; \"" + << key.nspace() << "\",\"" << key.oid() << "\"; " + << key.snap() << "," << key.gen() << ")"; +} + +class key_view_t { + public: + /* + * common interface as full_key_t + */ + shard_t shard() const { + return shard_pool_packed().shard; + } + pool_t pool() const { + return shard_pool_packed().pool; + } + crush_hash_t crush() const { + return crush_packed().crush; + } + const string_key_view_t& nspace() const { + return ns_oid_view().nspace; + } + const string_key_view_t& oid() const { + return ns_oid_view().oid; + } + ns_oid_view_t::Type dedup_type() const { + return ns_oid_view().type(); + } + snap_t snap() const { + return snap_gen_packed().snap; + } + gen_t gen() const { + return snap_gen_packed().gen; + } + + bool operator==(const full_key_t& o) const; + bool operator==(const full_key_t& o) const; + bool operator!=(const full_key_t& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t& o) const { + return !operator==(o); + } + + /* + * key_view_t specific interfaces + */ + + bool has_shard_pool() const { + return p_shard_pool != nullptr; + } + bool has_crush() const { + return p_crush != nullptr; + } + bool has_ns_oid() const { + return p_ns_oid.has_value(); + } + bool has_snap_gen() const { + return p_snap_gen != nullptr; + } + + const shard_pool_t& shard_pool_packed() const { + assert(has_shard_pool()); + return *p_shard_pool; + } + const crush_t& crush_packed() const { + assert(has_crush()); + return *p_crush; + } + const ns_oid_view_t& ns_oid_view() const { + assert(has_ns_oid()); + return *p_ns_oid; + } + const snap_gen_t& snap_gen_packed() const { + assert(has_snap_gen()); + return *p_snap_gen; + } + + void set(const crush_t& key) { + assert(!has_crush()); + p_crush = &key; + } + void set(const shard_pool_crush_t& key) { + set(key.crush); + assert(!has_shard_pool()); + p_shard_pool = &key.shard_pool; + } + void set(const ns_oid_view_t& key) { + assert(!has_ns_oid()); + p_ns_oid = key; + } + void set(const snap_gen_t& key) { + assert(!has_snap_gen()); + p_snap_gen = &key; + } + + private: + const shard_pool_t* p_shard_pool = nullptr; + const crush_t* p_crush = nullptr; + std::optional p_ns_oid; + const snap_gen_t* p_snap_gen = nullptr; +}; + +template +bool compare_full_key(const full_key_t& l, const full_key_t& r) { + if (l.shard() != r.shard()) + return false; + if (l.pool() != r.pool()) + return false; + if (l.crush() != r.crush()) + return false; + if (compare_to(l.nspace(), r.nspace()) != MatchKindCMP::EQ) + return false; + if (compare_to(l.oid(), r.oid()) != MatchKindCMP::EQ) + return false; + if (l.snap() != r.snap()) + return false; + if (l.gen() != r.gen()) + return false; + return true; +} + +inline bool key_hobj_t::operator==(const full_key_t& o) const { + return compare_full_key(*this, o); +} +inline bool key_hobj_t::operator==(const full_key_t& o) const { + return compare_full_key(*this, o); +} +inline bool key_view_t::operator==(const full_key_t& o) const { + return compare_full_key(*this, o); +} +inline bool key_view_t::operator==(const full_key_t& o) const { + return compare_full_key(*this, o); +} + +inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) { + os << "key_view("; + if (key.has_shard_pool()) { + os << (unsigned)key.shard() << "," << key.pool() << ","; + } else { + os << "X,X,"; + } + if (key.has_crush()) { + os << key.crush() << "; "; + } else { + os << "X; "; + } + if (key.has_ns_oid()) { + os << key.nspace() << "," << key.oid() << "; "; + } else { + os << "X,X; "; + } + if (key.has_snap_gen()) { + os << key.snap() << "," << key.gen() << ")"; + } else { + os << "X,X)"; + } + return os; +} + +template +MatchKindCMP compare_to(const full_key_t& key, const shard_pool_t& target) { + auto ret = toMatchKindCMP(key.shard(), target.shard); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.pool(), target.pool); +} + +template +MatchKindCMP compare_to(const full_key_t& key, const crush_t& target) { + return toMatchKindCMP(key.crush(), target.crush); +} + +template +MatchKindCMP compare_to(const full_key_t& key, const shard_pool_crush_t& target) { + auto ret = compare_to(key, target.shard_pool); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(key, target.crush); +} + +template +MatchKindCMP compare_to(const full_key_t& key, const ns_oid_view_t& target) { + auto ret = compare_to(key.nspace(), target.nspace); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(key.oid(), target.oid); +} + +template +MatchKindCMP compare_to(const full_key_t& key, const snap_gen_t& target) { + auto ret = toMatchKindCMP(key.snap(), target.snap); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.gen(), target.gen); +} + +template +shard_pool_t shard_pool_t::from_key(const full_key_t& key) { + if constexpr (KT == KeyT::VIEW) { + return key.shard_pool_packed(); + } else { + return {key.shard(), key.pool()}; + } +} + +template +crush_t crush_t::from_key(const full_key_t& key) { + if constexpr (KT == KeyT::VIEW) { + return key.crush_packed(); + } else { + return {key.crush()}; + } +} + +template +shard_pool_crush_t shard_pool_crush_t::from_key(const full_key_t& key) { + return {shard_pool_t::from_key(key), crush_t::from_key(key)}; +} + +template +snap_gen_t snap_gen_t::from_key(const full_key_t& key) { + if constexpr (KT == KeyT::VIEW) { + return key.snap_gen_packed(); + } else { + return {key.snap(), key.gen()}; + } +} + +template +node_offset_t ns_oid_view_t::estimate_size(const full_key_t& key) { + if constexpr (KT == KeyT::VIEW) { + return key.ns_oid_view().size(); + } else { + if (key.dedup_type() != Type::STR) { + // size after deduplication + return sizeof(string_size_t); + } else { + return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size(); + } + } +} + +template +void ns_oid_view_t::append( + NodeExtentMutable& mut, const full_key_t& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::append_str(mut, key.nspace(), p_append); + string_key_view_t::append_str(mut, key.oid(), p_append); + } else { + string_key_view_t::append_dedup(mut, key.dedup_type(), p_append); + } +} + +template +void ns_oid_view_t::append(const full_key_t& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::append_str(key.nspace(), p_append); + string_key_view_t::append_str(key.oid(), p_append); + } else { + string_key_view_t::append_dedup(key.dedup_type(), p_append); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_layout.cc new file mode 100644 index 00000000000..2dc828f69ed --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_layout.cc @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "node_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void node_header_t::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t header; + header.set_field_type(field_type); + header.set_node_type(node_type); + header.set_is_level_tail(is_level_tail); + header.level = level; + mut.copy_in_relative(0, header); +} + +void node_header_t::update_is_level_tail( + NodeExtentMutable& mut, const node_header_t& header, bool value) { + auto& _header = const_cast(header); + _header.set_is_level_tail(value); + mut.validate_inplace_update(_header); +} + +#define F013_T _node_fields_013_t +#define F013_INST(ST) _node_fields_013_t +#define F013_TEMPLATE(ST) template struct F013_INST(ST) +F013_TEMPLATE(slot_0_t); +F013_TEMPLATE(slot_1_t); +F013_TEMPLATE(slot_3_t); + +template +void F013_T::update_size_at( + NodeExtentMutable& mut, const me_t& node, size_t index, int change) { + assert(index <= node.num_keys); + for (const auto* p_slot = &node.slots[index]; + p_slot < &node.slots[node.num_keys]; + ++p_slot) { + node_offset_t offset = p_slot->right_offset; + mut.copy_in_absolute( + (void*)&(p_slot->right_offset), + node_offset_t(offset - change)); + } +} + +template +void F013_T::append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + mut.copy_in_absolute(p_append, key); + p_append += sizeof(key_t); +} + +template +void F013_T::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +template +template +void F013_T::insert_at( + NodeExtentMutable& mut, const full_key_t& key, + const me_t& node, size_t index, node_offset_t size_right) { + assert(index <= node.num_keys); + update_size_at(mut, node, index, size_right); + auto p_insert = const_cast(fields_start(node)) + + node.get_key_start_offset(index); + auto p_shift_end = fields_start(node) + node.get_key_start_offset(node.num_keys); + mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one()); + mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1)); + append_key(mut, key_t::template from_key(key), p_insert); + append_offset(mut, node.get_item_end_offset(index) - size_right, p_insert); +} +#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \ + insert_at(NodeExtentMutable&, const full_key_t&, \ + const F013_INST(ST)&, size_t, node_offset_t) +IA_TEMPLATE(slot_0_t, KeyT::VIEW); +IA_TEMPLATE(slot_1_t, KeyT::VIEW); +IA_TEMPLATE(slot_3_t, KeyT::VIEW); +IA_TEMPLATE(slot_0_t, KeyT::HOBJ); +IA_TEMPLATE(slot_1_t, KeyT::HOBJ); +IA_TEMPLATE(slot_3_t, KeyT::HOBJ); + +void node_fields_2_t::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_layout.h new file mode 100644 index 00000000000..67405a376a0 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_layout.h @@ -0,0 +1,434 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "key_layout.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct node_header_t { + static constexpr unsigned FIELD_TYPE_BITS = 6u; + static_assert(static_cast(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS); + static constexpr unsigned NODE_TYPE_BITS = 1u; + static constexpr unsigned B_LEVEL_TAIL_BITS = 1u; + using bits_t = uint8_t; + + node_header_t() {} + std::optional get_field_type() const { + if (field_type >= FIELD_TYPE_MAGIC && + field_type < static_cast(field_type_t::_MAX)) { + return static_cast(field_type); + } else { + return std::nullopt; + } + } + node_type_t get_node_type() const { + return static_cast(node_type); + } + bool get_is_level_tail() const { + return is_level_tail; + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool); + + bits_t field_type : FIELD_TYPE_BITS; + bits_t node_type : NODE_TYPE_BITS; + bits_t is_level_tail : B_LEVEL_TAIL_BITS; + static_assert(sizeof(bits_t) * 8 == + FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS); + level_t level; + + private: + void set_field_type(field_type_t type) { + field_type = static_cast(type); + } + void set_node_type(node_type_t type) { + node_type = static_cast(type); + } + void set_is_level_tail(bool value) { + is_level_tail = static_cast(value); + } +} __attribute__((packed)); + +template +struct _slot_t { + using key_t = FixedKeyType; + static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE; + + key_t key; + node_offset_t right_offset; +} __attribute__((packed)); +using slot_0_t = _slot_t; +using slot_1_t = _slot_t; +using slot_3_t = _slot_t; + +struct node_range_t { + node_offset_t start; + node_offset_t end; +}; + +template +const char* fields_start(const FieldType& node) { + return reinterpret_cast(&node); +} + +template +node_range_t fields_free_range_before( + const FieldType& node, size_t index) { + assert(index <= node.num_keys); + node_offset_t offset_start = node.get_key_start_offset(index); + node_offset_t offset_end = + (index == 0 ? FieldType::SIZE + : node.get_item_start_offset(index - 1)); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node.is_level_tail() && index == node.num_keys) { + offset_end -= sizeof(laddr_t); + } + } + assert(offset_start <= offset_end); + assert(offset_end - offset_start < FieldType::SIZE); + return {offset_start, offset_end}; +} + +// internal/leaf node N0, N1; leaf node N3 +template +struct _node_fields_013_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(SlotType), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = typename SlotType::key_t; + using key_get_type = const key_t&; + using me_t = _node_fields_013_t; + static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + + bool is_level_tail() const { return header.get_is_level_tail(); } + size_t total_size() const { return SIZE; } + key_get_type get_key(size_t index) const { + assert(index < num_keys); + return slots[index].key; + } + node_offset_t get_key_start_offset(size_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(SlotType) * index; + assert(offset < SIZE); + return offset; + } + node_offset_t get_item_start_offset(size_t index) const { + assert(index < num_keys); + auto offset = slots[index].right_offset; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(size_t index) const { + assert(index < num_keys); + return &slots[index].right_offset; + } + node_offset_t get_item_end_offset(size_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template + node_offset_t free_size_before(size_t index) const { + auto range = fields_free_range_before(*this, index); + return range.end - range.start; + } + +#if 0 + template + void fill_unused(NodeExtentMutable& mut) const { + auto range = fields_free_range_before(*this, num_keys); + for (auto i = range.start; i < range.end; ++i) { + mut.copy_in_relative(i, uint8_t(0xc5)); + } + } + + template + void validate_unused() const { + auto range = fields_free_range_before(*this, num_keys); + for (auto i = fields_start(*this) + range.start; + i < fields_start(*this) + range.end; + ++i) { + assert(*i == char(0xc5)); + } + } +#endif + + static node_offset_t estimate_insert_one() { return sizeof(SlotType); } + template + static void insert_at( + NodeExtentMutable&, const full_key_t& key, + const me_t& node, size_t index, node_offset_t size_right); + static void update_size_at( + NodeExtentMutable&, const me_t& node, size_t index, int change); + static void append_key( + NodeExtentMutable&, const key_t& key, char*& p_append); + template + static void append_key( + NodeExtentMutable& mut, const full_key_t& key, char*& p_append) { + append_key(mut, key_t::template from_key(key), p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + SlotType slots[]; +} __attribute__((packed)); +using node_fields_0_t = _node_fields_013_t; +using node_fields_1_t = _node_fields_013_t; + +// internal/leaf node N2 +struct node_fields_2_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(node_off_t), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = ns_oid_view_t; + using key_get_type = key_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N2; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + + bool is_level_tail() const { return header.get_is_level_tail(); } + size_t total_size() const { return SIZE; } + key_get_type get_key(size_t index) const { + assert(index < num_keys); + node_offset_t item_end_offset = + (index == 0 ? SIZE : offsets[index - 1]); + assert(item_end_offset <= SIZE); + const char* p_start = fields_start(*this); + return key_t(p_start + item_end_offset); + } + node_offset_t get_key_start_offset(size_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys; + assert(offset <= SIZE); + return offset; + } + node_offset_t get_item_start_offset(size_t index) const { + assert(index < num_keys); + auto offset = offsets[index]; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(size_t index) const { + assert(index < num_keys); + return &offsets[index]; + } + node_offset_t get_item_end_offset(size_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template + node_offset_t free_size_before(size_t index) const { + auto range = fields_free_range_before(*this, index); + return range.end - range.start; + } + +#if 0 + template + void fill_unused(NodeExtentMutable& mut) const { + auto range = fields_free_range_before(*this, num_keys); + for (auto i = range.start; i < range.end; ++i) { + mut.copy_in_relative(i, uint8_t(0xc5)); + } + } + + template + void validate_unused() const { + auto range = fields_free_range_before(*this, num_keys); + for (auto i = fields_start(*this) + range.start; + i < fields_start(*this) + range.end; + ++i) { + assert(*i == char(0xc5)); + } + } +#endif + + static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); } + template + static void insert_at( + NodeExtentMutable& mut, const full_key_t& key, + const node_fields_2_t& node, size_t index, node_offset_t size_right) { + assert(false && "not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const node_fields_2_t& node, size_t index, int change) { + assert(false && "not implemented"); + } + static void append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + ns_oid_view_t::append(mut, key, p_append); + } + template + static void append_key( + NodeExtentMutable& mut, const full_key_t& key, char*& p_append) { + ns_oid_view_t::append(mut, key, p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + node_offset_t offsets[]; +} __attribute__((packed)); + +// TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) +static constexpr unsigned MAX_NUM_KEYS_I3 = 170u; +template +struct _internal_fields_3_t { + using key_get_type = const snap_gen_t&; + using me_t = _internal_fields_3_t; + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) + using num_keys_t = uint8_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N3; + static constexpr node_offset_t SIZE = sizeof(me_t); + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + + bool is_level_tail() const { return header.get_is_level_tail(); } + size_t total_size() const { + if (is_level_tail()) { + return SIZE - sizeof(snap_gen_t); + } else { + return SIZE; + } + } + key_get_type get_key(size_t index) const { + assert(index < num_keys); + return keys[index]; + } + template + std::enable_if_t + free_size_before(size_t index) const { + assert(index <= num_keys); + auto allowed_num_keys = is_level_tail() ? MAX_NUM_KEYS - 1 : MAX_NUM_KEYS; + assert(num_keys <= allowed_num_keys); + auto free = (MAX_NUM_KEYS - index) * (sizeof(snap_gen_t) + sizeof(laddr_t)); + if (is_level_tail() && index == num_keys) { + free -= (sizeof(snap_gen_t) + sizeof(laddr_t)); + } + assert(free < SIZE); + return free; + } + +#if 0 + template + void fill_unused(NodeExtentMutable& mut) const { + node_offset_t begin = (const char*)&keys[num_keys] - fields_start(*this); + node_offset_t end = (const char*)&child_addrs[0] - fields_start(*this); + for (auto i = begin; i < end; ++i) { + mut.copy_in_relative(i, uint8_t(0xc5)); + } + begin = (const char*)&child_addrs[num_keys] - fields_start(*this); + end = NODE_BLOCK_SIZE; + if (is_level_tail()) { + begin += sizeof(laddr_t); + } + for (auto i = begin; i < end; ++i) { + mut.copy_in_relative(i, uint8_t(0xc5)); + } + } + + template + void validate_unused() const { + auto begin = (const char*)&keys[num_keys]; + auto end = (const char*)&child_addrs[0]; + for (auto i = begin; i < end; ++i) { + assert(*i == uint8_t(0xc5)); + } + begin = (const char*)&child_addrs[num_keys]; + end = fields_start(*this) + NODE_BLOCK_SIZE; + if (is_level_tail()) { + begin += sizeof(laddr_t); + } + for (auto i = begin; i < end; ++i) { + assert(*i == char(0xc5)); + } + } +#endif + + static node_offset_t estimate_insert_one() { + return sizeof(snap_gen_t) + sizeof(laddr_t); + } + template + static void insert_at( + NodeExtentMutable& mut, const full_key_t& key, + const me_t& node, size_t index, node_offset_t size_right) { + assert(false && "not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const me_t& node, size_t index, int change) { + assert(false && "not implemented"); + } + + node_header_t header; + num_keys_t num_keys = 0u; + snap_gen_t keys[MAX_NUM_KEYS]; + laddr_t child_addrs[MAX_NUM_KEYS]; +} __attribute__((packed)); +static_assert(_internal_fields_3_t::SIZE <= NODE_BLOCK_SIZE && + _internal_fields_3_t::SIZE > NODE_BLOCK_SIZE); +using internal_fields_3_t = _internal_fields_3_t; + +using leaf_fields_3_t = _node_fields_013_t; + +/* + * block layout of a variable-sized item (right-side) + * + * for internal node type 0, 1: + * previous off (block boundary) -----------------------------+ + * current off --+ | + * | | + * V V + * <==== | sub |fix|sub |fix|oid char|ns char|colli-| + * (next-item) |...addr|key|addr|key|array & |array &|-sion |(prv-item)... + * <==== | 1 |1 |0 |0 |len |len |offset| + * ^ | + * | | + * +------------ next collision ----------+ + * see item_iterator_t + * + * for internal node type 2: + * previous off (block boundary) ----------------------+ + * current off --+ | + * | | + * V V + * <==== | sub |fix|sub |fix|oid char|ns char| + * (next-item) |...addr|key|addr|key|array & |array &|(prv-item)... + * <==== | 1 |1 |0 |0 |len |len | + * see sub_items_t + * + * for leaf node type 0, 1: + * previous off (block boundary) ----------------------------------------+ + * current off --+ | + * | | + * V V + * <==== | fix|o- |fix| off|off|num |oid char|ns char|colli-| + * (next-item) |...key|node|key|...set|set|sub |array & |array &|-sion |(prv-item) + * <==== | 1 |0 |0 | 1 |0 |keys|len |len |offset| + * ^ | + * | | + * +------------ next collision ----------------------+ + * see item_iterator_t + * + * for leaf node type 2: + * previous off (block boundary) ---------------------------------+ + * current off --+ | + * | | + * V V + * <==== | fix|o- |fix| off|off|num |oid char|ns char| + * (next-item) |...key|node|key|...set|set|sub |array & |array &|(prv-item) + * <==== | 1 |0 |0 | 1 |0 |keys|len |len | + * see sub_items_t + */ + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc new file mode 100644 index 00000000000..8ee21db86c2 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc @@ -0,0 +1,315 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" +#include "node_layout.h" + +namespace crimson::os::seastore::onode { + +#define NODE_T node_extent_t +#define NODE_INST(FT, NT) node_extent_t +#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT) +NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL); +NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF); +NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF); + +template +const char* NODE_T::p_left_bound() const { + if constexpr (std::is_same_v) { + // N3 internal node doesn't have the right part + return nullptr; + } else { + auto ret = p_start() + fields().get_item_end_offset(keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + ret -= sizeof(laddr_t); + } + } + return ret; + } +} + +template +size_t NODE_T::size_to_nxt_at(size_t index) const { + assert(index < keys()); + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + return FieldType::estimate_insert_one(); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + auto p_end = p_start() + p_fields->get_item_end_offset(index); + return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size(); + } else { + assert(false && "N3 node is not nested"); + } +} + +template +memory_range_t NODE_T::get_nxt_container(size_t index) const { + if constexpr (std::is_same_v) { + assert(false && "N3 internal node doesn't have the right part"); + } else { + node_offset_t item_start_offset = p_fields->get_item_start_offset(index); + node_offset_t item_end_offset = p_fields->get_item_end_offset(index); + assert(item_start_offset < item_end_offset); + auto item_p_start = p_start() + item_start_offset; + auto item_p_end = p_start() + item_end_offset; + if constexpr (FIELD_TYPE == field_type_t::N2) { + // range for sub_items_t + item_p_end = ns_oid_view_t(item_p_end).p_start(); + assert(item_p_start < item_p_end); + } else { + // range for item_iterator_t + } + return {item_p_start, item_p_end}; + } +} + +template +void NODE_T::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t::bootstrap_extent( + mut, field_type, node_type, is_level_tail, level); + mut.copy_in_relative( + sizeof(node_header_t), typename FieldType::num_keys_t(0u)); +} + +template +void NODE_T::update_is_level_tail( + NodeExtentMutable& mut, const node_extent_t& extent, bool value) { + node_header_t::update_is_level_tail(mut, extent.p_fields->header, value); +} + +template +template +memory_range_t NODE_T::insert_prefix_at( + NodeExtentMutable& mut, const node_extent_t& node, const full_key_t& key, + size_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + assert(index <= node.keys()); + assert(p_left_bound == node.p_left_bound()); + assert(size > FieldType::estimate_insert_one()); + auto size_right = size - FieldType::estimate_insert_one(); + const char* p_insert = node.p_start() + node.fields().get_item_end_offset(index); + const char* p_insert_front = p_insert - size_right; + FieldType::template insert_at(mut, key, node.fields(), index, size_right); + mut.shift_absolute(p_left_bound, + p_insert - p_left_bound, + -(int)size_right); + return {p_insert_front, p_insert}; + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + assert(false && "not implemented"); + } else { + assert(false && "impossible"); + } +} +#define IPA_TEMPLATE(FT, NT, KT) \ + template memory_range_t NODE_INST(FT, NT)::insert_prefix_at( \ + NodeExtentMutable&, const node_extent_t&, const full_key_t&, \ + size_t, node_offset_t, const char*) +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); + +template +void NODE_T::update_size_at( + NodeExtentMutable& mut, const node_extent_t& node, size_t index, int change) { + assert(index < node.keys()); + FieldType::update_size_at(mut, node.fields(), index, change); +} + +template +size_t NODE_T::trim_until( + NodeExtentMutable& mut, const node_extent_t& node, size_t index) { + assert(!node.is_level_tail()); + auto keys = node.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + if constexpr (std::is_same_v) { + assert(false && "not implemented"); + } else { + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index)); + } + // no need to calculate trim size for node + return 0; +} + +template +size_t NODE_T::trim_at( + NodeExtentMutable& mut, const node_extent_t& node, size_t index, size_t trimmed) { + assert(!node.is_level_tail()); + auto keys = node.keys(); + assert(index < keys); + if constexpr (std::is_same_v) { + assert(false && "not implemented"); + } else { + auto offset = node.p_fields->get_item_start_offset(index); + assert(offset + trimmed < node.p_fields->get_item_end_offset(index)); + mut.copy_in_absolute(const_cast(node.p_fields->p_offset(index)), + node_offset_t(offset + trimmed)); + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index + 1)); + } + // no need to calculate trim size for node + return 0; +} + +#define APPEND_T node_extent_t::Appender +#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t::Appender +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ); + +template +template +void APPEND_T::append(const node_extent_t& src, size_t from, size_t items) { + assert(from <= src.keys()); + if (p_src == nullptr) { + p_src = &src; + } else { + assert(p_src == &src); + } + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + num_keys += items; + if constexpr (std::is_same_v) { + assert(false && "impossible path"); + } else { + // append left part forwards + node_offset_t offset_left_start = src.fields().get_key_start_offset(from); + node_offset_t offset_left_end = src.fields().get_key_start_offset(from + items); + node_offset_t left_size = offset_left_end - offset_left_start; + if (num_keys == 0) { + // no need to adjust offset + assert(from == 0); + assert(p_start + offset_left_start == p_append_left); + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + } else { + node_offset_t step_size = FieldType::estimate_insert_one(); + node_offset_t offset_base = src.fields().get_item_end_offset(from); + int offset_change = p_append_right - p_start - offset_base; + auto p_offset_dst = p_append_left; + if constexpr (FIELD_TYPE != field_type_t::N2) { + // copy keys + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + // point to offset for update + p_offset_dst += sizeof(typename FieldType::key_t); + } + for (auto i = from; i < from + items; ++i) { + p_mut->copy_in_absolute(p_offset_dst, + node_offset_t(src.fields().get_item_start_offset(i) + offset_change)); + p_offset_dst += step_size; + } + assert(p_append_left + left_size + sizeof(typename FieldType::key_t) == + p_offset_dst); + } + p_append_left += left_size; + + // append right part backwards + node_offset_t offset_right_start = src.fields().get_item_end_offset(from + items); + node_offset_t offset_right_end = src.fields().get_item_end_offset(from); + node_offset_t right_size = offset_right_end - offset_right_start; + p_append_right -= right_size; + p_mut->copy_in_absolute(p_append_right, + src.p_start() + offset_right_start, right_size); + } +} + +template +template +void APPEND_T::append( + const full_key_t& key, const value_t& value, const value_t*& p_value) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + assert(false && "not implemented"); + } else { + assert(false && "should not happen"); + } +} + +template +template +std::tuple +APPEND_T::open_nxt(const key_get_type& partial_key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::append_key(*p_mut, partial_key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::append_key(*p_mut, partial_key, p_append_right); + } else { + assert(false && "impossible path"); + } + return {p_mut, p_append_right}; +} + +template +template +std::tuple +APPEND_T::open_nxt(const full_key_t& key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::template append_key(*p_mut, key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::template append_key(*p_mut, key, p_append_right); + } else { + assert(false && "impossible path"); + } + return {p_mut, p_append_right}; +} + +template +template +char* APPEND_T::wrap() { + assert(p_append_left <= p_append_right); + assert(p_src); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (p_src->is_level_tail()) { + laddr_t tail_value = *p_src->get_end_p_laddr(); + p_append_right -= sizeof(laddr_t); + assert(p_append_left <= p_append_right); + p_mut->copy_in_absolute(p_append_right, tail_value); + } + } + p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys); + return p_append_left; +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h new file mode 100644 index 00000000000..73703fd7d65 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h @@ -0,0 +1,203 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +template +class node_extent_t { + public: + using value_t = value_type_t<_NODE_TYPE>; + using num_keys_t = typename FieldType::num_keys_t; + static constexpr node_type_t NODE_TYPE = _NODE_TYPE; + static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE; + static constexpr node_offset_t EXTENT_SIZE = + (FieldType::SIZE + DISK_BLOCK_SIZE - 1u) / DISK_BLOCK_SIZE * DISK_BLOCK_SIZE; + + // TODO: remove + node_extent_t() = default; + + node_extent_t(const FieldType* p_fields) : p_fields{p_fields} { + validate(*p_fields); + } + + const char* p_start() const { return fields_start(*p_fields); } + + const char* off_to_ptr(node_offset_t off) const { + assert(off <= FieldType::SIZE); + return p_start() + off; + } + + node_offset_t ptr_to_off(const void* ptr) const { + auto _ptr = static_cast(ptr); + assert(_ptr >= p_start()); + auto off = _ptr - p_start(); + assert(off <= FieldType::SIZE); + return off; + } + + bool is_level_tail() const { return p_fields->is_level_tail(); } + level_t level() const { return p_fields->header.level; } + size_t free_size() const { + return p_fields->template free_size_before(keys()); + } + size_t total_size() const { return p_fields->total_size(); } + const char* p_left_bound() const; + template + std::enable_if_t + get_end_p_laddr() const { + assert(is_level_tail()); + if constexpr (FIELD_TYPE == field_type_t::N3) { + #pragma GCC diagnostic ignored "-Waddress-of-packed-member" + return &p_fields->child_addrs[keys()]; + } else { + auto offset_start = p_fields->get_item_end_offset(keys()); + assert(offset_start <= FieldType::SIZE); + offset_start -= sizeof(laddr_t); + auto p_addr = p_start() + offset_start; + return reinterpret_cast(p_addr); + } + } + + // container type system + using key_get_type = typename FieldType::key_get_type; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + size_t keys() const { return p_fields->num_keys; } + key_get_type operator[] (size_t index) const { return p_fields->get_key(index); } + size_t size_before(size_t index) const { + auto free_size = p_fields->template free_size_before(index); + assert(total_size() >= free_size); + return total_size() - free_size; + } + size_t size_to_nxt_at(size_t index) const; + memory_range_t get_nxt_container(size_t index) const; + + template + std::enable_if_t + get_p_value(size_t index) const { + assert(index < keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + #pragma GCC diagnostic ignored "-Waddress-of-packed-member" + return &p_fields->child_addrs[index]; + } else { + auto range = get_nxt_container(index); + auto ret = reinterpret_cast(range.p_start); + assert(range.p_start + ret->size == range.p_end); + return ret; + } + } + + static void validate(const FieldType& fields) { +#ifndef NDEBUG + assert(fields.header.get_node_type() == NODE_TYPE); + assert(fields.header.get_field_type() == FieldType::FIELD_TYPE); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(fields.header.level > 0u); + } else { + assert(fields.header.level == 0u); + } +#endif + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool); + + static node_offset_t header_size() { return FieldType::HEADER_SIZE; } + + template + static node_offset_t estimate_insert( + const full_key_t& key, const value_t& value) { + auto size = FieldType::estimate_insert_one(); + if constexpr (FIELD_TYPE == field_type_t::N2) { + size += ns_oid_view_t::estimate_size(key); + } else if constexpr (FIELD_TYPE == field_type_t::N3 && + NODE_TYPE == node_type_t::LEAF) { + size += value.size; + } + return size; + } + + template + static const value_t* insert_at( + NodeExtentMutable& mut, const node_extent_t&, + const full_key_t& key, const value_t& value, + size_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + assert(false && "not implemented"); + } else { + assert(false && "impossible"); + } + } + + template + static memory_range_t insert_prefix_at( + NodeExtentMutable&, const node_extent_t&, + const full_key_t& key, + size_t index, node_offset_t size, const char* p_left_bound); + + static void update_size_at( + NodeExtentMutable&, const node_extent_t&, size_t index, int change); + + static size_t trim_until(NodeExtentMutable&, const node_extent_t&, size_t index); + static size_t trim_at(NodeExtentMutable&, const node_extent_t&, + size_t index, size_t trimmed); + + template + class Appender; + + private: + const FieldType& fields() const { return *p_fields; } + const FieldType* p_fields; +}; + +template +template +class node_extent_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_start{p_append} { +#ifndef NDEBUG + auto p_fields = reinterpret_cast(p_append); + assert(*(p_fields->header.get_field_type()) == FIELD_TYPE); + assert(p_fields->header.get_node_type() == NODE_TYPE); + assert(p_fields->num_keys == 0); +#endif + p_append_left = p_start + FieldType::HEADER_SIZE; + p_append_right = p_start + FieldType::SIZE; + } + void append(const node_extent_t& src, size_t from, size_t items); + void append(const full_key_t&, const value_t&, const value_t*&); + char* wrap(); + std::tuple open_nxt(const key_get_type&); + std::tuple open_nxt(const full_key_t&); + void wrap_nxt(char* p_append) { + if constexpr (FIELD_TYPE != field_type_t::N3) { + assert(p_append < p_append_right); + assert(p_append_left < p_append); + p_append_right = p_append; + FieldType::append_offset(*p_mut, p_append - p_start, p_append_left); + ++num_keys; + } else { + assert(false); + } + } + + private: + const node_extent_t* p_src = nullptr; + NodeExtentMutable* p_mut; + char* p_start; + char* p_append_left; + char* p_append_right; + num_keys_t num_keys = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h new file mode 100644 index 00000000000..f6ea16bfd61 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h @@ -0,0 +1,1891 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +// TODO: remove +#include +#include +#include + +#include "common/likely.h" + +#include "sub_items_stage.h" +#include "item_iterator_stage.h" + +namespace crimson::os::seastore::onode { + +struct search_result_bs_t { + size_t index; + MatchKindBS match; +}; +template +search_result_bs_t binary_search( + const full_key_t& key, + size_t begin, size_t end, FGetKey&& f_get_key) { + assert(begin <= end); + while (begin < end) { + auto total = begin + end; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get_key(mid)) target = f_get_key(mid); + auto match = compare_to(key, target); + if (match == MatchKindCMP::NE) { + end = mid; + } else if (match == MatchKindCMP::PO) { + begin = mid + 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {begin , MatchKindBS::NE}; +} + +template +search_result_bs_t binary_search_r( + size_t rend, size_t rbegin, FGet&& f_get, const PivotType& key) { + assert(rend <= rbegin); + while (rend < rbegin) { + auto total = rend + rbegin + 1; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get(mid)) target = f_get(mid); + int match = target - key; + if (match < 0) { + rend = mid; + } else if (match > 0) { + rbegin = mid - 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {rbegin, MatchKindBS::NE}; +} + +using match_stat_t = int8_t; +constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end() +constexpr match_stat_t MSTAT_EQ = -1; // key == index +constexpr match_stat_t MSTAT_NE0 = 0; // key == index [pool/shard crush ns/oid]; key < index [snap/gen] +constexpr match_stat_t MSTAT_NE1 = 1; // key == index [pool/shard crush]; key < index [ns/oid] +constexpr match_stat_t MSTAT_NE2 = 2; // key < index [pool/shard crush ns/oid] || + // key == index [pool/shard]; key < index [crush] +constexpr match_stat_t MSTAT_NE3 = 3; // key < index [pool/shard] +constexpr match_stat_t MSTAT_MIN = MSTAT_END; +constexpr match_stat_t MSTAT_MAX = MSTAT_NE3; + +inline bool matchable(field_type_t type, match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + /* + * compressed prefix by field type: + * N0: NONE + * N1: pool/shard + * N2: pool/shard crush + * N3: pool/shard crush ns/oid + * + * if key matches the node's compressed prefix, return true + * else, return false + */ +#ifndef NDEBUG + if (mstat == MSTAT_END) { + assert(type == field_type_t::N0); + } +#endif + return mstat + to_unsigned(type) < 4; +} + +inline void assert_mstat( + const full_key_t& key, + const full_key_t& index, + match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_NE2); + // key < index ... + switch (mstat) { + case MSTAT_EQ: + break; + case MSTAT_NE0: + assert(compare_to(key, index.snap_gen_packed()) == MatchKindCMP::NE); + break; + case MSTAT_NE1: + assert(compare_to(key, index.ns_oid_view()) == MatchKindCMP::NE); + break; + case MSTAT_NE2: + if (index.has_shard_pool()) { + assert(compare_to(key, shard_pool_crush_t{ + index.shard_pool_packed(), index.crush_packed()}) == MatchKindCMP::NE); + } else { + assert(compare_to(key, index.crush_packed()) == MatchKindCMP::NE); + } + break; + default: + assert(false); + } + // key == index ... + switch (mstat) { + case MSTAT_EQ: + assert(compare_to(key, index.snap_gen_packed()) == MatchKindCMP::EQ); + case MSTAT_NE0: + if (!index.has_ns_oid()) + break; + assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX || + compare_to(key, index.ns_oid_view()) == MatchKindCMP::EQ); + case MSTAT_NE1: + if (!index.has_crush()) + break; + assert(compare_to(key, index.crush_packed()) == MatchKindCMP::EQ); + if (!index.has_shard_pool()) + break; + assert(compare_to(key, index.shard_pool_packed()) == MatchKindCMP::EQ); + default: + break; + } +} + +template +struct staged_result_t { + using me_t = staged_result_t; + bool is_end() const { return position.is_end(); } + MatchKindBS match() const { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE); + } + + static me_t end() { + return {staged_position_t::end(), nullptr, MSTAT_END}; + } + template + static std::enable_if_t from_nxt( + size_t index, const staged_result_t& nxt_stage_result) { + return {{index, nxt_stage_result.position}, + nxt_stage_result.p_value, + nxt_stage_result.mstat}; + } + + staged_position_t position; + const value_type_t* p_value; + match_stat_t mstat; +}; + +template +staged_result_t&& normalize( + staged_result_t&& result) { return std::move(result); } + +template > +staged_result_t normalize( + staged_result_t&& result) { + // FIXME: assert result.mstat correct + return {normalize(std::move(result.position)), result.p_value, result.mstat}; +} + +/* + * staged infrastructure + */ + +template +struct staged_params_subitems { + using container_t = sub_items_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_subitems; +}; + +template +struct staged_params_item_iterator { + using container_t = item_iterator_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems; +}; + +template +struct staged_params_node_01 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_LEFT; + + using next_param_t = staged_params_item_iterator; +}; + +template +struct staged_params_node_2 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems; +}; + +template +struct staged_params_node_3 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_node_3; +}; + +#define NXT_STAGE_T staged + +enum class TrimType { BEFORE, AFTER, AT }; + +template +struct staged { + static_assert(Params::STAGE >= STAGE_BOTTOM); + static_assert(Params::STAGE <= STAGE_TOP); + using container_t = typename Params::container_t; + using key_get_type = typename container_t::key_get_type; + using next_param_t = typename Params::next_param_t; + using position_t = staged_position_t; + using result_t = staged_result_t; + using value_t = value_type_t; + static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE; + static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM); + static constexpr auto NODE_TYPE = Params::NODE_TYPE; + static constexpr auto STAGE = Params::STAGE; + + template + static void _left_or_right(size_t& s_index, size_t i_index, + std::optional& i_to_left) { + assert(!i_to_left.has_value()); + assert(s_index != INDEX_END); + if constexpr (is_exclusive) { + if (s_index <= i_index) { + // ...[s_index-1] |!| (i_index) [s_index]... + // offset i_position to right + i_to_left = false; + } else { + // ...[s_index-1] (i_index)) |?[s_index]| ... + // ...(i_index)...[s_index-1] |?[s_index]| ... + i_to_left = true; + --s_index; + } + } else { + if (s_index < i_index) { + // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]... + i_to_left = false; + } else if (s_index > i_index) { + // ...[(i_index)s_index-1] |?[s_index]| ... + // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ... + i_to_left = true; + } else { + // ...[s_index-1] |?[(i_index)s_index]| ... + // i_to_left = std::nullopt; + } + } + } + + template class _iterator_t; + template + class _iterator_t> { + /* + * indexable container type system: + * CONTAINER_TYPE = ContainerType::INDEXABLE + * keys() const -> size_t + * operator[](size_t) const -> key_get_type + * size_before(size_t) const -> size_t + * (IS_BOTTOM) get_p_value(size_t) const -> const value_t* + * (!IS_BOTTOM) size_to_nxt_at(size_t) const -> size_t + * (!IS_BOTTOM) get_nxt_container(size_t) const + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * (IS_BOTTOM) insert_at(mut, src, key, value, + * index, size, p_left_bound) -> const value_t* + * (!IS_BOTTOM) insert_prefix_at(mut, src, key, + * index, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size_at(mut, src, index, size) + * trim_until(mut, container, index) -> trim_size + * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size + * + * Appender::append(const container_t& src, from, items) + */ + public: + using me_t = _iterator_t; + + _iterator_t(const container_t& container) : container{container} { + assert(container.keys()); + } + + size_t index() const { + return _index; + } + key_get_type get_key() const { + assert(!is_end()); + return container[_index]; + } + size_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt_at(_index); + } + template + std::enable_if_t get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(_index); + } + template + std::enable_if_t get_p_value() const { + assert(!is_end()); + return container.get_p_value(_index); + } + bool is_last() const { + return _index + 1 == container.keys(); + } + bool is_end() const { return _index == container.keys(); } + size_t size() const { + assert(!is_end()); + assert(header_size() == container.size_before(0)); + return container.size_before(_index + 1) - + container.size_before(_index); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++_index; + return *this; + } + void seek_at(size_t index) { + assert(index < container.keys()); + seek_till_end(index); + } + void seek_till_end(size_t index) { + assert(!is_end()); + assert(this->index() == 0); + assert(index <= container.keys()); + _index = index; + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + _index = container.keys() - 1; + } + void set_end() { + assert(!is_end()); + assert(is_last()); + ++_index; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + size_t end_index = container.keys(); + if (exclude_last) { + assert(end_index); + --end_index; + assert(compare_to(key, container[end_index]) == MatchKindCMP::NE); + } + auto ret = binary_search(key, _index, end_index, + [this] (size_t index) { return container[index]; }); + _index = ret.index; + return ret.match; + } + + template + std::enable_if_t insert( + NodeExtentMutable& mut, const full_key_t& key, + const value_t& value, node_offset_t insert_size, const char* p_left_bound) { + return container_t::template insert_at( + mut, container, key, value, _index, insert_size, p_left_bound); + } + + template + std::enable_if_t insert_prefix( + NodeExtentMutable& mut, const full_key_t& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix_at( + mut, container, key, _index, size, p_left_bound); + } + + template + std::enable_if_t + update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size_at(mut, container, _index, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + template + size_t seek_split_inserted(size_t start_size, size_t extra_size, + size_t target_size, size_t& i_index, size_t i_size, + std::optional& i_to_left) { + assert(!is_end()); + assert(index() == 0); + assert(i_index <= container.keys() || i_index == INDEX_END); + // replace the unknown INDEX_END value + if (i_index == INDEX_END) { + if constexpr (!is_exclusive) { + i_index = container.keys() - 1; + } else { + i_index = container.keys(); + } + } + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1, + i_index, i_size] (size_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + current_size = start_size_1; + if (index > i_index) { + current_size += i_size; + if constexpr (is_exclusive) { + --index; + } + } + // already includes header size + current_size += container.size_before(index); + } + return current_size; + }; + size_t s_end; + if constexpr (is_exclusive) { + s_end = container.keys(); + } else { + s_end = container.keys() - 1; + } + _index = binary_search_r(0, s_end, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + + _left_or_right(_index, i_index, i_to_left); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1] (size_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + // already includes header size + current_size = start_size_1 + container.size_before(index); + } + return current_size; + }; + _index = binary_search_r( + 0, container.keys() - 1, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if + // to_index == INDEX_END && to_stage == STAGE + template + void copy_out_until(typename container_t::template Appender& appender, + size_t& to_index, + match_stage_t to_stage) { + assert(to_stage <= STAGE); + auto num_keys = container.keys(); + size_t items; + if (to_index == INDEX_END) { + if (to_stage == STAGE) { + items = num_keys - _index; + appender.append(container, _index, items); + _index = num_keys; + } else { + assert(!is_end()); + items = num_keys - 1 - _index; + appender.append(container, _index, items); + _index = num_keys - 1; + } + to_index = _index; + } else { + assert(_index <= to_index); + items = to_index - _index; + appender.append(container, _index, items); + _index = to_index; + } + } + + size_t trim_until(NodeExtentMutable& mut) { + return container_t::trim_until(mut, container, _index); + } + + template + std::enable_if_t + trim_at(NodeExtentMutable& mut, size_t trimmed) { + return container_t::trim_at(mut, container, _index, trimmed); + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template + static size_t estimate_insert(const full_key_t& key, const value_t& value) { + return container_t::template estimate_insert(key, value); + } + + private: + container_t container; + size_t _index = 0; + }; + + template + class _iterator_t> { + /* + * iterative container type system (!IS_BOTTOM): + * CONTAINER_TYPE = ContainerType::ITERATIVE + * index() const -> size_t + * get_key() const -> key_get_type + * size() const -> size_t + * size_to_nxt() const -> size_t + * get_nxt_container() const + * has_next() const -> bool + * operator++() + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t + * update_size(mut, src, size) + * trim_until(mut, container) -> trim_size + * trim_at(mut, container, trimmed) -> trim_size + */ + // currently the iterative iterator is only implemented with STAGE_STRING + // for in-node space efficiency + static_assert(STAGE == STAGE_STRING); + public: + using me_t = _iterator_t; + + _iterator_t(const container_t& container) : container{container} { + assert(index() == 0); + } + + size_t index() const { + if (is_end()) { + return end_index; + } else { + return container.index(); + } + } + key_get_type get_key() const { + assert(!is_end()); + return container.get_key(); + } + size_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt(); + } + const typename NXT_STAGE_T::container_t get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(); + } + bool is_last() const { + assert(!is_end()); + return !container.has_next(); + } + bool is_end() const { return _is_end; } + size_t size() const { + assert(!is_end()); + return container.size(); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++container; + return *this; + } + void seek_at(size_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + assert(container.has_next()); + ++container; + --index; + } + } + void seek_till_end(size_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + if (!container.has_next()) { + assert(index == 1); + set_end(); + break; + } + ++container; + --index; + } + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + while (container.has_next()) { + ++container; + } + } + void set_end() { + assert(!is_end()); + assert(is_last()); + _is_end = true; + end_index = container.index() + 1; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + do { + if (exclude_last && is_last()) { + assert(compare_to(key, get_key()) == MatchKindCMP::NE); + return MatchKindBS::NE; + } + auto match = compare_to(key, get_key()); + if (match == MatchKindCMP::NE) { + return MatchKindBS::NE; + } else if (match == MatchKindCMP::EQ) { + return MatchKindBS::EQ; + } else { + if (container.has_next()) { + ++container; + } else { + // end + break; + } + } + } while (true); + assert(!exclude_last); + set_end(); + return MatchKindBS::NE; + } + + template + memory_range_t insert_prefix( + NodeExtentMutable& mut, const full_key_t& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix( + mut, container, key, is_end(), size, p_left_bound); + } + + void update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size(mut, container, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + template + size_t seek_split_inserted(size_t start_size, size_t extra_size, + size_t target_size, size_t& i_index, size_t i_size, + std::optional& i_to_left) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + size_t s_index = 0; + extra_size += header_size(); + do { + if constexpr (!is_exclusive) { + if (is_last()) { + assert(s_index == index()); + if (i_index == INDEX_END) { + i_index = index(); + } + assert(i_index <= index()); + break; + } + } + + size_t nxt_size = current_size; + if (s_index == 0) { + nxt_size += extra_size; + } + if (s_index == i_index) { + nxt_size += i_size; + if constexpr (is_exclusive) { + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++s_index; + } + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + + if constexpr (is_exclusive) { + if (is_last()) { + assert(s_index == index()); + set_end(); + s_index = index(); + if (i_index == INDEX_END) { + i_index = index(); + } + assert(i_index == index()); + break; + } else { + ++(*this); + ++s_index; + } + } else { + ++(*this); + ++s_index; + } + } while (true); + assert(current_size <= target_size); + + _left_or_right(s_index, i_index, i_to_left); + assert(s_index == index()); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + do { + if (is_last()) { + break; + } + + size_t nxt_size = current_size; + if (index() == 0) { + nxt_size += extra_size; + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++(*this); + } while (true); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if + // to_index == INDEX_END && to_stage == STAGE + template + void copy_out_until(typename container_t::template Appender& appender, + size_t& to_index, + match_stage_t to_stage) { + assert(to_stage <= STAGE); + if (is_end()) { + assert(!container.has_next()); + assert(to_stage == STAGE); + assert(to_index == index() || to_index == INDEX_END); + to_index = index(); + return; + } + typename container_t::index_t type; + size_t items; + if (to_index == INDEX_END) { + if (to_stage == STAGE) { + type = container_t::index_t::end; + } else { + type = container_t::index_t::last; + } + items = INDEX_END; + } else { + assert(index() <= to_index); + type = container_t::index_t::none; + items = to_index - index(); + } + if (appender.append(container, items, type)) { + set_end(); + } + to_index = index(); + } + + size_t trim_until(NodeExtentMutable& mut) { + if (is_end()) { + return 0; + } + return container_t::trim_until(mut, container); + } + + size_t trim_at(NodeExtentMutable& mut, size_t trimmed) { + assert(!is_end()); + return container_t::trim_at(mut, container, trimmed); + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template + static node_offset_t estimate_insert(const full_key_t& key, const value_t& value) { + return container_t::template estimate_insert(key, value); + } + + private: + container_t container; + bool _is_end = false; + size_t end_index; + }; + + /* + * iterator_t encapsulates both indexable and iterative implementations + * from a *non-empty* container. + * cstr(const container_t&) + * access: + * index() -> size_t + * get_key() -> key_get_type (const reference or value type) + * is_last() -> bool + * is_end() -> bool + * size() -> size_t + * (IS_BOTTOM) get_p_value() -> const value_t* + * (!IS_BOTTOM) get_nxt_container() -> nxt_stage::container_t + * (!IS_BOTTOM) size_to_nxt() -> size_t + * seek: + * operator++() -> iterator_t& + * seek_at(index) + * seek_till_end(index) + * seek_last() + * set_end() + * seek(key, exclude_last) -> MatchKindBS + * insert: + * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value + * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size(mut, size) + * split; + * seek_split_inserted( + * start_size, extra_size, target_size, i_index, i_size, + * std::optional& i_to_left) + * -> insert to left/right/unknown (!exclusive) + * -> insert to left/right (exclusive, can be end) + * -> split_size + * seek_split(start_size, extra_size, target_size) -> split_size + * copy_out_until(appender, to_index, to_stage) (can be end) + * trim_until(mut) -> trim_size + * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + */ + using iterator_t = _iterator_t; + + /* + * Lookup internals (hide?) + */ + + static result_t smallest_result(const iterator_t& iter) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto pos_smallest = NXT_STAGE_T::position_t::begin(); + auto nxt_container = iter.get_nxt_container(); + auto value_ptr = NXT_STAGE_T::get_p_value(nxt_container, pos_smallest); + return result_t{{iter.index(), pos_smallest}, value_ptr, STAGE}; + } + + static result_t nxt_lower_bound( + const full_key_t& key, iterator_t& iter, MatchHistory& history) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::lower_bound(nxt_container, key, history); + if (nxt_result.is_end()) { + if (iter.is_last()) { + return result_t::end(); + } else { + return smallest_result(++iter); + } + } else { + return result_t::from_nxt(iter.index(), nxt_result); + } + } + + static void lookup_largest( + const container_t& container, position_t& position, const value_t*& p_value) { + auto iter = iterator_t(container); + iter.seek_last(); + position.index = iter.index(); + if constexpr (IS_BOTTOM) { + p_value = iter.get_p_value(); + } else { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::lookup_largest(nxt_container, position.nxt, p_value); + } + } + + static void lookup_largest_index( + const container_t& container, full_key_t& output) { + auto iter = iterator_t(container); + iter.seek_last(); + output.set(iter.get_key()); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::lookup_largest_index(nxt_container, output); + } + } + + static const value_t* get_p_value( + const container_t& container, const position_t& position) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::get_p_value(nxt_container, position.nxt); + } else { + return iter.get_p_value(); + } + } + + static void get_key_view( + const container_t& container, + const position_t& position, + full_key_t& output) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + output.set(iter.get_key()); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::get_key_view(nxt_container, position.nxt, output); + } + } + + static result_t lower_bound( + const container_t& container, + const full_key_t& key, + MatchHistory& history) { + bool exclude_last = false; + if (history.get().has_value()) { + if (*history.get() == MatchKindCMP::EQ) { + // lookup is short-circuited + if constexpr (!IS_BOTTOM) { + assert(history.get().has_value()); + if (history.is_PO()) { + auto iter = iterator_t(container); + bool test_key_equal; + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN); + auto cmp = compare_to(key, iter.get_key()); + assert(cmp != MatchKindCMP::PO); + test_key_equal = (cmp == MatchKindCMP::EQ); + } else { + auto cmp = compare_to(key, iter.get_key()); + // From history, key[stage] == parent[stage][index - 1] + // which should be the smallest possible value for all + // index[stage][*] + assert(cmp != MatchKindCMP::PO); + test_key_equal = (cmp == MatchKindCMP::EQ); + } + if (test_key_equal) { + return nxt_lower_bound(key, iter, history); + } else { + // key[stage] < index[stage][left-most] + return smallest_result(iter); + } + } + } + // IS_BOTTOM || !history.is_PO() + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX); + assert(compare_to(key, iter.get_key()) == MatchKindCMP::EQ); + } else { + assert(compare_to(key, iter.get_key()) == MatchKindCMP::EQ); + } + if constexpr (IS_BOTTOM) { + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, MSTAT_EQ}; + } else { + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::lower_bound(nxt_container, key, history); + // !history.is_PO() means + // key[stage+1 ...] <= index[stage+1 ...][*] + assert(!nxt_result.is_end()); + return result_t::from_nxt(iter.index(), nxt_result); + } + } else if (*history.get() == MatchKindCMP::NE) { + exclude_last = true; + } + } + auto iter = iterator_t(container); + auto bs_match = iter.seek(key, exclude_last); + if (iter.is_end()) { + assert(!exclude_last); + assert(bs_match == MatchKindBS::NE); + history.set(MatchKindCMP::PO); + return result_t::end(); + } + history.set(bs_match == MatchKindBS::EQ ? + MatchKindCMP::EQ : MatchKindCMP::NE); + if constexpr (IS_BOTTOM) { + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, + (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_NE0)}; + } else { + if (bs_match == MatchKindBS::EQ) { + return nxt_lower_bound(key, iter, history); + } else { + return smallest_result(iter); + } + } + } + + template + static node_offset_t insert_size(const full_key_t& key, const value_t& value) { + if constexpr (IS_BOTTOM) { + return iterator_t::template estimate_insert(key, value); + } else { + return iterator_t::template estimate_insert(key, value) + + NXT_STAGE_T::iterator_t::header_size() + + NXT_STAGE_T::template insert_size(key, value); + } + } + + template + static node_offset_t insert_size_at( + match_stage_t stage, const full_key_t& key, const value_t& value) { + if (stage == STAGE) { + return insert_size(key, value); + } else { + assert(stage < STAGE); + return NXT_STAGE_T::template insert_size_at(stage, key, value); + } + } + + template > + static std::enable_if_t evaluate_insert( + const container_t& container, const full_key_t& key, + const value_t& value, position_t& position, bool is_current) { + auto iter = iterator_t(container); + auto& index = position.index; + if (!is_current) { + index = INDEX_END; + } + if (index == INDEX_END) { + iter.seek_last(); + index = iter.index(); + // evaluate the previous index + } else { + // evaluate the current index + iter.seek_at(index); + auto match = compare_to(key, iter.get_key()); + if (match == MatchKindCMP::EQ) { + if constexpr (IS_BOTTOM) { + // ceph_abort? + assert(false && "insert conflict at current index!"); + } else { + // insert into the current index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, true); + } + } else { + assert(is_current && match == MatchKindCMP::NE); + if (index == 0) { + // already the first index, so insert at the current index + return {STAGE, insert_size(key, value)}; + } + --index; + iter = iterator_t(container); + iter.seek_at(index); + // proceed to evaluate the previous index + } + } + + // XXX(multi-type): when key is from a different type of node + auto match = compare_to(key, iter.get_key()); + if (match == MatchKindCMP::PO) { + // key doesn't match both indexes, so insert at the current index + ++index; + return {STAGE, insert_size(key, value)}; + } else { + assert(match == MatchKindCMP::EQ); + if constexpr (IS_BOTTOM) { + // ceph_abort? + assert(false && "insert conflict at the previous index!"); + } else { + // insert into the previous index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, false); + } + } + } + + template + static std::enable_if_t + compensate_insert_position_at(match_stage_t stage, position_t& position) { + auto& index = position.index; + if (stage == STAGE) { + assert(index == 0); + // insert at the end of the current stage + index = INDEX_END; + return true; + } else { + if constexpr (IS_BOTTOM) { + assert(false && "impossible"); + } else { + assert(stage < STAGE); + bool compensate = NXT_STAGE_T:: + compensate_insert_position_at(stage, position.nxt); + if (compensate) { + assert(index != INDEX_END); + if (index == 0) { + // insert into the *last* index of the current stage + index = INDEX_END; + return true; + } else { + --index; + return false; + } + } else { + return false; + } + } + } + } + + template > + static std::enable_if_t evaluate_insert( + const full_key_t& key, const onode_t& value, + const MatchHistory& history, position_t& position) { + match_stage_t insert_stage = STAGE_TOP; + while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) { + assert(insert_stage != STAGE_BOTTOM && "insert conflict!"); + --insert_stage; + } + + if (history.is_PO()) { + if (position.is_end()) { + // no need to compensate insert position + assert(insert_stage <= STAGE && "impossible insert stage"); + } else if (position == position_t::begin()) { + // I must be short-circuited by staged::smallest_result() + // in staged::lower_bound() + + // XXX(multi-type): need to upgrade node type before inserting an + // incompatible index at front. + assert(insert_stage <= STAGE && "incompatible insert"); + + // insert at begin and at the top stage + insert_stage = STAGE; + } else { + assert(insert_stage <= STAGE && "impossible insert stage"); + bool ret = compensate_insert_position_at(insert_stage, position); + assert(!ret); + } + } + + node_offset_t insert_size = insert_size_at(insert_stage, key, value); + + return {insert_stage, insert_size}; + } + + template + static const value_t* insert_new( + NodeExtentMutable& mut, const memory_range_t& range, + const full_key_t& key, const value_t& value) { + char* p_insert = const_cast(range.p_end); + const value_t* p_value = nullptr; + StagedAppender appender; + appender.init(&mut, p_insert); + appender.append(key, value, p_value); + const char* p_insert_front = appender.wrap(); + assert(p_insert_front == range.p_start); + return p_value; + } + + template + static const value_t* proceed_insert_recursively( + NodeExtentMutable& mut, const container_t& container, + const full_key_t& key, const value_t& value, + position_t& position, match_stage_t& stage, + node_offset_t& _insert_size, const char* p_left_bound) { + // proceed insert from right to left + assert(stage <= STAGE); + auto iter = iterator_t(container); + auto& index = position.index; + if (index == INDEX_END) { + iter.seek_last(); + } else { + iter.seek_till_end(index); + } + + bool do_insert = false; + if (stage == STAGE) { + if (index == INDEX_END) { + iter.set_end(); + } + do_insert = true; + } else { // stage < STAGE + if constexpr (SPLIT) { + if (iter.is_end()) { + // insert at the higher stage due to split + do_insert = true; + _insert_size = insert_size(key, value); + stage = STAGE; + } + } else { + assert(!iter.is_end()); + } + } + if (index == INDEX_END) { + index = iter.index(); + } + + if (do_insert) { + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + assert(_insert_size == insert_size(key, value)); + if constexpr (IS_BOTTOM) { + return iter.template insert( + mut, key, value, _insert_size, p_left_bound); + } else { + auto range = iter.template insert_prefix( + mut, key, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new(mut, range, key, value); + } + } else { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + auto p_value = NXT_STAGE_T::template proceed_insert_recursively( + mut, nxt_container, key, value, + position.nxt, stage, _insert_size, p_left_bound); + iter.update_size(mut, _insert_size); + return p_value; + } else { + assert(false && "impossible path"); + } + } + } + + template + static const value_t* proceed_insert( + NodeExtentMutable& mut, const container_t& container, + const full_key_t& key, const value_t& value, + position_t& position, match_stage_t& stage, node_offset_t& _insert_size) { + auto p_left_bound = container.p_left_bound(); + if (unlikely(!container.keys())) { + assert(position == position_t::end()); + assert(stage == STAGE); + position = position_t::begin(); + if constexpr (IS_BOTTOM) { + return container_t::template insert_at( + mut, container, key, value, 0, _insert_size, p_left_bound); + } else { + auto range = container_t::template insert_prefix_at( + mut, container, key, 0, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new(mut, range, key, value); + } + } else { + return proceed_insert_recursively( + mut, container, key, value, + position, stage, _insert_size, p_left_bound); + } + } + + /* + * Lookup interfaces + */ + + static void lookup_largest_normalized( + const container_t& container, + search_position_t& position, + const value_t*& p_value) { + if constexpr (STAGE == STAGE_LEFT) { + lookup_largest(container, position, p_value); + return; + } + position.index = 0; + auto& pos_nxt = position.nxt; + if constexpr (STAGE == STAGE_STRING) { + lookup_largest(container, pos_nxt, p_value); + return; + } + pos_nxt.index = 0; + auto& pos_nxt_nxt = pos_nxt.nxt; + if constexpr (STAGE == STAGE_RIGHT) { + lookup_largest(container, pos_nxt_nxt, p_value); + return; + } + assert(false); + } + + static staged_result_t lower_bound_normalized( + const container_t& container, + const full_key_t& key, + MatchHistory& history) { + auto&& result = lower_bound(container, key, history); +#ifndef NDEBUG + if (result.is_end()) { + assert(result.mstat == MSTAT_END); + } else { + full_key_t index; + get_key_view(container, result.position, index); + assert_mstat(key, index, result.mstat); + } +#endif + if constexpr (container_t::FIELD_TYPE == field_type_t::N0) { + // currently only internal node checks mstat + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (result.mstat == MSTAT_NE2) { + auto cmp = compare_to( + key, container[result.position.index].shard_pool); + assert(cmp != MatchKindCMP::PO); + if (cmp != MatchKindCMP::EQ) { + result.mstat = MSTAT_NE3; + } + } + } + } + return normalize(std::move(result)); + } + + static std::ostream& dump(const container_t& container, + std::ostream& os, + const std::string& prefix, + size_t& size, + const char* p_start) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + std::string prefix_blank(prefix.size(), ' '); + const std::string* p_prefix = &prefix; + size += iterator_t::header_size(); + do { + std::ostringstream sos; + sos << *p_prefix << iter.get_key() << ": "; + std::string i_prefix = sos.str(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + size += iter.size_to_nxt(); + NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start); + } else { + auto value_ptr = iter.get_p_value(); + int offset = reinterpret_cast(value_ptr) - p_start; + size += iter.size(); + os << "\n" << i_prefix; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + os << *value_ptr; + } else { + os << "0x" << std::hex << *value_ptr << std::dec; + } + os << " " << size << "B" + << " @" << offset << "B"; + } + if (iter.is_last()) { + break; + } else { + ++iter; + p_prefix = &prefix_blank; + } + } while (true); + return os; + } + + struct _BaseEmpty {}; + class _BaseWithNxtIterator { + protected: + typename NXT_STAGE_T::StagedIterator _nxt; + }; + class StagedIterator + : std::conditional_t { + public: + StagedIterator() = default; + bool valid() const { return iter.has_value(); } + size_t index() const { + return iter->index(); + } + bool is_end() const { return iter->is_end(); } + bool in_progress() const { + assert(valid()); + if constexpr (!IS_BOTTOM) { + if (this->_nxt.valid()) { + if (this->_nxt.index() == 0) { + return this->_nxt.in_progress(); + } else { + return true; + } + } else { + return false; + } + } else { + return false; + } + } + key_get_type get_key() const { return iter->get_key(); } + + iterator_t& get() { return *iter; } + void set(const container_t& container) { + assert(!valid()); + iter = iterator_t(container); + } + void set_end() { iter->set_end(); } + typename NXT_STAGE_T::StagedIterator& nxt() { + if constexpr (!IS_BOTTOM) { + if (!this->_nxt.valid()) { + auto nxt_container = iter->get_nxt_container(); + this->_nxt.set(nxt_container); + } + return this->_nxt; + } else { + assert(false); + } + } + typename NXT_STAGE_T::StagedIterator& get_nxt() { + if constexpr (!IS_BOTTOM) { + return this->_nxt; + } else { + assert(false); + } + } + StagedIterator& operator++() { + if (iter->is_last()) { + iter->set_end(); + } else { + ++(*iter); + } + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + return *this; + } + void reset() { + if (valid()) { + iter.reset(); + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + } + } + std::ostream& print(std::ostream& os, bool is_top) const { + if (valid()) { + if (iter->is_end()) { + return os << "END"; + } else { + os << index(); + } + } else { + if (is_top) { + return os << "invalid StagedIterator!"; + } else { + os << "0!"; + } + } + if constexpr (!IS_BOTTOM) { + os << ", "; + return this->_nxt.print(os, false); + } else { + return os; + } + } + position_t get_pos() const { + if (valid()) { + if constexpr (IS_BOTTOM) { + return position_t{index()}; + } else { + return position_t{index(), this->_nxt.get_pos()}; + } + } else { + return position_t::begin(); + } + } + friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) { + return iter.print(os, true); + } + private: + std::optional iter; + size_t end_index; + }; + + static void recursively_locate_split( + size_t& current_size, size_t extra_size, + size_t target_size, StagedIterator& split_at) { + assert(current_size <= target_size); + iterator_t& iter = split_at.get(); + current_size = iter.seek_split(current_size, extra_size, target_size); + if constexpr (!IS_BOTTOM) { + NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + iter.size_to_nxt(), + target_size, split_at.nxt()); + } + } + + static void recursively_locate_split_inserted( + size_t& current_size, size_t extra_size, size_t target_size, + position_t& i_position, match_stage_t i_stage, size_t i_size, + std::optional& i_to_left, StagedIterator& split_at) { + assert(current_size <= target_size); + assert(!i_to_left.has_value()); + iterator_t& iter = split_at.get(); + auto& i_index = i_position.index; + if (i_stage == STAGE) { + current_size = iter.template seek_split_inserted( + current_size, extra_size, target_size, + i_index, i_size, i_to_left); + assert(i_to_left.has_value()); + if (*i_to_left == false && iter.index() == i_index) { + // ...[s_index-1] |!| (i_index) [s_index]... + return; + } + assert(!iter.is_end()); + if (iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + } else { + if constexpr (!IS_BOTTOM) { + assert(i_stage < STAGE); + current_size = iter.template seek_split_inserted( + current_size, extra_size, target_size, + i_index, i_size, i_to_left); + assert(!iter.is_end()); + if (iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + if (!i_to_left.has_value()) { + assert(iter.index() == i_index); + NXT_STAGE_T::recursively_locate_split_inserted( + current_size, extra_size + iter.size_to_nxt(), target_size, + i_position.nxt, i_stage, i_size, i_to_left, split_at.nxt()); + assert(i_to_left.has_value()); + return; + } + } else { + assert(false && "impossible path"); + } + } + if constexpr (!IS_BOTTOM) { + NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + iter.size_to_nxt(), + target_size, split_at.nxt()); + } + return; + } + + static bool locate_split( + const container_t& container, size_t target_size, + position_t& i_position, match_stage_t i_stage, size_t i_size, + StagedIterator& split_at) { + split_at.set(container); + size_t current_size = 0; + std::optional i_to_left; + recursively_locate_split_inserted( + current_size, 0, target_size, + i_position, i_stage, i_size, i_to_left, split_at); + std::cout << " locate_split(): size_to_left=" << current_size + << ", target_split_size=" << target_size + << ", original_size=" << container.size_before(container.keys()) + << std::endl; + assert(current_size <= target_size); + return *i_to_left; + } + + /* + * container appender type system + * container_t::Appender(NodeExtentMutable& mut, char* p_append) + * append(const container_t& src, size_t from, size_t items) + * wrap() -> char* + * IF !IS_BOTTOM: + * open_nxt(const key_get_type&) + * open_nxt(const full_key_t&) + * -> std::tuple + * wrap_nxt(char* p_append) + * ELSE + * append(const full_key_t& key, const value_t& value) + */ + template + struct _BaseWithNxtAppender { + typename NXT_STAGE_T::template StagedAppender _nxt; + }; + template + class StagedAppender + : std::conditional_t> { + public: + StagedAppender() = default; + ~StagedAppender() { + assert(!require_wrap_nxt); + assert(!valid()); + } + bool valid() const { return appender.has_value(); } + size_t index() const { + assert(valid()); + return _index; + } + bool in_progress() const { return require_wrap_nxt; } + // TODO: pass by reference + void init(NodeExtentMutable* p_mut, char* p_start) { + assert(!valid()); + appender = typename container_t::template Appender(p_mut, p_start); + _index = 0; + } + // possible to make src_iter end if + // to_index == INDEX_END && to_stage == STAGE + void append_until( + StagedIterator& src_iter, size_t& to_index, match_stage_t to_stage) { + assert(!require_wrap_nxt); + assert(to_stage <= STAGE); + auto s_index = src_iter.index(); + src_iter.get().template copy_out_until(*appender, to_index, to_stage); + assert(src_iter.index() == to_index); + assert(to_index >= s_index); + auto increment = (to_index - s_index); + if (increment) { + _index += increment; + if constexpr (!IS_BOTTOM) { + src_iter.get_nxt().reset(); + } + } + } + void append(const full_key_t& key, + const value_t& value, const value_t*& p_value) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + auto& nxt = open_nxt(key); + nxt.append(key, value, p_value); + wrap_nxt(); + } else { + appender->append(key, value, p_value); + ++_index; + } + } + char* wrap() { + assert(valid()); + assert(_index > 0); + if constexpr (!IS_BOTTOM) { + if (require_wrap_nxt) { + wrap_nxt(); + } + } + auto ret = appender->wrap(); + appender.reset(); + return ret; + } + typename NXT_STAGE_T::template StagedAppender& + open_nxt(key_get_type paritial_key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(paritial_key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + assert(false); + } + } + typename NXT_STAGE_T::template StagedAppender& + open_nxt(const full_key_t& key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + assert(false); + } + } + typename NXT_STAGE_T::template StagedAppender& get_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + return this->_nxt; + } else { + assert(false); + } + } + void wrap_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + require_wrap_nxt = false; + auto p_append = this->_nxt.wrap(); + appender->wrap_nxt(p_append); + ++_index; + } else { + assert(false); + } + } + private: + std::optional> appender; + size_t _index; + bool require_wrap_nxt = false; + }; + + template + static void _append_range(StagedIterator& src_iter, StagedAppender& appender, + size_t& to_index, match_stage_t stage) { + if (src_iter.is_end()) { + assert(to_index == INDEX_END); + assert(stage == STAGE); + to_index = src_iter.index(); + } else if constexpr (!IS_BOTTOM) { + if (appender.in_progress()) { + // we are in the progress of appending + auto to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range( + src_iter.nxt(), appender.get_nxt(), + to_index_nxt, STAGE - 1); + ++src_iter; + appender.wrap_nxt(); + } else if (src_iter.in_progress()) { + // cannot append the current item as-a-whole + auto to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), + to_index_nxt, STAGE - 1); + ++src_iter; + appender.wrap_nxt(); + } + } + appender.append_until(src_iter, to_index, stage); + } + + template + static void _append_into(StagedIterator& src_iter, StagedAppender& appender, + position_t& position, match_stage_t stage) { + // reaches the last item + if (stage == STAGE) { + // done, end recursion + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + } else { + assert(stage < STAGE); + // process append in the next stage + NXT_STAGE_T::template append_until( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), + position.nxt, stage); + } + } + + template + static void append_until(StagedIterator& src_iter, StagedAppender& appender, + position_t& position, match_stage_t stage) { + size_t from_index = src_iter.index(); + size_t& to_index = position.index; + assert(from_index <= to_index); + if constexpr (IS_BOTTOM) { + assert(stage == STAGE); + appender.append_until(src_iter, to_index, stage); + } else { + assert(stage <= STAGE); + if (src_iter.index() == to_index) { + _append_into(src_iter, appender, position, stage); + } else { + _append_range(src_iter, appender, to_index, stage); + _append_into(src_iter, appender, position, stage); + } + } + to_index -= from_index; + } + + template + static bool append_insert( + const full_key_t& key, const value_t& value, + StagedIterator& src_iter, StagedAppender& appender, + bool is_front_insert, match_stage_t& stage, const value_t*& p_value) { + assert(src_iter.valid()); + if (stage == STAGE) { + appender.append(key, value, p_value); + if (src_iter.is_end()) { + return true; + } else { + return false; + } + } else { + assert(stage < STAGE); + if constexpr (!IS_BOTTOM) { + auto nxt_is_end = NXT_STAGE_T::template append_insert( + key, value, src_iter.get_nxt(), appender.get_nxt(), + is_front_insert, stage, p_value); + if (nxt_is_end) { + appender.wrap_nxt(); + ++src_iter; + if (is_front_insert) { + stage = STAGE; + } + if (src_iter.is_end()) { + return true; + } + } + return false; + } else { + assert(false && "impossible path"); + } + } + } + + static std::tuple + recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + if (!trim_at.valid()) { + return {TrimType::BEFORE, 0u}; + } + if (trim_at.is_end()) { + return {TrimType::AFTER, 0u}; + } + + auto& iter = trim_at.get(); + if constexpr (!IS_BOTTOM) { + auto [type, trimmed] = NXT_STAGE_T::recursively_trim( + mut, trim_at.get_nxt()); + size_t trim_size; + if (type == TrimType::AFTER) { + if (iter.is_last()) { + return {TrimType::AFTER, 0u}; + } + ++trim_at; + trim_size = iter.trim_until(mut); + } else if (type == TrimType::BEFORE) { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } + trim_size = iter.trim_until(mut); + } else { + trim_size = iter.trim_at(mut, trimmed); + } + return {TrimType::AT, trim_size}; + } else { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } else { + auto trimmed = iter.trim_until(mut); + return {TrimType::AT, trimmed}; + } + } + } + + static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + auto [type, trimmed] = recursively_trim(mut, trim_at); + if (type == TrimType::AFTER) { + auto& iter = trim_at.get(); + assert(iter.is_end()); + iter.trim_until(mut); + } + } +}; + +template struct _node_to_stage_t; +template +struct _node_to_stage_t> { + using type = staged>; +}; +template +struct _node_to_stage_t> { + using type = staged>; +}; +template +struct _node_to_stage_t> { + using type = staged>; +}; +template +using node_to_stage_t = typename _node_to_stage_t::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h new file mode 100644 index 00000000000..c0df7856bc2 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h @@ -0,0 +1,277 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include + +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" + +namespace crimson::os::seastore::onode { + +using match_stage_t = uint8_t; +constexpr match_stage_t STAGE_LEFT = 2u; // shard/pool/crush +constexpr match_stage_t STAGE_STRING = 1u; // nspace/oid +constexpr match_stage_t STAGE_RIGHT = 0u; // snap/gen +constexpr auto STAGE_TOP = STAGE_LEFT; +constexpr auto STAGE_BOTTOM = STAGE_RIGHT; + +// TODO: replace by +// using match_history_t = int8_t; +// left_m, str_m, right_m +// 3: PO, +// 2: EQ, PO, +// 1: EQ, EQ, PO +// 0: EQ, EQ, EQ +// -1: EQ, EQ, NE +// -2: EQ, NE, +// -3: NE, + +struct MatchHistory { + template + const std::optional& get() const { + static_assert(STAGE >= STAGE_BOTTOM && STAGE <= STAGE_TOP); + if constexpr (STAGE == STAGE_RIGHT) { + return right_match; + } else if (STAGE == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + const std::optional& + get_by_stage(match_stage_t stage) const { + assert(stage >= STAGE_BOTTOM && stage <= STAGE_TOP); + if (stage == STAGE_RIGHT) { + return right_match; + } else if (stage == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + template + const bool is_PO() const; + + template + void set(MatchKindCMP match) { + static_assert(STAGE >= STAGE_BOTTOM && STAGE <= STAGE_TOP); + if constexpr (STAGE < STAGE_TOP) { + assert(*get() == MatchKindCMP::EQ); + } + assert(!get().has_value() || *get() != MatchKindCMP::EQ); + const_cast&>(get()) = match; + } + + std::optional left_match; + std::optional string_match; + std::optional right_match; +}; + +template +struct _check_PO_t { + static bool eval(const MatchHistory* history) { + return history->get() && + (*history->get() == MatchKindCMP::PO || + (*history->get() == MatchKindCMP::EQ && + _check_PO_t::eval(history))); + } +}; +template <> +struct _check_PO_t { + static bool eval(const MatchHistory* history) { + return history->get() && + *history->get() == MatchKindCMP::PO; + } +}; +template +const bool MatchHistory::is_PO() const { + static_assert(STAGE >= STAGE_BOTTOM && STAGE <= STAGE_TOP); + if constexpr (STAGE < STAGE_TOP) { + assert(get() == MatchKindCMP::EQ); + } + return _check_PO_t::eval(this); +} + +template +struct staged_position_t { + static_assert(STAGE > STAGE_BOTTOM && STAGE <= STAGE_TOP); + using me_t = staged_position_t; + using nxt_t = staged_position_t; + bool is_end() const { return index == INDEX_END; } + size_t& index_by_stage(match_stage_t stage) { + assert(stage <= STAGE); + if (STAGE == stage) { + return index; + } else { + return nxt.index_by_stage(stage); + } + } + + int cmp(const me_t& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return nxt.cmp(o.nxt); + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(o.index != INDEX_END); + assert(index >= o.index); + if (index != INDEX_END) { + index -= o.index; + if (index == 0) { + nxt -= o.nxt; + } + } + return *this; + } + + static me_t begin() { return {0u, nxt_t::begin()}; } + static me_t end() { + return {INDEX_END, nxt_t::end()}; + } + + size_t index; + nxt_t nxt; +}; +template +std::ostream& operator<<(std::ostream& os, const staged_position_t& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else { + os << pos.index; + } + return os << ", " << pos.nxt; +} + +template <> +struct staged_position_t { + using me_t = staged_position_t; + bool is_end() const { return index == INDEX_END; } + size_t& index_by_stage(match_stage_t stage) { + assert(stage == STAGE_BOTTOM); + return index; + } + + int cmp(const staged_position_t& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return 0; + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(o.index != INDEX_END); + assert(index >= o.index); + if (index != INDEX_END) { + index -= o.index; + } + return *this; + } + + static me_t begin() { return {0u}; } + static me_t end() { return {INDEX_END}; } + + size_t index; +}; +template <> +inline std::ostream& operator<<(std::ostream& os, const staged_position_t& pos) { + if (pos.index == INDEX_END) { + return os << "END"; + } else { + return os << pos.index; + } +} + +using search_position_t = staged_position_t; + +template > +const search_position_t& cast_down(const search_position_t& pos) { return pos; } + +template > +const staged_position_t& cast_down(const search_position_t& pos) { + if constexpr (STAGE == STAGE_STRING) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.is_end()); + } else { + assert(pos.index == 0u); + } +#endif + return pos.nxt; + } else if (STAGE == STAGE_RIGHT) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.nxt.is_end()); + } else { + assert(pos.index == 0u); + assert(pos.nxt.index == 0u); + } +#endif + return pos.nxt.nxt; + } else { + assert(false); + } +} + +template +staged_position_t& cast_down(search_position_t& pos) { + const search_position_t& _pos = pos; + return const_cast&>(cast_down(_pos)); +} + +inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); } + +template > +search_position_t normalize(staged_position_t&& pos) { + if (pos.is_end()) { + return search_position_t::end(); + } + if constexpr (STAGE == STAGE_STRING) { + return {0u, std::move(pos)}; + } else if (STAGE == STAGE_RIGHT) { + return {0u, {0u, std::move(pos)}}; + } else { + assert(false); + } +} + +struct memory_range_t { + const char* p_start; + const char* p_end; +}; + +enum class ContainerType { ITERATIVE, INDEXABLE }; + +struct onode_t; + +template struct value_type; +template<> struct value_type { using type = laddr_t; }; +template<> struct value_type { using type = onode_t; }; +template +using value_type_t = typename value_type::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc new file mode 100644 index 00000000000..06263c53ebd --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "sub_items_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +template +const laddr_t* internal_sub_items_t::insert_at( + NodeExtentMutable& mut, const internal_sub_items_t& sub_items, + const full_key_t& key, const laddr_t& value, + size_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert(key, value)); + const char* p_shift_start = p_left_bound; + const char* p_shift_end = reinterpret_cast( + sub_items.p_first_item + 1 - index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + auto p_insert = const_cast(p_shift_end) - size; + auto item = internal_sub_item_t{snap_gen_t::from_key(key), value}; + mut.copy_in_absolute(p_insert, item); + return &reinterpret_cast(p_insert)->value; +} +template const laddr_t* internal_sub_items_t::insert_at( + NodeExtentMutable&, const internal_sub_items_t&, const full_key_t&, + const laddr_t&, size_t, node_offset_t, const char*); + +size_t internal_sub_items_t::trim_until( + NodeExtentMutable&, internal_sub_items_t& items, size_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + return sizeof(internal_sub_item_t) * (keys - index); +} + +template class internal_sub_items_t::Appender; +template class internal_sub_items_t::Appender; + +template +void internal_sub_items_t::Appender::append( + const internal_sub_items_t& src, size_t from, size_t items) { + assert(from <= src.keys()); + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + node_offset_t size = sizeof(internal_sub_item_t) * items; + p_append -= size; + p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size); +} + +template +void internal_sub_items_t::Appender::append( + const full_key_t& key, const laddr_t& value, const laddr_t*& p_value) { + assert(pp_value == nullptr); + p_append -= sizeof(internal_sub_item_t); + auto item = internal_sub_item_t{snap_gen_t::from_key(key), value}; + p_mut->copy_in_absolute(p_append, item); + p_value = &reinterpret_cast(p_append)->value; +} + +template +const onode_t* leaf_sub_items_t::insert_at( + NodeExtentMutable& mut, const leaf_sub_items_t& sub_items, + const full_key_t& key, const onode_t& value, + size_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert(key, value)); + // a. [... item(index)] << size + const char* p_shift_start = p_left_bound; + const char* p_shift_end = sub_items.get_item_end(index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + // b. insert item + auto p_insert = const_cast(p_shift_end - size); + auto p_value = reinterpret_cast(p_insert); + mut.copy_in_absolute(p_insert, &value, value.size); + p_insert += value.size; + mut.copy_in_absolute(p_insert, snap_gen_t::template from_key(key)); + assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end); + + // c. compensate affected offsets + auto item_size = value.size + sizeof(snap_gen_t); + for (auto i = index; i < sub_items.keys(); ++i) { + const node_offset_t& offset_i = sub_items.get_offset(i); + mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i + item_size)); + } + + // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t) + const char* p_offset = (index == 0 ? + (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) : + (const char*)&sub_items.get_offset(index - 1)); + p_shift_start = p_shift_end; + p_shift_end = p_offset; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t)); + + // e. insert offset + node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index); + mut.copy_in_absolute( + const_cast(p_shift_end) - sizeof(node_offset_t), offset_to_item_start); + + // f. update num_sub_keys + mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1)); + + return p_value; +} +template const onode_t* leaf_sub_items_t::insert_at( + NodeExtentMutable&, const leaf_sub_items_t&, const full_key_t&, + const onode_t&, size_t, node_offset_t, const char*); + +size_t leaf_sub_items_t::trim_until( + NodeExtentMutable& mut, leaf_sub_items_t& items, size_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + size_t trim_items = keys - index; + const char* p_items_start = items.p_start(); + const char* p_shift_start = items.get_item_end(index); + const char* p_shift_end = items.get_item_end(0); + size_t size_trim_offsets = sizeof(node_offset_t) * trim_items; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, + size_trim_offsets); + mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index)); + return size_trim_offsets + (p_shift_start - p_items_start); +} + +// helper type for the visitor +template struct overloaded : Ts... { using Ts::operator()...; }; +// explicit deduction guide +template overloaded(Ts...) -> overloaded; + +template class leaf_sub_items_t::Appender; +template class leaf_sub_items_t::Appender; + +template +char* leaf_sub_items_t::Appender::wrap() { + auto p_cur = p_append; + num_keys_t num_keys = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { num_keys += arg.items; }, + [&] (const kv_item_t& arg) { ++num_keys; } + }, a); + } + assert(num_keys); + p_cur -= sizeof(num_keys_t); + p_mut->copy_in_absolute(p_cur, num_keys); + + node_offset_t last_offset = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + int compensate = (last_offset - op_src->get_offset_to_end(arg.from)); + node_offset_t offset; + for (auto i = arg.from; i < arg.from + arg.items; ++i) { + offset = op_src->get_offset(i) + compensate; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, offset); + } + last_offset = offset; + }, + [&] (const kv_item_t& arg) { + last_offset += sizeof(snap_gen_t) + arg.p_value->size; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, last_offset); + } + }, a); + } + + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + auto _p_start = op_src->get_item_end(arg.from + arg.items); + size_t _len = op_src->get_item_end(arg.from) - _p_start; + p_cur -= _len; + p_mut->copy_in_absolute(p_cur, _p_start, _len); + }, + [&] (const kv_item_t& arg) { + assert(pp_value); + p_cur -= sizeof(snap_gen_t); + p_mut->copy_in_absolute(p_cur, snap_gen_t::template from_key(*arg.p_key)); + p_cur -= arg.p_value->size; + p_mut->copy_in_absolute(p_cur, arg.p_value, arg.p_value->size); + *pp_value = reinterpret_cast(p_cur); + } + }, a); + } + return p_cur; +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h new file mode 100644 index 00000000000..124a47b035d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h @@ -0,0 +1,278 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct internal_sub_item_t { + const snap_gen_t& get_key() const { return key; } + #pragma GCC diagnostic ignored "-Waddress-of-packed-member" + const laddr_t* get_p_value() const { return &value; } + + snap_gen_t key; + laddr_t value; +} __attribute__((packed)); + +/* + * internal node N0, N1, N2 + * + * p_first_item + + * (num_items) | + * V + * | fix|sub |fix|sub | + * |...key|addr|key|addr| + * | 1 |1 |0 |0 | + */ +class internal_sub_items_t { + public: + using num_keys_t = size_t; + + internal_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0); + num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t); + assert(num_items > 0); + auto _p_first_item = range.p_end - sizeof(internal_sub_item_t); + p_first_item = reinterpret_cast(_p_first_item); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return num_items; } + key_get_type operator[](size_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_key(); + } + size_t size_before(size_t index) const { + return index * sizeof(internal_sub_item_t); + } + const laddr_t* get_p_value(size_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_p_value(); + } + + static node_offset_t header_size() { return 0u; } + + template + static node_offset_t estimate_insert(const full_key_t&, const laddr_t&) { + return sizeof(internal_sub_item_t); + } + + template + static const laddr_t* insert_at( + NodeExtentMutable&, const internal_sub_items_t&, + const full_key_t&, const laddr_t&, + size_t index, node_offset_t size, const char* p_left_bound); + + static size_t trim_until(NodeExtentMutable&, internal_sub_items_t&, size_t); + + template + class Appender; + + private: + size_t num_items; + const internal_sub_item_t* p_first_item; +}; + +template +class internal_sub_items_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + void append(const internal_sub_items_t& src, size_t from, size_t items); + void append(const full_key_t&, const laddr_t&, const laddr_t*&); + char* wrap() { return p_append; } + private: + const laddr_t** pp_value = nullptr; + NodeExtentMutable* p_mut; + char* p_append; +}; + +/* + * leaf node N0, N1, N2 + * + * p_num_keys -----------------+ + * p_offsets --------------+ | + * p_items_end -----+ | | + * | | | + * V V V + * | fix|o- |fix| off|off|num | + * |...key|node|key|...set|set|sub | + * | 1 |0 |0 | 1 |0 |keys| + * ^ | | + * | | | + * +--------+ <=====+ + */ +class leaf_sub_items_t { + public: + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), + // and the minimal size of onode_t + using num_keys_t = uint8_t; + + leaf_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + auto _p_num_keys = range.p_end - sizeof(num_keys_t); + assert(range.p_start < _p_num_keys); + p_num_keys = reinterpret_cast(_p_num_keys); + assert(keys()); + auto _p_offsets = _p_num_keys - sizeof(node_offset_t); + assert(range.p_start < _p_offsets); + p_offsets = reinterpret_cast(_p_offsets); + p_items_end = reinterpret_cast(&get_offset(keys() - 1)); + assert(range.p_start < p_items_end); + assert(range.p_start == p_start()); + } + + bool operator==(const leaf_sub_items_t& x) { + return (p_num_keys == x.p_num_keys && + p_offsets == x.p_offsets && + p_items_end == x.p_items_end); + } + + const char* p_start() const { return get_item_end(keys()); } + + const node_offset_t& get_offset(size_t index) const { + assert(index < keys()); + return *(p_offsets - index); + } + + const node_offset_t get_offset_to_end(size_t index) const { + assert(index <= keys()); + return index == 0 ? 0 : get_offset(index - 1); + } + + const char* get_item_start(size_t index) const { + return p_items_end - get_offset(index); + } + + const char* get_item_end(size_t index) const { + return p_items_end - get_offset_to_end(index); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return *p_num_keys; } + key_get_type operator[](size_t index) const { + assert(index < keys()); + auto pointer = get_item_end(index); + assert(get_item_start(index) < pointer); + pointer -= sizeof(snap_gen_t); + assert(get_item_start(index) < pointer); + return *reinterpret_cast(pointer); + } + size_t size_before(size_t index) const { + assert(index <= keys()); + if (index == 0) { + return sizeof(num_keys_t); + } + --index; + auto ret = sizeof(num_keys_t) + + (index + 1) * sizeof(node_offset_t) + + get_offset(index); + return ret; + } + const onode_t* get_p_value(size_t index) const { + assert(index < keys()); + auto pointer = get_item_start(index); + auto value = reinterpret_cast(pointer); + assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index)); + return value; + } + + static node_offset_t header_size() { return sizeof(num_keys_t); } + + template + static node_offset_t estimate_insert(const full_key_t&, const onode_t& value) { + return value.size + sizeof(snap_gen_t) + sizeof(node_offset_t); + } + + template + static const onode_t* insert_at( + NodeExtentMutable&, const leaf_sub_items_t&, + const full_key_t&, const onode_t&, + size_t index, node_offset_t size, const char* p_left_bound); + + static size_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, size_t index); + + template + class Appender; + + private: + // TODO: support unaligned access + const num_keys_t* p_num_keys; + const node_offset_t* p_offsets; + const char* p_items_end; +}; + +auto constexpr APPENDER_LIMIT = 3u; + +template +class leaf_sub_items_t::Appender { + struct range_items_t { + size_t from; + size_t items; + }; + struct kv_item_t { + const full_key_t* p_key; + const onode_t* p_value; + }; + using var_t = std::variant; + + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} { + } + + void append(const leaf_sub_items_t& src, size_t from, size_t items) { + assert(cnt <= APPENDER_LIMIT); + assert(from <= src.keys()); + if (items == 0) { + return; + } + if (op_src) { + assert(*op_src == src); + } else { + op_src = src; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + appends[cnt] = range_items_t{from, items}; + ++cnt; + } + void append(const full_key_t& key, + const onode_t& value, const onode_t*& p_value) { + assert(pp_value == nullptr); + assert(cnt <= APPENDER_LIMIT); + appends[cnt] = kv_item_t{&key, &value}; + ++cnt; + pp_value = &p_value; + } + char* wrap(); + + private: + std::optional op_src; + const onode_t** pp_value = nullptr; + NodeExtentMutable* p_mut; + char* p_append; + var_t appends[APPENDER_LIMIT]; + size_t cnt = 0; +}; + +template struct _sub_items_t; +template<> struct _sub_items_t { using type = internal_sub_items_t; }; +template<> struct _sub_items_t { using type = leaf_sub_items_t; }; +template +using sub_items_t = typename _sub_items_t::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc new file mode 100644 index 00000000000..2828dd33e27 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "super.h" +#include "node.h" + +namespace crimson::os::seastore::onode { + +Ref RootNodeTrackerIsolated::get_root(Transaction& t) const { + auto iter = tracked_supers.find(&t); + if (iter == tracked_supers.end()) { + return nullptr; + } else { + return iter->second->get_p_root(); + } +} + +Ref RootNodeTrackerShared::get_root(Transaction&) const { + if (is_clean()) { + return nullptr; + } else { + return tracked_super->get_p_root(); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.h b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h new file mode 100644 index 00000000000..04540d3db0e --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "crimson/common/type_helpers.h" + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +class Node; +class Super; +class RootNodeTracker { + public: + virtual ~RootNodeTracker() = default; + virtual bool is_clean() const = 0; + virtual Ref get_root(Transaction&) const = 0; + static RootNodeTrackerURef create(bool read_isolated); + protected: + RootNodeTracker() = default; + RootNodeTracker(const RootNodeTracker&) = delete; + RootNodeTracker(RootNodeTracker&&) = delete; + RootNodeTracker& operator=(const RootNodeTracker&) = delete; + RootNodeTracker& operator=(RootNodeTracker&&) = delete; + virtual void do_track_super(Transaction&, Super&) = 0; + virtual void do_untrack_super(Transaction&, Super&) = 0; + friend class Super; +}; + +class Super { + public: + using URef = std::unique_ptr; + Super(const Super&) = delete; + Super(Super&&) = delete; + Super& operator=(const Super&) = delete; + Super& operator=(Super&&) = delete; + virtual ~Super() { + assert(tracked_root_node == nullptr); + tracker.do_untrack_super(t, *this); + } + + virtual laddr_t get_root_laddr() const = 0; + virtual void write_root_laddr(context_t, laddr_t) = 0; + + void do_track_root(Node& root) { + assert(tracked_root_node == nullptr); + tracked_root_node = &root; + } + void do_untrack_root(Node& root) { + assert(tracked_root_node == &root); + tracked_root_node = nullptr; + } + Node* get_p_root() const { + assert(tracked_root_node != nullptr); + return tracked_root_node; + } + + protected: + Super(Transaction& t, RootNodeTracker& tracker) + : t{t}, tracker{tracker} { + tracker.do_track_super(t, *this); + } + + private: + Transaction& t; + RootNodeTracker& tracker; + Node* tracked_root_node = nullptr; +}; + +class RootNodeTrackerIsolated final : public RootNodeTracker { + public: + ~RootNodeTrackerIsolated() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_supers.empty(); + } + void do_track_super(Transaction& t, Super& super) override { + assert(tracked_supers.find(&t) == tracked_supers.end()); + tracked_supers[&t] = &super; + } + void do_untrack_super(Transaction& t, Super& super) override { + auto removed = tracked_supers.erase(&t); + assert(removed); + } + ::Ref get_root(Transaction& t) const override; + std::map tracked_supers; +}; + +class RootNodeTrackerShared final : public RootNodeTracker { + public: + ~RootNodeTrackerShared() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_super == nullptr; + } + void do_track_super(Transaction&, Super& super) override { + assert(is_clean()); + tracked_super = &super; + } + void do_untrack_super(Transaction&, Super& super) override { + assert(tracked_super == &super); + tracked_super = nullptr; + } + ::Ref get_root(Transaction&) const override; + Super* tracked_super = nullptr; +}; + +inline RootNodeTrackerURef RootNodeTracker::create(bool read_isolated) { + if (read_isolated) { + return RootNodeTrackerURef(new RootNodeTrackerIsolated()); + } else { + return RootNodeTrackerURef(new RootNodeTrackerShared()); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc new file mode 100644 index 00000000000..15ff343e1e6 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc @@ -0,0 +1,221 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tree.h" + +#include "node.h" +#include "node_extent_manager.h" +#include "stages/key_layout.h" // full_key_t +#include "super.h" + +namespace crimson::os::seastore::onode { + +using btree_ertr = Btree::btree_ertr; +template +using btree_future = Btree::btree_future; +using Cursor = Btree::Cursor; + +Cursor::Cursor(Btree* p_tree, Ref _p_cursor) + : p_tree(p_tree) { + if (_p_cursor->is_end()) { + // for cursors indicating end of tree untrack the leaf node + } else { + p_cursor = _p_cursor; + } +} +Cursor::Cursor(Btree* p_tree) : p_tree{p_tree} {} +Cursor::Cursor(const Cursor&) = default; +Cursor::Cursor(Cursor&&) noexcept = default; +Cursor& Cursor::operator=(const Cursor&) = default; +Cursor& Cursor::operator=(Cursor&&) = default; +Cursor::~Cursor() = default; + +bool Cursor::is_end() const { + if (p_cursor) { + assert(!p_cursor->is_end()); + return false; + } else { + return true; + } +} + +const onode_key_t& Cursor::key() { + // TODO + assert(false && "not implemented"); +} + +const onode_t* Cursor::value() const { + return p_cursor->get_p_value(); +} + +bool Cursor::operator==(const Cursor& x) const { + return p_cursor == x.p_cursor; +} + +Cursor& Cursor::operator++() { + // TODO + return *this; +} + +Cursor Cursor::operator++(int) { + Cursor tmp = *this; + ++*this; + return tmp; +} + +Cursor Cursor::make_end(Btree* p_tree) { + return {p_tree}; +} + +Btree::Btree(NodeExtentManagerURef&& _nm) + : nm{std::move(_nm)}, + root_tracker{RootNodeTracker::create(nm->is_read_isolated())} {} + +Btree::~Btree() { assert(root_tracker->is_clean()); } + +btree_future<> Btree::mkfs(Transaction& t) { + return Node::mkfs(get_context(t), *root_tracker); +} + +btree_future Btree::begin(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + return root->lookup_smallest(get_context(t)); + }).safe_then([this](auto cursor) { + return Cursor{this, cursor}; + }); +} + +btree_future Btree::last(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + return root->lookup_largest(get_context(t)); + }).safe_then([this](auto cursor) { + return Cursor(this, cursor); + }); +} + +Cursor Btree::end() { + return Cursor::make_end(this); +} + +btree_future +Btree::contains(Transaction& t, const onode_key_t& key) { + return seastar::do_with( + full_key_t(key), + [this, &t](auto& key) -> btree_future { + return get_root(t).safe_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).safe_then([](auto result) { + return MatchKindBS::EQ == result.match; + }); + } + ); +} + +btree_future +Btree::find(Transaction& t, const onode_key_t& key) { + return seastar::do_with( + full_key_t(key), + [this, &t](auto& key) -> btree_future { + return get_root(t).safe_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).safe_then([this](auto result) { + if (result.match == MatchKindBS::EQ) { + return Cursor(this, result.p_cursor); + } else { + return Cursor::make_end(this); + } + }); + } + ); +} + +btree_future +Btree::lower_bound(Transaction& t, const onode_key_t& key) { + return seastar::do_with( + full_key_t(key), + [this, &t](auto& key) -> btree_future { + return get_root(t).safe_then([this, &t, &key](auto root) { + return root->lower_bound(get_context(t), key); + }).safe_then([this](auto result) { + return Cursor(this, result.p_cursor); + }); + } + ); +} + +btree_future> +Btree::insert(Transaction& t, const onode_key_t& key, const onode_t& value) { + return seastar::do_with( + full_key_t(key), + [this, &t, &value](auto& key) -> btree_future> { + return get_root(t).safe_then([this, &t, &key, &value](auto root) { + return root->insert(get_context(t), key, value); + }).safe_then([this](auto ret) { + auto& [cursor, success] = ret; + return std::make_pair(Cursor(this, cursor), success); + }); + } + ); +} + +btree_future Btree::erase(Transaction& t, const onode_key_t& key) { + // TODO + return btree_ertr::make_ready_future(0u); +} + +btree_future Btree::erase(Cursor& pos) { + // TODO + return btree_ertr::make_ready_future( + Cursor::make_end(this)); +} + +btree_future +Btree::erase(Cursor& first, Cursor& last) { + // TODO + return btree_ertr::make_ready_future( + Cursor::make_end(this)); +} + +btree_future Btree::height(Transaction& t) { + return get_root(t).safe_then([](auto root) { + return size_t(root->level() + 1); + }); +} + +std::ostream& Btree::dump(Transaction& t, std::ostream& os) { + auto root = root_tracker->get_root(t); + if (root) { + root->dump(os); + } else { + os << "empty tree!"; + } + return os; +} + +btree_future> Btree::get_root(Transaction& t) { + auto root = root_tracker->get_root(t); + if (root) { + return btree_ertr::make_ready_future>(root); + } else { + return Node::load_root(get_context(t), *root_tracker); + } +} + +bool Btree::test_is_clean() const { + return root_tracker->is_clean(); +} + +btree_future<> Btree::test_clone_from( + Transaction& t, Transaction& t_from, Btree& from) { + // Note: assume the tree to clone is tracked correctly in memory. + // In some unit tests, parts of the tree are stubbed out that they + // should not be loaded from NodeExtentManager. + return from.get_root(t_from + ).safe_then([this, &t](auto root_from) { + return root_from->test_clone_root(get_context(t), *root_tracker); + }); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h new file mode 100644 index 00000000000..c188e5c6848 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "crimson/common/type_helpers.h" + +#include "fwd.h" +#include "tree_types.h" + +namespace crimson::os::seastore::onode { + +/* + * btree interfaces + * requirements are based on: + * ceph::os::Transaction::create/touch/remove() + * ceph::ObjectStore::collection_list() + * ceph::BlueStore::get_onode() + * db->get_iterator(PREFIIX_OBJ) by ceph::BlueStore::fsck() + */ +class Node; +class Btree { + public: + using btree_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template + using btree_future = btree_ertr::future; + + Btree(NodeExtentManagerURef&& nm); + Btree(const Btree&) = delete; + Btree(Btree&&) = delete; + Btree& operator=(const Btree&) = delete; + Btree& operator=(Btree&&) = delete; + ~Btree(); + + btree_future<> mkfs(Transaction&); + + class Cursor; + // lookup + btree_future begin(Transaction&); + btree_future last(Transaction&); + Cursor end(); + // TODO: replace onode_key_t + btree_future contains(Transaction&, const onode_key_t&); + btree_future find(Transaction&, const onode_key_t&); + btree_future lower_bound(Transaction&, const onode_key_t&); + + // modifiers + // TODO: replace onode_t + btree_future> + insert(Transaction&, const onode_key_t&, const onode_t&); + btree_future erase(Transaction&, const onode_key_t& key); + btree_future erase(Cursor& pos); + btree_future erase(Cursor& first, Cursor& last); + + // stats + btree_future height(Transaction&); + std::ostream& dump(Transaction&, std::ostream&); + + // test_only + bool test_is_clean() const; + btree_future<> test_clone_from(Transaction& t, Transaction& t_from, Btree& from); + + private: + context_t get_context(Transaction& t) { return {*nm, t}; } + btree_future> get_root(Transaction& t); + + NodeExtentManagerURef nm; + RootNodeTrackerURef root_tracker; + + friend class DummyChildPool; +}; + +class tree_cursor_t; +class Btree::Cursor { + public: + Cursor(const Cursor&); + Cursor(Cursor&&) noexcept; + Cursor& operator=(const Cursor&); + Cursor& operator=(Cursor&&); + ~Cursor(); + + bool is_end() const; + const onode_key_t& key(); + const onode_t* value() const; + bool operator==(const Cursor& x) const; + bool operator!=(const Cursor& x) const { return !(*this == x); } + Cursor& operator++(); + Cursor operator++(int); + + private: + Cursor(Btree*, Ref); + Cursor(Btree*); + + static Cursor make_end(Btree*); + + Btree* p_tree; + Ref p_cursor; + + friend class Btree; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h new file mode 100644 index 00000000000..058df3bd632 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +namespace crimson::os::seastore::onode { + +// TODO: replace +struct onode_t { + // onode should be smaller than a node + uint16_t size; // address up to 64 KiB sized node + uint16_t id; + // omap, extent_map, inline data + + bool operator==(const onode_t& o) const { return size == o.size && id == o.id; } + bool operator!=(const onode_t& o) const { return !(*this == o); } +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const onode_t& node) { + return os << "onode(" << node.id << ", " << node.size << "B)"; +} + +using shard_t = int8_t; +using pool_t = int64_t; +using crush_hash_t = uint32_t; +using snap_t = uint64_t; +using gen_t = uint64_t; + +// TODO: replace with ghobject_t +struct onode_key_t { + shard_t shard; + pool_t pool; + crush_hash_t crush; + std::string nspace; + std::string oid; + snap_t snap; + gen_t gen; + + int cmp(const onode_key_t& o) const { + auto l = std::tie(shard, pool, crush, nspace, oid, snap, gen); + auto r = std::tie(o.shard, o.pool, o.crush, o.nspace, o.oid, o.snap, o.gen); + if (l < r) { + return -1; + } else if (l > r) { + return 1; + } else { + return 0; + } + } + bool operator>(const onode_key_t& o) const { return cmp(o) > 0; } + bool operator>=(const onode_key_t& o) const { return cmp(o) >= 0; } + bool operator<(const onode_key_t& o) const { return cmp(o) < 0; } + bool operator<=(const onode_key_t& o) const { return cmp(o) <= 0; } + bool operator==(const onode_key_t& o) const { return cmp(o) == 0; } + bool operator!=(const onode_key_t& o) const { return cmp(o) != 0; } +}; +inline std::ostream& operator<<(std::ostream& os, const onode_key_t& key) { + os << "key(" << (unsigned)key.shard << "," << key.pool << "," << key.crush << "; "; + if (key.nspace.size() <= 12) { + os << "\"" << key.nspace << "\","; + } else { + os << "\"" << key.nspace.substr(0, 4) << ".." + << key.nspace.substr(key.nspace.size() - 2, 2) + << "/" << key.nspace.size() << "B\","; + } + if (key.oid.size() <= 12) { + os << "\"" << key.oid << "\"; "; + } else { + os << "\"" << key.oid.substr(0, 4) << ".." + << key.oid.substr(key.oid.size() - 2, 2) + << "/" << key.oid.size() << "B\"; "; + } + os << key.snap << "," << key.gen << ")"; + return os; +} + +} diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index 9c20fc3f0e8..ff43b1e515b 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -57,6 +57,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t) return out << "EXTMAP_INNER"; case extent_types_t::EXTMAP_LEAF: return out << "EXTMAP_LEAF"; + case extent_types_t::ONODE_BLOCK_STAGED: + return out << "ONODE_BLOCK_STAGED"; case extent_types_t::TEST_BLOCK: return out << "TEST_BLOCK"; case extent_types_t::TEST_BLOCK_PHYSICAL: diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 56bf53b72fb..26875cbb3f7 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -269,6 +269,7 @@ enum class extent_types_t : uint8_t { ONODE_BLOCK = 3, EXTMAP_INNER = 4, EXTMAP_LEAF = 5, + ONODE_BLOCK_STAGED = 6, // Test Block Types TEST_BLOCK = 0xF0, diff --git a/src/test/crimson/seastore/onode_tree/CMakeLists.txt b/src/test/crimson/seastore/onode_tree/CMakeLists.txt index 4d6f414d9d0..84dca33353e 100644 --- a/src/test/crimson/seastore/onode_tree/CMakeLists.txt +++ b/src/test/crimson/seastore/onode_tree/CMakeLists.txt @@ -6,3 +6,10 @@ target_link_libraries(test-seastore-onode-tree-node GTest::Main crimson-os crimson-common) + +add_executable(unittest_staged_fltree + test_staged_fltree.cc + ../../gtest_seastar.cc) +add_ceph_unittest(unittest_staged_fltree) +target_link_libraries(unittest_staged_fltree + crimson-seastore) diff --git a/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc b/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc new file mode 100644 index 00000000000..dbcb256f2e9 --- /dev/null +++ b/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc @@ -0,0 +1,983 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include +#include + +#include "crimson/common/log.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_impl.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/tree.h" + +#include "test/crimson/gtest_seastar.h" + +// TODO: use assert instead of logging + +using namespace crimson::os::seastore::onode; + +namespace { + [[maybe_unused]] seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_test); + } + + // return a key_view_t and its underlying memory buffer. + // the buffer needs to be freed manually. + std::pair build_key_view(const onode_key_t& hobj) { + key_hobj_t key_hobj(hobj); + size_t key_size = sizeof(shard_pool_crush_t) + sizeof(snap_gen_t) + + ns_oid_view_t::estimate_size(key_hobj); + void* p_mem = std::malloc(key_size); + + key_view_t key_view; + char* p_fill = (char*)p_mem + key_size; + + auto spc = shard_pool_crush_t::from_key(key_hobj); + p_fill -= sizeof(shard_pool_crush_t); + std::memcpy(p_fill, &spc, sizeof(shard_pool_crush_t)); + key_view.set(*reinterpret_cast(p_fill)); + + auto p_ns_oid = p_fill; + ns_oid_view_t::append(key_hobj, p_fill); + ns_oid_view_t ns_oid_view(p_ns_oid); + key_view.set(ns_oid_view); + + auto sg = snap_gen_t::from_key(key_hobj); + p_fill -= sizeof(snap_gen_t); + assert(p_fill == (char*)p_mem); + std::memcpy(p_fill, &sg, sizeof(snap_gen_t)); + key_view.set(*reinterpret_cast(p_fill)); + + return {key_view, p_mem}; + } +} + +struct a_basic_test_t : public seastar_test_suite_t {}; + +TEST_F(a_basic_test_t, 1_basic_sizes) +{ + logger().info("\n" + "Bytes of struct:\n" + " node_header_t: {}\n" + " shard_pool_t: {}\n" + " shard_pool_crush_t: {}\n" + " crush_t: {}\n" + " snap_gen_t: {}\n" + " slot_0_t: {}\n" + " slot_1_t: {}\n" + " slot_3_t: {}\n" + " node_fields_0_t: {}\n" + " node_fields_1_t: {}\n" + " node_fields_2_t: {}\n" + " internal_fields_3_t: {}\n" + " leaf_fields_3_t: {}\n" + " internal_sub_item_t: {}", + sizeof(node_header_t), sizeof(shard_pool_t), + sizeof(shard_pool_crush_t), sizeof(crush_t), sizeof(snap_gen_t), + sizeof(slot_0_t), sizeof(slot_1_t), sizeof(slot_3_t), + sizeof(node_fields_0_t), sizeof(node_fields_1_t), sizeof(node_fields_2_t), + sizeof(internal_fields_3_t), sizeof(leaf_fields_3_t), sizeof(internal_sub_item_t) + ); + + onode_key_t hobj = {0, 0, 0, "n", "o", 0, 0}; + key_hobj_t key(hobj); + auto [key_view, p_mem] = build_key_view(hobj); + onode_t value = {2}; +#define _STAGE_T(NodeType) node_to_stage_t +#define NXT_T(StageType) staged + logger().info("\n" + "Bytes of a key-value insertion (full-string):\n" + " s-p-c, 'n'-'o', s-g => onode_t{2}: typically internal 41B, leaf 35B\n" + " InternalNode0: {} {} {}\n" + " InternalNode1: {} {} {}\n" + " InternalNode2: {} {}\n" + " InternalNode3: {}\n" + " LeafNode0: {} {} {}\n" + " LeafNode1: {} {} {}\n" + " LeafNode2: {} {}\n" + " LeafNode3: {}", + _STAGE_T(InternalNode0)::template insert_size(key_view, 0), + NXT_T(_STAGE_T(InternalNode0))::template insert_size(key_view, 0), + NXT_T(NXT_T(_STAGE_T(InternalNode0)))::template insert_size(key_view, 0), + _STAGE_T(InternalNode1)::template insert_size(key_view, 0), + NXT_T(_STAGE_T(InternalNode1))::template insert_size(key_view, 0), + NXT_T(NXT_T(_STAGE_T(InternalNode1)))::template insert_size(key_view, 0), + _STAGE_T(InternalNode2)::template insert_size(key_view, 0), + NXT_T(_STAGE_T(InternalNode2))::template insert_size(key_view, 0), + _STAGE_T(InternalNode3)::template insert_size(key_view, 0), + _STAGE_T(LeafNode0)::template insert_size(key, value), + NXT_T(_STAGE_T(LeafNode0))::template insert_size(key, value), + NXT_T(NXT_T(_STAGE_T(LeafNode0)))::template insert_size(key, value), + _STAGE_T(LeafNode1)::template insert_size(key, value), + NXT_T(_STAGE_T(LeafNode1))::template insert_size(key, value), + NXT_T(NXT_T(_STAGE_T(LeafNode1)))::template insert_size(key, value), + _STAGE_T(LeafNode2)::template insert_size(key, value), + NXT_T(_STAGE_T(LeafNode2))::template insert_size(key, value), + _STAGE_T(LeafNode3)::template insert_size(key, value) + ); + std::free(p_mem); +} + +TEST_F(a_basic_test_t, 2_node_sizes) +{ + run_async([this] { + auto nm = NodeExtentManager::create_dummy(); + auto t = make_transaction(); + context_t c{*nm, *t}; + std::vector, NodeExtentMutable>> nodes = { + InternalNode0::allocate(c, 1u, false).unsafe_get0().make_pair(), + InternalNode1::allocate(c, 1u, false).unsafe_get0().make_pair(), + InternalNode2::allocate(c, 1u, false).unsafe_get0().make_pair(), + InternalNode3::allocate(c, 1u, false).unsafe_get0().make_pair(), + InternalNode0::allocate(c, 1u, true).unsafe_get0().make_pair(), + InternalNode1::allocate(c, 1u, true).unsafe_get0().make_pair(), + InternalNode2::allocate(c, 1u, true).unsafe_get0().make_pair(), + InternalNode3::allocate(c, 1u, true).unsafe_get0().make_pair(), + LeafNode0::allocate(c, false).unsafe_get0().make_pair(), + LeafNode1::allocate(c, false).unsafe_get0().make_pair(), + LeafNode2::allocate(c, false).unsafe_get0().make_pair(), + LeafNode3::allocate(c, false).unsafe_get0().make_pair(), + LeafNode0::allocate(c, true).unsafe_get0().make_pair(), + LeafNode1::allocate(c, true).unsafe_get0().make_pair(), + LeafNode2::allocate(c, true).unsafe_get0().make_pair(), + LeafNode3::allocate(c, true).unsafe_get0().make_pair() + }; + std::ostringstream oss; + oss << "\nallocated nodes:"; + auto node_tracker = RootNodeTracker::create(c.nm.is_read_isolated()); + assert(node_tracker->is_clean()); + for (auto iter = nodes.begin(); iter != nodes.end(); ++iter) { + auto& ref_node = iter->first; + auto& mut = iter->second; + oss << "\n " << *ref_node; + ref_node->test_make_destructable( + c, mut, c.nm.get_super(c.t, *node_tracker).unsafe_get0()); + assert(!node_tracker->is_clean()); + iter->first.reset(); + assert(node_tracker->is_clean()); + } + logger().info("{}", oss.str()); + }); +} + +class Onodes { + public: + Onodes(size_t n) { + for (size_t i = 1; i <= n; ++i) { + auto p_onode = &create(i * 8); + onodes.push_back(p_onode); + } + } + + ~Onodes() { + std::for_each(tracked_onodes.begin(), tracked_onodes.end(), + [] (onode_t* onode) { + std::free(onode); + }); + } + + const onode_t& create(size_t size) { + assert(size >= sizeof(onode_t) + sizeof(uint32_t)); + uint32_t target = size * 137; + auto p_mem = (char*)std::malloc(size); + auto p_onode = (onode_t*)p_mem; + tracked_onodes.push_back(p_onode); + p_onode->size = size; + p_onode->id = id++; + p_mem += (size - sizeof(uint32_t)); + std::memcpy(p_mem, &target, sizeof(uint32_t)); + validate(*p_onode); + return *p_onode; + } + + const onode_t& pick() const { + auto index = rd() % onodes.size(); + return *onodes[index]; + } + + const onode_t& pick_largest() const { + return *onodes[onodes.size() - 1]; + } + + static void validate_cursor( + const Btree::Cursor& cursor, const onode_t& onode) { + assert(!cursor.is_end()); + assert(cursor.value()); + assert(cursor.value() != &onode); + assert(*cursor.value() == onode); + validate(*cursor.value()); + } + + private: + static void validate(const onode_t& node) { + auto p_target = (const char*)&node + node.size - sizeof(uint32_t); + uint32_t target; + std::memcpy(&target, p_target, sizeof(uint32_t)); + assert(target == node.size * 137); + } + + uint16_t id = 0; + mutable std::random_device rd; + std::vector onodes; + std::vector tracked_onodes; +}; + +struct b_dummy_tree_test_t : public seastar_test_suite_t { + NodeExtentManagerURef moved_nm; + TransactionRef ref_t; + Transaction& t; + context_t c; + Btree tree; + + b_dummy_tree_test_t() + : moved_nm{NodeExtentManager::create_dummy()}, + ref_t{make_transaction()}, + t{*ref_t}, + c{*moved_nm, t}, + tree{std::move(moved_nm)} {} + + seastar::future<> set_up_fut() override final { + return tree.mkfs(t).handle_error( + crimson::ct_error::all_same_way([] { + ASSERT_FALSE("Unable to mkfs"); + }) + ); + } +}; + +TEST_F(b_dummy_tree_test_t, 3_random_insert_leaf_node) +{ + run_async([this] { + logger().info("\n---------------------------------------------" + "\nrandomized leaf node insert:\n"); + auto key_s = onode_key_t{ + 0, 0, 0, "ns", "oid", 0, 0 + }; + auto key_e = onode_key_t{ + std::numeric_limits::max(), 0, 0, "ns", "oid", 0, 0 + }; + assert(tree.find(t, key_s).unsafe_get0().is_end()); + assert(tree.begin(t).unsafe_get0().is_end()); + assert(tree.last(t).unsafe_get0().is_end()); + + std::vector> insert_history; + auto f_validate_insert_new = [this, &insert_history] ( + const onode_key_t& key, const onode_t& value) { + auto [cursor, success] = tree.insert(t, key, value).unsafe_get0(); + assert(success == true); + insert_history.emplace_back(key, &value, cursor); + Onodes::validate_cursor(cursor, value); + auto cursor_ = tree.lower_bound(t, key).unsafe_get0(); + assert(cursor_.value() == cursor.value()); + return cursor.value(); + }; + auto onodes = Onodes(15); + + // insert key1, onode1 at STAGE_LEFT + auto key1 = onode_key_t{3, 3, 3, "ns3", "oid3", 3, 3}; + auto& onode1 = onodes.pick(); + auto p_value1 = f_validate_insert_new(key1, onode1); + + // validate lookup + { + auto cursor1_s = tree.lower_bound(t, key_s).unsafe_get0(); + assert(cursor1_s.value() == p_value1); + auto cursor1_e = tree.lower_bound(t, key_e).unsafe_get0(); + assert(cursor1_e.is_end()); + } + + // insert the same key1 with a different onode + { + auto& onode1_dup = onodes.pick(); + auto [cursor1_dup, ret1_dup] = tree.insert( + t, key1, onode1_dup).unsafe_get0(); + assert(ret1_dup == false); + Onodes::validate_cursor(cursor1_dup, onode1); + } + + // insert key2, onode2 to key1's left at STAGE_LEFT + // insert node front at STAGE_LEFT + auto key2 = onode_key_t{2, 2, 2, "ns3", "oid3", 3, 3}; + auto& onode2 = onodes.pick(); + f_validate_insert_new(key2, onode2); + + // insert key3, onode3 to key1's right at STAGE_LEFT + // insert node last at STAGE_LEFT + auto key3 = onode_key_t{4, 4, 4, "ns3", "oid3", 3, 3}; + auto& onode3 = onodes.pick(); + f_validate_insert_new(key3, onode3); + + // insert key4, onode4 to key1's left at STAGE_STRING (collision) + auto key4 = onode_key_t{3, 3, 3, "ns2", "oid2", 3, 3}; + auto& onode4 = onodes.pick(); + f_validate_insert_new(key4, onode4); + + // insert key5, onode5 to key1's right at STAGE_STRING (collision) + auto key5 = onode_key_t{3, 3, 3, "ns4", "oid4", 3, 3}; + auto& onode5 = onodes.pick(); + f_validate_insert_new(key5, onode5); + + // insert key6, onode6 to key1's left at STAGE_RIGHT + auto key6 = onode_key_t{3, 3, 3, "ns3", "oid3", 2, 2}; + auto& onode6 = onodes.pick(); + f_validate_insert_new(key6, onode6); + + // insert key7, onode7 to key1's right at STAGE_RIGHT + auto key7 = onode_key_t{3, 3, 3, "ns3", "oid3", 4, 4}; + auto& onode7 = onodes.pick(); + f_validate_insert_new(key7, onode7); + + // insert node front at STAGE_RIGHT + auto key8 = onode_key_t{2, 2, 2, "ns3", "oid3", 2, 2}; + auto& onode8 = onodes.pick(); + f_validate_insert_new(key8, onode8); + + // insert node front at STAGE_STRING (collision) + auto key9 = onode_key_t{2, 2, 2, "ns2", "oid2", 3, 3}; + auto& onode9 = onodes.pick(); + f_validate_insert_new(key9, onode9); + + // insert node last at STAGE_RIGHT + auto key10 = onode_key_t{4, 4, 4, "ns3", "oid3", 4, 4}; + auto& onode10 = onodes.pick(); + f_validate_insert_new(key10, onode10); + + // insert node last at STAGE_STRING (collision) + auto key11 = onode_key_t{4, 4, 4, "ns4", "oid4", 3, 3}; + auto& onode11 = onodes.pick(); + f_validate_insert_new(key11, onode11); + + // insert key, value randomly until a perfect 3-ary tree is formed + std::vector> kvs{ + {onode_key_t{2, 2, 2, "ns2", "oid2", 2, 2}, &onodes.pick()}, + {onode_key_t{2, 2, 2, "ns2", "oid2", 4, 4}, &onodes.pick()}, + {onode_key_t{2, 2, 2, "ns3", "oid3", 4, 4}, &onodes.pick()}, + {onode_key_t{2, 2, 2, "ns4", "oid4", 2, 2}, &onodes.pick()}, + {onode_key_t{2, 2, 2, "ns4", "oid4", 3, 3}, &onodes.pick()}, + {onode_key_t{2, 2, 2, "ns4", "oid4", 4, 4}, &onodes.pick()}, + {onode_key_t{3, 3, 3, "ns2", "oid2", 2, 2}, &onodes.pick()}, + {onode_key_t{3, 3, 3, "ns2", "oid2", 4, 4}, &onodes.pick()}, + {onode_key_t{3, 3, 3, "ns4", "oid4", 2, 2}, &onodes.pick()}, + {onode_key_t{3, 3, 3, "ns4", "oid4", 4, 4}, &onodes.pick()}, + {onode_key_t{4, 4, 4, "ns2", "oid2", 2, 2}, &onodes.pick()}, + {onode_key_t{4, 4, 4, "ns2", "oid2", 3, 3}, &onodes.pick()}, + {onode_key_t{4, 4, 4, "ns2", "oid2", 4, 4}, &onodes.pick()}, + {onode_key_t{4, 4, 4, "ns3", "oid3", 2, 2}, &onodes.pick()}, + {onode_key_t{4, 4, 4, "ns4", "oid4", 2, 2}, &onodes.pick()}, + {onode_key_t{4, 4, 4, "ns4", "oid4", 4, 4}, &onodes.pick()}}; + auto& smallest_value = *kvs[0].second; + auto& largest_value = *kvs[kvs.size() - 1].second; + std::random_shuffle(kvs.begin(), kvs.end()); + std::for_each(kvs.begin(), kvs.end(), [&f_validate_insert_new] (auto& kv) { + f_validate_insert_new(kv.first, *kv.second); + }); + assert(tree.height(t).unsafe_get0() == 1); + assert(!tree.test_is_clean()); + + for (auto& [k, v, c] : insert_history) { + // validate values in tree keep intact + auto cursor = tree.lower_bound(t, k).unsafe_get0(); + Onodes::validate_cursor(cursor, *v); + // validate values in cursors keep intact + Onodes::validate_cursor(c, *v); + } + Onodes::validate_cursor( + tree.lower_bound(t, key_s).unsafe_get0(), smallest_value); + Onodes::validate_cursor( + tree.begin(t).unsafe_get0(), smallest_value); + Onodes::validate_cursor( + tree.last(t).unsafe_get0(), largest_value); + + std::ostringstream oss; + tree.dump(t, oss); + logger().info("\n{}\n", oss.str()); + + insert_history.clear(); + }); +} + +static std::set build_key_set( + std::pair range_2, + std::pair range_1, + std::pair range_0, + std::string padding = "", + bool is_internal = false) { + std::set ret; + onode_key_t key; + for (unsigned i = range_2.first; i < range_2.second; ++i) { + for (unsigned j = range_1.first; j < range_1.second; ++j) { + for (unsigned k = range_0.first; k < range_0.second; ++k) { + key.shard = i; + key.pool = i; + key.crush = i; + std::ostringstream os_ns; + os_ns << "ns" << j; + key.nspace = os_ns.str(); + std::ostringstream os_oid; + os_oid << "oid" << j << padding; + key.oid = os_oid.str(); + key.snap = k; + key.gen = k; + ret.insert(key); + } + } + } + if (is_internal) { + ret.insert(onode_key_t{9, 9, 9, "ns~last", "oid~last", 9, 9}); + } + return ret; +} + +TEST_F(b_dummy_tree_test_t, 4_split_leaf_node) +{ + run_async([this] { + logger().info("\n---------------------------------------------" + "\nbefore leaf node split:\n"); + auto keys = build_key_set({2, 5}, {2, 5}, {2, 5}); + std::vector> insert_history; + auto onodes = Onodes(0); + for (auto& key : keys) { + auto& value = onodes.create(120); + auto [cursor, success] = tree.insert(t, key, value).unsafe_get0(); + assert(success == true); + Onodes::validate_cursor(cursor, value); + insert_history.emplace_back(key, &value, cursor); + } + assert(tree.height(t).unsafe_get0() == 1); + assert(!tree.test_is_clean()); + std::ostringstream oss; + tree.dump(t, oss); + logger().info("\n{}\n", oss.str()); + + auto f_split = [this, &insert_history] ( + const onode_key_t& key, const onode_t& value) { + Btree tree_clone(NodeExtentManager::create_dummy()); + auto ref_t_clone = make_transaction(); + Transaction& t_clone = *ref_t_clone; + tree_clone.test_clone_from(t_clone, t, tree).unsafe_get0(); + + logger().info("insert {}:", key); + auto [cursor, success] = tree_clone.insert(t_clone, key, value).unsafe_get0(); + assert(success == true); + Onodes::validate_cursor(cursor, value); + + std::ostringstream oss; + tree_clone.dump(t_clone, oss); + logger().info("\n{}\n", oss.str()); + assert(tree_clone.height(t_clone).unsafe_get0() == 2); + + for (auto& [k, v, c] : insert_history) { + auto result = tree_clone.lower_bound(t_clone, k).unsafe_get0(); + Onodes::validate_cursor(result, *v); + } + auto result = tree_clone.lower_bound(t_clone, key).unsafe_get0(); + Onodes::validate_cursor(result, value); + }; + + auto& onode = onodes.create(1280); + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to left front at stage 2, 1, 0\n"); + f_split(onode_key_t{1, 1, 1, "ns3", "oid3", 3, 3}, onode); + f_split(onode_key_t{2, 2, 2, "ns1", "oid1", 3, 3}, onode); + f_split(onode_key_t{2, 2, 2, "ns2", "oid2", 1, 1}, onode); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to left back at stage 0, 1, 2, 1, 0\n"); + f_split(onode_key_t{2, 2, 2, "ns4", "oid4", 5, 5}, onode); + f_split(onode_key_t{2, 2, 2, "ns5", "oid5", 3, 3}, onode); + f_split(onode_key_t{2, 3, 3, "ns3", "oid3", 3, 3}, onode); + f_split(onode_key_t{3, 3, 3, "ns1", "oid1", 3, 3}, onode); + f_split(onode_key_t{3, 3, 3, "ns2", "oid2", 1, 1}, onode); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to right front at stage 0, 1, 2, 1, 0\n"); + f_split(onode_key_t{3, 3, 3, "ns4", "oid4", 5, 5}, onode); + f_split(onode_key_t{3, 3, 3, "ns5", "oid5", 3, 3}, onode); + f_split(onode_key_t{3, 4, 4, "ns3", "oid3", 3, 3}, onode); + f_split(onode_key_t{4, 4, 4, "ns1", "oid1", 3, 3}, onode); + f_split(onode_key_t{4, 4, 4, "ns2", "oid2", 1, 1}, onode); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to right back at stage 0, 1, 2\n"); + f_split(onode_key_t{4, 4, 4, "ns4", "oid4", 5, 5}, onode); + f_split(onode_key_t{4, 4, 4, "ns5", "oid5", 3, 3}, onode); + f_split(onode_key_t{5, 5, 5, "ns3", "oid3", 3, 3}, onode); + + auto& onode1 = onodes.create(512); + logger().info("\n---------------------------------------------" + "\nsplit at stage 1; insert to left middle at stage 0, 1, 2, 1, 0\n"); + f_split(onode_key_t{2, 2, 2, "ns4", "oid4", 5, 5}, onode1); + f_split(onode_key_t{2, 2, 2, "ns5", "oid5", 3, 3}, onode1); + f_split(onode_key_t{2, 2, 3, "ns3", "oid3", 3, 3}, onode1); + f_split(onode_key_t{3, 3, 3, "ns1", "oid1", 3, 3}, onode1); + f_split(onode_key_t{3, 3, 3, "ns2", "oid2", 1, 1}, onode1); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 1; insert to left back at stage 0, 1, 0\n"); + f_split(onode_key_t{3, 3, 3, "ns2", "oid2", 5, 5}, onode1); + f_split(onode_key_t{3, 3, 3, "ns2", "oid3", 3, 3}, onode1); + f_split(onode_key_t{3, 3, 3, "ns3", "oid3", 1, 1}, onode1); + + auto& onode2 = onodes.create(256); + logger().info("\n---------------------------------------------" + "\nsplit at stage 1; insert to right front at stage 0, 1, 0\n"); + f_split(onode_key_t{3, 3, 3, "ns3", "oid3", 5, 5}, onode2); + f_split(onode_key_t{3, 3, 3, "ns3", "oid4", 3, 3}, onode2); + f_split(onode_key_t{3, 3, 3, "ns4", "oid4", 1, 1}, onode2); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 1; insert to right middle at stage 0, 1, 2, 1, 0\n"); + f_split(onode_key_t{3, 3, 3, "ns4", "oid4", 5, 5}, onode2); + f_split(onode_key_t{3, 3, 3, "ns5", "oid5", 3, 3}, onode2); + f_split(onode_key_t{3, 3, 4, "ns3", "oid3", 3, 3}, onode2); + f_split(onode_key_t{4, 4, 4, "ns1", "oid1", 3, 3}, onode2); + f_split(onode_key_t{4, 4, 4, "ns2", "oid2", 1, 1}, onode2); + + auto& onode3 = onodes.create(768); + logger().info("\n---------------------------------------------" + "\nsplit at stage 0; insert to right middle at stage 0, 1, 2, 1, 0\n"); + f_split(onode_key_t{3, 3, 3, "ns4", "oid4", 5, 5}, onode3); + f_split(onode_key_t{3, 3, 3, "ns5", "oid5", 3, 3}, onode3); + f_split(onode_key_t{3, 3, 4, "ns3", "oid3", 3, 3}, onode3); + f_split(onode_key_t{4, 4, 4, "ns1", "oid1", 3, 3}, onode3); + f_split(onode_key_t{4, 4, 4, "ns2", "oid2", 1, 1}, onode3); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 0; insert to right front at stage 0\n"); + f_split(onode_key_t{3, 3, 3, "ns4", "oid4", 2, 3}, onode3); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 0; insert to left back at stage 0\n"); + f_split(onode_key_t{3, 3, 3, "ns2", "oid2", 3, 4}, onode3); + + // TODO: test split at {0, 0, 0} + // TODO: test split at {END, END, END} + }); +} + +namespace crimson::os::seastore::onode { + +class DummyChildPool { + class DummyChild final : public Node { + public: + virtual ~DummyChild() { + std::free(p_mem_key_view); + } + + node_future<> populate_split( + context_t c, std::set>& splitable_nodes) { + assert(can_split()); + assert(splitable_nodes.find(this) != splitable_nodes.end()); + + size_t index; + if (keys.size() == 2) { + index = 1; + } else { + index = rd() % (keys.size() - 2) + 1; + } + auto iter = keys.begin(); + std::advance(iter, index); + + std::set left_keys(keys.begin(), iter); + std::set right_keys(iter, keys.end()); + bool right_is_tail = _is_level_tail; + reset(left_keys, false); + auto right_child = DummyChild::create(right_keys, right_is_tail, pool); + if (!can_split()) { + splitable_nodes.erase(this); + } + if (right_child->can_split()) { + splitable_nodes.insert(right_child); + } + return this->insert_parent(c, right_child); + } + + node_future<> insert_and_split( + context_t c, const onode_key_t& insert_key, + std::set>& splitable_nodes) { + assert(keys.size() == 1); + auto& key = *keys.begin(); + assert(insert_key < key); + + std::set new_keys; + new_keys.insert(insert_key); + new_keys.insert(key); + reset(new_keys, _is_level_tail); + + splitable_nodes.clear(); + splitable_nodes.insert(this); + auto fut = populate_split(c, splitable_nodes); + assert(!splitable_nodes.size()); + return fut; + } + + bool match_pos(const search_position_t& pos) const { + assert(!is_root()); + return pos == parent_info().position; + } + + static Ref create( + const std::set& keys, bool is_level_tail, DummyChildPool& pool) { + static laddr_t seed = 0; + return new DummyChild(keys, is_level_tail, seed++, pool); + } + + static node_future> create_initial( + context_t c, const std::set& keys, + DummyChildPool& pool, RootNodeTracker& root_tracker) { + auto initial = create(keys, true, pool); + return c.nm.get_super(c.t, root_tracker + ).safe_then([c, &pool, initial](auto super) { + initial->make_root_new(c, std::move(super)); + return initial->upgrade_root(c).safe_then([initial] { + return initial; + }); + }); + } + + static Ref create_clone( + const std::set& keys, bool is_level_tail, + laddr_t addr, DummyChildPool& pool) { + return new DummyChild(keys, is_level_tail, addr, pool); + } + + protected: + bool is_level_tail() const override { return _is_level_tail; } + field_type_t field_type() const override { return field_type_t::N0; } + laddr_t laddr() const override { return _laddr; } + level_t level() const override { return 0u; } + key_view_t get_key_view(const search_position_t&) const override { + assert(false && "impossible path"); } + key_view_t get_largest_key_view() const override { return key_view; } + std::ostream& dump(std::ostream&) const override { + assert(false && "impossible path"); } + std::ostream& dump_brief(std::ostream&) const override { + assert(false && "impossible path"); } + node_future do_lower_bound( + context_t, const key_hobj_t&, MatchHistory&) override { + assert(false && "impossible path"); } + node_future> lookup_smallest(context_t) override { + assert(false && "impossible path"); } + node_future> lookup_largest(context_t) override { + assert(false && "impossible path"); } + void test_make_destructable( + context_t, NodeExtentMutable&, Super::URef&&) override { + assert(false && "impossible path"); } + node_future<> test_clone_non_root( + context_t, Ref new_parent) const override { + assert(!is_root()); + auto p_pool_clone = pool.pool_clone_in_progress; + assert(p_pool_clone); + auto clone = create_clone(keys, _is_level_tail, _laddr, *p_pool_clone); + clone->as_child(parent_info().position, new_parent); + clone->_laddr = _laddr; + return node_ertr::now(); + } + + private: + DummyChild(const std::set& keys, + bool is_level_tail, laddr_t laddr, DummyChildPool& pool) + : keys{keys}, _is_level_tail{is_level_tail}, _laddr{laddr}, pool{pool} { + std::tie(key_view, p_mem_key_view) = build_key_view(*keys.crbegin()); + pool.track_node(this); + } + + bool can_split() const { return keys.size() > 1; } + + void reset(const std::set& _keys, bool level_tail) { + keys = _keys; + _is_level_tail = level_tail; + std::free(p_mem_key_view); + std::tie(key_view, p_mem_key_view) = build_key_view(*keys.crbegin()); + } + + mutable std::random_device rd; + std::set keys; + bool _is_level_tail; + laddr_t _laddr; + DummyChildPool& pool; + + key_view_t key_view; + void* p_mem_key_view; + }; + + public: + using node_ertr = Node::node_ertr; + template + using node_future = Node::node_future; + + DummyChildPool() = default; + ~DummyChildPool() { reset(); } + + node_future<> build_tree(const std::set& keys) { + reset(); + + // create tree + auto ref_nm = NodeExtentManager::create_dummy(); + p_nm = ref_nm.get(); + p_btree.emplace(std::move(ref_nm)); + return DummyChild::create_initial(get_context(), keys, *this, *p_btree->root_tracker + ).safe_then([this](auto initial_child) { + // split + splitable_nodes.insert(initial_child); + return crimson::do_until([this] { + if (splitable_nodes.empty()) { + return node_ertr::make_ready_future(true); + } + auto index = rd() % splitable_nodes.size(); + auto iter = splitable_nodes.begin(); + std::advance(iter, index); + Ref child = *iter; + return child->populate_split(get_context(), splitable_nodes + ).safe_then([] { + return node_ertr::make_ready_future(false); + }); + }); + }).safe_then([this] { + std::ostringstream oss; + p_btree->dump(t(), oss); + logger().info("\n{}\n", oss.str()); + return p_btree->height(t()); + }).safe_then([](auto height) { + assert(height == 2); + }); + } + + seastar::future<> test_split(onode_key_t key, search_position_t pos) { + return seastar::async([this, key, pos] { + logger().info("insert {} at {}:", key, pos); + DummyChildPool pool_clone; + pool_clone_in_progress = &pool_clone; + auto ref_nm = NodeExtentManager::create_dummy(); + pool_clone.p_nm = ref_nm.get(); + pool_clone.p_btree.emplace(std::move(ref_nm)); + pool_clone.p_btree->test_clone_from( + pool_clone.t(), t(), *p_btree).unsafe_get0(); + pool_clone_in_progress = nullptr; + auto node_to_split = pool_clone.get_node_by_pos(pos); + node_to_split->insert_and_split( + pool_clone.get_context(), key, pool_clone.splitable_nodes).unsafe_get0(); + std::ostringstream oss; + pool_clone.p_btree->dump(pool_clone.t(), oss); + logger().info("\n{}\n", oss.str()); + assert(pool_clone.p_btree->height(pool_clone.t()).unsafe_get0() == 3); + }); + } + + private: + void reset() { + assert(!pool_clone_in_progress); + if (tracked_children.size()) { + assert(!p_btree->test_is_clean()); + tracked_children.clear(); + assert(p_btree->test_is_clean()); + p_nm = nullptr; + p_btree.reset(); + } else { + assert(!p_btree); + } + splitable_nodes.clear(); + } + + void track_node(Ref node) { + assert(tracked_children.find(node) == tracked_children.end()); + tracked_children.insert(node); + } + + Ref get_node_by_pos(const search_position_t& pos) const { + auto iter = std::find_if( + tracked_children.begin(), tracked_children.end(), [&pos](auto& child) { + return child->match_pos(pos); + }); + assert(iter != tracked_children.end()); + return *iter; + } + + context_t get_context() { + assert(p_nm != nullptr); + return {*p_nm, t()}; + } + + Transaction& t() const { return *ref_t; } + + std::set> tracked_children; + std::optional p_btree; + NodeExtentManager* p_nm = nullptr; + TransactionRef ref_t = make_transaction(); + + std::random_device rd; + std::set> splitable_nodes; + + DummyChildPool* pool_clone_in_progress = nullptr; +}; + +} + +struct c_dummy_children_test_t : public seastar_test_suite_t {}; + +TEST_F(c_dummy_children_test_t, 5_split_internal_node) +{ + run_async([this] { + DummyChildPool pool; + { + logger().info("\n---------------------------------------------" + "\nbefore internal node insert:\n"); + auto padding = std::string(250, '_'); + auto keys = build_key_set({2, 6}, {2, 5}, {2, 5}, padding, true); + pool.build_tree(keys).unsafe_get0(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to right front at stage 0, 1, 2, 1, 0\n"); + pool.test_split(onode_key_t{3, 3, 3, "ns4", "oid4" + padding, 5, 5}, {2, {0, {0}}}).get(); + pool.test_split(onode_key_t{3, 3, 3, "ns5", "oid5", 3, 3}, {2, {0, {0}}}).get(); + pool.test_split(onode_key_t{3, 4, 4, "ns3", "oid3", 3, 3}, {2, {0, {0}}}).get(); + pool.test_split(onode_key_t{4, 4, 4, "ns1", "oid1", 3, 3}, {2, {0, {0}}}).get(); + pool.test_split(onode_key_t{4, 4, 4, "ns2", "oid2" + padding, 1, 1}, {2, {0, {0}}}).get(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to right middle at stage 0, 1, 2, 1, 0\n"); + pool.test_split(onode_key_t{4, 4, 4, "ns4", "oid4" + padding, 5, 5}, {3, {0, {0}}}).get(); + pool.test_split(onode_key_t{4, 4, 4, "ns5", "oid5", 3, 3}, {3, {0, {0}}}).get(); + pool.test_split(onode_key_t{4, 4, 5, "ns3", "oid3", 3, 3}, {3, {0, {0}}}).get(); + pool.test_split(onode_key_t{5, 5, 5, "ns1", "oid1", 3, 3}, {3, {0, {0}}}).get(); + pool.test_split(onode_key_t{5, 5, 5, "ns2", "oid2" + padding, 1, 1}, {3, {0, {0}}}).get(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to right back at stage 0, 1, 2\n"); + pool.test_split( + onode_key_t{5, 5, 5, "ns4", "oid4" + padding, 5, 5}, search_position_t::end()).get(); + pool.test_split(onode_key_t{5, 5, 5, "ns5", "oid5", 3, 3}, search_position_t::end()).get(); + pool.test_split(onode_key_t{6, 6, 6, "ns3", "oid3", 3, 3}, search_position_t::end()).get(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 0; insert to left front at stage 2, 1, 0\n"); + pool.test_split(onode_key_t{1, 1, 1, "ns3", "oid3", 3, 3}, {0, {0, {0}}}).get(); + pool.test_split(onode_key_t{2, 2, 2, "ns1", "oid1", 3, 3}, {0, {0, {0}}}).get(); + pool.test_split(onode_key_t{2, 2, 2, "ns2", "oid2" + padding, 1, 1}, {0, {0, {0}}}).get(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 0/1; insert to left middle at stage 0, 1, 2, 1, 0\n"); + pool.test_split(onode_key_t{2, 2, 2, "ns4", "oid4" + padding, 5, 5}, {1, {0, {0}}}).get(); + pool.test_split(onode_key_t{2, 2, 2, "ns5", "oid5", 3, 3}, {1, {0, {0}}}).get(); + pool.test_split( + onode_key_t{2, 2, 3, "ns3", "oid3" + std::string(80, '_'), 3, 3}, {1, {0, {0}}}).get(); + pool.test_split(onode_key_t{3, 3, 3, "ns1", "oid1", 3, 3}, {1, {0, {0}}}).get(); + pool.test_split(onode_key_t{3, 3, 3, "ns2", "oid2" + padding, 1, 1}, {1, {0, {0}}}).get(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 0; insert to left back at stage 0\n"); + pool.test_split(onode_key_t{3, 3, 3, "ns4", "oid4" + padding, 3, 4}, {1, {2, {2}}}).get(); + } + + { + logger().info("\n---------------------------------------------" + "\nbefore internal node insert (1):\n"); + auto padding = std::string(245, '_'); + auto keys = build_key_set({2, 6}, {2, 5}, {2, 5}, padding, true); + keys.insert(onode_key_t{5, 5, 5, "ns4", "oid4" + padding, 5, 5}); + keys.insert(onode_key_t{5, 5, 5, "ns4", "oid4" + padding, 6, 6}); + pool.build_tree(keys).unsafe_get0(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to left back at stage 0, 1, 2, 1\n"); + pool.test_split(onode_key_t{3, 3, 3, "ns4", "oid4" + padding, 5, 5}, {2, {0, {0}}}).get(); + pool.test_split(onode_key_t{3, 3, 3, "ns5", "oid5", 3, 3}, {2, {0, {0}}}).get(); + pool.test_split(onode_key_t{3, 4, 4, "n", "o", 3, 3}, {2, {0, {0}}}).get(); + pool.test_split(onode_key_t{4, 4, 4, "n", "o", 3, 3}, {2, {0, {0}}}).get(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to left middle at stage 2\n"); + pool.test_split(onode_key_t{2, 3, 3, "n", "o", 3, 3}, {1, {0, {0}}}).get(); + } + + { + logger().info("\n---------------------------------------------" + "\nbefore internal node insert (2):\n"); + auto padding = std::string(245, '_'); + auto keys = build_key_set({2, 6}, {2, 5}, {2, 5}, padding, true); + keys.insert(onode_key_t{4, 4, 4, "n", "o", 3, 3}); + keys.insert(onode_key_t{5, 5, 5, "ns4", "oid4" + padding, 5, 5}); + pool.build_tree(keys).unsafe_get0(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 2; insert to left back at stage (0, 1, 2, 1,) 0\n"); + pool.test_split(onode_key_t{4, 4, 4, "n", "o", 2, 2}, {2, {0, {0}}}).get(); + } + + { + logger().info("\n---------------------------------------------" + "\nbefore internal node insert (3):\n"); + auto padding = std::string(417, '_'); + auto keys = build_key_set({2, 5}, {2, 5}, {2, 5}, padding, true); + keys.insert(onode_key_t{4, 4, 4, "ns3", "oid3" + padding, 5, 5}); + keys.erase(onode_key_t{4, 4, 4, "ns4", "oid4" + padding, 2, 2}); + keys.erase(onode_key_t{4, 4, 4, "ns4", "oid4" + padding, 3, 3}); + keys.erase(onode_key_t{4, 4, 4, "ns4", "oid4" + padding, 4, 4}); + pool.build_tree(keys).unsafe_get0(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 1; insert to right front at stage 0, 1, 0\n"); + pool.test_split(onode_key_t{3, 3, 3, "ns2", "oid2" + padding, 5, 5}, {1, {1, {0}}}).get(); + pool.test_split(onode_key_t{3, 3, 3, "ns2", "oid3", 3, 3}, {1, {1, {0}}}).get(); + pool.test_split(onode_key_t{3, 3, 3, "ns3", "oid3" + padding, 1, 1}, {1, {1, {0}}}).get(); + } + + { + logger().info("\n---------------------------------------------" + "\nbefore internal node insert (4):\n"); + auto padding = std::string(360, '_'); + auto keys = build_key_set({2, 5}, {2, 5}, {2, 5}, padding, true); + keys.insert(onode_key_t{4, 4, 4, "ns4", "oid4" + padding, 5, 5}); + pool.build_tree(keys).unsafe_get0(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 1; insert to left back at stage 0, 1\n"); + pool.test_split(onode_key_t{3, 3, 3, "ns2", "oid2" + padding, 5, 5}, {1, {1, {0}}}).get(); + pool.test_split(onode_key_t{3, 3, 3, "ns2", "oid3", 3, 3}, {1, {1, {0}}}).get(); + } + + { + logger().info("\n---------------------------------------------" + "\nbefore internal node insert (5):\n"); + auto padding = std::string(412, '_'); + auto keys = build_key_set({2, 5}, {2, 5}, {2, 5}, padding); + keys.insert(onode_key_t{3, 3, 3, "ns2", "oid3", 3, 3}); + keys.insert(onode_key_t{4, 4, 4, "ns3", "oid3" + padding, 5, 5}); + keys.insert(onode_key_t{9, 9, 9, "ns~last", "oid~last", 9, 9}); + keys.erase(onode_key_t{4, 4, 4, "ns4", "oid4" + padding, 2, 2}); + keys.erase(onode_key_t{4, 4, 4, "ns4", "oid4" + padding, 3, 3}); + keys.erase(onode_key_t{4, 4, 4, "ns4", "oid4" + padding, 4, 4}); + pool.build_tree(keys).unsafe_get0(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 1; insert to left back at stage (0, 1,) 0\n"); + pool.test_split(onode_key_t{3, 3, 3, "ns2", "oid3", 2, 2}, {1, {1, {0}}}).get(); + } + + { + logger().info("\n---------------------------------------------" + "\nbefore internal node insert (6):\n"); + auto padding = std::string(328, '_'); + auto keys = build_key_set({2, 5}, {2, 5}, {2, 5}, padding); + keys.insert(onode_key_t{5, 5, 5, "ns3", "oid3" + std::string(271, '_'), 3, 3}); + keys.insert(onode_key_t{9, 9, 9, "ns~last", "oid~last", 9, 9}); + pool.build_tree(keys).unsafe_get0(); + + logger().info("\n---------------------------------------------" + "\nsplit at stage 0; insert to right front at stage 0\n"); + pool.test_split(onode_key_t{3, 3, 3, "ns3", "oid3" + padding, 2, 3}, {1, {1, {1}}}).get(); + } + + // TODO: test split at {0, 0, 0} + // TODO: test split at {END, END, END} + }); +}