From 1b832f4f28c424273244df15262adb8ddca587b9 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 13 Oct 2022 11:50:17 +0800 Subject: [PATCH] crimson/os/seastore/btree: search fixed-kv-btree by parent<->child pointers Signed-off-by: Xuehan Xu (cherry picked from commit 7c3305f0149808f800dc6c852d4f1f755a490ec4) --- .../os/seastore/btree/fixed_kv_btree.h | 217 ++++++++++++++---- src/crimson/os/seastore/btree/fixed_kv_node.h | 52 +++++ src/crimson/os/seastore/cache.h | 25 +- src/crimson/os/seastore/cached_extent.h | 75 ++++-- src/crimson/os/seastore/transaction.h | 11 +- 5 files changed, 305 insertions(+), 75 deletions(-) diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 71e77a5ae97..ca6e9359e7a 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -219,6 +219,12 @@ public: typename NodeType::Ref node; uint16_t pos = INVALID; + node_position_t() = default; + node_position_t( + typename NodeType::Ref node, + uint16_t pos) + : node(node), pos(pos) {} + void reset() { *this = node_position_t{}; } @@ -1009,6 +1015,9 @@ private: phy_tree_root_t root; bool root_dirty = false; + template + using node_position_t = typename iterator::template node_position_t; + using get_internal_node_iertr = base_iertr; using get_internal_node_ret = get_internal_node_iertr::future; static get_internal_node_ret get_internal_node( @@ -1016,7 +1025,8 @@ private: depth_t depth, paddr_t offset, node_key_t begin, - node_key_t end) + node_key_t end, + typename std::optional> parent_pos) { LOG_PREFIX(FixedKVBtree::get_internal_node); SUBTRACET( @@ -1028,10 +1038,16 @@ private: begin, end); assert(depth > 1); - auto init_internal = [c, depth, begin, end](internal_node_t &node) { + auto init_internal = [c, depth, begin, end, + parent_pos=std::move(parent_pos)] + (internal_node_t &node) { assert(!node.is_pending()); assert(!node.pin.is_linked()); node.pin.set_range(fixed_kv_node_meta_t{begin, end, depth}); + if (parent_pos) { + auto &parent = parent_pos->node; + parent->link_child(&node, parent_pos->pos); + } if (c.pins) { c.pins->add_pin(node.pin); } @@ -1078,7 +1094,8 @@ private: op_context_t c, paddr_t offset, node_key_t begin, - node_key_t end) + node_key_t end, + typename std::optional> parent_pos) { LOG_PREFIX(FixedKVBtree::get_leaf_node); SUBTRACET( @@ -1088,10 +1105,16 @@ private: offset, begin, end); - auto init_leaf = [c, begin, end](leaf_node_t &node) { + auto init_leaf = [c, begin, end, + parent_pos=std::move(parent_pos)] + (leaf_node_t &node) { assert(!node.is_pending()); assert(!node.pin.is_linked()); node.pin.set_range(fixed_kv_node_meta_t{begin, end, 1}); + if (parent_pos) { + auto &parent = parent_pos->node; + parent->link_child(&node, parent_pos->pos); + } if (c.pins) { c.pins->add_pin(node.pin); } @@ -1143,7 +1166,8 @@ private: root.get_depth(), root.get_location(), min_max_t::min, - min_max_t::max + min_max_t::max, + std::nullopt ).si_then([this, visitor, &iter](InternalNodeRef root_node) { iter.get_internal(root.get_depth()).node = root_node; if (visitor) (*visitor)( @@ -1158,7 +1182,8 @@ private: c, root.get_location(), min_max_t::min, - min_max_t::max + min_max_t::max, + std::nullopt ).si_then([visitor, &iter, this](LeafNodeRef root_node) { iter.leaf.node = root_node; if (visitor) (*visitor)( @@ -1181,22 +1206,13 @@ private: F &f, mapped_space_visitor_t *visitor ) { + LOG_PREFIX(FixedKVBtree::lookup_internal_level); assert(depth > 1); auto &parent_entry = iter.get_internal(depth + 1); auto parent = parent_entry.node; auto node_iter = parent->iter_idx(parent_entry.pos); - auto next_iter = node_iter + 1; - auto begin = node_iter->get_key(); - auto end = next_iter == parent->end() - ? parent->get_node_meta().end - : next_iter->get_key(); - return get_internal_node( - c, - depth, - node_iter->get_val().maybe_relative_to(parent->get_paddr()), - begin, - end - ).si_then([depth, visitor, &iter, &f](InternalNodeRef node) { + + auto on_found = [depth, visitor, &iter, &f](InternalNodeRef node) { auto &entry = iter.get_internal(depth); entry.node = node; auto node_iter = f(*node); @@ -1209,6 +1225,50 @@ private: depth, node->get_type()); return seastar::now(); + }; + + auto child_pos = parent->get_child(c.trans, node_iter); + auto &child = child_pos.child; + if (child) { + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_entry.node, + parent_entry.pos, + *child); + + ceph_assert(child->is_valid()); + if (!child->is_pending_in_trans(c.trans.get_trans_id())) { + c.trans.add_to_read_set(child); + if (!child->is_mutation_pending()) { + c.cache.touch_extent(*child); + } + } + return child->wait_io().then( + [child, on_found=std::move(on_found), node_iter]() mutable { + auto &cnode = (typename internal_node_t::base_t &)*child; + assert(cnode.get_node_meta().begin == node_iter.get_key()); + assert(cnode.get_node_meta().end > node_iter.get_key()); + return on_found(child->template cast()); + }); + } + + auto next_iter = node_iter + 1; + auto begin = node_iter->get_key(); + auto end = next_iter == parent->end() + ? parent->get_node_meta().end + : next_iter->get_key(); + return get_internal_node( + c, + depth, + node_iter->get_val().maybe_relative_to(parent->get_paddr()), + begin, + end, + std::make_optional>( + child_pos.stable_parent->template cast(), + child_pos.pos) + ).si_then([on_found=std::move(on_found)](InternalNodeRef node) { + return on_found(node); }); } @@ -1221,22 +1281,13 @@ private: F &f, mapped_space_visitor_t *visitor ) { + LOG_PREFIX(FixedKVBtree::lookup_leaf); auto &parent_entry = iter.get_internal(2); auto parent = parent_entry.node; assert(parent); auto node_iter = parent->iter_idx(parent_entry.pos); - auto next_iter = node_iter + 1; - auto begin = node_iter->get_key(); - auto end = next_iter == parent->end() - ? parent->get_node_meta().end - : next_iter->get_key(); - return get_leaf_node( - c, - node_iter->get_val().maybe_relative_to(parent->get_paddr()), - begin, - end - ).si_then([visitor, &iter, &f](LeafNodeRef node) { + auto on_found = [visitor, &iter, &f](LeafNodeRef node) { iter.leaf.node = node; auto node_iter = f(*node); iter.leaf.pos = node_iter->get_offset(); @@ -1247,6 +1298,50 @@ private: 1, node->get_type()); return seastar::now(); + }; + + auto child_pos = parent->get_child(c.trans, node_iter); + auto &child = child_pos.child; + if (child) { + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_entry.node, + parent_entry.pos, + *child); + + ceph_assert(child->is_valid()); + if (!child->is_pending_in_trans(c.trans.get_trans_id())) { + c.trans.add_to_read_set(child); + if (!child->is_mutation_pending()) { + c.cache.touch_extent(*child); + } + } + return child->wait_io().then( + [child, on_found=std::move(on_found), node_iter]() mutable { + auto &cnode = (typename internal_node_t::base_t &)*child; + assert(cnode.get_node_meta().begin == node_iter.get_key()); + assert(cnode.get_node_meta().end > node_iter.get_key()); + return on_found(child->template cast()); + }); + } + + auto next_iter = node_iter + 1; + auto begin = node_iter->get_key(); + auto end = next_iter == parent->end() + ? parent->get_node_meta().end + : next_iter->get_key(); + + return get_leaf_node( + c, + node_iter->get_val().maybe_relative_to(parent->get_paddr()), + begin, + end, + std::make_optional>( + child_pos.stable_parent->template cast(), + child_pos.pos) + ).si_then([on_found=std::move(on_found)](LeafNodeRef node) { + return on_found(node); }); } @@ -1627,9 +1722,6 @@ private: }); } - template - using node_position_t = typename iterator::template node_position_t; - template , int> = 0> base_iertr::future get_node( @@ -1637,9 +1729,10 @@ private: depth_t depth, paddr_t addr, node_key_t begin, - node_key_t end) { + node_key_t end, + typename std::optional> parent_pos) { assert(depth == 1); - return get_leaf_node(c, addr, begin, end); + return get_leaf_node(c, addr, begin, end, std::move(parent_pos)); } template > parent_pos) { + return get_internal_node(c, depth, addr, begin, end, std::move(parent_pos)); } template @@ -1678,13 +1772,7 @@ private: : next_iter->get_key(); SUBTRACET(seastore_fixedkv_tree, "parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node); - return get_node( - c, - depth, - donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()), - begin, - end - ).si_then([c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( + auto do_merge = [c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( typename NodeType::Ref donor) { LOG_PREFIX(FixedKVBtree::merge_level); auto [l, r] = donor_is_left ? @@ -1756,6 +1844,49 @@ private: } return seastar::now(); + }; + + auto child_pos = parent_pos.node->get_child(c.trans, donor_iter); + auto &child = child_pos.child; + if (child) { + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_pos.node, + donor_iter.get_offset(), + *child); + + ceph_assert(child->is_valid()); + if (!child->is_pending_in_trans(c.trans.get_trans_id())) { + c.trans.add_to_read_set(child); + if (!child->is_mutation_pending()) { + c.cache.touch_extent(*child); + } + } + return child->wait_io().then( + [child, do_merge=std::move(do_merge), &pos, + donor_iter, donor_is_left]() mutable { + auto &node = (typename internal_node_t::base_t&)*child; + assert(donor_is_left ? + node.get_node_meta().end == pos.node->get_node_meta().begin : + node.get_node_meta().begin == pos.node->get_node_meta().end); + assert(node.get_node_meta().begin == donor_iter.get_key()); + assert(node.get_node_meta().end > donor_iter.get_key()); + return do_merge(child->template cast()); + }); + } + + return get_node( + c, + depth, + donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()), + begin, + end, + std::make_optional>( + child_pos.stable_parent->template cast(), + child_pos.pos) + ).si_then([do_merge=std::move(do_merge)](typename NodeType::Ref donor) { + return do_merge(donor); }); } }; diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 101ad4945d3..b171db6965b 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -216,6 +216,47 @@ struct FixedKVNode : CachedExtent { } } + struct child_pos_t { + FixedKVNodeRef stable_parent; + uint16_t pos = std::numeric_limits::max(); + CachedExtentRef child; + child_pos_t(CachedExtentRef child) : child(child) {} + child_pos_t(FixedKVNodeRef stable_parent, uint16_t pos) + : stable_parent(stable_parent), pos(pos) {} + }; + + void link_child(FixedKVNode* child, uint16_t pos) { + assert(pos < get_node_size()); + assert(child); + ceph_assert(!is_pending()); + ceph_assert(child->is_valid() && !child->is_pending()); + assert(!children[pos]); + children[pos] = child; + set_child_ptracker(child); + } + + template + child_pos_t get_child(Transaction &t, iter_t iter) { + auto pos = iter.get_offset(); + assert(children.capacity()); + auto child = children[pos]; + if (child) { + return child_pos_t(child->get_transactional_view(t)); + } else if (is_pending()) { + auto key = iter.get_key(); + auto &sparent = get_stable_for_key(key); + auto spos = sparent.child_pos_for_key(key); + auto child = sparent.children[spos]; + if (child) { + return child_pos_t(child->get_transactional_view(t)); + } else { + return child_pos_t(&sparent, spos); + } + } else { + return child_pos_t(this, pos); + } + } + void split_child_ptrs( FixedKVNode &left, FixedKVNode &right) @@ -585,6 +626,13 @@ struct FixedKVInternalNode return this->upper_bound(key).get_offset(); } + uint16_t child_pos_for_key(NODE_KEY key) const final { + auto it = this->upper_bound(key); + assert(it != this->begin()); + --it; + return it.get_offset(); + } + NODE_KEY get_key_from_idx(uint16_t idx) const final { return this->iter_idx(idx).get_key(); } @@ -919,6 +967,10 @@ struct FixedKVLeafNode return this->upper_bound(key).get_offset(); } + uint16_t child_pos_for_key(NODE_KEY key) const final { + return lower_bound_offset(key); + } + NODE_KEY get_key_from_idx(uint16_t idx) const final { return this->iter_idx(idx).get_key(); } diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 3b90516e614..32ff392b62c 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -985,6 +985,19 @@ public: uint64_t get_omap_tree_depth() { return stats.omap_tree_depth; } + + /// Update lru for access to ref + void touch_extent( + CachedExtent &ext, + const Transaction::src_t* p_src=nullptr) + { + if (p_src && is_background_transaction(*p_src)) + return; + if (ext.is_clean() && !ext.is_placeholder()) { + lru.move_to_top(ext); + } + } + private: ExtentPlacementManager& epm; RootBlockRef root; ///< ref to current root @@ -1268,18 +1281,6 @@ private: return bp; } - /// Update lru for access to ref - void touch_extent( - CachedExtent &ext, - const Transaction::src_t* p_src=nullptr) - { - if (p_src && is_background_transaction(*p_src)) - return; - if (ext.is_clean() && !ext.is_placeholder()) { - lru.move_to_top(ext); - } - } - void backref_batch_update( std::vector &&, const journal_seq_t &); diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 70ebc394bd5..170a8137746 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -24,6 +24,15 @@ class SegmentedAllocator; class TransactionManager; class ExtentPlacementManager; +template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size> +class FixedKVBtree; + // #define DEBUG_CACHED_EXTENT_REF #ifdef DEBUG_CACHED_EXTENT_REF @@ -45,11 +54,14 @@ namespace onode { template class read_set_item_t { - boost::intrusive::list_member_hook<> list_hook; - using list_hook_options = boost::intrusive::member_hook< + using set_hook_t = boost::intrusive::set_member_hook< + boost::intrusive::link_mode< + boost::intrusive::auto_unlink>>; + set_hook_t trans_hook; + using set_hook_options = boost::intrusive::member_hook< read_set_item_t, - boost::intrusive::list_member_hook<>, - &read_set_item_t::list_hook>; + set_hook_t, + &read_set_item_t::trans_hook>; public: struct cmp_t { @@ -59,9 +71,29 @@ public: bool operator()(const read_set_item_t &lhs, const paddr_t &rhs) const; }; - using list = boost::intrusive::list< + struct trans_cmp_t { + bool operator()( + const read_set_item_t &lhs, + const read_set_item_t &rhs) const { + return lhs.t < rhs.t; + } + bool operator()( + const Transaction *lhs, + const read_set_item_t &rhs) const { + return lhs < rhs.t; + } + bool operator()( + const read_set_item_t &lhs, + const Transaction *rhs) const { + return lhs.t < rhs; + } + }; + + using trans_set_t = boost::intrusive::set< read_set_item_t, - list_hook_options>; + set_hook_options, + boost::intrusive::constant_time_size, + boost::intrusive::compare>; T *t = nullptr; CachedExtentRef ref; @@ -69,7 +101,7 @@ public: read_set_item_t(T *t, CachedExtentRef ref); read_set_item_t(const read_set_item_t &) = delete; read_set_item_t(read_set_item_t &&) = default; - ~read_set_item_t(); + ~read_set_item_t() = default; }; template using read_set_t = std::set< @@ -151,6 +183,14 @@ class CachedExtent friend class onode::DummyNodeExtent; friend class onode::TestReplayExtent; + template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size> + friend class FixedKVBtree; uint32_t last_committed_crc = 0; // Points at current version while in state MUTATION_PENDING @@ -530,6 +570,11 @@ private: return extent_index_hook.is_linked(); } + /// Returns true if the extent part of the open transaction + bool is_pending_in_trans(transaction_id_t id) const { + return is_pending() && pending_for_transaction == id; + } + /// hook for intrusive ref list (mainly dirty or lru list) boost::intrusive::list_member_hook<> primary_ref_list_hook; using primary_ref_list_member_options = boost::intrusive::member_hook< @@ -580,7 +625,9 @@ private: } } - read_set_item_t::list transactions; + CachedExtent* get_transactional_view(Transaction &t); + + read_set_item_t::trans_set_t transactions; placement_hint_t user_hint = PLACEMENT_HINT_NULL; @@ -611,8 +658,6 @@ protected: struct retired_placeholder_t{}; CachedExtent(retired_placeholder_t) : state(extent_state_t::INVALID) {} - CachedExtent& get_transactional_view(Transaction &t); - friend class Cache; template static TCachedExtentRef make_cached_extent_ref( @@ -1000,15 +1045,7 @@ struct ref_laddr_cmp { template read_set_item_t::read_set_item_t(T *t, CachedExtentRef ref) : t(t), ref(ref) -{ - ref->transactions.push_back(*this); -} - -template -read_set_item_t::~read_set_item_t() -{ - ref->transactions.erase(ref->transactions.s_iterator_to(*this)); -} +{} template inline bool read_set_item_t::cmp_t::operator()( diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 14ba61cee07..ed9d1d1a0c4 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -140,8 +140,16 @@ public: void add_to_read_set(CachedExtentRef ref) { if (is_weak()) return; + assert(ref->is_valid()); + + auto it = ref->transactions.lower_bound( + this, read_set_item_t::trans_cmp_t()); + if (it != ref->transactions.end() && it->t == this) return; + auto [iter, inserted] = read_set.emplace(this, ref); ceph_assert(inserted); + ref->transactions.insert_before( + it, const_cast&>(*iter)); } void add_fresh_extent( @@ -231,7 +239,8 @@ public: assert(where != read_set.end()); assert(where->ref.get() == &placeholder); where = read_set.erase(where); - read_set.emplace_hint(where, this, &extent); + auto it = read_set.emplace_hint(where, this, &extent); + extent.transactions.insert(const_cast&>(*it)); } { auto where = retired_set.find(&placeholder); -- 2.39.5