From 22bfce956b670df20f23d4395462dd2ab864180f Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Fri, 14 Feb 2025 15:48:01 +0800 Subject: [PATCH] crimson/os/seastore/omap_manager: apply linked_tree_node to omap trees Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cached_extent.h | 5 +- src/crimson/os/seastore/linked_tree_node.h | 13 +- src/crimson/os/seastore/logical_child_node.h | 2 + .../omap_manager/btree/btree_omap_manager.cc | 36 ++- .../omap_manager/btree/btree_omap_manager.h | 9 - .../omap_manager/btree/omap_btree_node.h | 36 ++- .../btree/omap_btree_node_impl.cc | 271 ++++++++++++------ .../omap_manager/btree/omap_btree_node_impl.h | 225 +++++++++++++-- .../btree/string_kv_node_layout.h | 2 +- src/crimson/os/seastore/transaction_manager.h | 4 + 10 files changed, 482 insertions(+), 121 deletions(-) diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index d6324f2ff15a3..d7b47d3cae126 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -1388,12 +1388,15 @@ public: template LogicalCachedExtent(T&&... t) : CachedExtent(std::forward(t)...) {} - void on_rewrite(Transaction&, CachedExtent &extent, extent_len_t off) final { + void on_rewrite(Transaction &t, CachedExtent &extent, extent_len_t off) final { assert(get_type() == extent.get_type()); auto &lextent = (LogicalCachedExtent&)extent; set_laddr((lextent.get_laddr() + off).checked_to_laddr()); + do_on_rewrite(t, lextent); } + virtual void do_on_rewrite(Transaction &t, LogicalCachedExtent &extent) {} + bool has_laddr() const { return laddr != L_ADDR_NULL; } diff --git a/src/crimson/os/seastore/linked_tree_node.h b/src/crimson/os/seastore/linked_tree_node.h index b19c053cdbb38..572b7e1ccfcb5 100644 --- a/src/crimson/os/seastore/linked_tree_node.h +++ b/src/crimson/os/seastore/linked_tree_node.h @@ -149,7 +149,11 @@ private: friend class TreeRootLinker; }; -// The link from the ChildNodes to the ParentNodes +// The sharable linker from the ChildNodes to the same ParentNode +// +// The indirection of child.parent_tracker.parent is necessary +// because otherwise we'll need to update every child's parent +// upon commiting a mutated extent. template class parent_tracker_t : public boost::intrusive_ref_counter< @@ -400,6 +404,10 @@ protected: : children(capacity, nullptr) {} ParentNode(const ParentNode &rhs) : children(rhs.children.capacity(), nullptr) {} + void sync_children_capacity() { + auto &me = down_cast(); + maybe_expand_children(me.get_size()); + } void add_copy_dest(Transaction &t, TCachedExtentRef dest) { ceph_assert(down_cast().is_stable()); ceph_assert(dest->is_pending()); @@ -1000,6 +1008,9 @@ protected: void on_replace_prior() { take_parent_from_prior(); } + // destroy() is allowed to be skipped for the pending extents, + // because they will be destroyed altogether with their parents + // upon transaction invalidation. void destroy() { assert(!down_cast().is_btree_root()); assert(this->has_parent_tracker()); diff --git a/src/crimson/os/seastore/logical_child_node.h b/src/crimson/os/seastore/logical_child_node.h index 03f2b73f975c4..e1895ada8468c 100644 --- a/src/crimson/os/seastore/logical_child_node.h +++ b/src/crimson/os/seastore/logical_child_node.h @@ -42,7 +42,9 @@ public: protected: void on_replace_prior() final { child_node_t::on_replace_prior(); + do_on_replace_prior(); } + virtual void do_on_replace_prior() {} }; using LogicalChildNodeRef = TCachedExtentRef; } // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc index ae197097c6506..451e17bd8fdee 100644 --- a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc +++ b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc @@ -26,6 +26,7 @@ BtreeOMapManager::initialize_omap(Transaction &t, laddr_t hint, return tm.alloc_non_data_extent(t, hint, get_leaf_size(type)) .si_then([hint, &t, type](auto&& root_extent) { root_extent->set_size(0); + root_extent->init_range(BEGIN_KEY, END_KEY); omap_node_meta_t meta{1}; root_extent->set_meta(meta); omap_root_t omap_root; @@ -44,7 +45,19 @@ BtreeOMapManager::get_omap_root(omap_context_t oc, const omap_root_t &omap_root) { assert(omap_root.get_location() != L_ADDR_NULL); laddr_t laddr = omap_root.get_location(); - return omap_load_extent(oc, laddr, omap_root.get_depth()); + if (omap_root.get_depth() > 1) { + return omap_load_extent( + oc, laddr, omap_root.get_depth(), BEGIN_KEY, END_KEY + ).si_then([](auto extent) { + return extent->template cast(); + }); + } else { + return omap_load_extent( + oc, laddr, omap_root.get_depth(), BEGIN_KEY, END_KEY + ).si_then([](auto extent) { + return extent->template cast(); + }); + } } BtreeOMapManager::handle_root_split_ret @@ -57,18 +70,29 @@ BtreeOMapManager::handle_root_split( DEBUGT("{}", oc.t, omap_root); return oc.tm.alloc_non_data_extent(oc.t, omap_root.hint, OMAP_INNER_BLOCK_SIZE) - .si_then([&omap_root, mresult, oc](auto&& nroot) -> handle_root_split_ret { + .si_then([&omap_root, mresult, oc, FNAME] + (auto&& nroot) -> handle_root_split_ret { auto [left, right, pivot] = *(mresult.split_tuple); omap_node_meta_t meta{omap_root.depth + 1}; + nroot->init_range(BEGIN_KEY, END_KEY); nroot->set_meta(meta); + left->init_range(BEGIN_KEY, pivot); + nroot->insert_child_ptr( + nroot->iter_begin().get_offset(), + dynamic_cast*>(left.get())); nroot->journal_inner_insert(nroot->iter_begin(), left->get_laddr(), - "", nroot->maybe_get_delta_buffer()); + BEGIN_KEY, nroot->maybe_get_delta_buffer()); + nroot->insert_child_ptr( + (nroot->iter_begin() + 1).get_offset(), + dynamic_cast*>(right.get())); + right->init_range(pivot, END_KEY); nroot->journal_inner_insert(nroot->iter_begin() + 1, right->get_laddr(), pivot, nroot->maybe_get_delta_buffer()); omap_root.update(nroot->get_laddr(), omap_root.get_depth() + 1, omap_root.hint, omap_root.get_type()); oc.t.get_omap_tree_stats().depth = omap_root.depth; ++(oc.t.get_omap_tree_stats().extents_num_delta); + DEBUGT("l {}, r {}", oc.t, *left, *right); return seastar::now(); }).handle_error_interruptible( crimson::ct_error::enospc::assert_failure{"unexpected enospc"}, @@ -84,8 +108,8 @@ BtreeOMapManager::handle_root_merge( { LOG_PREFIX(BtreeOMapManager::handle_root_merge); DEBUGT("{}", oc.t, omap_root); - auto root = *(mresult.need_merge); - auto iter = root->cast()->iter_begin(); + auto old_root = *(mresult.need_merge); + auto iter = old_root->cast()->iter_begin(); omap_root.update( iter->get_val(), omap_root.depth -= 1, @@ -93,7 +117,7 @@ BtreeOMapManager::handle_root_merge( omap_root.get_type()); oc.t.get_omap_tree_stats().depth = omap_root.depth; oc.t.get_omap_tree_stats().extents_num_delta--; - return oc.tm.remove(oc.t, root->get_laddr() + return oc.tm.remove(oc.t, old_root->get_laddr() ).si_then([](auto &&ret) -> handle_root_merge_ret { return seastar::now(); }).handle_error_interruptible( diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h index 90dbccf63eedf..6de3ade3f7b76 100644 --- a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h +++ b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h @@ -106,15 +106,6 @@ public: omap_clear_ret omap_clear( omap_root_t &omap_root, Transaction &t) final; - - static extent_len_t get_leaf_size(omap_type_t type) { - if (type == omap_type_t::LOG) { - return LOG_LEAF_BLOCK_SIZE; - } - ceph_assert(type == omap_type_t::OMAP || - type == omap_type_t::XATTR); - return OMAP_LEAF_BLOCK_SIZE; - } }; using BtreeOMapManagerRef = std::unique_ptr; diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h index 3de6017096c54..cbdacd28601e9 100644 --- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h @@ -15,6 +15,9 @@ namespace crimson::os::seastore::omap_manager{ +const std::string BEGIN_KEY = ""; +const std::string END_KEY(64, (char)(-1)); + struct omap_context_t { TransactionManager &tm; Transaction &t; @@ -52,7 +55,10 @@ struct OMapNode : LogicalChildNode { explicit OMapNode(ceph::bufferptr &&ptr) : LogicalChildNode(std::move(ptr)) {} explicit OMapNode(extent_len_t length) : LogicalChildNode(length) {} OMapNode(const OMapNode &other) - : LogicalChildNode(other) {} + : LogicalChildNode(other), + root(other.root), + begin(other.begin), + end(other.end) {} using get_value_iertr = base_iertr; using get_value_ret = OMapManager::omap_get_value_ret; @@ -109,14 +115,34 @@ struct OMapNode : LogicalChildNode { virtual uint32_t get_node_size() = 0; virtual ~OMapNode() = default; + + void init_range(std::string begin, std::string end) { + if (!this->end.empty()) { + // this is a duplicated init. + assert(end == this->end); + return; + } + if (begin == BEGIN_KEY && end == END_KEY) { + root = true; + } + this->begin = std::move(begin); + this->end = std::move(end); + } + const std::string &get_begin() const { + return begin; + } + const std::string &get_end() const { + return end; + } + bool is_btree_root() const { return root; } +private: + bool root = false; + std::string begin; + std::string end; }; using OMapNodeRef = OMapNode::OMapNodeRef; -using omap_load_extent_iertr = OMapNode::base_iertr; -omap_load_extent_iertr::future -omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth); - } #if FMT_VERSION >= 90000 diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc index 042b22d189300..746a6cdcdf572 100644 --- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc @@ -29,8 +29,14 @@ std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs) std::ostream &OMapInnerNode::print_detail_l(std::ostream &out) const { - return out << ", size=" << get_size() - << ", depth=" << get_meta().depth; + out << ", size=" << get_size() + << ", depth=" << get_meta().depth + << ", is_root=" << is_btree_root(); + if (get_size() > 0) { + out << ", begin=" << get_begin() + << ", end=" << get_end(); + } + return out; } using dec_ref_iertr = OMapInnerNode::base_iertr; @@ -54,23 +60,35 @@ dec_ref_ret dec_ref(omap_context_t oc, T&& addr) { OMapInnerNode::make_split_insert_ret OMapInnerNode::make_split_insert( omap_context_t oc, - internal_iterator_t iter, + internal_const_iterator_t iter, std::string key, - laddr_t laddr) + OMapNodeRef &node) { LOG_PREFIX(OMapInnerNode::make_split_insert); DEBUGT("this: {}, key: {}", oc.t, *this, key); - return make_split_children(oc).si_then([=] (auto tuple) { + return make_split_children(oc).si_then( + [FNAME, oc, iter=std::move(iter), + key=std::move(key), node, this] (auto tuple) { auto [left, right, pivot] = tuple; + DEBUGT("this: {}, key: {}, pivot {}", oc.t, *this, key, pivot); + left->init_range(get_begin(), pivot); + right->init_range(pivot, get_end()); if (pivot > key) { auto liter = left->iter_idx(iter.get_offset()); - left->journal_inner_insert(liter, laddr, key, + left->insert_child_ptr( + liter.get_offset(), + dynamic_cast(node.get())); + left->journal_inner_insert(liter, node->get_laddr(), key, left->maybe_get_delta_buffer()); } else { //right auto riter = right->iter_idx(iter.get_offset() - left->get_node_size()); - right->journal_inner_insert(riter, laddr, key, + right->insert_child_ptr( + riter.get_offset(), + dynamic_cast(node.get())); + right->journal_inner_insert(riter, node->get_laddr(), key, right->maybe_get_delta_buffer()); } + this->adjust_copy_src_dest_on_split(oc.t, *left, *right); ++(oc.t.get_omap_tree_stats().extents_num_delta); return make_split_insert_ret( interruptible::ready_future_marker{}, @@ -83,7 +101,7 @@ OMapInnerNode::make_split_insert( OMapInnerNode::handle_split_ret OMapInnerNode::handle_split( omap_context_t oc, - internal_iterator_t iter, + internal_const_iterator_t iter, mutation_result_t mresult) { LOG_PREFIX(OMapInnerNode::handle_split); @@ -94,25 +112,34 @@ OMapInnerNode::handle_split( return mut->handle_split(oc, mut_iter, mresult); } auto [left, right, pivot] = *(mresult.split_tuple); + DEBUGT("this: {} {} {}", oc.t, *left, *right, pivot); //update operation will not cause node overflow, so we can do it first. + this->update_child_ptr( + iter.get_offset(), + dynamic_cast(left.get())); journal_inner_update(iter, left->get_laddr(), maybe_get_delta_buffer()); bool overflow = extent_will_overflow(pivot.size(), std::nullopt); + auto right_iter = iter + 1; if (!overflow) { - journal_inner_insert(iter + 1, right->get_laddr(), pivot, + this->insert_child_ptr( + right_iter.get_offset(), + dynamic_cast(right.get())); + journal_inner_insert(right_iter, right->get_laddr(), pivot, maybe_get_delta_buffer()); return insert_ret( interruptible::ready_future_marker{}, mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt)); } else { - return make_split_insert(oc, iter + 1, pivot, right->get_laddr()) - .si_then([this, oc] (auto m_result) { - return dec_ref(oc, get_laddr()) - .si_then([m_result = std::move(m_result)] { - return insert_ret( - interruptible::ready_future_marker{}, - m_result); - }); - }); + return make_split_insert( + oc, right_iter, pivot, right + ).si_then([this, oc] (auto m_result) { + return dec_ref(oc, get_laddr() + ).si_then([m_result = std::move(m_result)] { + return insert_ret( + interruptible::ready_future_marker{}, + m_result); + }); + }); } } @@ -123,11 +150,9 @@ OMapInnerNode::get_value( { LOG_PREFIX(OMapInnerNode::get_value); DEBUGT("key = {}, this: {}", oc.t, key, *this); - auto child_pt = get_containing_child(key); - assert(child_pt != iter_cend()); - auto laddr = child_pt->get_val(); - return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then( + return get_child_node(oc, key).si_then( [oc, &key] (auto extent) { + ceph_assert(!extent->is_btree_root()); return extent->get_value(oc, key); }).finally([ref = OMapNodeRef(this)] {}); } @@ -141,10 +166,9 @@ OMapInnerNode::insert( LOG_PREFIX(OMapInnerNode::insert); DEBUGT("{}->{}, this: {}", oc.t, key, value, *this); auto child_pt = get_containing_child(key); - assert(child_pt != iter_cend()); - auto laddr = child_pt->get_val(); - return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then( + return get_child_node(oc, child_pt).si_then( [oc, &key, &value] (auto extent) { + ceph_assert(!extent->is_btree_root()); return extent->insert(oc, key, value); }).si_then([this, oc, child_pt] (auto mresult) { if (mresult.status == mutation_status_t::SUCCESS) { @@ -165,10 +189,9 @@ OMapInnerNode::rm_key(omap_context_t oc, const std::string &key) LOG_PREFIX(OMapInnerNode::rm_key); DEBUGT("key={}, this: {}", oc.t, key, *this); auto child_pt = get_containing_child(key); - assert(child_pt != iter_cend()); - auto laddr = child_pt->get_val(); - return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then( + return get_child_node(oc, child_pt).si_then( [this, oc, &key, child_pt] (auto extent) { + ceph_assert(!extent->is_btree_root()); return extent->rm_key(oc, key) .si_then([this, oc, child_pt, extent = std::move(extent)] (auto mresult) { switch (mresult.status) { @@ -242,11 +265,9 @@ OMapInnerNode::list( seastar::stop_iteration::yes); } assert(result.size() < config.max_result_size); - auto laddr = iter->get_val(); - return omap_load_extent( - oc, laddr, - get_meta().depth - 1 + return get_child_node(oc, iter ).si_then([&, config, oc](auto &&extent) { + ceph_assert(!extent->is_btree_root()); return seastar::do_with( iter == fiter ? first : std::optional(std::nullopt), iter == liter - 1 ? last : std::optional(std::nullopt), @@ -306,8 +327,9 @@ OMapInnerNode::clear(omap_context_t oc) auto laddr = iter->get_val(); auto ndepth = get_meta().depth - 1; if (ndepth > 1) { - return omap_load_extent(oc, laddr, ndepth + return get_child_node(oc, iter ).si_then([oc](auto &&extent) { + ceph_assert(!extent->is_btree_root()); return extent->clear(oc); }).si_then([oc, laddr] { return dec_ref(oc, laddr); @@ -325,7 +347,7 @@ OMapInnerNode::clear(omap_context_t oc) } OMapInnerNode::split_children_ret -OMapInnerNode:: make_split_children(omap_context_t oc) +OMapInnerNode::make_split_children(omap_context_t oc) { LOG_PREFIX(OMapInnerNode::make_split_children); DEBUGT("this: {}", oc.t, *this); @@ -336,6 +358,7 @@ OMapInnerNode:: make_split_children(omap_context_t oc) auto left = ext_pair.front(); auto right = ext_pair.back(); DEBUGT("this: {}, split into: l {} r {}", oc.t, *this, *left, *right); + this->split_child_ptrs(oc.t, *left, *right); return split_children_ret( interruptible::ready_future_marker{}, std::make_tuple(left, right, split_into(*left, *right))); @@ -352,7 +375,9 @@ OMapInnerNode::make_full_merge(omap_context_t oc, OMapNodeRef right) DEBUGT("", oc.t); return oc.tm.alloc_non_data_extent(oc.t, oc.hint, OMAP_INNER_BLOCK_SIZE) - .si_then([this, right] (auto &&replacement) { + .si_then([this, right, oc] (auto &&replacement) { + replacement->merge_child_ptrs( + oc.t, *this, *right->cast()); replacement->merge_from(*this, *right->cast()); return full_merge_ret( interruptible::ready_future_marker{}, @@ -371,10 +396,12 @@ OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right) ceph_assert(_right->get_type() == TYPE); return oc.tm.alloc_extents(oc.t, oc.hint, OMAP_INNER_BLOCK_SIZE, 2) - .si_then([this, _right] (auto &&replacement_pair){ + .si_then([this, _right, oc] (auto &&replacement_pair){ auto replacement_left = replacement_pair.front(); auto replacement_right = replacement_pair.back(); auto &right = *_right->cast(); + this->balance_child_ptrs(oc.t, *this, right, true, + *replacement_left, *replacement_right); return make_balanced_ret( interruptible::ready_future_marker{}, std::make_tuple(replacement_left, replacement_right, @@ -389,7 +416,7 @@ OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right) OMapInnerNode::merge_entry_ret OMapInnerNode::merge_entry( omap_context_t oc, - internal_iterator_t iter, + internal_const_iterator_t iter, OMapNodeRef entry) { LOG_PREFIX(OMapInnerNode::merge_entry); @@ -401,36 +428,53 @@ OMapInnerNode::merge_entry( } auto is_left = (iter + 1) == iter_cend(); auto donor_iter = is_left ? iter - 1 : iter + 1; - return omap_load_extent(oc, donor_iter->get_val(), get_meta().depth - 1 + return get_child_node(oc, donor_iter ).si_then([=, this](auto &&donor) mutable { + ceph_assert(!donor->is_btree_root()); LOG_PREFIX(OMapInnerNode::merge_entry); auto [l, r] = is_left ? std::make_pair(donor, entry) : std::make_pair(entry, donor); auto [liter, riter] = is_left ? std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); if (l->can_merge(r)) { - DEBUGT("make_full_merge l {} r {}", oc.t, *l, *r); + DEBUGT("make_full_merge l {} r {} liter {} riter {}", + oc.t, *l, *r, liter->get_key(), riter->get_key()); assert(entry->extent_is_below_min()); return l->make_full_merge(oc, r ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this] (auto &&replacement) { LOG_PREFIX(OMapInnerNode::merge_entry); DEBUGT("to update parent: {}", oc.t, *this); + this->update_child_ptr( + liter.get_offset(), + dynamic_cast(replacement.get())); journal_inner_update( liter, replacement->get_laddr(), maybe_get_delta_buffer()); + this->remove_child_ptr(riter.get_offset()); journal_inner_remove(riter, maybe_get_delta_buffer()); //retire extent std::vector dec_laddrs {l->get_laddr(), r->get_laddr()}; + auto next = liter + 1; + auto end = next == iter_cend() ? get_end() : next.get_key(); + assert(end == r->get_end()); + replacement->init_range(liter.get_key(), std::move(end)); + if (get_meta().depth > 2) { // replacement is an inner node + auto &rep = *replacement->template cast(); + rep.adjust_copy_src_dest_on_merge( + oc.t, + *l->template cast(), + *r->template cast()); + } return dec_ref(oc, dec_laddrs - ).si_then([this, oc] { + ).si_then([this, oc, r=std::move(replacement)] { --(oc.t.get_omap_tree_stats().extents_num_delta); if (extent_is_below_min()) { return merge_entry_ret( interruptible::ready_future_marker{}, - mutation_result_t(mutation_status_t::NEED_MERGE, - std::nullopt, this)); + mutation_result_t(mutation_status_t::NEED_MERGE, + std::nullopt, this)); } else { return merge_entry_ret( interruptible::ready_future_marker{}, @@ -439,14 +483,30 @@ OMapInnerNode::merge_entry( } }); }); - } else { - DEBUGT("balanced l {} r {}", oc.t, *l, *r); + } else { // !l->can_merge(r) + DEBUGT("balanced l {} r {} liter {} riter {}", + oc.t, *l, *r, liter->get_key(), riter->get_key()); return l->make_balanced(oc, r ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this](auto tuple) { LOG_PREFIX(OMapInnerNode::merge_entry); - DEBUGT("to update parent: {}", oc.t, *this); auto [replacement_l, replacement_r, replacement_pivot] = tuple; + DEBUGT("to update parent: {} {} {}", + oc.t, *this, *replacement_l, *replacement_r); + replacement_l->init_range(l->get_begin(), replacement_pivot); + replacement_r->init_range(replacement_pivot, r->get_end()); + if (get_meta().depth > 2) { // l and r are inner nodes + auto &left = *l->template cast(); + auto &right = *r->template cast(); + auto &rep_left = *replacement_l->template cast(); + auto &rep_right = *replacement_r->template cast(); + this->adjust_copy_src_dest_on_balance( + oc.t, left, right, true, rep_left, rep_right); + } + //update operation will not cuase node overflow, so we can do it first + this->update_child_ptr( + liter.get_offset(), + dynamic_cast(replacement_l.get())); journal_inner_update( liter, replacement_l->get_laddr(), @@ -454,6 +514,9 @@ OMapInnerNode::merge_entry( bool overflow = extent_will_overflow(replacement_pivot.size(), std::nullopt); if (!overflow) { + this->update_child_ptr( + riter.get_offset(), + dynamic_cast(replacement_r.get())); journal_inner_remove(riter, maybe_get_delta_buffer()); journal_inner_insert( riter, @@ -469,12 +532,14 @@ OMapInnerNode::merge_entry( std::nullopt, std::nullopt)); }); } else { - DEBUGT("balanced and split {} r {}", oc.t, *l, *r); + DEBUGT("balanced and split {} r {} riter {}", + oc.t, *l, *r, riter.get_key()); //use remove and insert to instead of replace, //remove operation will not cause node split, so we can do it first + this->remove_child_ptr(riter.get_offset()); journal_inner_remove(riter, maybe_get_delta_buffer()); - return make_split_insert(oc, riter, replacement_pivot, - replacement_r->get_laddr() + return make_split_insert( + oc, riter, replacement_pivot, replacement_r ).si_then([this, oc, l = l, r = r](auto mresult) { std::vector dec_laddrs{ l->get_laddr(), @@ -493,21 +558,28 @@ OMapInnerNode::merge_entry( } -OMapInnerNode::internal_iterator_t +OMapInnerNode::internal_const_iterator_t OMapInnerNode::get_containing_child(const std::string &key) { - auto iter = string_lower_bound(key); - if (iter == iter_end() || (iter != iter_begin() && iter.get_key() > key)) { - iter--; - } + auto iter = string_upper_bound(key); + iter--; assert(iter.contains(key)); return iter; } std::ostream &OMapLeafNode::print_detail_l(std::ostream &out) const { - return out << ", size=" << get_size() - << ", depth=" << get_meta().depth; + out << ", size=" << get_size() + << ", depth=" << get_meta().depth + << ", is_root=" << is_btree_root(); + if (get_size() > 0) { + out << ", begin=" << get_begin() + << ", end=" << get_end(); + } + if (this->child_node_t::is_parent_valid()) + return out << ", parent=" << (void*)this->child_node_t::get_parent_node().get(); + else + return out; } OMapLeafNode::get_value_ret @@ -559,6 +631,8 @@ OMapLeafNode::insert( } else { return make_split_children(oc).si_then([this, oc, &key, &value] (auto tuple) { auto [left, right, pivot] = tuple; + left->init_range(get_begin(), pivot); + right->init_range(pivot, get_end()); auto replace_pt = find_string_key(key); if (replace_pt != iter_end()) { ++(oc.t.get_omap_tree_stats().num_updates); @@ -732,36 +806,71 @@ OMapLeafNode::make_balanced(omap_context_t oc, OMapNodeRef _right) ); } - -omap_load_extent_iertr::future -omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth) +OMapInnerNode::get_child_node_ret +OMapInnerNode::get_child_node( + omap_context_t oc, + internal_const_iterator_t child_pt) { - ceph_assert(depth > 0); - if (depth > 1) { - return oc.tm.read_extent( - oc.t, laddr, OMAP_INNER_BLOCK_SIZE - ).handle_error_interruptible( - omap_load_extent_iertr::pass_further{}, - crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" } - ).si_then([](auto maybe_indirect_extent) { - assert(!maybe_indirect_extent.is_indirect()); - assert(!maybe_indirect_extent.is_clone); - return seastar::make_ready_future( - std::move(maybe_indirect_extent.extent)); + assert(get_meta().depth > 1); + child_pos_t child_pos(nullptr, 0); + auto laddr = child_pt->get_val(); + auto next = child_pt + 1; + if (get_meta().depth == 2) { + auto ret = this->get_child( + oc.t, oc.tm.get_etvr(), child_pt.get_offset(), child_pt.get_key()); + if (ret.has_child()) { + return ret.get_child_fut( + ).si_then([](auto extent) { + return extent->template cast(); + }); + } else { + child_pos = ret.get_child_pos(); + } + return omap_load_extent( + oc, + laddr, + get_meta().depth - 1, + child_pt->get_key(), + next == iter_cend() + ? this->get_end() + : next->get_key(), + std::move(child_pos) + ).si_then([](auto extent) { + return extent->template cast(); }); } else { - return oc.tm.read_extent( - oc.t, laddr, - BtreeOMapManager::get_leaf_size(oc.type) - ).handle_error_interruptible( - omap_load_extent_iertr::pass_further{}, - crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" } - ).si_then([](auto maybe_indirect_extent) { - assert(!maybe_indirect_extent.is_indirect()); - assert(!maybe_indirect_extent.is_clone); - return seastar::make_ready_future( - std::move(maybe_indirect_extent.extent)); + auto ret = this->get_child( + oc.t, oc.tm.get_etvr(), child_pt.get_offset(), child_pt.get_key()); + if (ret.has_child()) { + return ret.get_child_fut( + ).si_then([](auto extent) { + return extent->template cast(); + }); + } else { + child_pos = ret.get_child_pos(); + } + return omap_load_extent( + oc, + laddr, + get_meta().depth - 1, + child_pt->get_key(), + next == iter_cend() + ? this->get_end() + : next->get_key(), + std::move(child_pos) + ).si_then([](auto extent) { + return extent->template cast(); }); } } + +extent_len_t get_leaf_size(omap_type_t type) { + if (type == omap_type_t::LOG) { + return LOG_LEAF_BLOCK_SIZE; + } + ceph_assert(type == omap_type_t::OMAP || + type == omap_type_t::XATTR); + return OMAP_LEAF_BLOCK_SIZE; +} + } diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h index d1dd00f182f47..32fb3f871464a 100644 --- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h @@ -16,6 +16,8 @@ namespace crimson::os::seastore::omap_manager { +extent_len_t get_leaf_size(omap_type_t type); + /** * OMapInnerNode * @@ -28,20 +30,86 @@ namespace crimson::os::seastore::omap_manager { struct OMapInnerNode : OMapNode, - StringKVInnerNodeLayout { + StringKVInnerNodeLayout, + ParentNode, + ChildNode { using OMapInnerNodeRef = TCachedExtentRef; - using internal_iterator_t = const_iterator; + using internal_const_iterator_t = const_iterator; + using internal_iterator_t = iterator; + using parent_node_t = ParentNode; + using base_child_t = BaseChildNode; + using child_node_t = ChildNode; + static constexpr uint32_t CHILD_VEC_UNIT = 128; explicit OMapInnerNode(ceph::bufferptr &&ptr) - : OMapNode(std::move(ptr)) { - this->set_layout_buf(this->get_bptr().c_str()); + : OMapNode(std::move(ptr)), + StringKVInnerNodeLayout(get_bptr().c_str()), + parent_node_t(0) { + this->parent_node_t::sync_children_capacity(); } // Must be identical with OMapInnerNode(ptr) after on_fully_loaded() explicit OMapInnerNode(extent_len_t length) - : OMapNode(length) {} + : OMapNode(length), + StringKVInnerNodeLayout(nullptr), + parent_node_t(0) {} OMapInnerNode(const OMapInnerNode &rhs) - : OMapNode(rhs) { - this->set_layout_buf(this->get_bptr().c_str()); + : OMapNode(rhs), + StringKVInnerNodeLayout(get_bptr().c_str()), + parent_node_t(0) { + this->parent_node_t::sync_children_capacity(); + } + + iterator begin() { + return iter_begin(); + } + + iterator end() { + return iter_end(); + } + + void do_on_rewrite(Transaction &t, LogicalCachedExtent &extent) final { + auto &ext = static_cast(extent); + this->parent_node_t::on_rewrite(t, ext); + auto &other = static_cast(extent); + this->init_range(other.get_begin(), other.get_end()); + this->sync_children_capacity(); + } + + void prepare_commit() final { + this->parent_node_t::prepare_commit(); + if (is_rewrite() && !is_btree_root()) { + auto &prior = *get_prior_instance()->template cast(); + if (prior.base_child_t::has_parent_tracker()) { + // unlike fixed-kv nodes, rewriting child nodes of the omap tree + // won't affect parent nodes, so we have to manually take prior + // instances' parent trackers here. + this->child_node_t::take_parent_from_prior(); + } + } + } + + void do_on_replace_prior() final { + this->parent_node_t::on_replace_prior(); + if (!this->is_btree_root()) { + auto &prior = *get_prior_instance()->template cast(); + assert(prior.base_child_t::has_parent_tracker()); + this->child_node_t::on_replace_prior(); + } + } + + void on_invalidated(Transaction &t) final { + this->child_node_t::on_invalidated(); + } + + void on_initial_write() final { + if (this->is_btree_root()) { + //TODO: should involve RootChildNode + this->child_node_t::reset_parent_tracker(); + } + } + + btreenode_pos_t get_node_split_pivot() const { + return this->get_split_pivot().get_offset(); } omap_node_meta_t get_node_meta() const final { return get_meta(); } @@ -58,6 +126,10 @@ struct OMapInnerNode this->set_layout_buf(this->get_bptr().c_str()); } + void on_clean_read() final { + this->sync_children_capacity(); + } + CachedExtentRef duplicate_for_write(Transaction&) final { assert(delta_buffer.empty()); return CachedExtentRef(new OMapInnerNode(*this)); @@ -101,19 +173,19 @@ struct OMapInnerNode using make_split_insert_iertr = base_iertr; using make_split_insert_ret = make_split_insert_iertr::future; make_split_insert_ret make_split_insert( - omap_context_t oc, internal_iterator_t iter, - std::string key, laddr_t laddr); + omap_context_t oc, internal_const_iterator_t iter, + std::string key, OMapNodeRef &node); using merge_entry_iertr = base_iertr; using merge_entry_ret = merge_entry_iertr::future; merge_entry_ret merge_entry( omap_context_t oc, - internal_iterator_t iter, OMapNodeRef entry); + internal_const_iterator_t iter, OMapNodeRef entry); using handle_split_iertr = base_iertr; using handle_split_ret = handle_split_iertr::future; handle_split_ret handle_split( - omap_context_t oc, internal_iterator_t iter, + omap_context_t oc, internal_const_iterator_t iter, mutation_result_t mresult); std::ostream &print_detail_l(std::ostream &out) const final; @@ -138,9 +210,43 @@ struct OMapInnerNode auto bptr = bl.cbegin(); decode(buffer, bptr); buffer.replay(*this); + this->parent_node_t::sync_children_capacity(); + } + + internal_const_iterator_t get_containing_child(const std::string &key); + + internal_iterator_t lower_bound(const std::string &key) { + return string_lower_bound(key); } - internal_iterator_t get_containing_child(const std::string &key); + internal_iterator_t upper_bound(const std::string &key) { + return string_upper_bound(key); + } + + bool is_in_range(const std::string &key) const { + return get_begin() <= key && get_end() > key; + } + + ~OMapInnerNode() { + if (this->is_valid() + && !this->is_pending() + && !this->is_btree_root() + && this->base_child_t::has_parent_tracker()) { + this->child_node_t::destroy(); + } + } +private: + using get_child_node_iertr = OMapNode::base_iertr; + using get_child_node_ret = get_child_node_iertr::future; + get_child_node_ret get_child_node( + omap_context_t oc, + internal_const_iterator_t child_pt); + + get_child_node_ret get_child_node(omap_context_t oc, const std::string &key) { + auto child_pt = get_containing_child(key); + assert(child_pt != iter_cend()); + return get_child_node(oc, child_pt); + } }; using OMapInnerNodeRef = OMapInnerNode::OMapInnerNodeRef; @@ -156,10 +262,51 @@ using OMapInnerNodeRef = OMapInnerNode::OMapInnerNodeRef; struct OMapLeafNode : OMapNode, - StringKVLeafNodeLayout { - + StringKVLeafNodeLayout, + ChildNode { using OMapLeafNodeRef = TCachedExtentRef; - using internal_iterator_t = const_iterator; + using internal_const_iterator_t = const_iterator; + using base_child_t = BaseChildNode; + using child_node_t = ChildNode; + + void do_on_rewrite(Transaction &t, LogicalCachedExtent &extent) final { + auto &other = static_cast(extent); + this->init_range(other.get_begin(), other.get_end()); + } + + void on_invalidated(Transaction &t) final { + this->child_node_t::on_invalidated(); + } + + void prepare_commit() final { + if (is_rewrite() && !is_btree_root()) { + auto &prior = *get_prior_instance()->template cast(); + if (prior.base_child_t::has_parent_tracker()) { + // unlike fixed-kv nodes, rewriting child nodes of the omap tree + // won't affect parent nodes, so we have to manually take prior + // instances' parent trackers here. + this->child_node_t::take_parent_from_prior(); + } + } + } + + void do_on_replace_prior() final { + ceph_assert(!this->is_rewrite()); + if (!this->is_btree_root()) { + auto &prior = *get_prior_instance()->template cast(); + assert(prior.base_child_t::has_parent_tracker()); + this->child_node_t::on_replace_prior(); + } + } + + ~OMapLeafNode() { + if (this->is_valid() + && !this->is_pending() + && !this->is_btree_root() + && this->base_child_t::has_parent_tracker()) { + this->child_node_t::destroy(); + } + } explicit OMapLeafNode(ceph::bufferptr &&ptr) : OMapNode(std::move(ptr)) { @@ -256,14 +403,58 @@ struct OMapLeafNode buffer.replay(*this); } + btreenode_pos_t get_node_split_pivot() const { + return this->get_split_pivot().get_offset(); + } + std::ostream &print_detail_l(std::ostream &out) const final; - std::pair + std::pair get_leaf_entries(std::string &key); - }; using OMapLeafNodeRef = OMapLeafNode::OMapLeafNodeRef; +using omap_load_extent_iertr = OMapNode::base_iertr; +template +requires std::is_same_v || std::is_same_v +omap_load_extent_iertr::future> +omap_load_extent( + omap_context_t oc, + laddr_t laddr, + depth_t depth, + std::string begin, + std::string end, + std::optional> chp = std::nullopt) +{ + LOG_PREFIX(omap_load_extent); + assert(end <= END_KEY); + auto size = std::is_same_v + ? OMAP_INNER_BLOCK_SIZE : get_leaf_size(oc.type); + return oc.tm.read_extent( + oc.t, laddr, size, + [begin=std::move(begin), end=std::move(end), FNAME, + oc, chp=std::move(chp)](T &extent) mutable { + extent.init_range(std::move(begin), std::move(end)); + if (extent.T::base_child_t::is_parent_valid() + || extent.is_btree_root()) { + return; + } + assert(chp); + SUBDEBUGT(seastore_omap, "linking {} to {}", + oc.t, extent, *chp->get_parent()); + chp->link_child(&extent); + } + ).handle_error_interruptible( + omap_load_extent_iertr::pass_further{}, + crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" } + ).si_then([](auto maybe_indirect_extent) { + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); + return seastar::make_ready_future>( + std::move(maybe_indirect_extent.extent)); + }); +} + std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs); std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs); } diff --git a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h index 69f5601132181..d0991de5023e2 100644 --- a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h +++ b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h @@ -504,7 +504,7 @@ public: inner_remove(iter); } - StringKVInnerNodeLayout() : buf(nullptr) {} + StringKVInnerNodeLayout(char *buf) : buf(buf) {} void set_layout_buf(char *_buf) { assert(buf == nullptr); diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index bc553e955b275..e7d449cdd5e6d 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -931,6 +931,10 @@ public: return epm->get_stat(); } + ExtentTransViewRetriever& get_etvr() { + return *cache; + } + ~TransactionManager(); private: -- 2.39.5