From: Xuehan Xu Date: Tue, 25 Oct 2022 06:03:43 +0000 (+0800) Subject: crimson/os/seastore/lba_manager: link lba leaf nodes with logical extents by pointers X-Git-Tag: v18.1.0~34^2~6 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=9e3a2eb228b9420985b08033667373b31af14c01;p=ceph-ci.git crimson/os/seastore/lba_manager: link lba leaf nodes with logical extents by pointers Signed-off-by: Xuehan Xu (cherry picked from commit cce850d75609c7c34bce0920e4e12ba9b9513229) --- diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index baa2e7ca954..5b1c6187ca2 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -18,6 +18,8 @@ set(crimson_seastore_srcs omap_manager.cc omap_manager/btree/btree_omap_manager.cc omap_manager/btree/omap_btree_node_impl.cc + btree/btree_range_pin.cc + btree/fixed_kv_node.cc onode.cc onode_manager/staged-fltree/node.cc onode_manager/staged-fltree/node_extent_manager.cc diff --git a/src/crimson/os/seastore/backref/backref_tree_node.h b/src/crimson/os/seastore/backref/backref_tree_node.h index db9f1febff1..c3ff52520ce 100644 --- a/src/crimson/os/seastore/backref/backref_tree_node.h +++ b/src/crimson/os/seastore/backref/backref_tree_node.h @@ -92,7 +92,8 @@ public: const_iterator insert( const_iterator iter, paddr_t key, - backref_map_val_t val) final { + backref_map_val_t val, + LogicalCachedExtent*) final { journal_insert( iter, key, @@ -103,7 +104,8 @@ public: void update( const_iterator iter, - backref_map_val_t val) final { + backref_map_val_t val, + LogicalCachedExtent*) final { return journal_update( iter, val, diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc index 7db8318cb9d..0980cb2ed2b 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.cc +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -242,7 +242,8 @@ BtreeBackrefManager::new_mapping( c, *state.insert_iter, state.last_end, - val + val, + nullptr ).si_then([&state, c, addr, len, key](auto &&p) { LOG_PREFIX(BtreeBackrefManager::new_mapping); auto [iter, inserted] = std::move(p); diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h index 0306d0e8bbe..1f3347c8cdd 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.h +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -17,10 +17,12 @@ public: BtreeBackrefPin() = default; BtreeBackrefPin( CachedExtentRef parent, + uint16_t pos, backref_map_val_t &val, backref_node_meta_t &&meta) : BtreeNodePin( parent, + pos, val.laddr, val.len, std::forward(meta)), diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc new file mode 100644 index 00000000000..9565a853b83 --- /dev/null +++ b/src/crimson/os/seastore/btree/btree_range_pin.cc @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/btree/btree_range_pin.h" +#include "crimson/os/seastore/btree/fixed_kv_node.h" + +namespace crimson::os::seastore { + +template +void BtreeNodePin::link_extent(LogicalCachedExtent *ref) { + assert(ref->is_valid()); + // it's only when reading logical extents from disk that we need to + // link them to lba leaves + if (!ref->is_pending() && !ref->is_exist_clean()) { + assert(parent); + assert(pos != std::numeric_limits::max()); + if (parent->is_initial_pending()) { + auto &p = ((FixedKVNode&)*parent).get_stable_for_key( + pin.range.begin); + p.link_child(ref, pos); + } else if (parent->is_mutation_pending()) { + auto &p = (FixedKVNode&)*parent->get_prior_instance(); + p.link_child(ref, pos); + } else { + assert(!parent->is_pending() && parent->is_valid()); + auto &p = (FixedKVNode&)*parent; + p.link_child(ref, pos); + } + pos = std::numeric_limits::max(); + } + pin.set_extent(ref); +} + +template void BtreeNodePin::link_extent(LogicalCachedExtent*); +template void BtreeNodePin::link_extent(LogicalCachedExtent*); +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h index 5942e85f317..c1d116fc986 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.h +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -453,6 +453,7 @@ class BtreeNodePin : public PhysicalNodePin { val_t value; extent_len_t len; btree_range_pin_t pin; + uint16_t pos = std::numeric_limits::max(); public: using val_type = val_t; @@ -460,13 +461,18 @@ public: BtreeNodePin( CachedExtentRef parent, + uint16_t pos, val_t &value, extent_len_t len, fixed_kv_node_meta_t &&meta) - : parent(parent), value(value), len(len) { + : parent(parent), value(value), len(len), pos(pos) { pin.set_range(std::move(meta)); } + CachedExtentRef get_parent() const final { + return parent; + } + btree_range_pin_t& get_range_pin() { return pin; } @@ -479,9 +485,7 @@ public: parent = pin; } - void link_extent(LogicalCachedExtent *ref) final { - pin.set_extent(ref); - } + void link_extent(LogicalCachedExtent *ref) final; extent_len_t get_length() const final { ceph_assert(pin.range.end > pin.range.begin); diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 6c3372819f3..9044d7d5936 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -15,12 +15,16 @@ #include "crimson/os/seastore/btree/btree_range_pin.h" #include "crimson/os/seastore/root_block.h" +#define RESERVATION_PTR reinterpret_cast(0x1) + namespace crimson::os::seastore::lba_manager::btree { struct lba_map_val_t; } namespace crimson::os::seastore { +bool is_valid_child_ptr(ChildableCachedExtent* child); + template phy_tree_root_t& get_phy_tree_root(root_t& r); @@ -223,6 +227,7 @@ public: auto key = get_key(); return std::make_unique( leaf.node, + leaf.pos, val, fixed_kv_node_meta_t{ key, key + val.len, 0 }); } @@ -545,7 +550,8 @@ public: op_context_t c, iterator iter, node_key_t laddr, - node_val_t val + node_val_t val, + LogicalCachedExtent* nextent ) { LOG_PREFIX(FixedKVBtree::insert); SUBTRACET( @@ -556,10 +562,10 @@ public: iter.is_end() ? min_max_t::max : iter.get_key()); return seastar::do_with( iter, - [this, c, laddr, val](auto &ret) { + [this, c, laddr, val, nextent](auto &ret) { return find_insertion( c, laddr, ret - ).si_then([this, c, laddr, val, &ret] { + ).si_then([this, c, laddr, val, &ret, nextent] { if (!ret.at_boundary() && ret.get_key() == laddr) { return insert_ret( interruptible::ready_future_marker{}, @@ -568,7 +574,7 @@ public: ++(get_tree_stats(c.trans).num_inserts); return handle_split( c, ret - ).si_then([c, laddr, val, &ret] { + ).si_then([c, laddr, val, &ret, nextent] { if (!ret.leaf.node->is_mutable()) { CachedExtentRef mut = c.cache.duplicate_for_write( c.trans, ret.leaf.node @@ -581,7 +587,7 @@ public: assert(iter == ret.leaf.node->end() || iter->get_key() > laddr); assert(laddr >= ret.leaf.node->get_meta().begin && laddr < ret.leaf.node->get_meta().end); - ret.leaf.node->insert(iter, laddr, val); + ret.leaf.node->insert(iter, laddr, val, nextent); return insert_ret( interruptible::ready_future_marker{}, std::make_pair(ret, true)); @@ -594,11 +600,12 @@ public: insert_ret insert( op_context_t c, node_key_t laddr, - node_val_t val) { + node_val_t val, + LogicalCachedExtent* nextent) { return lower_bound( c, laddr - ).si_then([this, c, laddr, val](auto iter) { - return this->insert(c, iter, laddr, val); + ).si_then([this, c, laddr, val, nextent](auto iter) { + return this->insert(c, iter, laddr, val, nextent); }); } @@ -617,7 +624,8 @@ public: update_ret update( op_context_t c, iterator iter, - node_val_t val) + node_val_t val, + LogicalCachedExtent* nextent) { LOG_PREFIX(FixedKVBtree::update); SUBTRACET( @@ -634,7 +642,8 @@ public: ++(get_tree_stats(c.trans).num_updates); iter.leaf.node->update( iter.leaf.node->iter_idx(iter.leaf.pos), - val); + val, + nextent); return update_ret( interruptible::ready_future_marker{}, iter); diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.cc b/src/crimson/os/seastore/btree/fixed_kv_node.cc new file mode 100644 index 00000000000..00aceab92b3 --- /dev/null +++ b/src/crimson/os/seastore/btree/fixed_kv_node.cc @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/btree/fixed_kv_node.h" + +namespace crimson::os::seastore { + +bool is_valid_child_ptr(ChildableCachedExtent* child) { + return child != nullptr && child != RESERVATION_PTR; +} + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 202a270a336..70135210af0 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -28,25 +28,8 @@ namespace crimson::os::seastore { * Base class enabling recursive lookup between internal and leaf nodes. */ template -struct FixedKVNode : CachedExtent { +struct FixedKVNode : ChildableCachedExtent { using FixedKVNodeRef = TCachedExtentRef; - struct parent_tracker_t - : public boost::intrusive_ref_counter< - parent_tracker_t, boost::thread_unsafe_counter> { - parent_tracker_t(FixedKVNodeRef parent) - : parent(parent) {} - parent_tracker_t(FixedKVNode* parent) - : parent(parent) {} - FixedKVNodeRef parent = nullptr; - ~parent_tracker_t() { - // this is parent's tracker, reset it - if (parent->my_tracker == this) { - parent->my_tracker = nullptr; - } - } - }; - - using parent_tracker_ref = boost::intrusive_ptr; btree_range_pin_t pin; struct copy_source_cmp_t { @@ -98,20 +81,24 @@ struct FixedKVNode : CachedExtent { * its "prior_instance" if the node is the result of a rewrite), with which * the lba range of this node overlaps. */ - std::vector children; + std::vector children; std::set copy_sources; uint16_t capacity = 0; parent_tracker_t* my_tracker = nullptr; - parent_tracker_ref parent_tracker; RootBlockRef root_block; + bool is_linked() { + assert(!has_parent_tracker() || !(bool)root_block); + return (bool)has_parent_tracker() || (bool)root_block; + } + FixedKVNode(uint16_t capacity, ceph::bufferptr &&ptr) - : CachedExtent(std::move(ptr)), + : ChildableCachedExtent(std::move(ptr)), pin(this), children(capacity, nullptr), capacity(capacity) {} FixedKVNode(const FixedKVNode &rhs) - : CachedExtent(rhs), + : ChildableCachedExtent(rhs), pin(rhs.pin, this), children(rhs.capacity, nullptr), capacity(rhs.capacity) {} @@ -128,6 +115,8 @@ struct FixedKVNode : CachedExtent { set_child_ptracker(child); } + virtual bool is_leaf_and_has_children() const = 0; + template void insert_child_ptr(iter_t iter, ChildableCachedExtent* child) { auto raw_children = children.data(); @@ -136,8 +125,18 @@ struct FixedKVNode : CachedExtent { &raw_children[offset + 1], &raw_children[offset], (get_node_size() - offset) * sizeof(ChildableCachedExtent*)); - children[offset] = child; - set_child_ptracker(child); + if (child) { + children[offset] = child; + set_child_ptracker(child); + } else { + // this can only happen when reserving lba spaces + ceph_assert(is_leaf_and_has_children()); + // this is to avoid mistakenly copying pointers from + // copy sources when committing this lba node, because + // we rely on pointers' "nullness" to avoid copying + // pointers for updated values + children[offset] = RESERVATION_PTR; + } } template @@ -227,7 +226,7 @@ struct FixedKVNode : CachedExtent { : stable_parent(stable_parent), pos(pos) {} }; - void link_child(FixedKVNode* child, uint16_t pos) { + void link_child(ChildableCachedExtent* child, uint16_t pos) { assert(pos < get_node_size()); assert(child); ceph_assert(!is_pending()); @@ -242,14 +241,14 @@ struct FixedKVNode : CachedExtent { auto pos = iter.get_offset(); assert(children.capacity()); auto child = children[pos]; - if (child) { + if (is_valid_child_ptr(child)) { return child_pos_t(child->get_transactional_view(t)); } else if (is_pending()) { auto key = iter.get_key(); auto &sparent = get_stable_for_key(key); auto spos = sparent.child_pos_for_key(key); auto child = sparent.children[spos]; - if (child) { + if (is_valid_child_ptr(child)) { return child_pos_t(child->get_transactional_view(t)); } else { return child_pos_t(&sparent, spos); @@ -357,10 +356,9 @@ struct FixedKVNode : CachedExtent { return; } ceph_assert(!root_block); - parent_tracker = prior.parent_tracker; - auto &parent = parent_tracker->parent; - assert(parent); - assert(parent->is_valid()); + take_prior_parent_tracker(); + assert(is_parent_valid()); + auto parent = get_parent_node(); //TODO: can this search be avoided? auto off = parent->lower_bound_offset(get_node_meta().begin); assert(parent->get_key_from_idx(off) == get_node_meta().begin); @@ -385,7 +383,7 @@ struct FixedKVNode : CachedExtent { assert(prior.my_tracker || prior.is_children_empty()); if (prior.my_tracker) { - prior.my_tracker->parent.reset(this); + prior.my_tracker->reset_parent(this); my_tracker = prior.my_tracker; // All my initial pending children is pointing to the original // tracker which has been dropped by the above line, so need @@ -401,8 +399,8 @@ struct FixedKVNode : CachedExtent { ceph_assert(end <= children.end()); for (auto it = begin; it != end; it++) { auto child = *it; - if (child) { - set_child_ptracker((FixedKVNode*)child); + if (is_valid_child_ptr(child)) { + set_child_ptracker(child); } } } @@ -485,7 +483,7 @@ struct FixedKVNode : CachedExtent { } void on_invalidated(Transaction &t) final { - parent_tracker.reset(); + reset_parent_tracker(); } bool is_rewrite() { @@ -495,17 +493,17 @@ struct FixedKVNode : CachedExtent { void on_initial_write() final { // All in-memory relative addrs are necessarily block-relative resolve_relative_addrs(get_paddr()); - ceph_assert( - parent_tracker - ? (parent_tracker->parent && parent_tracker->parent->is_valid()) - : true); + if (pin.is_root()) { + reset_parent_tracker(); + } + assert(has_parent_tracker() ? (is_parent_valid()) : true); } - void set_child_ptracker(FixedKVNode *child) { - if (!my_tracker) { - my_tracker = new parent_tracker_t(this); + void set_child_ptracker(ChildableCachedExtent *child) { + if (!this->my_tracker) { + this->my_tracker = new parent_tracker_t(this); } - child->parent_tracker.reset(my_tracker); + child->reset_parent_tracker(this->my_tracker); } void on_clean_read() final { @@ -564,6 +562,10 @@ struct FixedKVInternalNode : FixedKVNode(rhs), node_layout_t(this->get_bptr().c_str()) {} + bool is_leaf_and_has_children() const final { + return false; + } + uint16_t get_node_split_pivot() final { return this->get_split_pivot().get_offset(); } @@ -617,9 +619,8 @@ struct FixedKVInternalNode ceph_assert(this->root_block); unlink_phy_tree_root_node(this->root_block); } else { - ceph_assert(this->parent_tracker); - auto &parent = this->parent_tracker->parent; - ceph_assert(parent); + ceph_assert(this->is_parent_valid()); + auto parent = this->template get_parent_node>(); auto off = parent->lower_bound_offset(this->get_meta().begin); assert(parent->get_key_from_idx(off) == this->get_meta().begin); assert(parent->children[off] == this); @@ -853,17 +854,13 @@ struct FixedKVInternalNode } } - std::ostream &print_detail(std::ostream &out) const + std::ostream &_print_detail(std::ostream &out) const { out << ", size=" << this->get_size() << ", meta=" << this->get_meta() - << ", parent_tracker=" << (void*)this->parent_tracker.get(); - if (this->parent_tracker) { - out << ", parent=" << (void*)this->parent_tracker->parent.get(); - } - out << ", my_tracker=" << (void*)this->my_tracker; + << ", my_tracker=" << (void*)this->my_tracker; if (this->my_tracker) { - out << ", my_tracker->parent=" << (void*)this->my_tracker->parent.get(); + out << ", my_tracker->parent=" << (void*)this->my_tracker->get_parent().get(); } return out << ", root_block=" << (void*)this->root_block.get(); } @@ -936,8 +933,18 @@ struct FixedKVLeafNode VAL, VAL_LE>; using internal_const_iterator_t = typename node_layout_t::const_iterator; + using this_type_t = FixedKVLeafNode< + CAPACITY, + NODE_KEY, + NODE_KEY_LE, + VAL, + VAL_LE, + node_size, + node_type_t, + has_children>; + using base_t = FixedKVNode; FixedKVLeafNode(ceph::bufferptr &&ptr) - : FixedKVNode(0, std::move(ptr)), + : FixedKVNode(has_children ? CAPACITY : 0, std::move(ptr)), node_layout_t(this->get_bptr().c_str()) {} FixedKVLeafNode(const FixedKVLeafNode &rhs) : FixedKVNode(rhs), @@ -945,11 +952,15 @@ struct FixedKVLeafNode static constexpr bool do_has_children = has_children; + bool is_leaf_and_has_children() const final { + return has_children; + } + uint16_t get_node_split_pivot() final { return this->get_split_pivot().get_offset(); } - bool validate_stable_children() final { + bool validate_stable_children() override { return true; } @@ -959,9 +970,8 @@ struct FixedKVLeafNode ceph_assert(this->root_block); unlink_phy_tree_root_node(this->root_block); } else { - ceph_assert(this->parent_tracker); - auto &parent = this->parent_tracker->parent; - ceph_assert(parent); + ceph_assert(this->is_parent_valid()); + auto parent = this->template get_parent_node>(); auto off = parent->lower_bound_offset(this->get_meta().begin); assert(parent->get_key_from_idx(off) == this->get_meta().begin); assert(parent->children[off] == this); @@ -970,9 +980,49 @@ struct FixedKVLeafNode } } - void on_replace_prior(Transaction &t) final { - this->set_parent_tracker(); - assert(this->mutate_state.empty()); + void prepare_write() final { + if constexpr (has_children) { + if (this->is_initial_pending()) { + if (this->is_rewrite()) { + this->set_children_from_prior_instance(); + } + this->copy_children_from_stable_sources( + [this](base_t &node, uint16_t pos) { + ceph_assert(node.get_type() == this->get_type()); + auto &n = static_cast(node); + return n.iter_idx(pos); + } + ); + if (this->is_rewrite()) { + this->reset_prior_instance(); + } else { + this->adjust_ptracker_for_children(); + } + assert(this->validate_stable_children()); + this->copy_sources.clear(); + } + } + assert(this->is_initial_pending() + ? this->copy_sources.empty(): + true); + } + + void on_replace_prior(Transaction&) final { + ceph_assert(!this->is_rewrite()); + if constexpr (has_children) { + this->set_children_from_prior_instance(); + auto &prior = (this_type_t&)(*this->get_prior_instance()); + auto copied = this->copy_children_from_stable_source( + prior, + prior.begin(), + prior.end(), + this->begin()); + ceph_assert(copied <= get_node_size()); + assert(this->validate_stable_children()); + this->set_parent_tracker_from_prior_instance(); + } else { + this->set_parent_tracker_from_prior_instance(); + } } uint16_t lower_bound_offset(NODE_KEY key) const final { @@ -1011,11 +1061,13 @@ struct FixedKVLeafNode virtual void update( internal_const_iterator_t iter, - VAL val) = 0; + VAL val, + LogicalCachedExtent* nextent) = 0; virtual internal_const_iterator_t insert( internal_const_iterator_t iter, NODE_KEY addr, - VAL val) = 0; + VAL val, + LogicalCachedExtent* nextent) = 0; virtual void remove(internal_const_iterator_t iter) = 0; std::tuple @@ -1024,6 +1076,9 @@ struct FixedKVLeafNode c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto right = c.cache.template alloc_new_extent( c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + if constexpr (has_children) { + this->split_child_ptrs(*left, *right); + } auto pivot = this->split_into(*left, *right); left->pin.set_range(left->get_meta()); right->pin.set_range(right->get_meta()); @@ -1038,6 +1093,9 @@ struct FixedKVLeafNode Ref &right) { auto replacement = c.cache.template alloc_new_extent( c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + if constexpr (has_children) { + replacement->merge_child_ptrs(*this, *right); + } replacement->merge_from(*this, *right->template cast()); replacement->pin.set_range(replacement->get_meta()); return replacement; @@ -1061,6 +1119,14 @@ struct FixedKVLeafNode prefer_left, *replacement_left, *replacement_right); + if constexpr (has_children) { + this->balance_child_ptrs( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + } replacement_left->pin.set_range(replacement_left->get_meta()); replacement_right->pin.set_range(replacement_right->get_meta()); @@ -1090,15 +1156,10 @@ struct FixedKVLeafNode this->resolve_relative_addrs(base); } - std::ostream &print_detail(std::ostream &out) const + std::ostream &_print_detail(std::ostream &out) const { - out << ", size=" << this->get_size() - << ", meta=" << this->get_meta() - << ", parent_tracker=" << (void*)this->parent_tracker.get(); - if (this->parent_tracker) { - out << ", parent=" << (void*)this->parent_tracker->parent.get(); - } - return out; + return out << ", size=" << this->get_size() + << ", meta=" << this->get_meta(); } constexpr static size_t get_min_capacity() { diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index dfa4c656168..78ea5a465bf 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -6,6 +6,8 @@ #include "crimson/common/log.h" +#include "crimson/os/seastore/btree/fixed_kv_node.h" + namespace { [[maybe_unused]] seastar::logger& logger() { return crimson::get_logger(ceph_subsys_seastore_tm); @@ -91,7 +93,22 @@ CachedExtent* CachedExtent::get_transactional_view(transaction_id_t tid) { } } -std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const +std::ostream &operator<<(std::ostream &out, const parent_tracker_t &tracker) { + return out << "parent_tracker=" << (void*)&tracker + << ", parent=" << (void*)tracker.get_parent().get(); +} + +std::ostream &ChildableCachedExtent::print_detail(std::ostream &out) const { + if (parent_tracker) { + out << *parent_tracker; + } else { + out << ", parent_tracker=" << (void*)nullptr; + } + _print_detail(out); + return out; +} + +std::ostream &LogicalCachedExtent::_print_detail(std::ostream &out) const { out << ", laddr=" << laddr; if (pin) { @@ -110,6 +127,36 @@ void CachedExtent::set_invalid(Transaction &t) { on_invalidated(t); } +LogicalCachedExtent::~LogicalCachedExtent() { + if (has_parent_tracker() && is_valid() && !is_pending()) { + assert(get_parent_node()); + auto parent = get_parent_node>(); + auto off = parent->lower_bound_offset(laddr); + assert(parent->get_key_from_idx(off) == laddr); + assert(parent->children[off] == this); + parent->children[off] = nullptr; + } +} + +void LogicalCachedExtent::on_replace_prior(Transaction &t) { + assert(is_mutation_pending()); + take_prior_parent_tracker(); + assert(get_parent_node()); + auto parent = get_parent_node>(); + //TODO: can this search be avoided? + auto off = parent->lower_bound_offset(laddr); + assert(parent->get_key_from_idx(off) == laddr); + parent->children[off] = this; +} + +parent_tracker_t::~parent_tracker_t() { + // this is parent's tracker, reset it + auto &p = (FixedKVNode&)*parent; + if (p.my_tracker == this) { + p.my_tracker = nullptr; + } +} + std::ostream &operator<<(std::ostream &out, const LBAPin &rhs) { return out << "LBAPin(" << rhs.get_key() << "~" << rhs.get_length() diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 5ee08f9bbb6..10161fe2201 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -33,6 +33,8 @@ template < size_t node_size, bool leaf_has_children> class FixedKVBtree; +template +class BtreeNodePin; // #define DEBUG_CACHED_EXTENT_REF #ifdef DEBUG_CACHED_EXTENT_REF @@ -543,6 +545,8 @@ public: void set_invalid(Transaction &t); + // a rewrite extent has an invalid prior_instance, + // and a mutation_pending extent has a valid prior_instance CachedExtentRef get_prior_instance() { return prior_instance; } @@ -715,6 +719,8 @@ protected: friend class crimson::os::seastore::SegmentedAllocator; friend class crimson::os::seastore::TransactionManager; friend class crimson::os::seastore::ExtentPlacementManager; + template + friend class BtreeNodePin; }; std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t); @@ -885,6 +891,7 @@ public: virtual key_t get_key() const = 0; virtual PhysicalNodePinRef duplicate() const = 0; virtual bool has_been_invalidated() const = 0; + virtual CachedExtentRef get_parent() const = 0; virtual ~PhysicalNodePin() {} }; @@ -957,6 +964,67 @@ public: } }; +class parent_tracker_t + : public boost::intrusive_ref_counter< + parent_tracker_t, boost::thread_unsafe_counter> { +public: + parent_tracker_t(CachedExtentRef parent) + : parent(parent) {} + parent_tracker_t(CachedExtent* parent) + : parent(parent) {} + ~parent_tracker_t(); + template + TCachedExtentRef get_parent() const { + ceph_assert(parent); + if constexpr (std::is_same_v) { + return parent; + } else { + return parent->template cast(); + } + } + void reset_parent(CachedExtentRef p) { + parent = p; + } + bool is_valid() const { + return parent && parent->is_valid(); + } +private: + CachedExtentRef parent; +}; + +std::ostream &operator<<(std::ostream &, const parent_tracker_t &); + +using parent_tracker_ref = boost::intrusive_ptr; + +class ChildableCachedExtent : public CachedExtent { +public: + template + ChildableCachedExtent(T&&... t) : CachedExtent(std::forward(t)...) {} + bool has_parent_tracker() const { + return (bool)parent_tracker; + } + void reset_parent_tracker(parent_tracker_t *p = nullptr) { + parent_tracker.reset(p); + } + bool is_parent_valid() const { + return parent_tracker && parent_tracker->is_valid(); + } + template + TCachedExtentRef get_parent_node() const { + assert(parent_tracker); + return parent_tracker->template get_parent(); + } + void take_prior_parent_tracker() { + auto &prior = (ChildableCachedExtent&)(*get_prior_instance()); + parent_tracker = prior.parent_tracker; + } + std::ostream &print_detail(std::ostream &out) const final; +private: + parent_tracker_ref parent_tracker; + virtual std::ostream &_print_detail(std::ostream &out) const { + return out; + } +}; /** * LogicalCachedExtent * @@ -965,10 +1033,12 @@ public: * Users of TransactionManager should be using extents derived from * LogicalCachedExtent. */ -class LogicalCachedExtent : public CachedExtent { +class LogicalCachedExtent : public ChildableCachedExtent { public: template - LogicalCachedExtent(T&&... t) : CachedExtent(std::forward(t)...) {} + LogicalCachedExtent(T&&... t) + : ChildableCachedExtent(std::forward(t)...) + {} void set_pin(LBAPinRef &&npin) { assert(!pin); @@ -1005,8 +1075,13 @@ public: return true; } - std::ostream &print_detail(std::ostream &out) const final; + std::ostream &_print_detail(std::ostream &out) const final; + + void on_replace_prior(Transaction &t) final; + + virtual ~LogicalCachedExtent(); protected: + virtual void apply_delta(const ceph::bufferlist &bl) = 0; virtual std::ostream &print_detail_l(std::ostream &out) const { return out; @@ -1026,6 +1101,16 @@ protected: private: laddr_t laddr = L_ADDR_NULL; LBAPinRef pin; + + template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> + friend class FixedKVBtree; }; using LogicalCachedExtentRef = TCachedExtentRef; diff --git a/src/crimson/os/seastore/lba_manager.cc b/src/crimson/os/seastore/lba_manager.cc index b35e2d0ead8..d113bbd1e95 100644 --- a/src/crimson/os/seastore/lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager.cc @@ -17,17 +17,15 @@ LBAManager::update_mappings( t, extent->get_laddr(), extent->get_prior_paddr_and_reset(), - extent->get_paddr() + extent->get_paddr(), + nullptr // all the extents should have already been + // added to the fixed_kv_btree ); }); } -template LBAManagerRef lba_manager::create_lba_manager(Cache &cache) { - return LBAManagerRef(new btree::BtreeLBAManager(cache)); + return LBAManagerRef(new btree::BtreeLBAManager(cache)); } -template LBAManagerRef lba_manager::create_lba_manager(Cache &cache); -template LBAManagerRef lba_manager::create_lba_manager(Cache &cache); - } diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index f495eb07534..af11cac7cc4 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -80,7 +80,8 @@ public: Transaction &t, laddr_t hint, extent_len_t len, - paddr_t addr) = 0; + paddr_t addr, + LogicalCachedExtent *nextent) = 0; struct ref_update_result_t { unsigned refcount = 0; @@ -166,7 +167,8 @@ public: Transaction& t, laddr_t laddr, paddr_t prev_addr, - paddr_t paddr) = 0; + paddr_t paddr, + LogicalCachedExtent *nextent) = 0; /** * update_mappings @@ -206,7 +208,6 @@ using LBAManagerRef = std::unique_ptr; class Cache; namespace lba_manager { -template LBAManagerRef create_lba_manager(Cache &cache); } diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index 64dd3103ce2..df123d2ee05 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -209,7 +209,8 @@ BtreeLBAManager::alloc_extent( Transaction &t, laddr_t hint, extent_len_t len, - paddr_t addr) + paddr_t addr, + LogicalCachedExtent* nextent) { struct state_t { laddr_t last_end; @@ -229,7 +230,8 @@ BtreeLBAManager::alloc_extent( cache, c, hint, - [this, FNAME, c, hint, len, addr, lookup_attempts, &t](auto &btree, auto &state) { + [this, FNAME, c, hint, len, addr, lookup_attempts, + &t, nextent](auto &btree, auto &state) { return LBABtree::iterate_repeat( c, btree.upper_bound_right(c, hint), @@ -265,12 +267,13 @@ BtreeLBAManager::alloc_extent( interruptible::ready_future_marker{}, seastar::stop_iteration::no); } - }).si_then([FNAME, c, addr, len, hint, &btree, &state] { + }).si_then([FNAME, c, addr, len, hint, &btree, &state, nextent] { return btree.insert( c, *state.insert_iter, state.last_end, - lba_map_val_t{len, addr, 1, 0} + lba_map_val_t{len, addr, 1, 0}, + nextent ).si_then([&state, FNAME, c, addr, len, hint](auto &&p) { auto [iter, inserted] = std::move(p); TRACET("{}~{}, hint={}, inserted at {}", @@ -473,7 +476,8 @@ BtreeLBAManager::update_mapping( Transaction& t, laddr_t laddr, paddr_t prev_addr, - paddr_t addr) + paddr_t addr, + LogicalCachedExtent *nextent) { LOG_PREFIX(BtreeLBAManager::update_mapping); TRACET("laddr={}, paddr {} => {}", t, laddr, prev_addr, addr); @@ -487,7 +491,8 @@ BtreeLBAManager::update_mapping( ceph_assert(in.paddr == prev_addr); ret.paddr = addr; return ret; - } + }, + nextent ).si_then([&t, laddr, prev_addr, addr, FNAME](auto result) { DEBUGT("laddr={}, paddr {} => {} done -- {}", t, laddr, prev_addr, addr, result); @@ -566,7 +571,8 @@ BtreeLBAManager::update_refcount( ceph_assert((int)out.refcount + delta >= 0); out.refcount += delta; return out; - } + }, + nullptr ).si_then([&t, addr, delta, FNAME](auto result) { DEBUGT("laddr={}, delta={} done -- {}", t, addr, delta, result); return ref_update_result_t{ @@ -581,16 +587,17 @@ BtreeLBAManager::_update_mapping_ret BtreeLBAManager::_update_mapping( Transaction &t, laddr_t addr, - update_func_t &&f) + update_func_t &&f, + LogicalCachedExtent* nextent) { auto c = get_context(t); return with_btree_ret( cache, c, - [f=std::move(f), c, addr](auto &btree) mutable { + [f=std::move(f), c, addr, nextent](auto &btree) mutable { return btree.lower_bound( c, addr - ).si_then([&btree, f=std::move(f), c, addr](auto iter) + ).si_then([&btree, f=std::move(f), c, addr, nextent](auto iter) -> _update_mapping_ret { if (iter.is_end() || iter.get_key() != addr) { LOG_PREFIX(BtreeLBAManager::_update_mapping); @@ -610,7 +617,8 @@ BtreeLBAManager::_update_mapping( return btree.update( c, iter, - ret + ret, + nextent ).si_then([ret](auto) { return ret; }); diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index 58dbe1e0581..1535ef93129 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -30,10 +30,12 @@ public: BtreeLBAPin() = default; BtreeLBAPin( CachedExtentRef parent, + uint16_t pos, lba_map_val_t &val, lba_node_meta_t &&meta) : BtreeNodePin( parent, + pos, val.paddr, val.len, std::forward(meta)) @@ -88,7 +90,8 @@ public: Transaction &t, laddr_t hint, extent_len_t len, - paddr_t addr) final; + paddr_t addr, + LogicalCachedExtent*) final; ref_ret decref_extent( Transaction &t, @@ -133,7 +136,8 @@ public: Transaction& t, laddr_t laddr, paddr_t prev_addr, - paddr_t paddr) final; + paddr_t paddr, + LogicalCachedExtent*) final; get_physical_extent_if_live_ret get_physical_extent_if_live( Transaction &t, @@ -198,7 +202,8 @@ private: _update_mapping_ret _update_mapping( Transaction &t, laddr_t addr, - update_func_t &&f); + update_func_t &&f, + LogicalCachedExtent*); }; using BtreeLBAManagerRef = std::unique_ptr; diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc index c502ef338a1..a33f75917c1 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc @@ -27,15 +27,15 @@ std::ostream& operator<<(std::ostream& out, const lba_map_val_t& v) << ")"; } -std::ostream &LBALeafNode::print_detail(std::ostream &out) const +std::ostream &LBALeafNode::_print_detail(std::ostream &out) const { - out << ", size=" << get_size() - << ", meta=" << get_meta() - << ", parent_tracker=" << (void*)parent_tracker.get(); - if (parent_tracker) { - return out << ", parent=" << (void*)parent_tracker->parent.get(); + out << ", size=" << this->get_size() + << ", meta=" << this->get_meta() + << ", my_tracker=" << (void*)this->my_tracker; + if (this->my_tracker) { + out << ", my_tracker->parent=" << (void*)this->my_tracker->get_parent().get(); } - return out << ", root_block=" << (void*)root_block.get(); + return out << ", root_block=" << (void*)this->root_block.get(); } void LBALeafNode::resolve_relative_addrs(paddr_t base) diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h index ff61829cb2e..62ceae6cc46 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -145,64 +145,125 @@ struct LBALeafNode LBALeafNode, true> { using Ref = TCachedExtentRef; - using internal_iterator_t = const_iterator; + using parent_type_t = FixedKVLeafNode< + LEAF_NODE_CAPACITY, + laddr_t, laddr_le_t, + lba_map_val_t, lba_map_val_le_t, + LBA_BLOCK_SIZE, + LBALeafNode, + true>; + using internal_const_iterator_t = + typename parent_type_t::node_layout_t::const_iterator; + using internal_iterator_t = + typename parent_type_t::node_layout_t::iterator; template LBALeafNode(T&&... t) : - FixedKVLeafNode(std::forward(t)...) {} + parent_type_t(std::forward(t)...) {} static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF; + bool validate_stable_children() final { + LOG_PREFIX(LBALeafNode::validate_stable_children); + if (this->children.empty()) { + return false; + } + + for (auto i : *this) { + auto child = (LogicalCachedExtent*)this->children[i.get_offset()]; + if (is_valid_child_ptr(child) && child->get_laddr() != i.get_key()) { + SUBERROR(seastore_fixedkv_tree, + "stable child not valid: child {}, key {}", + *child, + i.get_key()); + ceph_abort(); + return false; + } + } + return true; + } + void update( - const_iterator iter, - lba_map_val_t val) final { - val.paddr = maybe_generate_relative(val.paddr); - return journal_update( + internal_const_iterator_t iter, + lba_map_val_t val, + LogicalCachedExtent* nextent) final { + LOG_PREFIX(LBALeafNode::update); + if (nextent) { + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, {}", + this->pending_for_transaction, + iter.get_offset(), + *nextent); + // child-ptr may already be correct, see LBAManager::update_mappings() + this->update_child_ptr(iter, nextent); + } + val.paddr = this->maybe_generate_relative(val.paddr); + return this->journal_update( iter, val, - maybe_get_delta_buffer()); + this->maybe_get_delta_buffer()); } - const_iterator insert( - const_iterator iter, + internal_const_iterator_t insert( + internal_const_iterator_t iter, laddr_t addr, - lba_map_val_t val) final { - val.paddr = maybe_generate_relative(val.paddr); - journal_insert( + lba_map_val_t val, + LogicalCachedExtent* nextent) final { + LOG_PREFIX(LBALeafNode::insert); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}, extent {}", + this->pending_for_transaction, + iter.get_offset(), + addr, + (void*)nextent); + this->insert_child_ptr(iter, nextent); + val.paddr = this->maybe_generate_relative(val.paddr); + this->journal_insert( iter, addr, val, - maybe_get_delta_buffer()); + this->maybe_get_delta_buffer()); return iter; } - void remove(const_iterator iter) final { - return journal_remove( + void remove(internal_const_iterator_t iter) final { + LOG_PREFIX(LBALeafNode::remove); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}", + this->pending_for_transaction, + iter.get_offset(), + iter.get_key()); + assert(iter != this->end()); + this->remove_child_ptr(iter); + return this->journal_remove( iter, - maybe_get_delta_buffer()); + this->maybe_get_delta_buffer()); } // See LBAInternalNode, same concept void resolve_relative_addrs(paddr_t base); - void node_resolve_vals(iterator from, iterator to) const final { - if (is_initial_pending()) { + void node_resolve_vals( + internal_iterator_t from, + internal_iterator_t to) const final + { + if (this->is_initial_pending()) { for (auto i = from; i != to; ++i) { auto val = i->get_val(); if (val.paddr.is_relative()) { assert(val.paddr.is_block_relative()); - val.paddr = get_paddr().add_relative(val.paddr); + val.paddr = this->get_paddr().add_relative(val.paddr); i->set_val(val); } } } } - void node_unresolve_vals(iterator from, iterator to) const final { - if (is_initial_pending()) { + void node_unresolve_vals( + internal_iterator_t from, + internal_iterator_t to) const final + { + if (this->is_initial_pending()) { for (auto i = from; i != to; ++i) { auto val = i->get_val(); if (val.paddr.is_relative()) { auto val = i->get_val(); assert(val.paddr.is_record_relative()); - val.paddr = val.paddr.block_relative_to(get_paddr()); + val.paddr = val.paddr.block_relative_to(this->get_paddr()); i->set_val(val); } } @@ -213,7 +274,7 @@ struct LBALeafNode return TYPE; } - std::ostream &print_detail(std::ostream &out) const final; + std::ostream &_print_detail(std::ostream &out) const final; }; using LBALeafNodeRef = TCachedExtentRef; diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index a14201cbcf7..9328a03094c 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -219,6 +219,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t) return out << "LADDR_INTERNAL"; case extent_types_t::LADDR_LEAF: return out << "LADDR_LEAF"; + case extent_types_t::DINK_LADDR_LEAF: + return out << "LADDR_LEAF"; case extent_types_t::ONODE_BLOCK_STAGED: return out << "ONODE_BLOCK_STAGED"; case extent_types_t::OMAP_INNER: diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 2f91eba524d..7ec75775644 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -1062,7 +1062,7 @@ enum class extent_types_t : uint8_t { ROOT = 0, LADDR_INTERNAL = 1, LADDR_LEAF = 2, - DINK_LADDR_LEAF = 3, + DINK_LADDR_LEAF = 3, // should only be used for unitttests OMAP_INNER = 4, OMAP_LEAF = 5, ONODE_BLOCK_STAGED = 6, diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 0f083340ce2..bde4df6ba38 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -486,7 +486,8 @@ TransactionManager::rewrite_logical_extent( t, lextent->get_laddr(), lextent->get_paddr(), - nlextent->get_paddr()); + nlextent->get_paddr(), + nlextent.get()); } TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index e00290d88e2..aa4127db46b 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -142,6 +142,7 @@ public: assert(!extent.has_pin()); assert(!extent.has_been_invalidated()); assert(!pin->has_been_invalidated()); + assert(pin->get_parent()); extent.set_pin(std::move(pin)); lba_manager->add_pin(extent.get_pin()); } @@ -325,7 +326,8 @@ public: t, laddr_hint, len, - ext->get_paddr() + ext->get_paddr(), + ext.get() ).si_then([ext=std::move(ext), laddr_hint, &t, FNAME](auto &&ref) mutable { ext->set_pin(std::move(ref)); SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint); @@ -380,7 +382,8 @@ public: t, laddr_hint, length, - existing_paddr + existing_paddr, + ext.get() ).si_then([ext=std::move(ext), laddr_hint, this](auto &&ref) { ceph_assert(laddr_hint == ref->get_key()); ext->set_pin(std::move(ref)); @@ -409,7 +412,8 @@ public: t, hint, len, - P_ADDR_ZERO); + P_ADDR_ZERO, + nullptr); } /* alloc_extents diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 64a847f30a7..8ca18fe3b95 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -257,7 +257,7 @@ struct lba_btree_test : btree_test_base { check.emplace(addr, get_map_val(len)); lba_btree_update([=, this](auto &btree, auto &t) { return btree.insert( - get_op_context(t), addr, get_map_val(len) + get_op_context(t), addr, get_map_val(len), nullptr ).si_then([](auto){}); }); } @@ -324,7 +324,7 @@ TEST_F(lba_btree_test, basic) } struct btree_lba_manager_test : btree_test_base { - BtreeLBAManagerRef lba_manager; + BtreeLBAManagerRef lba_manager; btree_lba_manager_test() = default; @@ -426,7 +426,7 @@ struct btree_lba_manager_test : btree_test_base { auto ret = with_trans_intr( *t.t, [=, this](auto &t) { - return lba_manager->alloc_extent(t, hint, len, paddr); + return lba_manager->alloc_extent(t, hint, len, paddr, nullptr); }).unsafe_get0(); logger().debug("alloc'd: {}", *ret); EXPECT_EQ(len, ret->get_length());