From: Yingxin Cheng Date: Mon, 8 Mar 2021 08:13:46 +0000 (+0800) Subject: crimson/onode-staged-tree: fix tree_cursor_t::Cache to be aware of extent duplication X-Git-Tag: v17.1.0~2573^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b6f35fe018f722c15565fe1bb8b70b45af152209;p=ceph.git crimson/onode-staged-tree: fix tree_cursor_t::Cache to be aware of extent duplication Signed-off-by: Yingxin Cheng --- diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc index 6ba04af81283..1aecbb3a2bbe 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc @@ -33,7 +33,7 @@ using node_future = Node::node_future; */ tree_cursor_t::tree_cursor_t(Ref node, const search_position_t& pos) - : ref_leaf_node{node}, position{pos} + : ref_leaf_node{node}, position{pos}, cache{ref_leaf_node} { assert(!is_end()); ref_leaf_node->do_track_cursor(*this); @@ -42,15 +42,15 @@ tree_cursor_t::tree_cursor_t(Ref node, const search_position_t& pos) tree_cursor_t::tree_cursor_t( Ref node, const search_position_t& pos, const key_view_t& key_view, const value_header_t* p_value_header) - : ref_leaf_node{node}, position{pos} + : ref_leaf_node{node}, position{pos}, cache{ref_leaf_node} { assert(!is_end()); - update_cache(*node, key_view, p_value_header); + update_cache_same_node(key_view, p_value_header); ref_leaf_node->do_track_cursor(*this); } tree_cursor_t::tree_cursor_t(Ref node) - : ref_leaf_node{node}, position{search_position_t::end()} + : ref_leaf_node{node}, position{search_position_t::end()}, cache{ref_leaf_node} { assert(is_end()); assert(ref_leaf_node->is_level_tail()); @@ -133,81 +133,112 @@ void tree_cursor_t::update_track( assert(!is_end()); ref_leaf_node = node; position = pos; + // we lazy update the key/value information until user asked cache.invalidate(); ref_leaf_node->do_track_cursor(*this); } template void tree_cursor_t::update_track(Ref, const search_position_t&); template void tree_cursor_t::update_track(Ref, const search_position_t&); -void tree_cursor_t::update_cache(LeafNode& node, - const key_view_t& key_view, - const value_header_t* p_value_header) const +void tree_cursor_t::update_cache_same_node(const key_view_t& key_view, + const value_header_t* p_value_header) const { assert(!is_end()); - assert(ref_leaf_node.get() == &node); - cache.update(node, key_view, p_value_header); - cache.validate_is_latest(node, position); + cache.update_all(ref_leaf_node->get_version(), key_view, p_value_header); + cache.validate_is_latest(position); } -void tree_cursor_t::maybe_update_cache(value_magic_t magic) const -{ - assert(!is_end()); - if (!cache.is_latest()) { - auto [key_view, p_value_header] = ref_leaf_node->get_kv(position); - if (p_value_header->magic != magic) { - logger().error("OTree::Value::Load: magic mismatch, expect {} but got {}", - magic, p_value_header->magic); - ceph_abort(); - } - cache.update(*ref_leaf_node, key_view, p_value_header); - } - cache.validate_is_latest(*ref_leaf_node, position); -} +/* + * tree_cursor_t::Cache + */ -tree_cursor_t::Cache::Cache() = default; +tree_cursor_t::Cache::Cache(Ref& ref_leaf_node) + : ref_leaf_node{ref_leaf_node} {} -bool tree_cursor_t::Cache::is_latest() const -{ - return (valid && (version == p_leaf_node->get_layout_version())); -} - -void tree_cursor_t::Cache::update(LeafNode& node, - const key_view_t& _key_view, - const value_header_t* _p_value_header) +void tree_cursor_t::Cache::update_all(const node_version_t& current_version, + const key_view_t& _key_view, + const value_header_t* _p_value_header) { assert(_p_value_header); - p_leaf_node = &node; - version = node.get_layout_version(); + + needs_update_all = false; + version = current_version; + key_view = _key_view; p_value_header = _p_value_header; + + auto p_node_base = ref_leaf_node->read(); + assert((const char*)_p_value_header > p_node_base); + offset_value_header = (const char*)_p_value_header - p_node_base; + assert(offset_value_header < NODE_BLOCK_SIZE); + value_payload_mut.reset(); p_value_recorder = nullptr; - valid = true; - assert(is_latest()); } -void tree_cursor_t::Cache::validate_is_latest(const LeafNode& node, - const search_position_t& pos) const +void tree_cursor_t::Cache::maybe_duplicate(const node_version_t& current_version) +{ + assert(version.layout == current_version.layout); + if (!version.is_duplicate && current_version.is_duplicate) { + assert(p_value_header); + auto _p_value_header = reinterpret_cast( + ref_leaf_node->read() + offset_value_header); + assert(_p_value_header != p_value_header); + assert(*_p_value_header == *p_value_header); + + assert(!value_payload_mut); + assert(!p_value_recorder); + + version.is_duplicate = true; + p_value_header = _p_value_header; + } else { + // cache must be latest. + // node cannot change is_duplicate from true to false. + assert(!(version.is_duplicate && !current_version.is_duplicate)); + } +} + +void tree_cursor_t::Cache::make_latest( + value_magic_t magic, const search_position_t& pos) +{ + auto current_version = ref_leaf_node->get_version(); + if (needs_update_all || version.layout != current_version.layout) { + auto [_key_view, _p_value_header] = ref_leaf_node->get_kv(pos); + update_all(current_version, _key_view, _p_value_header); + } else { + maybe_duplicate(current_version); + } + assert(p_value_header->magic == magic); + validate_is_latest(pos); +} + +void tree_cursor_t::Cache::validate_is_latest(const search_position_t& pos) const { - assert(p_leaf_node == &node); - assert(is_latest()); #ifndef NDEBUG - auto [_key_view, _p_value_header] = node.get_kv(pos); + assert(!needs_update_all); + assert(version == ref_leaf_node->get_version()); + + auto [_key_view, _p_value_header] = ref_leaf_node->get_kv(pos); assert(key_view->compare_to(_key_view) == MatchKindCMP::EQ); assert(p_value_header == _p_value_header); + auto p_node_base = ref_leaf_node->read(); + assert((const char*)_p_value_header - p_node_base == offset_value_header); #endif } std::pair -tree_cursor_t::Cache::prepare_mutate_value_payload(context_t c) +tree_cursor_t::Cache::prepare_mutate_value_payload( + context_t c, const search_position_t& pos) { - assert(is_latest()); - assert(p_leaf_node && p_value_header); - assert(p_value_header->magic == c.vb.get_header_magic()); + make_latest(c.vb.get_header_magic(), pos); if (!value_payload_mut.has_value()) { - auto value_mutable = p_leaf_node->prepare_mutate_value_payload(c); + assert(!p_value_recorder); + auto value_mutable = ref_leaf_node->prepare_mutate_value_payload(c); + auto current_version = ref_leaf_node->get_version(); + maybe_duplicate(current_version); value_payload_mut = p_value_header->get_payload_mutable(value_mutable.first); p_value_recorder = value_mutable.second; + validate_is_latest(pos); } return {*value_payload_mut, p_value_recorder}; } @@ -770,6 +801,16 @@ bool LeafNode::is_level_tail() const return impl->is_level_tail(); } +node_version_t LeafNode::get_version() const +{ + return {layout_version, impl->is_duplicate()}; +} + +const char* LeafNode::read() const +{ + return impl->read(); +} + std::tuple LeafNode::get_kv(const search_position_t& pos) const { @@ -994,7 +1035,7 @@ Ref LeafNode::get_or_track_cursor( p_cursor = found->second; assert(p_cursor->get_leaf_node() == this); assert(p_cursor->get_position() == position); - p_cursor->update_cache(*this, key, p_value_header); + p_cursor->update_cache_same_node(key, p_value_header); } return p_cursor; } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h index edbde7541888..ea4c741e7d56 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h @@ -47,6 +47,19 @@ namespace crimson::os::seastore::onode { class LeafNode; class InternalNode; +using layout_version_t = uint32_t; +struct node_version_t { + layout_version_t layout; + bool is_duplicate; + + bool operator==(const node_version_t& rhs) const { + return (layout == rhs.layout && is_duplicate == rhs.is_duplicate); + } + bool operator!=(const node_version_t& rhs) const { + return !(*this == rhs); + } +}; + /** * tree_cursor_t * @@ -57,7 +70,6 @@ class InternalNode; * * Exposes public interfaces for Btree::Cursor. */ -using layout_version_t = uint32_t; class tree_cursor_t final : public boost::intrusive_ref_counter< tree_cursor_t, boost::thread_unsafe_counter> { @@ -90,8 +102,8 @@ class tree_cursor_t final /// Returns the key view in tree if it is not an end cursor. const key_view_t& get_key_view(value_magic_t magic) const { - maybe_update_cache(magic); - return cache.get_key_view(); + assert(!is_end()); + return cache.get_key_view(magic, position); } /// Returns the next tree_cursor_t in tree, can be end if there's no next. @@ -106,15 +118,15 @@ class tree_cursor_t final /// Get the latest value_header_t pointer for read. const value_header_t* read_value_header(value_magic_t magic) const { - maybe_update_cache(magic); - return cache.get_p_value_header(); + assert(!is_end()); + return cache.get_p_value_header(magic, position); } /// Prepare the node extent to be mutable and recorded. std::pair prepare_mutate_value_payload(context_t c) { - maybe_update_cache(c.vb.get_header_magic()); - return cache.prepare_mutate_value_payload(c); + assert(!is_end()); + return cache.prepare_mutate_value_payload(c, position); } /// Extends the size of value payload. @@ -134,8 +146,8 @@ class tree_cursor_t final Ref get_leaf_node() const { return ref_leaf_node; } template void update_track(Ref, const search_position_t&); - void update_cache(LeafNode&, const key_view_t&, const value_header_t*) const; - void maybe_update_cache(value_magic_t magic) const; + void update_cache_same_node(const key_view_t&, + const value_header_t*) const; static Ref create(Ref node, const search_position_t& pos) { return new tree_cursor_t(node, pos); @@ -162,40 +174,44 @@ class tree_cursor_t final /** Cache * * Cached memory pointers or views which may be outdated due to - * asynchronous leaf node updates. + * extent copy-on-write or asynchronous leaf node updates. */ class Cache { public: - Cache(); - bool is_latest() const; - void invalidate() { valid = false; } - void update(LeafNode&, const key_view_t&, const value_header_t*); - void validate_is_latest(const LeafNode&, const search_position_t&) const; - - const key_view_t& get_key_view() const { - assert(is_latest()); - assert(key_view.has_value()); + Cache(Ref&); + void validate_is_latest(const search_position_t&) const; + void invalidate() { needs_update_all = true; } + void update_all(const node_version_t&, const key_view_t&, const value_header_t*); + const key_view_t& get_key_view( + value_magic_t magic, const search_position_t& pos) { + make_latest(magic, pos); return *key_view; } - const value_header_t* get_p_value_header() const { - assert(is_latest()); - assert(p_value_header); + const value_header_t* get_p_value_header( + value_magic_t magic, const search_position_t& pos) { + make_latest(magic, pos); return p_value_header; } std::pair - prepare_mutate_value_payload(context_t); + prepare_mutate_value_payload(context_t, const search_position_t&); private: - LeafNode* p_leaf_node = nullptr; - std::optional key_view; + void maybe_duplicate(const node_version_t&); + void make_latest(value_magic_t, const search_position_t&); + + // metadata about how cache is valid + Ref& ref_leaf_node; + bool needs_update_all = true; + node_version_t version; + + // cached key value info + std::optional key_view; // maybe still point to the read_only extent const value_header_t* p_value_header = nullptr; + node_offset_t offset_value_header; - // to update value payload + // cached data-structures to update value payload std::optional value_payload_mut; ValueDeltaRecorder* p_value_recorder = nullptr; - - layout_version_t version; - bool valid = false; }; mutable Cache cache; @@ -486,7 +502,8 @@ class LeafNode final : public Node { LeafNode& operator=(LeafNode&&) = delete; bool is_level_tail() const; - layout_version_t get_layout_version() const { return layout_version; } + node_version_t get_version() const; + const char* read() const; std::tuple get_kv(const search_position_t&) const; node_future> get_next_cursor(context_t, const search_position_t&); diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h index 29b13b9ea715..ed58cfcd1c07 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h @@ -294,6 +294,12 @@ class NodeExtentAccessorT { const node_stage_t& read() const { return node_stage; } laddr_t get_laddr() const { return extent->get_laddr(); } + bool is_duplicate() const { + // we cannot rely on whether the extent state is MUTATION_PENDING because + // we may access the extent (internally) after it becomes DIRTY after + // transaction submission. + return recorder != nullptr; + } // must be called before any mutate attempes. // for the safety of mixed read and mutate, call before read. diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h index 63946f798ede..7775c2b67fc3 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h @@ -70,6 +70,8 @@ class NodeImpl { virtual node_type_t node_type() const = 0; virtual field_type_t field_type() const = 0; virtual laddr_t laddr() const = 0; + virtual const char* read() const = 0; + virtual bool is_duplicate() const = 0; virtual void prepare_mutate(context_t) = 0; virtual bool is_level_tail() const = 0; virtual bool is_empty() const = 0; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h index 17a6b6b882b3..aeb3735b27c1 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h @@ -89,6 +89,8 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { node_type_t node_type() const override { return NODE_TYPE; } field_type_t field_type() const override { return FIELD_TYPE; } laddr_t laddr() const override { return extent.get_laddr(); } + const char* read() const override { return extent.read().p_start(); } + bool is_duplicate() const override { return extent.is_duplicate(); } void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); } bool is_level_tail() const override { return extent.read().is_level_tail(); } bool is_empty() const override { return extent.read().keys() == 0; } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/value.h b/src/crimson/os/seastore/onode_manager/staged-fltree/value.h index 9813d6ea8cb0..4f50ae6f820b 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/value.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/value.h @@ -73,6 +73,13 @@ struct value_header_t { value_magic_t magic; value_size_t payload_size; + bool operator==(const value_header_t& rhs) const { + return (magic == rhs.magic && payload_size == rhs.payload_size); + } + bool operator!=(const value_header_t& rhs) const { + return !(*this == rhs); + } + value_size_t allocation_size() const { return payload_size + sizeof(value_header_t); } diff --git a/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc b/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc index 365ef60a3a91..61344951ff83 100644 --- a/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc +++ b/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc @@ -712,6 +712,10 @@ class DummyChildPool { protected: node_type_t node_type() const override { return node_type_t::LEAF; } field_type_t field_type() const override { return field_type_t::N0; } + const char* read() const override { + ceph_abort("impossible path"); } + bool is_duplicate() const override { + ceph_abort("impossible path"); } level_t level() const override { return 0u; } void prepare_mutate(context_t) override { ceph_abort("impossible path"); }