From: Yingxin Cheng Date: Wed, 16 Dec 2020 08:03:33 +0000 (+0800) Subject: crimson/onode-staged-tree: implement encode/decode delta with replay X-Git-Tag: v16.1.0~63^2~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cef06f37c03f00e843fdb057174b3c3838be5a4a;p=ceph.git crimson/onode-staged-tree: implement encode/decode delta with replay Signed-off-by: Yingxin Cheng --- diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index b1875ba0d3b7..9749884896ae 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -36,6 +36,7 @@ using TCachedExtentRef = boost::intrusive_ptr; */ namespace onode { class DummyNodeExtent; + class TestReplayExtent; } class ExtentIndex; class CachedExtent : public boost::intrusive_ref_counter< @@ -50,8 +51,9 @@ class CachedExtent : public boost::intrusive_ref_counter< INVALID // Part of no ExtentIndex set } state = extent_state_t::INVALID; friend std::ostream &operator<<(std::ostream &, extent_state_t); - // allow a dummy onode extent to pretend it is a fresh block + // allow a dummy extent to pretend it is at a specific state friend class onode::DummyNodeExtent; + friend class onode::TestReplayExtent; uint32_t last_committed_crc = 0; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h index 685055727f25..9c6a40e18257 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h @@ -3,10 +3,15 @@ #pragma once +#include "crimson/common/log.h" #include "node_extent_manager.h" #include "node_delta_recorder.h" #include "node_layout_replayable.h" +#ifndef NDEBUG +#include "node_extent_manager/test_replay.h" +#endif + namespace crimson::os::seastore::onode { /** @@ -17,8 +22,16 @@ namespace crimson::os::seastore::onode { */ template class DeltaRecorderT final: public DeltaRecorder { + enum class op_t : uint8_t { + INSERT, + SPLIT, + SPLIT_INSERT, + UPDATE_CHILD_ADDR, + }; + public: using layout_t = NodeLayoutReplayableT; + using node_stage_t = typename layout_t::node_stage_t; using position_t = typename layout_t::position_t; using StagedIterator = typename layout_t::StagedIterator; using value_t = typename layout_t::value_t; @@ -33,13 +46,19 @@ class DeltaRecorderT final: public DeltaRecorder { const position_t& insert_pos, const match_stage_t& insert_stage, const node_offset_t& insert_size) { - // TODO encode to encoded + ceph::encode(op_t::INSERT, encoded); + encode_key(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); } void encode_split( const StagedIterator& split_at, - const char* p_start) { - // TODO encode to encoded + const char* p_node_start) { + ceph::encode(op_t::SPLIT, encoded); + split_at.encode(p_node_start, encoded); } template @@ -50,30 +69,150 @@ class DeltaRecorderT final: public DeltaRecorder { const position_t& insert_pos, const match_stage_t& insert_stage, const node_offset_t& insert_size, - const char* p_start) { - // TODO encode to encoded + const char* p_node_start) { + ceph::encode(op_t::SPLIT_INSERT, encoded); + split_at.encode(p_node_start, encoded); + encode_key(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); } void encode_update_child_addr( const laddr_t new_addr, const laddr_packed_t* p_addr, - const char* p_start) { - // TODO encode to encoded + const char* p_node_start) { + ceph::encode(op_t::UPDATE_CHILD_ADDR, encoded); + ceph::encode(new_addr, encoded); + int node_offset = reinterpret_cast(p_addr) - p_node_start; + assert(node_offset > 0 && node_offset <= NODE_BLOCK_SIZE); + ceph::encode(static_cast(node_offset), encoded); } static DeltaRecorderURef create() { return std::unique_ptr(new DeltaRecorderT()); } - private: + protected: DeltaRecorderT() = default; node_type_t node_type() const override { return NODE_TYPE; } field_type_t field_type() const override { return FIELD_TYPE; } void apply_delta(ceph::bufferlist::const_iterator& delta, NodeExtentMutable& node) override { assert(is_empty()); - // TODO decode and apply - assert(false && "not implemented"); + node_stage_t stage(reinterpret_cast(node.get_read())); + op_t op; + try { + ceph::decode(op, delta); + switch (op) { + case op_t::INSERT: { + logger().debug("OTree::Extent::Replay: decoding INSERT ..."); + auto key = key_hobj_t::decode(delta); + + std::unique_ptr value_storage_heap; + value_t value_storage_stack; + auto p_value = decode_value(delta, value_storage_heap, value_storage_stack); + + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + logger().debug("OTree::Extent::Replay: apply {}, {}, " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + key, *p_value, insert_pos, insert_stage, insert_size); + layout_t::template insert( + node, stage, key, *p_value, insert_pos, insert_stage, insert_size); + break; + } + case op_t::SPLIT: { + logger().debug("OTree::Extent::Replay: decoding SPLIT ..."); + auto split_at = StagedIterator::decode(stage.p_start(), delta); + logger().debug("OTree::Extent::Replay: apply split_at={} ...", split_at); + layout_t::split(node, stage, split_at); + break; + } + case op_t::SPLIT_INSERT: { + logger().debug("OTree::Extent::Replay: decoding SPLIT_INSERT ..."); + auto split_at = StagedIterator::decode(stage.p_start(), delta); + auto key = key_hobj_t::decode(delta); + + std::unique_ptr value_storage_heap; + value_t value_storage_stack; + auto p_value = decode_value(delta, value_storage_heap, value_storage_stack); + + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + logger().debug("OTree::Extent::Replay: apply split_at={}, {}, {}, " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + split_at, key, *p_value, insert_pos, insert_stage, insert_size); + layout_t::template split_insert( + node, stage, split_at, key, *p_value, insert_pos, insert_stage, insert_size); + break; + } + case op_t::UPDATE_CHILD_ADDR: { + logger().debug("OTree::Extent::Replay: decoding UPDATE_CHILD_ADDR ..."); + laddr_t new_addr; + ceph::decode(new_addr, delta); + node_offset_t update_offset; + ceph::decode(update_offset, delta); + auto p_addr = reinterpret_cast( + node.get_write() + update_offset); + logger().debug("OTree::Extent::Replay: apply {:#x} to offset {:#x} ...", + new_addr, update_offset); + layout_t::update_child_addr(node, new_addr, p_addr); + break; + } + default: + logger().error("OTree::Extent::Replay: got unknown op {} when replay {:#x}", + op, node.get_laddr()); + ceph_abort(); + } + } catch (buffer::error& e) { + logger().error("OTree::Extent::Replay: got decode error {} when replay {:#x}", + e, node.get_laddr()); + ceph_abort(); + } + } + + private: + static void encode_value(const value_t& value, ceph::bufferlist& encoded) { + if constexpr (std::is_same_v) { + // NODE_TYPE == node_type_t::INTERNAL + ceph::encode(value.value, encoded); + } else if constexpr (std::is_same_v) { + // NODE_TYPE == node_type_t::LEAF + value.encode(encoded); + } else { + ceph_abort("impossible path"); + } + } + + static value_t* decode_value(ceph::bufferlist::const_iterator& delta, + std::unique_ptr& value_storage_heap, + value_t& value_storage_stack) { + if constexpr (std::is_same_v) { + // NODE_TYPE == node_type_t::INTERNAL + laddr_t value; + ceph::decode(value, delta); + value_storage_stack.value = value; + return &value_storage_stack; + } else if constexpr (std::is_same_v) { + // NODE_TYPE == node_type_t::LEAF + auto value_config = onode_t::decode(delta); + value_storage_heap = onode_t::allocate(value_config); + return reinterpret_cast(value_storage_heap.get()); + } else { + ceph_abort("impossible path"); + } + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); } }; @@ -117,6 +256,12 @@ class NodeExtentAccessorT { } else { ceph_abort("impossible path"); } +#ifndef NDEBUG + auto ref_recorder = recorder_t::create(); + test_recorder = static_cast(ref_recorder.get()); + test_extent = TestReplayExtent::create( + extent->get_length(), std::move(ref_recorder)); +#endif } ~NodeExtentAccessorT() = default; NodeExtentAccessorT(const NodeExtentAccessorT&) = delete; @@ -154,9 +299,18 @@ class NodeExtentAccessorT { recorder->template encode_insert( key, value, insert_pos, insert_stage, insert_size); } - return layout_t::template insert( +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_insert( + key, value, insert_pos, insert_stage, insert_size); +#endif + auto ret = layout_t::template insert( *mut, read(), key, value, insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; } void split_replayable(StagedIterator& split_at) { @@ -164,7 +318,14 @@ class NodeExtentAccessorT { if (needs_recording()) { recorder->encode_split(split_at, read().p_start()); } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_split(split_at, read().p_start()); +#endif layout_t::split(*mut, read(), split_at); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif } template @@ -181,9 +342,19 @@ class NodeExtentAccessorT { split_at, key, value, insert_pos, insert_stage, insert_size, read().p_start()); } - return layout_t::template split_insert( +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_split_insert( + split_at, key, value, insert_pos, insert_stage, insert_size, + read().p_start()); +#endif + auto ret = layout_t::template split_insert( *mut, read(), split_at, key, value, insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; } void update_child_addr_replayable( @@ -192,7 +363,14 @@ class NodeExtentAccessorT { if (needs_recording()) { recorder->encode_update_child_addr(new_addr, p_addr, read().p_start()); } - return layout_t::update_child_addr(*mut, new_addr, p_addr); +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->encode_update_child_addr(new_addr, p_addr, read().p_start()); +#endif + layout_t::update_child_addr(*mut, new_addr, p_addr); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif } void test_copy_to(NodeExtentMutable& to) const { @@ -224,6 +402,12 @@ class NodeExtentAccessorT { std::optional mut; // owned by extent recorder_t* recorder; + +#ifndef NDEBUG + // verify record replay using a different memory block + TestReplayExtent::Ref test_extent; + recorder_t* test_recorder; +#endif }; } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h new file mode 100644 index 000000000000..85ed5a448d04 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +/** test_replay.h + * + * A special version of NodeExtent to help verify delta encode, decode and + * replay in recorder_t under debug build. + */ + +namespace crimson::os::seastore::onode { + +class TestReplayExtent final: public NodeExtent { + public: + using Ref = crimson::os::seastore::TCachedExtentRef; + + void prepare_replay(NodeExtentRef from_extent) { + assert(get_length() == from_extent->get_length()); + auto mut = do_get_mutable(); + std::memcpy(mut.get_write(), from_extent->get_read(), get_length()); + } + + void replay_and_verify(NodeExtentRef replayed_extent) { + assert(get_length() == replayed_extent->get_length()); + auto mut = do_get_mutable(); + auto bl = recorder->get_delta(); + assert(bl.length()); + auto p = bl.cbegin(); + recorder->apply_delta(p, mut); + assert(p == bl.end()); + auto cmp = std::memcmp(get_read(), replayed_extent->get_read(), get_length()); + ceph_assert(cmp == 0 && "replay mismatch!"); + } + + static Ref create(extent_len_t length, DeltaRecorderURef&& recorder) { + auto r = ceph::buffer::create_aligned(length, 4096); + auto bp = ceph::bufferptr(std::move(r)); + return new TestReplayExtent(std::move(bp), std::move(recorder)); + } + + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override { + ceph_abort("impossible path"); } + DeltaRecorder* get_recorder() const override { + ceph_abort("impossible path"); } + CachedExtentRef duplicate_for_write() override { + ceph_abort("impossible path"); } + extent_types_t get_type() const override { + ceph_abort("impossible path"); } + ceph::bufferlist get_delta() override { + ceph_abort("impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + ceph_abort("impossible path"); } + + private: + TestReplayExtent(ceph::bufferptr&& ptr, DeltaRecorderURef&& recorder) + : NodeExtent(std::move(ptr)), recorder(std::move(recorder)) { + state = extent_state_t::MUTATION_PENDING; + } + DeltaRecorderURef recorder; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc index de67500274e5..edc6324caf39 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc @@ -25,6 +25,10 @@ extent_len_t NodeExtentMutable::get_length() const { return extent.get_length(); } +laddr_t NodeExtentMutable::get_laddr() const { + return extent.get_laddr(); +} + const char* NodeExtentMutable::buf_upper_bound() const { return get_read() + get_length(); } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h index 6e58421c4e73..e4c6990e1983 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h @@ -63,12 +63,13 @@ class NodeExtentMutable { assert((const char*)&updated + sizeof(T) <= buf_upper_bound()); } + const char* get_read() const; char* get_write(); extent_len_t get_length() const; + laddr_t get_laddr() const; private: explicit NodeExtentMutable(NodeExtent&); - const char* get_read() const; const char* buf_upper_bound() const; NodeExtent& extent; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h index 8452168e40cc..595d6a1f6c11 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h @@ -46,6 +46,9 @@ inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) { struct laddr_packed_t { laddr_t value; } __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const laddr_packed_t& laddr) { + return os << "laddr_packed(0x" << std::hex << laddr.value << std::dec << ")"; +} using match_stat_t = int8_t; constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end() diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h index b0ff95c57fe8..91435f2182c9 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h @@ -28,17 +28,20 @@ class NodeExtentMutable; * # # next-stage | ns-oid | back_ # # * # # contaner | strings | offset # # * #...# range | | #...# - * ^ ^ | - * | | | - * | +---------------------------+ - * + p_items_start + * ^ ^ | ^ + * | | | | + * | +---------------------------+ | + * + p_items_start p_items_end + */ template class item_iterator_t { using value_t = value_type_t; public: item_iterator_t(const memory_range_t& range) - : p_items_start(range.p_start) { next_item_range(range.p_end); } + : p_items_start(range.p_start), p_items_end(range.p_end) { + assert(p_items_start < p_items_end); + next_item_range(p_items_end); + } const char* p_start() const { return item_range.p_start; } const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); } @@ -83,6 +86,35 @@ class item_iterator_t { ++_index; return *this; } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + int start_offset = p_items_start - p_node_start; + int end_offset = p_items_end - p_node_start; + assert(start_offset > 0 && start_offset < NODE_BLOCK_SIZE); + assert(end_offset > 0 && end_offset <= NODE_BLOCK_SIZE); + ceph::encode(static_cast(start_offset), encoded); + ceph::encode(static_cast(end_offset), encoded); + ceph::encode(_index, encoded); + } + + static item_iterator_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + index_t index; + ceph::decode(index, delta); + + item_iterator_t ret({p_node_start + start_offset, + p_node_start + end_offset}); + while (index > 0) { + ++ret; + --index; + } + return ret; + } static node_offset_t header_size() { return 0u; } @@ -120,6 +152,7 @@ class item_iterator_t { } const char* p_items_start; + const char* p_items_end; mutable memory_range_t item_range; mutable node_offset_t back_offset; mutable std::optional key; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h index 64d182e4ea44..fa243834a0d4 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h @@ -288,12 +288,36 @@ class string_view_masked_t { return (memcmp(view.data(), x.view.data(), size()) == 0); } bool operator!=(const string_view_masked_t& x) const { return !(*this == x); } + void encode(ceph::bufferlist& bl) const { + if (get_type() == Type::MIN) { + ceph::encode(string_key_view_t::MIN, bl); + } else if (get_type() == Type::MAX) { + ceph::encode(string_key_view_t::MAX, bl); + } else { + ceph::encode(size(), bl); + ceph::encode_nohead(view, bl); + } + } static auto min() { return string_view_masked_t{Type::MIN}; } static auto max() { return string_view_masked_t{Type::MAX}; } + static string_view_masked_t decode( + std::string& str_storage, ceph::bufferlist::const_iterator& delta) { + string_size_t size; + ceph::decode(size, delta); + if (size == string_key_view_t::MIN) { + return min(); + } else if (size == string_key_view_t::MAX) { + return max(); + } else { + ceph::decode_nohead(size, str_storage, delta); + return string_view_masked_t(str_storage); + } + } private: explicit string_view_masked_t(Type type) : type{type} {} + Type type; std::string_view view; }; @@ -438,11 +462,21 @@ class key_hobj_t { return ghobj.hobj.get_hash(); } std::string_view nspace() const { + // TODO(cross-node string dedup) return ghobj.hobj.nspace; } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{nspace()}; + } std::string_view oid() const { + // TODO(cross-node string dedup) return ghobj.hobj.oid.name; } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{oid()}; + } ns_oid_view_t::Type dedup_type() const { return _dedup_type; } @@ -471,6 +505,29 @@ class key_hobj_t { return os; } + static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) { + shard_t shard; + ceph::decode(shard, delta); + pool_t pool; + ceph::decode(pool, delta); + crush_hash_t crush; + ceph::decode(crush, delta); + std::string nspace; + auto nspace_masked = string_view_masked_t::decode(nspace, delta); + // TODO(cross-node string dedup) + assert(nspace_masked.get_type() == string_view_masked_t::Type::STR); + std::string oid; + auto oid_masked = string_view_masked_t::decode(oid, delta); + // TODO(cross-node string dedup) + assert(oid_masked.get_type() == string_view_masked_t::Type::STR); + snap_t snap; + ceph::decode(snap, delta); + gen_t gen; + ceph::decode(gen, delta); + return key_hobj_t(ghobject_t( + shard_id_t(shard), pool, crush, nspace, oid, snap, gen)); + } + private: ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR; ghobject_t ghobj; @@ -500,11 +557,21 @@ class key_view_t { return crush_packed().crush; } std::string_view nspace() const { + // TODO(cross-node string dedup) return ns_oid_view().nspace.to_string_view(); } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().nspace}; + } std::string_view oid() const { + // TODO(cross-node string dedup) return ns_oid_view().oid.to_string_view(); } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().oid}; + } ns_oid_view_t::Type dedup_type() const { return ns_oid_view().type(); } @@ -563,15 +630,9 @@ class key_view_t { } ghobject_t to_ghobj() const { - ghobject_t ghobj; - ghobj.shard_id.id = shard(); - ghobj.hobj.pool = pool(); - ghobj.hobj.set_hash(crush()); - ghobj.hobj.nspace = nspace(); - ghobj.hobj.oid.name = oid(); - ghobj.hobj.snap = snap(); - ghobj.generation = gen(); - return ghobj; + return ghobject_t( + shard_id_t(shard()), pool(), crush(), + std::string(nspace()), std::string(oid()), snap(), gen()); } void replace(const crush_t& key) { p_crush = &key; } @@ -628,6 +689,17 @@ class key_view_t { const snap_gen_t* p_snap_gen = nullptr; }; +template +void encode_key(const full_key_t& key, ceph::bufferlist& bl) { + ceph::encode(key.shard(), bl); + ceph::encode(key.pool(), bl); + ceph::encode(key.crush(), bl); + key.nspace_masked().encode(bl); + key.oid_masked().encode(bl); + ceph::encode(key.snap(), bl); + ceph::encode(key.gen(), bl); +} + inline MatchKindCMP compare_to(std::string_view l, std::string_view r) { return toMatchKindCMP(l, r); } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h index 1aabea15efca..b765463b389b 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h @@ -105,6 +105,17 @@ class node_extent_t { } } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + assert(p_node_start == p_start()); + // nothing to encode as the container range is the entire extent + } + + static node_extent_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + // nothing to decode + return node_extent_t(reinterpret_cast(p_node_start)); + } + static void validate(const FieldType& fields) { #ifndef NDEBUG assert(fields.header.get_node_type() == NODE_TYPE); diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h index 269999f06ed9..68d4d9092ade 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h @@ -212,6 +212,8 @@ struct staged { * (IS_BOTTOM) get_p_value(index_t) const -> const value_t* * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t * (!IS_BOTTOM) get_nxt_container(index_t) const + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t * static: * header_size() -> node_offset_t * estimate_insert(key, value) -> node_offset_t @@ -444,6 +446,21 @@ struct staged { return container_t::trim_at(mut, container, _index, trimmed); } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + ceph::encode(_index, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + index_t index; + ceph::decode(index, delta); + ret.seek_till_end(index); + return ret; + } + static node_offset_t header_size() { return container_t::header_size(); } @@ -471,6 +488,8 @@ struct staged { * size_overhead() const -> node_offset_t * get_nxt_container() const * has_next() const -> bool + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t * operator++() * static: * header_size() -> node_offset_t @@ -486,9 +505,7 @@ struct staged { public: using me_t = _iterator_t; - _iterator_t(const container_t& container) : container{container} { - assert(index() == 0); - } + _iterator_t(const container_t& container) : container{container} {} index_t index() const { if (is_end()) { @@ -743,6 +760,24 @@ struct staged { return container_t::trim_at(mut, container, trimmed); } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + uint8_t is_end = _is_end; + ceph::encode(is_end, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + uint8_t is_end; + ceph::decode(is_end, delta); + if (is_end) { + ret.set_end(); + } + return ret; + } + static node_offset_t header_size() { return container_t::header_size(); } @@ -782,7 +817,7 @@ struct staged { * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t * (!IS_BOTTOM) update_size(mut, size) - * split; + * split: * seek_split_inserted( * start_size, extra_size, target_size, insert_index, insert_size, * std::optional& is_insert_left) @@ -793,6 +828,9 @@ struct staged { * copy_out_until(appender, to_index) (can be end) * trim_until(mut) -> trim_size * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size + * denc: + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> iterator_t * static: * header_size() -> node_offset_t * estimate_insert(key, value) -> node_offset_t @@ -1512,6 +1550,29 @@ struct staged { return position_t::begin(); } } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + uint8_t present = static_cast(iter); + ceph::encode(present, encoded); + if (iter.has_value()) { + iter->encode(p_node_start, encoded); + if constexpr (!IS_BOTTOM) { + this->_nxt.encode(p_node_start, encoded); + } + } + } + static StagedIterator decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + StagedIterator ret; + uint8_t present; + ceph::decode(present, delta); + if (present) { + ret.iter = iterator_t::decode(p_node_start, delta); + if constexpr (!IS_BOTTOM) { + ret._nxt = NXT_STAGE_T::StagedIterator::decode(p_node_start, delta); + } + } + return ret; + } friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) { return iter.print(os, true); } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h index eec2788abc95..7125bd3d24c5 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h @@ -178,6 +178,18 @@ struct staged_position_t { return *this; } + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + nxt.encode(encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + ret.nxt = nxt_t::decode(delta); + return ret; + } + static me_t begin() { return {0u, nxt_t::begin()}; } static me_t end() { return {INDEX_END, nxt_t::end()}; @@ -241,6 +253,16 @@ struct staged_position_t { return *this; } + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + return ret; + } + static me_t begin() { return {0u}; } static me_t end() { return {INDEX_END}; } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc index 447a35c97d27..9e430fa9b4e2 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc @@ -24,9 +24,12 @@ const laddr_packed_t* internal_sub_items_t::insert_at( mut.copy_in_absolute(p_insert, item); return &reinterpret_cast(p_insert)->value; } -template const laddr_packed_t* internal_sub_items_t::insert_at( - NodeExtentMutable&, const internal_sub_items_t&, const full_key_t&, - const laddr_packed_t&, index_t, node_offset_t, const char*); +#define IA_TEMPLATE(KT) \ + template const laddr_packed_t* internal_sub_items_t::insert_at( \ + NodeExtentMutable&, const internal_sub_items_t&, const full_key_t&, \ + const laddr_packed_t&, index_t, node_offset_t, const char*) +IA_TEMPLATE(KeyT::VIEW); +IA_TEMPLATE(KeyT::HOBJ); node_offset_t internal_sub_items_t::trim_until( NodeExtentMutable&, internal_sub_items_t& items, index_t index) { diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h index d9c1eceaa5a8..bb17a1f5826d 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h @@ -69,6 +69,30 @@ class internal_sub_items_t { return (p_first_item - index)->get_p_value(); } node_offset_t size_overhead_at(index_t index) const { return 0u; } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast(p_first_item) + + sizeof(internal_sub_item_t); + auto p_start = p_end - num_items * sizeof(internal_sub_item_t); + int start_offset = p_start - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast(start_offset), encoded); + ceph::encode(static_cast(end_offset), encoded); + } + + static internal_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return internal_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } static node_offset_t header_size() { return 0u; } @@ -205,6 +229,29 @@ class leaf_sub_items_t { assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index)); return value; } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast(p_num_keys) + + sizeof(num_keys_t); + int start_offset = p_start() - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast(start_offset), encoded); + ceph::encode(static_cast(end_offset), encoded); + } + + static leaf_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return leaf_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } static node_offset_t header_size() { return sizeof(num_keys_t); } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h index c9e731b3f337..5eaa0f31b52a 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h @@ -16,6 +16,33 @@ struct onode_t { bool operator==(const onode_t& o) const { return size == o.size && id == o.id; } bool operator!=(const onode_t& o) const { return !(*this == o); } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(size, encoded); + ceph::encode(id, encoded); + } + static onode_t decode(ceph::bufferlist::const_iterator& delta) { + uint16_t size; + ceph::decode(size, delta); + uint16_t id; + ceph::decode(id, delta); + onode_t ret{size, id}; + return ret; + } + static std::unique_ptr allocate(onode_t config) { + ceph_assert(config.size >= sizeof(onode_t) + sizeof(uint32_t)); + + auto ret = std::make_unique(config.size); + char* p_mem = ret.get(); + auto p_onode = reinterpret_cast(p_mem); + *p_onode = config; + + uint32_t tail_magic = config.size * 137; + p_mem += (config.size - sizeof(uint32_t)); + std::memcpy(p_mem, &tail_magic, sizeof(uint32_t)); + + return ret; + } } __attribute__((packed)); inline std::ostream& operator<<(std::ostream& os, const onode_t& node) { return os << "onode(" << node.id << ", " << node.size << "B)"; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h index e532d8fad6dc..e904e0cb6c67 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h @@ -50,6 +50,7 @@ class Onodes { ceph_assert(size >= sizeof(onode_t) + sizeof(uint32_t)); uint32_t target = size * 137; auto p_mem = (char*)std::malloc(size); + std::memset(p_mem, 0, size); auto p_onode = (onode_t*)p_mem; tracked_onodes.push_back(p_onode); p_onode->size = size;