From 7f73bbfe7d0667f4243958ecd9f07c9874d0a637 Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Wed, 14 Apr 2021 09:39:50 +0800 Subject: [PATCH] crimson/onode-staged-tree: implement layout-level merge Signed-off-by: Yingxin Cheng --- .../onode_manager/staged-fltree/node_layout.h | 53 +++++++++- .../staged-fltree/node_layout_replayable.h | 11 ++- .../stages/item_iterator_stage.cc | 16 +++ .../stages/item_iterator_stage.h | 1 + .../staged-fltree/stages/node_stage.cc | 33 ++++++- .../staged-fltree/stages/node_stage.h | 1 + .../staged-fltree/stages/stage.h | 97 +++++++++++++++++-- .../staged-fltree/stages/sub_items_stage.cc | 67 +++++++++++++ .../staged-fltree/stages/sub_items_stage.h | 36 +++---- 9 files changed, 284 insertions(+), 31 deletions(-) diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h index 949faa5a6fd..9097c4bf754 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h @@ -205,8 +205,55 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { search_position_t merge(NodeExtentMutable& mut, NodeImpl& _right_node, match_stage_t merge_stage, node_offset_t merge_size) override { - // TODO - ceph_abort("not implemented"); + assert(NODE_TYPE == _right_node.node_type()); + assert(FIELD_TYPE == _right_node.field_type()); + auto& right_node = dynamic_cast(_right_node); + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + { + std::ostringstream sos; + dump(sos); + logger().debug("OTree::Layout::Merge: -- left node dump\n{}", sos.str()); + } + { + std::ostringstream sos; + right_node.dump(sos); + logger().debug("OTree::Layout::Merge: -- right node dump\n{}", sos.str()); + } + } + + assert(!is_level_tail()); + assert(!is_keys_empty()); + auto& left_node_stage = extent.read(); + position_t left_last_pos; + STAGE_T::template get_largest_slot( + left_node_stage, &left_last_pos, nullptr, nullptr); + + typename STAGE_T::template StagedAppender left_appender; + left_appender.init_tail(&mut, left_node_stage, merge_stage); + + assert(!right_node.is_keys_empty()); + auto& right_node_stage = right_node.extent.read(); + typename STAGE_T::StagedIterator right_append_at; + right_append_at.set(right_node_stage); + + auto pos_end = position_t::end(); + STAGE_T::template append_until( + right_append_at, left_appender, pos_end, STAGE); + assert(right_append_at.is_end()); + left_appender.wrap(); + + if (right_node.is_level_tail()) { + node_stage_t::update_is_level_tail(mut, left_node_stage, true); + build_name(); + } + + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + logger().debug("OTree::Layout::Merge: -- merged node dump\n{}", sos.str()); + } + assert(merge_size == filled_size()); + return normalize(std::move(left_last_pos)); } ertr::future @@ -601,7 +648,7 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { auto append_at = split_at; // TODO(cross-node string dedup) typename STAGE_T::template StagedAppender right_appender; - right_appender.init(&right_mut, right_mut.get_write()); + right_appender.init_empty(&right_mut, right_mut.get_write()); const value_t* p_value = nullptr; if (!is_insert_left) { // right node: append [start(append_at), insert_pos) diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h index df54e8f7c73..ece058111f4 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h @@ -94,8 +94,15 @@ struct NodeLayoutReplayableT { NodeExtentMutable& mut, const node_stage_t& node_stage) { assert(!node_stage.is_level_tail()); - // TODO - ceph_abort("not implemented"); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + auto [r_stage, r_last_pos] = update_last_to_tail(mut, node_stage); + std::ignore = r_stage; + return r_last_pos; + } else { + node_stage_t::update_is_level_tail(mut, node_stage, true); + // no need to calculate the last pos + return position_t::end(); + } } private: diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc index b322caa3db6..5d399c3ccb2 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc @@ -101,6 +101,22 @@ ITER_TEMPLATE(node_type_t::INTERNAL); #define APPEND_T ITER_T::Appender +template +template +APPEND_T::Appender(NodeExtentMutable* p_mut, + const item_iterator_t& iter, + bool open) : p_mut{p_mut} +{ + assert(!iter.has_next()); + if (open) { + p_append = const_cast(iter.get_key().p_start()); + p_offset_while_open = const_cast(iter.item_range.p_end); + } else { + // XXX: this doesn't need to advance the iter to last + p_append = const_cast(iter.p_items_start); + } +} + template template bool APPEND_T::append(const ITER_T& src, index_t& items) diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h index cdfa6427239..58129845907 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h @@ -169,6 +169,7 @@ class item_iterator_t::Appender { public: Appender(NodeExtentMutable* p_mut, char* p_append) : p_mut{p_mut}, p_append{p_append} {} + Appender(NodeExtentMutable*, const item_iterator_t&, bool open); bool append(const item_iterator_t& src, index_t& items); char* wrap() { return p_append; } std::tuple open_nxt(const key_get_type&); diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc index b617023b83d..20d5ac89395 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc @@ -204,6 +204,37 @@ NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF); #define APPEND_T node_extent_t::Appender +template +template +APPEND_T::Appender(NodeExtentMutable* p_mut, const node_extent_t& node, bool open) + : p_mut{p_mut}, p_start{p_mut->get_write()} +{ + assert(p_start == node.p_start()); + assert(node.keys()); + if (open) { + // seek as open_nxt() + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + p_append_left = p_start + node.fields().get_key_start_offset(node.keys() - 1); + p_append_left += sizeof(typename FieldType::key_t); + p_append_right = p_start + node.fields().get_item_end_offset(node.keys() - 1); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible path"); + } + num_keys = node.keys() - 1; + } else { + if constexpr (std::is_same_v) { + ceph_abort("not implemented"); + } else { + p_append_left = p_start + node.fields().get_key_start_offset(node.keys()); + p_append_right = p_start + node.fields().get_item_end_offset(node.keys()); + } + num_keys = node.keys(); + } +} + template template void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) @@ -221,7 +252,7 @@ void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) assert(from + items <= src.keys()); num_keys += items; if constexpr (std::is_same_v) { - ceph_abort("impossible path"); + ceph_abort("not implemented"); } else { // append left part forwards node_offset_t offset_left_start = src.fields().get_key_start_offset(from); diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h index 14f022714b1..d0786ee48fe 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h @@ -201,6 +201,7 @@ class node_extent_t::Appender { p_append_left = p_start + FieldType::HEADER_SIZE; p_append_right = p_start + FieldType::SIZE; } + Appender(NodeExtentMutable*, const node_extent_t&, bool open = false); void append(const node_extent_t& src, index_t from, index_t items); void append(const full_key_t&, const value_input_t&, const value_t*&); char* wrap(); diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h index c3f270ef744..a6f439ca63b 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h @@ -456,6 +456,24 @@ struct staged { return container_t::erase_at(mut, container, _index, p_left_bound); } + template + typename container_t::template Appender + get_appender(NodeExtentMutable* p_mut) { + assert(_index + 1 == container.keys()); + return typename container_t::template Appender(p_mut, container); + } + + template + typename container_t::template Appender + get_appender_opened(NodeExtentMutable* p_mut) { + if constexpr (!IS_BOTTOM) { + assert(_index + 1 == container.keys()); + return typename container_t::template Appender(p_mut, container, true); + } else { + ceph_abort("impossible path"); + } + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { container.encode(p_node_start, encoded); ceph::encode(_index, encoded); @@ -776,6 +794,22 @@ struct staged { return container_t::erase(mut, container, p_left_bound); } + template + typename container_t::template Appender + get_appender(NodeExtentMutable* p_mut) { + return typename container_t::template Appender(p_mut, container, false); + } + + template + typename container_t::template Appender + get_appender_opened(NodeExtentMutable* p_mut) { + if constexpr (!IS_BOTTOM) { + return typename container_t::template Appender(p_mut, container, true); + } else { + ceph_abort("impossible path"); + } + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { container.encode(p_node_start, encoded); uint8_t is_end = _is_end; @@ -847,6 +881,9 @@ struct staged { * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size * erase: * erase(mut, p_left_bound) -> erase_size + * merge: + * get_appender(p_mut) -> Appender + * (!IS_BOTTOM)get_appender_opened(p_mut) -> Appender * denc: * encode(p_node_start, encoded) * decode(p_node_start, delta) -> iterator_t @@ -1266,7 +1303,7 @@ struct staged { char* p_insert = const_cast(range.p_end); const value_t* p_value = nullptr; StagedAppender appender; - appender.init(&mut, p_insert); + appender.init_empty(&mut, p_insert); appender.append(key, value, p_value); [[maybe_unused]] const char* p_insert_front = appender.wrap(); assert(p_insert_front == range.p_start); @@ -1524,6 +1561,7 @@ struct staged { bool is_end() const { return iter->is_end(); } bool in_progress() const { assert(valid()); + assert(!is_end()); if constexpr (!IS_BOTTOM) { if (this->_nxt.valid()) { if (this->_nxt.index() == 0) { @@ -1883,11 +1921,36 @@ struct staged { } bool in_progress() const { return require_wrap_nxt; } // TODO: pass by reference - void init(NodeExtentMutable* p_mut, char* p_start) { + void init_empty(NodeExtentMutable* p_mut, char* p_start) { assert(!valid()); appender = typename container_t::template Appender(p_mut, p_start); _index = 0; } + void init_tail(NodeExtentMutable* p_mut, + const container_t& container, + match_stage_t stage) { + assert(!valid()); + auto iter = iterator_t(container); + iter.seek_last(); + if (stage == STAGE) { + appender = iter.template get_appender(p_mut); + _index = iter.index() + 1; + if constexpr (!IS_BOTTOM) { + assert(!this->_nxt.valid()); + } + } else { + assert(stage < STAGE); + if constexpr (!IS_BOTTOM) { + appender = iter.template get_appender_opened(p_mut); + _index = iter.index(); + require_wrap_nxt = true; + auto nxt_container = iter.get_nxt_container(); + this->_nxt.init_tail(p_mut, nxt_container, stage); + } else { + ceph_abort("impossible path"); + } + } + } // possible to make src_iter end if to_index == INDEX_END void append_until(StagedIterator& src_iter, index_t& to_index) { assert(!require_wrap_nxt); @@ -1933,7 +1996,7 @@ struct staged { if constexpr (!IS_BOTTOM) { require_wrap_nxt = true; auto [p_mut, p_append] = appender->open_nxt(paritial_key); - this->_nxt.init(p_mut, p_append); + this->_nxt.init_empty(p_mut, p_append); return this->_nxt; } else { ceph_abort("impossible path"); @@ -1945,7 +2008,7 @@ struct staged { if constexpr (!IS_BOTTOM) { require_wrap_nxt = true; auto [p_mut, p_append] = appender->open_nxt(key); - this->_nxt.init(p_mut, p_append); + this->_nxt.init_empty(p_mut, p_append); return this->_nxt; } else { ceph_abort("impossible path"); @@ -1997,7 +2060,7 @@ struct staged { // cannot append the current item as-a-whole index_t to_index_nxt = INDEX_END; NXT_STAGE_T::template _append_range( - src_iter.nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt); + src_iter.get_nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt); ++src_iter; appender.wrap_nxt(); } else { @@ -2232,8 +2295,28 @@ struct staged { static std::tuple evaluate_merge( const full_key_t& left_pivot_index, const container_t& right_container) { - // TODO - ceph_abort("not implemented"); + auto r_iter = iterator_t(right_container); + r_iter.seek_at(0); + node_offset_t compensate = r_iter.header_size(); + auto cmp = compare_to(left_pivot_index, r_iter.get_key()); + if (cmp == MatchKindCMP::EQ) { + if constexpr (!IS_BOTTOM) { + // the index is equal, compensate and look at the lower stage + compensate += r_iter.size_to_nxt(); + auto r_nxt_container = r_iter.get_nxt_container(); + auto [ret_stage, ret_compensate] = NXT_STAGE_T::evaluate_merge( + left_pivot_index, r_nxt_container); + compensate += ret_compensate; + return {ret_stage, compensate}; + } else { + ceph_abort("impossible path: left_pivot_key == right_first_key"); + } + } else if (cmp == MatchKindCMP::LT) { + // ok, do merge here + return {STAGE, compensate}; + } else { + ceph_abort("impossible path: left_pivot_key < right_first_key"); + } } }; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc index 84e6b49c5a2..729cab4c3e7 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc @@ -195,9 +195,76 @@ template struct overloaded : Ts... { using Ts::operator()...; }; // explicit deduction guide template overloaded(Ts...) -> overloaded; +template +void leaf_sub_items_t::Appender::append( + const leaf_sub_items_t& src, index_t from, index_t items) +{ + if (p_append) { + // append from empty + assert(cnt <= APPENDER_LIMIT); + assert(from <= src.keys()); + if (items == 0) { + return; + } + if (op_src) { + assert(*op_src == src); + } else { + op_src = src; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + appends[cnt] = range_items_t{from, items}; + ++cnt; + } else { + // append from existing + assert(op_dst.has_value()); + assert(!p_appended); + assert(from == 0); + assert(items); + assert(items == src.keys()); + + num_keys_t num_keys = op_dst->keys(); + node_offset_t compensate = op_dst->get_offset(num_keys - 1).value; + const char* p_items_start = op_dst->p_start(); + const char* p_items_end = op_dst->p_items_end; + + // update dst num_keys + num_keys += items; + p_mut->copy_in_absolute((char*)op_dst->p_num_keys, num_keys); + + // shift dst items + std::size_t src_offsets_size = sizeof(node_offset_t) * items; + p_mut->shift_absolute(p_items_start, + p_items_end - p_items_start, + -(int)src_offsets_size); + + // fill offsets from src + node_offset_t offset; + char* p_cur_offset = const_cast(p_items_end); + for (auto i = from; i < from + items; ++i) { + offset = src.get_offset(i).value + compensate; + p_cur_offset -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur_offset, offset); + } + + // fill items from src + auto p_src_items_start = src.get_item_end(from + items); + std::size_t src_items_size = src.get_item_end(from) - p_src_items_start; + p_appended = const_cast(p_items_start) - src_offsets_size - src_items_size; + p_mut->copy_in_absolute(p_appended, p_src_items_start, src_items_size); + } +} + template char* leaf_sub_items_t::Appender::wrap() { + if (op_dst.has_value()) { + // append from existing + assert(p_appended); + return p_appended; + } + // append from empty + assert(p_append); auto p_cur = p_append; num_keys_t num_keys = 0; for (auto i = 0u; i < cnt; ++i) { diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h index 51c67bf35c2..aedd5f0fa41 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h @@ -126,6 +126,11 @@ class internal_sub_items_t::Appender { public: Appender(NodeExtentMutable* p_mut, char* p_append) : p_mut{p_mut}, p_append{p_append} {} + Appender(NodeExtentMutable* p_mut, const internal_sub_items_t& sub_items) + : p_mut{p_mut}, + p_append{(char*)(sub_items.p_first_item + 1 - sub_items.keys())} { + assert(sub_items.keys()); + } void append(const internal_sub_items_t& src, index_t from, index_t items); void append(const full_key_t&, const laddr_t&, const laddr_packed_t*&); char* wrap() { return p_append; } @@ -304,25 +309,16 @@ class leaf_sub_items_t::Appender { Appender(NodeExtentMutable* p_mut, char* p_append) : p_mut{p_mut}, p_append{p_append} { } - - void append(const leaf_sub_items_t& src, index_t from, index_t items) { - assert(cnt <= APPENDER_LIMIT); - assert(from <= src.keys()); - if (items == 0) { - return; - } - if (op_src) { - assert(*op_src == src); - } else { - op_src = src; - } - assert(from < src.keys()); - assert(from + items <= src.keys()); - appends[cnt] = range_items_t{from, items}; - ++cnt; + Appender(NodeExtentMutable* p_mut, const leaf_sub_items_t& sub_items) + : p_mut{p_mut} , op_dst(sub_items) { + assert(sub_items.keys()); } + + void append(const leaf_sub_items_t& src, index_t from, index_t items); void append(const full_key_t& key, const value_config_t& value, const value_header_t*& p_value) { + // append from empty + assert(p_append); assert(pp_value == nullptr); assert(cnt <= APPENDER_LIMIT); appends[cnt] = kv_item_t{&key, value}; @@ -332,12 +328,16 @@ class leaf_sub_items_t::Appender { char* wrap(); private: + NodeExtentMutable* p_mut; + // append from empty std::optional op_src; const value_header_t** pp_value = nullptr; - NodeExtentMutable* p_mut; - char* p_append; + char* p_append = nullptr; var_t appends[APPENDER_LIMIT]; index_t cnt = 0; + // append from existing + std::optional op_dst; + char* p_appended = nullptr; }; template struct _sub_items_t; -- 2.39.5