From: Xuehan Xu Date: Fri, 28 Jan 2022 05:04:03 +0000 (+0800) Subject: crimson/os/seastore: extract fixed kv btree implementation out of lba manager X-Git-Tag: v18.0.0~1254^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bd307d2c6dbce9b69c02ce2e1301a4fe087da624;p=ceph.git crimson/os/seastore: extract fixed kv btree implementation out of lba manager Basically, this pr moves the current LBABtree and lba_range_pin out of lba manager, and rename LBABtree to FixedKVBtree. This is the preparation for implementing backrefs Signed-off-by: Xuehan Xu --- diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index cec75b10471..96e9384b6c1 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -11,8 +11,6 @@ set(crimson_seastore_srcs lba_manager.cc segment_cleaner.cc lba_manager/btree/btree_lba_manager.cc - lba_manager/btree/btree_range_pin.cc - lba_manager/btree/lba_btree.cc lba_manager/btree/lba_btree_node.cc omap_manager.cc omap_manager/btree/btree_omap_manager.cc diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h new file mode 100644 index 00000000000..4791a9b457f --- /dev/null +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -0,0 +1,447 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +template +struct fixed_kv_node_meta_t { + bound_t begin = 0; + bound_t end = 0; + depth_t depth = 0; + + bool is_parent_of(const fixed_kv_node_meta_t &other) const { + return (depth == other.depth + 1) && + (begin <= other.begin) && + (end > other.begin); + } + + std::pair split_into(bound_t pivot) const { + return std::make_pair( + fixed_kv_node_meta_t{begin, pivot, depth}, + fixed_kv_node_meta_t{pivot, end, depth}); + } + + static fixed_kv_node_meta_t merge_from( + const fixed_kv_node_meta_t &lhs, const fixed_kv_node_meta_t &rhs) { + ceph_assert(lhs.depth == rhs.depth); + return fixed_kv_node_meta_t{lhs.begin, rhs.end, lhs.depth}; + } + + static std::pair + rebalance(const fixed_kv_node_meta_t &lhs, const fixed_kv_node_meta_t &rhs, bound_t pivot) { + ceph_assert(lhs.depth == rhs.depth); + return std::make_pair( + fixed_kv_node_meta_t{lhs.begin, pivot, lhs.depth}, + fixed_kv_node_meta_t{pivot, rhs.end, lhs.depth}); + } + + bool is_root() const { + return begin == 0 && end == L_ADDR_MAX; + } +}; + +template +inline std::ostream &operator<<( + std::ostream &lhs, + const fixed_kv_node_meta_t &rhs) +{ + return lhs << "btree_node_meta_t(" + << "begin=" << rhs.begin + << ", end=" << rhs.end + << ", depth=" << rhs.depth + << ")"; +} +/** + * btree_range_pin_t + * + * Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set + * hook, the fixed_kv_node_meta_t representing the key range covered by a node, + * and extent and ref members intended to hold a reference when the extent + * should be pinned. + */ +template +class btree_pin_set_t; + +template +class btree_range_pin_t : public boost::intrusive::set_base_hook<> { + friend class btree_pin_set_t; + fixed_kv_node_meta_t range; + + btree_pin_set_t *pins = nullptr; + + // We need to be able to remember extent without holding a reference, + // but we can do it more compactly -- TODO + CachedExtent *extent = nullptr; + CachedExtentRef ref; + + using index_t = boost::intrusive::set; + + static auto get_tuple(const fixed_kv_node_meta_t &meta) { + return std::make_tuple(-meta.depth, meta.begin); + } + + void acquire_ref() { + ref = CachedExtentRef(extent); + } + + void drop_ref() { + ref.reset(); + } + +public: + btree_range_pin_t() = default; + btree_range_pin_t(CachedExtent *extent) + : extent(extent) {} + btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent) + : range(rhs.range), extent(extent) {} + + bool has_ref() const { + return !!ref; + } + + bool is_root() const { + return range.is_root(); + } + + void set_range(const fixed_kv_node_meta_t &nrange) { + range = nrange; + } + void set_extent(CachedExtent *nextent) { + ceph_assert(!extent); + extent = nextent; + } + + CachedExtent &get_extent() { + assert(extent); + return *extent; + } + + bool has_ref() { + return !!ref; + } + + void take_pin(btree_range_pin_t &other) + { + ceph_assert(other.extent); + if (other.pins) { + other.pins->replace_pin(*this, other); + pins = other.pins; + other.pins = nullptr; + + if (other.has_ref()) { + other.drop_ref(); + acquire_ref(); + } + } + } + + friend bool operator<( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) < get_tuple(rhs.range); + } + friend bool operator>( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) > get_tuple(rhs.range); + } + friend bool operator==( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) == rhs.get_tuple(rhs.range); + } + + struct meta_cmp_t { + bool operator()( + const btree_range_pin_t &lhs, const fixed_kv_node_meta_t &rhs) const { + return get_tuple(lhs.range) < get_tuple(rhs); + } + bool operator()( + const fixed_kv_node_meta_t &lhs, const btree_range_pin_t &rhs) const { + return get_tuple(lhs) < get_tuple(rhs.range); + } + }; + + friend std::ostream &operator<<( + std::ostream &lhs, + const btree_range_pin_t &rhs) { + return lhs << "btree_range_pin_t(" + << "begin=" << rhs.range.begin + << ", end=" << rhs.range.end + << ", depth=" << rhs.range.depth + << ", extent=" << rhs.extent + << ")"; + } + + template + friend class BtreeNodePin; + ~btree_range_pin_t() + { + ceph_assert(!pins == !is_linked()); + ceph_assert(!ref); + if (pins) { + crimson::get_logger(ceph_subsys_seastore_lba + ).debug("{}: removing {}", __func__, *this); + pins->remove_pin(*this, true); + } + extent = nullptr; + } + +}; + +/** + * btree_pin_set_t + * + * Ensures that for every cached node, all parent btree nodes required + * to map it are present in cache. Relocating these nodes can + * therefore be done without further reads or cache space. + * + * Contains a btree_range_pin_t for every clean or dirty btree node + * or LogicalCachedExtent instance in cache at any point in time. + * For any btree node, the contained btree_range_pin_t will hold + * a reference to that node pinning it in cache as long as that + * node has children in the set. This invariant can be violated + * only by calling retire_extent and is repaired by calling + * check_parent synchronously after adding any new extents. + */ +template +class btree_pin_set_t { + friend class btree_range_pin_t; + using pins_t = typename btree_range_pin_t::index_t; + pins_t pins; + + /// Removes pin from set optionally checking whether parent has other children + void remove_pin(btree_range_pin_t &pin, bool do_check_parent) + { + crimson::get_logger(ceph_subsys_seastore_lba).debug("{}: {}", __func__, pin); + ceph_assert(pin.is_linked()); + ceph_assert(pin.pins); + ceph_assert(!pin.ref); + + pins.erase(pin); + pin.pins = nullptr; + + if (do_check_parent) { + check_parent(pin); + } + } + + void replace_pin( + btree_range_pin_t &to, + btree_range_pin_t &from) + { + pins.replace_node(pins.iterator_to(from), to); + } + + /// Returns parent pin if exists + btree_range_pin_t *maybe_get_parent( + const fixed_kv_node_meta_t &meta) + { + auto cmeta = meta; + cmeta.depth++; + auto iter = pins.upper_bound( + cmeta, + typename btree_range_pin_t::meta_cmp_t()); + if (iter == pins.begin()) { + return nullptr; + } else { + --iter; + if (iter->range.is_parent_of(meta)) { + return &*iter; + } else { + return nullptr; + } + } + } + + /// Returns earliest child pin if exist + const btree_range_pin_t + *maybe_get_first_child(const fixed_kv_node_meta_t &meta) const + { + if (meta.depth == 0) { + return nullptr; + } + + auto cmeta = meta; + cmeta.depth--; + + auto iter = pins.lower_bound( + cmeta, + typename btree_range_pin_t::meta_cmp_t()); + if (iter == pins.end()) { + return nullptr; + } else if (meta.is_parent_of(iter->range)) { + return &*iter; + } else { + return nullptr; + } + } + + /// Releases pin if it has no children + void release_if_no_children(btree_range_pin_t &pin) + { + ceph_assert(pin.is_linked()); + if (maybe_get_first_child(pin.range) == nullptr) { + pin.drop_ref(); + } + } + +public: + /// Adds pin to set, assumes set is consistent + void add_pin(btree_range_pin_t &pin) + { + ceph_assert(!pin.is_linked()); + ceph_assert(!pin.pins); + ceph_assert(!pin.ref); + + auto [prev, inserted] = pins.insert(pin); + if (!inserted) { + crimson::get_logger(ceph_subsys_seastore_lba).error( + "{}: unable to add {} ({}), found {} ({})", + __func__, + pin, + *(pin.extent), + *prev, + *(prev->extent)); + ceph_assert(0 == "impossible"); + return; + } + pin.pins = this; + if (!pin.is_root()) { + auto *parent = maybe_get_parent(pin.range); + ceph_assert(parent); + if (!parent->has_ref()) { + crimson::get_logger(ceph_subsys_seastore_lba + ).debug("{}: acquiring parent {}", __func__, + static_cast(parent)); + parent->acquire_ref(); + } else { + crimson::get_logger(ceph_subsys_seastore_lba).debug( + "{}: parent has ref {}", __func__, + static_cast(parent)); + } + } + if (maybe_get_first_child(pin.range) != nullptr) { + crimson::get_logger(ceph_subsys_seastore_lba).debug( + "{}: acquiring self {}", __func__, pin); + pin.acquire_ref(); + } + } + + + /** + * retire/check_parent + * + * See BtreeLBAManager::complete_transaction. + * retire removes the specified pin from the set, but does not + * check parents. After any new extents are added to the set, + * the caller is required to call check_parent to restore the + * invariant. + */ + void retire(btree_range_pin_t &pin) + { + pin.drop_ref(); + remove_pin(pin, false); + } + + void check_parent(btree_range_pin_t &pin) + { + auto parent = maybe_get_parent(pin.range); + if (parent) { + crimson::get_logger(ceph_subsys_seastore_lba + ).debug("{}: releasing parent {}", __func__, *parent); + release_if_no_children(*parent); + } + } + + template + void scan(F &&f) { + for (auto &i : pins) { + std::invoke(f, i); + } + } + + ~btree_pin_set_t() { + ceph_assert(pins.empty()); + } +}; + +template +class BtreeNodePin : public PhysicalNodePin { + + /** + * parent + * + * populated until link_extent is called to ensure cache residence + * until add_pin is called. + */ + CachedExtentRef parent; + + paddr_t paddr; + btree_range_pin_t pin; + +public: + BtreeNodePin() = default; + + BtreeNodePin( + CachedExtentRef parent, + paddr_t paddr, + fixed_kv_node_meta_t &&meta) + : parent(parent), paddr(paddr) { + pin.set_range(std::move(meta)); + } + + btree_range_pin_t& get_range_pin() { + return pin; + } + + CachedExtentRef get_parent() { + return parent; + } + + void set_parent(CachedExtentRef pin) { + parent = pin; + } + + void link_extent(LogicalCachedExtent *ref) final { + pin.set_extent(ref); + } + + extent_len_t get_length() const final { + ceph_assert(pin.range.end > pin.range.begin); + return pin.range.end - pin.range.begin; + } + + paddr_t get_paddr() const final { + return paddr; + } + + key_t get_key() const final { + return pin.range.begin; + } + + PhysicalNodePinRef duplicate() const final { + auto ret = std::unique_ptr>( + new BtreeNodePin); + ret->pin.set_range(pin.range); + ret->paddr = paddr; + ret->parent = parent; + return ret; + } + + void take_pin(PhysicalNodePin &opin) final { + pin.take_pin(static_cast&>(opin).pin); + } + + bool has_been_invalidated() const final { + return parent->has_been_invalidated(); + } +}; + +} diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h new file mode 100644 index 00000000000..1892992690e --- /dev/null +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -0,0 +1,1680 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include +#include +#include +#include + +#include "crimson/os/seastore/logging.h" + +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/btree/btree_range_pin.h" + +namespace crimson::os::seastore { + +template +struct op_context_t { + Cache &cache; + Transaction &trans; + btree_pin_set_t *pins = nullptr; +}; + +template +Transaction::tree_stats_t& get_tree_stats(Transaction &t); + +template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + size_t node_size> +class FixedKVBtree { + static constexpr size_t MAX_DEPTH = 16; + using self_type = FixedKVBtree< + node_key_t, + node_val_t, + internal_node_t, + leaf_node_t, + node_size>; +public: + using InternalNodeRef = TCachedExtentRef; + using LeafNodeRef = TCachedExtentRef; + + using base_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using base_iertr = trans_iertr; + + class iterator; + using iterator_fut = base_iertr::future; + + using mapped_space_visitor_t = std::function< + void(paddr_t, extent_len_t)>; + + class iterator { + public: + iterator(const iterator &rhs) noexcept : + internal(rhs.internal), leaf(rhs.leaf) {} + iterator(iterator &&rhs) noexcept : + internal(std::move(rhs.internal)), leaf(std::move(rhs.leaf)) {} + + iterator &operator=(const iterator &) = default; + iterator &operator=(iterator &&) = default; + + iterator_fut next( + op_context_t c, + mapped_space_visitor_t *visitor=nullptr) const + { + assert_valid(); + assert(!is_end()); + + auto ret = *this; + ret.leaf.pos++; + if (ret.at_boundary()) { + return seastar::do_with( + ret, + [c, visitor](auto &ret) mutable { + return ret.handle_boundary( + c, visitor + ).si_then([&ret] { + return std::move(ret); + }); + }); + } else { + return iterator_fut( + interruptible::ready_future_marker{}, + ret); + } + + } + + iterator_fut prev(op_context_t c) const + { + assert_valid(); + assert(!is_begin()); + + auto ret = *this; + + if (ret.leaf.pos > 0) { + ret.leaf.pos--; + return iterator_fut( + interruptible::ready_future_marker{}, + ret); + } + + depth_t depth_with_space = 2; + for (; depth_with_space <= get_depth(); ++depth_with_space) { + if (ret.get_internal(depth_with_space).pos > 0) { + break; + } + } + + assert(depth_with_space <= ret.get_depth()); // must not be begin() + return seastar::do_with( + std::move(ret), + [](const internal_node_t &internal) { return --internal.end(); }, + [](const leaf_node_t &leaf) { return --leaf.end(); }, + [c, depth_with_space](auto &ret, auto &li, auto &ll) { + for (depth_t depth = 2; depth < depth_with_space; ++depth) { + ret.get_internal(depth).reset(); + } + ret.leaf.reset(); + ret.get_internal(depth_with_space).pos--; + // note, cannot result in at_boundary() by construction + return lookup_depth_range( + c, ret, depth_with_space - 1, 0, li, ll, nullptr + ).si_then([&ret] { + assert(!ret.at_boundary()); + return std::move(ret); + }); + }); + } + + void assert_valid() const { + assert(leaf.node); + assert(leaf.pos <= leaf.node->get_size()); + + for (auto &i: internal) { + (void)i; + assert(i.node); + assert(i.pos < i.node->get_size()); + } + } + + depth_t get_depth() const { + return internal.size() + 1; + } + + auto &get_internal(depth_t depth) { + assert(depth > 1); + assert((depth - 2) < internal.size()); + return internal[depth - 2]; + } + + const auto &get_internal(depth_t depth) const { + assert(depth > 1); + assert((depth - 2) < internal.size()); + return internal[depth - 2]; + } + + node_key_t get_key() const { + assert(!is_end()); + return leaf.node->iter_idx(leaf.pos).get_key(); + } + node_val_t get_val() const { + assert(!is_end()); + auto ret = leaf.node->iter_idx(leaf.pos).get_val(); + ret.paddr = ret.paddr.maybe_relative_to(leaf.node->get_paddr()); + return ret; + } + + bool is_end() const { + // external methods may only resolve at a boundary if at end + return at_boundary(); + } + + bool is_begin() const { + for (auto &i: internal) { + if (i.pos != 0) + return false; + } + return leaf.pos == 0; + } + + PhysicalNodePinRef get_pin() const { + assert(!is_end()); + auto val = get_val(); + auto key = get_key(); + return std::make_unique>( + leaf.node, + val.paddr, + fixed_kv_node_meta_t{ key, key + val.len, 0 }); + } + + typename leaf_node_t::Ref get_leaf_node() { + return leaf.node; + } + + private: + iterator() noexcept {} + iterator(depth_t depth) noexcept : internal(depth - 1) {} + + friend class FixedKVBtree; + static constexpr uint16_t INVALID = std::numeric_limits::max(); + template + struct node_position_t { + typename NodeType::Ref node; + uint16_t pos = INVALID; + + void reset() { + *this = node_position_t{}; + } + + auto get_iter() { + assert(pos != INVALID); + assert(pos < node->get_size()); + return node->iter_idx(pos); + } + }; + boost::container::static_vector< + node_position_t, MAX_DEPTH> internal; + node_position_t leaf; + + bool at_boundary() const { + assert(leaf.pos <= leaf.node->get_size()); + return leaf.pos == leaf.node->get_size(); + } + + using handle_boundary_ertr = base_iertr; + using handle_boundary_ret = handle_boundary_ertr::future<>; + handle_boundary_ret handle_boundary( + op_context_t c, + mapped_space_visitor_t *visitor) + { + assert(at_boundary()); + depth_t depth_with_space = 2; + for (; depth_with_space <= get_depth(); ++depth_with_space) { + if ((get_internal(depth_with_space).pos + 1) < + get_internal(depth_with_space).node->get_size()) { + break; + } + } + + if (depth_with_space <= get_depth()) { + return seastar::do_with( + [](const internal_node_t &internal) { return internal.begin(); }, + [](const leaf_node_t &leaf) { return leaf.begin(); }, + [this, c, depth_with_space, visitor](auto &li, auto &ll) { + for (depth_t depth = 2; depth < depth_with_space; ++depth) { + get_internal(depth).reset(); + } + leaf.reset(); + get_internal(depth_with_space).pos++; + // note, cannot result in at_boundary() by construction + return lookup_depth_range( + c, *this, depth_with_space - 1, 0, li, ll, visitor + ); + }); + } else { + // end + return seastar::now(); + } + } + + depth_t check_split() const { + if (!leaf.node->at_max_capacity()) { + return 0; + } + for (depth_t split_from = 1; split_from < get_depth(); ++split_from) { + if (!get_internal(split_from + 1).node->at_max_capacity()) + return split_from; + } + return get_depth(); + } + + depth_t check_merge() const { + if (!leaf.node->below_min_capacity()) { + return 0; + } + for (depth_t merge_from = 1; merge_from < get_depth(); ++merge_from) { + if (!get_internal(merge_from + 1).node->below_min_capacity()) + return merge_from; + } + return get_depth(); + } + }; + + FixedKVBtree(phy_tree_root_t root) : root(root) {} + + bool is_root_dirty() const { + return root_dirty; + } + phy_tree_root_t get_root_undirty() { + ceph_assert(root_dirty); + root_dirty = false; + return root; + } + + /// mkfs + using mkfs_ret = phy_tree_root_t; + static mkfs_ret mkfs(op_context_t c) { + auto root_leaf = c.cache.template alloc_new_extent( + c.trans, + node_size); + root_leaf->set_size(0); + fixed_kv_node_meta_t meta{0, L_ADDR_MAX, 1}; + root_leaf->set_meta(meta); + root_leaf->pin.set_range(meta); + c.trans.get_lba_tree_stats().depth = 1u; + return phy_tree_root_t{root_leaf->get_paddr(), 1u}; + } + + /** + * lower_bound + * + * @param c [in] context + * @param addr [in] ddr + * @return least iterator >= key + */ + iterator_fut lower_bound( + op_context_t c, + node_key_t addr, + mapped_space_visitor_t *visitor=nullptr) const + { + LOG_PREFIX(FixedKVBtree::lower_bound); + return lookup( + c, + [addr](const internal_node_t &internal) { + assert(internal.get_size() > 0); + auto iter = internal.upper_bound(addr); + assert(iter != internal.begin()); + --iter; + return iter; + }, + [FNAME, c, addr](const leaf_node_t &leaf) { + auto ret = leaf.lower_bound(addr); + SUBDEBUGT( + seastore_lba_details, + "leaf addr {}, got ret offset {}, size {}, end {}", + c.trans, + addr, + ret.get_offset(), + leaf.get_size(), + ret == leaf.end()); + return ret; + }, + visitor + ).si_then([FNAME, c](auto &&ret) { + SUBDEBUGT( + seastore_lba_details, + "ret.leaf.pos {}", + c.trans, + ret.leaf.pos); + ret.assert_valid(); + return std::move(ret); + }); + } + + + /** + * upper_bound + * + * @param c [in] context + * @param addr [in] ddr + * @return least iterator > key + */ + iterator_fut upper_bound( + op_context_t c, + node_key_t addr + ) const { + return lower_bound( + c, addr + ).si_then([c, addr](auto iter) { + if (!iter.is_end() && iter.get_key() == addr) { + return iter.next(c); + } else { + return iterator_fut( + interruptible::ready_future_marker{}, + iter); + } + }); + } + + /** + * upper_bound_right + * + * @param c [in] context + * @param addr [in] addr + * @return least iterator i s.t. i.get_key() + i.get_val().len > key + */ + iterator_fut upper_bound_right( + op_context_t c, + node_key_t addr) const + { + return lower_bound( + c, addr + ).si_then([c, addr](auto iter) { + if (iter.is_begin()) { + return iterator_fut( + interruptible::ready_future_marker{}, + iter); + } else { + return iter.prev( + c + ).si_then([iter, addr](auto prev) { + if ((prev.get_key() + prev.get_val().len) > addr) { + return iterator_fut( + interruptible::ready_future_marker{}, + prev); + } else { + return iterator_fut( + interruptible::ready_future_marker{}, + iter); + } + }); + } + }); + } + + iterator_fut begin(op_context_t c) const { + return lower_bound(c, 0); + } + iterator_fut end(op_context_t c) const { + return upper_bound(c, L_ADDR_MAX); + } + + using iterate_repeat_ret_inner = base_iertr::future< + seastar::stop_iteration>; + template + static base_iertr::future<> iterate_repeat( + op_context_t c, + iterator_fut &&iter_fut, + F &&f, + mapped_space_visitor_t *visitor=nullptr) { + return std::move( + iter_fut + ).si_then([c, visitor, f=std::forward(f)](auto iter) { + return seastar::do_with( + iter, + std::move(f), + [c, visitor](auto &pos, auto &f) { + return trans_intr::repeat( + [c, visitor, &f, &pos] { + return f( + pos + ).si_then([c, visitor, &pos](auto done) { + if (done == seastar::stop_iteration::yes) { + return iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } else { + ceph_assert(!pos.is_end()); + return pos.next( + c, visitor + ).si_then([&pos](auto next) { + pos = next; + return iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + }); + } + }); + }); + }); + }); + } + + /** + * insert + * + * Inserts val at laddr with iter as a hint. If element at laddr already + * exists returns iterator to that element unchanged and returns false. + * + * Invalidates all outstanding iterators for this tree on this transaction. + * + * @param c [in] op context + * @param iter [in] hint, insertion constant if immediately prior to iter + * @param laddr [in] addr at which to insert + * @param val [in] val to insert + * @return pair where iter points to element at addr, bool true + * iff element at laddr did not exist. + */ + using insert_iertr = base_iertr; + using insert_ret = insert_iertr::future>; + insert_ret insert( + op_context_t c, + iterator iter, + node_key_t laddr, + node_val_t val + ) { + LOG_PREFIX(FixedKVBtree::insert); + SUBDEBUGT( + seastore_lba_details, + "inserting laddr {} at iter {}", + c.trans, + laddr, + iter.is_end() ? L_ADDR_MAX : iter.get_key()); + return seastar::do_with( + iter, + [this, c, laddr, val](auto &ret) { + return find_insertion( + c, laddr, ret + ).si_then([this, c, laddr, val, &ret] { + if (!ret.at_boundary() && ret.get_key() == laddr) { + return insert_ret( + interruptible::ready_future_marker{}, + std::make_pair(ret, false)); + } else { + ++(c.trans.get_lba_tree_stats().num_inserts); + return handle_split( + c, ret + ).si_then([c, laddr, val, &ret] { + if (!ret.leaf.node->is_pending()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, ret.leaf.node + ); + ret.leaf.node = mut->cast(); + } + auto iter = typename leaf_node_t::const_iterator( + ret.leaf.node.get(), ret.leaf.pos); + assert(iter == ret.leaf.node->lower_bound(laddr)); + assert(iter == ret.leaf.node->end() || iter->get_key() > laddr); + assert(laddr >= ret.leaf.node->get_meta().begin && + laddr < ret.leaf.node->get_meta().end); + ret.leaf.node->insert(iter, laddr, val); + return insert_ret( + interruptible::ready_future_marker{}, + std::make_pair(ret, true)); + }); + } + }); + }); + } + + insert_ret insert( + op_context_t c, + node_key_t laddr, + node_val_t val) { + return lower_bound( + c, laddr + ).si_then([this, c, laddr, val](auto iter) { + return this->insert(c, iter, laddr, val); + }); + } + + /** + * update + * + * Invalidates all outstanding iterators for this tree on this transaction. + * + * @param c [in] op context + * @param iter [in] iterator to element to update, must not be end + * @param val [in] val with which to update + * @return iterator to newly updated element + */ + using update_iertr = base_iertr; + using update_ret = update_iertr::future; + update_ret update( + op_context_t c, + iterator iter, + node_val_t val) + { + LOG_PREFIX(FixedKVBtree::update); + SUBDEBUGT( + seastore_lba_details, + "update element at {}", + c.trans, + iter.is_end() ? L_ADDR_MAX : iter.get_key()); + if (!iter.leaf.node->is_pending()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, iter.leaf.node + ); + iter.leaf.node = mut->cast(); + } + iter.leaf.node->update( + iter.leaf.node->iter_idx(iter.leaf.pos), + val); + return update_ret( + interruptible::ready_future_marker{}, + iter); + } + + + /** + * remove + * + * Invalidates all outstanding iterators for this tree on this transaction. + * + * @param c [in] op context + * @param iter [in] iterator to element to remove, must not be end + */ + using remove_iertr = base_iertr; + using remove_ret = remove_iertr::future<>; + remove_ret remove( + op_context_t c, + iterator iter) + { + LOG_PREFIX(FixedKVBtree::remove); + SUBDEBUGT( + seastore_lba_details, + "remove element at {}", + c.trans, + iter.is_end() ? L_ADDR_MAX : iter.get_key()); + assert(!iter.is_end()); + ++(c.trans.get_lba_tree_stats().num_erases); + return seastar::do_with( + iter, + [this, c](auto &ret) { + if (!ret.leaf.node->is_pending()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, ret.leaf.node + ); + ret.leaf.node = mut->cast(); + } + ret.leaf.node->remove( + ret.leaf.node->iter_idx(ret.leaf.pos)); + + return handle_merge( + c, ret + ); + }); + } + + /** + * init_cached_extent + * + * Checks whether e is live (reachable from fixed kv tree) and drops or initializes + * accordingly. + * + * Returns if e is live. + */ + using init_cached_extent_iertr = base_iertr; + using init_cached_extent_ret = init_cached_extent_iertr::future; + init_cached_extent_ret init_cached_extent( + op_context_t c, + CachedExtentRef e) + { + assert(!e->is_logical()); + LOG_PREFIX(FixedKVTree::init_cached_extent); + SUBDEBUGT(seastore_lba_details, "extent {}", c.trans, *e); + if (e->get_type() == internal_node_t::TYPE) { + auto eint = e->cast(); + return lower_bound( + c, eint->get_node_meta().begin + ).si_then([e, c, eint](auto iter) { + // Note, this check is valid even if iter.is_end() + LOG_PREFIX(FixedKVTree::init_cached_extent); + depth_t cand_depth = eint->get_node_meta().depth; + if (cand_depth <= iter.get_depth() && + &*iter.get_internal(cand_depth).node == &*eint) { + SUBDEBUGT( + seastore_lba_details, + "extent {} is live", + c.trans, + *eint); + return true; + } else { + SUBDEBUGT( + seastore_lba_details, + "extent {} is not live", + c.trans, + *eint); + return false; + } + }); + } else if (e->get_type() == leaf_node_t::TYPE) { + auto eleaf = e->cast(); + return lower_bound( + c, eleaf->get_node_meta().begin + ).si_then([c, e, eleaf](auto iter) { + // Note, this check is valid even if iter.is_end() + LOG_PREFIX(FixedKVTree::init_cached_extent); + if (iter.leaf.node == &*eleaf) { + SUBDEBUGT( + seastore_lba_details, + "extent {} is live", + c.trans, + *eleaf); + return true; + } else { + SUBDEBUGT( + seastore_lba_details, + "extent {} is not live", + c.trans, + *eleaf); + return false; + } + }); + } else { + SUBDEBUGT( + seastore_lba_details, + "found other extent {} type {}", + c.trans, + *e, + e->get_type()); + return init_cached_extent_ret( + interruptible::ready_future_marker{}, + true); + } + } + + /// get_leaf_if_live: get leaf node at laddr/addr if still live + using get_leaf_if_live_iertr = base_iertr; + using get_leaf_if_live_ret = get_leaf_if_live_iertr::future; + get_leaf_if_live_ret get_leaf_if_live( + op_context_t c, + paddr_t addr, + node_key_t laddr, + seastore_off_t len) + { + LOG_PREFIX(FixedKVBtree::get_leaf_if_live); + return lower_bound( + c, laddr + ).si_then([FNAME, c, addr, laddr, len](auto iter) { + if (iter.leaf.node->get_paddr() == addr) { + SUBDEBUGT( + seastore_lba_details, + "extent laddr {} addr {}~{} found: {}", + c.trans, + laddr, + addr, + len, + *iter.leaf.node); + return CachedExtentRef(iter.leaf.node); + } else { + SUBDEBUGT( + seastore_lba_details, + "extent laddr {} addr {}~{} is not live, does not match node {}", + c.trans, + laddr, + addr, + len, + *iter.leaf.node); + return CachedExtentRef(); + } + }); + } + + + /// get_internal_if_live: get internal node at laddr/addr if still live + using get_internal_if_live_iertr = base_iertr; + using get_internal_if_live_ret = get_internal_if_live_iertr::future; + get_internal_if_live_ret get_internal_if_live( + op_context_t c, + paddr_t addr, + node_key_t laddr, + seastore_off_t len) + { + LOG_PREFIX(FixedKVBtree::get_leaf_if_live); + return lower_bound( + c, laddr + ).si_then([FNAME, c, addr, laddr, len](auto iter) { + for (depth_t d = 2; d <= iter.get_depth(); ++d) { + CachedExtent &node = *iter.get_internal(d).node; + auto internal_node = node.cast(); + if (internal_node->get_paddr() == addr) { + SUBDEBUGT( + seastore_lba_details, + "extent laddr {} addr {}~{} found: {}", + c.trans, + laddr, + addr, + len, + *internal_node); + assert(internal_node->get_node_meta().begin == laddr); + return CachedExtentRef(internal_node); + } + } + SUBDEBUGT( + seastore_lba_details, + "extent laddr {} addr {}~{} is not live, no matching internal node", + c.trans, + laddr, + addr, + len); + return CachedExtentRef(); + }); + } + + + /** + * rewrite_extent + * + * Rewrites a fresh copy of extent into transaction and updates internal + * references. + */ + using rewrite_extent_iertr = base_iertr; + using rewrite_extent_ret = rewrite_extent_iertr::future<>; + rewrite_extent_ret rewrite_extent( + op_context_t c, + CachedExtentRef e) { + LOG_PREFIX(FixedKVBtree::rewrite_extent); + assert(e->get_type() == extent_types_t::LADDR_INTERNAL || + e->get_type() == extent_types_t::LADDR_LEAF); + + auto do_rewrite = [&](auto &fixed_kv_extent) { + auto n_fixed_kv_extent = c.cache.template alloc_new_extent< + std::remove_reference_t + >( + c.trans, + fixed_kv_extent.get_length()); + fixed_kv_extent.get_bptr().copy_out( + 0, + fixed_kv_extent.get_length(), + n_fixed_kv_extent->get_bptr().c_str()); + n_fixed_kv_extent->pin.set_range(n_fixed_kv_extent->get_node_meta()); + + /* This is a bit underhanded. Any relative addrs here must necessarily + * be record relative as we are rewriting a dirty extent. Thus, we + * are using resolve_relative_addrs with a (likely negative) block + * relative offset to correct them to block-relative offsets adjusted + * for our new transaction location. + * + * Upon commit, these now block relative addresses will be interpretted + * against the real final address. + */ + n_fixed_kv_extent->resolve_relative_addrs( + make_record_relative_paddr(0) - n_fixed_kv_extent->get_paddr()); + + SUBDEBUGT( + seastore_lba_details, + "rewriting {} into {}", + c.trans, + fixed_kv_extent, + *n_fixed_kv_extent); + + return update_internal_mapping( + c, + n_fixed_kv_extent->get_node_meta().depth, + n_fixed_kv_extent->get_node_meta().begin, + e->get_paddr(), + n_fixed_kv_extent->get_paddr() + ).si_then([c, e] { + c.cache.retire_extent(c.trans, e); + }); + }; + + CachedExtentRef n_fixed_kv_extent; + if (e->get_type() == internal_node_t::TYPE) { + auto lint = e->cast(); + return do_rewrite(*lint); + } else { + assert(e->get_type() == leaf_node_t::TYPE); + auto lleaf = e->cast(); + return do_rewrite(*lleaf); + } + } + + using update_internal_mapping_iertr = base_iertr; + using update_internal_mapping_ret = update_internal_mapping_iertr::future<>; + update_internal_mapping_ret update_internal_mapping( + op_context_t c, + depth_t depth, + node_key_t laddr, + paddr_t old_addr, + paddr_t new_addr) + { + LOG_PREFIX(FixedKVBtree::update_internal_mapping); + SUBDEBUGT( + seastore_lba_details, + "updating laddr {} at depth {} from {} to {}", + c.trans, + laddr, + depth, + old_addr, + new_addr); + + return lower_bound( + c, laddr + ).si_then([=](auto iter) { + assert(iter.get_depth() >= depth); + if (depth == iter.get_depth()) { + SUBDEBUGT(seastore_lba_details, "update at root", c.trans); + + if (laddr != 0) { + SUBERRORT( + seastore_lba_details, + "updating root laddr {} at depth {} from {} to {}," + "laddr is not 0", + c.trans, + laddr, + depth, + old_addr, + new_addr, + root.get_location()); + ceph_assert(0 == "impossible"); + } + + if (root.get_location() != old_addr) { + SUBERRORT( + seastore_lba_details, + "updating root laddr {} at depth {} from {} to {}," + "root addr {} does not match", + c.trans, + laddr, + depth, + old_addr, + new_addr, + root.get_location()); + ceph_assert(0 == "impossible"); + } + + root.set_location(new_addr); + root_dirty = true; + } else { + auto &parent = iter.get_internal(depth + 1); + assert(parent.node); + assert(parent.pos < parent.node->get_size()); + auto piter = parent.node->iter_idx(parent.pos); + + if (piter->get_key() != laddr) { + SUBERRORT( + seastore_lba_details, + "updating laddr {} at depth {} from {} to {}," + "node {} pos {} val pivot addr {} does not match", + c.trans, + laddr, + depth, + old_addr, + new_addr, + *(parent.node), + parent.pos, + piter->get_key()); + ceph_assert(0 == "impossible"); + } + + + if (piter->get_val() != old_addr) { + SUBERRORT( + seastore_lba_details, + "updating laddr {} at depth {} from {} to {}," + "node {} pos {} val addr {} does not match", + c.trans, + laddr, + depth, + old_addr, + new_addr, + *(parent.node), + parent.pos, + piter->get_val()); + ceph_assert(0 == "impossible"); + } + + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, + parent.node + ); + typename internal_node_t::Ref mparent = mut->cast(); + mparent->update(piter, new_addr); + + /* Note, iter is now invalid as we didn't udpate either the parent + * node reference to the new mutable instance nor did we update the + * child pointer to the new node. Not a problem as we'll now just + * destruct it. + */ + } + return seastar::now(); + }); + } + + +private: + phy_tree_root_t root; + bool root_dirty = false; + + using get_internal_node_iertr = base_iertr; + using get_internal_node_ret = get_internal_node_iertr::future; + static get_internal_node_ret get_internal_node( + op_context_t c, + depth_t depth, + paddr_t offset, + node_key_t begin, + node_key_t end) + { + LOG_PREFIX(FixedKVBtree::get_internal_node); + SUBDEBUGT( + seastore_lba_details, + "reading internal at offset {}, depth {}, begin {}, end {}", + c.trans, + offset, + depth, + begin, + end); + assert(depth > 1); + auto init_internal = [c, depth, begin, end](internal_node_t &node) { + assert(!node.is_pending()); + assert(!node.pin.is_linked()); + node.pin.set_range(fixed_kv_node_meta_t{begin, end, depth}); + if (c.pins) { + c.pins->add_pin(node.pin); + } + }; + return c.cache.template get_extent( + c.trans, + offset, + node_size, + init_internal + ).si_then([FNAME, c, offset, init_internal, depth, begin, end]( + typename internal_node_t::Ref ret) { + SUBDEBUGT( + seastore_lba_details, + "read internal at offset {} {}", + c.trans, + offset, + *ret); + // This can only happen during init_cached_extent + if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) { + assert(ret->is_dirty()); + init_internal(*ret); + } + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + ceph_assert(depth == meta.depth); + ceph_assert(begin == meta.begin); + ceph_assert(end == meta.end); + return get_internal_node_ret( + interruptible::ready_future_marker{}, + ret); + }); + } + + + using get_leaf_node_iertr = base_iertr; + using get_leaf_node_ret = get_leaf_node_iertr::future; + static get_leaf_node_ret get_leaf_node( + op_context_t c, + paddr_t offset, + node_key_t begin, + node_key_t end) + { + LOG_PREFIX(FixedKVBtree::get_leaf_node); + SUBDEBUGT( + seastore_lba_details, + "reading leaf at offset {}, begin {}, end {}", + c.trans, + offset, + begin, + end); + auto init_leaf = [c, begin, end](leaf_node_t &node) { + assert(!node.is_pending()); + assert(!node.pin.is_linked()); + node.pin.set_range(fixed_kv_node_meta_t{begin, end, 1}); + if (c.pins) { + c.pins->add_pin(node.pin); + } + }; + return c.cache.template get_extent( + c.trans, + offset, + node_size, + init_leaf + ).si_then([FNAME, c, offset, init_leaf, begin, end] + (typename leaf_node_t::Ref ret) { + SUBDEBUGT( + seastore_lba_details, + "read leaf at offset {} {}", + c.trans, + offset, + *ret); + // This can only happen during init_cached_extent + if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) { + assert(ret->is_dirty()); + init_leaf(*ret); + } + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + ceph_assert(1 == meta.depth); + ceph_assert(begin == meta.begin); + ceph_assert(end == meta.end); + return get_leaf_node_ret( + interruptible::ready_future_marker{}, + ret); + }); + } + + using lookup_root_iertr = base_iertr; + using lookup_root_ret = lookup_root_iertr::future<>; + lookup_root_ret lookup_root( + op_context_t c, + iterator &iter, + mapped_space_visitor_t *visitor) const { + if (root.get_depth() > 1) { + return get_internal_node( + c, + root.get_depth(), + root.get_location(), + 0, + L_ADDR_MAX + ).si_then([this, visitor, &iter](InternalNodeRef root_node) { + iter.get_internal(root.get_depth()).node = root_node; + if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length()); + return lookup_root_iertr::now(); + }); + } else { + return get_leaf_node( + c, + root.get_location(), + 0, + L_ADDR_MAX + ).si_then([visitor, &iter](LeafNodeRef root_node) { + iter.leaf.node = root_node; + if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length()); + return lookup_root_iertr::now(); + }); + } + } + + using lookup_internal_level_iertr = base_iertr; + using lookup_internal_level_ret = lookup_internal_level_iertr::future<>; + template + static lookup_internal_level_ret lookup_internal_level( + op_context_t c, + depth_t depth, + iterator &iter, + F &f, + mapped_space_visitor_t *visitor + ) { + assert(depth > 1); + auto &parent_entry = iter.get_internal(depth + 1); + auto parent = parent_entry.node; + auto node_iter = parent->iter_idx(parent_entry.pos); + auto next_iter = node_iter + 1; + auto begin = node_iter->get_key(); + auto end = next_iter == parent->end() + ? parent->get_node_meta().end + : next_iter->get_key(); + return get_internal_node( + c, + depth, + node_iter->get_val().maybe_relative_to(parent->get_paddr()), + begin, + end + ).si_then([depth, visitor, &iter, &f](InternalNodeRef node) { + auto &entry = iter.get_internal(depth); + entry.node = node; + auto node_iter = f(*node); + assert(node_iter != node->end()); + entry.pos = node_iter->get_offset(); + if (visitor) (*visitor)(node->get_paddr(), node->get_length()); + return seastar::now(); + }); + } + + using lookup_leaf_iertr = base_iertr; + using lookup_leaf_ret = lookup_leaf_iertr::future<>; + template + static lookup_internal_level_ret lookup_leaf( + op_context_t c, + iterator &iter, + F &f, + mapped_space_visitor_t *visitor + ) { + auto &parent_entry = iter.get_internal(2); + auto parent = parent_entry.node; + assert(parent); + auto node_iter = parent->iter_idx(parent_entry.pos); + auto next_iter = node_iter + 1; + auto begin = node_iter->get_key(); + auto end = next_iter == parent->end() + ? parent->get_node_meta().end + : next_iter->get_key(); + + return get_leaf_node( + c, + node_iter->get_val().maybe_relative_to(parent->get_paddr()), + begin, + end + ).si_then([visitor, &iter, &f](LeafNodeRef node) { + iter.leaf.node = node; + auto node_iter = f(*node); + iter.leaf.pos = node_iter->get_offset(); + if (visitor) (*visitor)(node->get_paddr(), node->get_length()); + return seastar::now(); + }); + } + + /** + * lookup_depth_range + * + * Performs node lookups on depths [from, to) using li and ll to + * specific target at each level. Note, may leave the iterator + * at_boundary(), call handle_boundary() prior to returning out + * lf FixedKVBtree. + */ + using lookup_depth_range_iertr = base_iertr; + using lookup_depth_range_ret = lookup_depth_range_iertr::future<>; + template + static lookup_depth_range_ret lookup_depth_range( + op_context_t c, ///< [in] context + iterator &iter, ///< [in,out] iterator to populate + depth_t from, ///< [in] from inclusive + depth_t to, ///< [in] to exclusive, (to <= from, to == from is a noop) + LI &li, ///< [in] internal->iterator + LL &ll, ///< [in] leaf->iterator + mapped_space_visitor_t *visitor ///< [in] mapped space visitor + ) { + LOG_PREFIX(FixedKVBtree::lookup_depth_range); + SUBDEBUGT(seastore_lba_details, "{} -> {}", c.trans, from, to); + return seastar::do_with( + from, + [c, to, visitor, &iter, &li, &ll](auto &d) { + return trans_intr::repeat( + [c, to, visitor, &iter, &li, &ll, &d] { + if (d > to) { + return [&] { + if (d > 1) { + return lookup_internal_level( + c, + d, + iter, + li, + visitor); + } else { + assert(d == 1); + return lookup_leaf( + c, + iter, + ll, + visitor); + } + }().si_then([&d] { + --d; + return lookup_depth_range_iertr::make_ready_future< + seastar::stop_iteration + >(seastar::stop_iteration::no); + }); + } else { + return lookup_depth_range_iertr::make_ready_future< + seastar::stop_iteration + >(seastar::stop_iteration::yes); + } + }); + }); + } + + using lookup_iertr = base_iertr; + using lookup_ret = lookup_iertr::future; + template + lookup_ret lookup( + op_context_t c, + LI &&lookup_internal, + LL &&lookup_leaf, + mapped_space_visitor_t *visitor + ) const { + LOG_PREFIX(FixedKVBtree::lookup); + return seastar::do_with( + iterator{root.get_depth()}, + std::forward
  • (lookup_internal), + std::forward(lookup_leaf), + [FNAME, this, visitor, c](auto &iter, auto &li, auto &ll) { + return lookup_root( + c, iter, visitor + ).si_then([FNAME, this, visitor, c, &iter, &li, &ll] { + if (iter.get_depth() > 1) { + auto &root_entry = *(iter.internal.rbegin()); + root_entry.pos = li(*(root_entry.node)).get_offset(); + } else { + auto &root_entry = iter.leaf; + auto riter = ll(*(root_entry.node)); + root_entry.pos = riter->get_offset(); + } + SUBDEBUGT(seastore_lba_details, "got root, depth {}", c.trans, root.get_depth()); + return lookup_depth_range( + c, + iter, + root.get_depth() - 1, + 0, + li, + ll, + visitor + ).si_then([c, visitor, &iter] { + if (iter.at_boundary()) { + return iter.handle_boundary(c, visitor); + } else { + return lookup_iertr::now(); + } + }); + }).si_then([&iter] { + return std::move(iter); + }); + }); + } + + /** + * find_insertion + * + * Prepare iter for insertion. iter should begin pointing at + * the valid insertion point (lower_bound(laddr)). + * + * Upon completion, iter will point at the + * position at which laddr should be inserted. iter may, upon completion, + * point at the end of a leaf other than the end leaf if that's the correct + * insertion point. + */ + using find_insertion_iertr = base_iertr; + using find_insertion_ret = find_insertion_iertr::future<>; + static find_insertion_ret find_insertion( + op_context_t c, + node_key_t laddr, + iterator &iter) + { + assert(iter.is_end() || iter.get_key() >= laddr); + if (!iter.is_end() && iter.get_key() == laddr) { + return seastar::now(); + } else if (iter.leaf.node->get_node_meta().begin <= laddr) { +#ifndef NDEBUG + auto p = iter; + if (p.leaf.pos > 0) { + --p.leaf.pos; + assert(p.get_key() < laddr); + } +#endif + return seastar::now(); + } else { + assert(iter.leaf.pos == 0); + return iter.prev( + c + ).si_then([laddr, &iter](auto p) { + boost::ignore_unused(laddr); // avoid clang warning; + assert(p.leaf.node->get_node_meta().begin <= laddr); + assert(p.get_key() < laddr); + // Note, this is specifically allowed to violate the iterator + // invariant that pos is a valid index for the node in the event + // that the insertion point is at the end of a node. + p.leaf.pos++; + assert(p.at_boundary()); + iter = p; + return seastar::now(); + }); + } + } + + /** + * handle_split + * + * Split nodes in iter as needed for insertion. First, scan iter from leaf + * to find first non-full level. Then, split from there towards leaf. + * + * Upon completion, iter will point at the newly split insertion point. As + * with find_insertion, iter's leaf pointer may be end without iter being + * end. + */ + using handle_split_iertr = base_iertr; + using handle_split_ret = handle_split_iertr::future<>; + handle_split_ret handle_split( + op_context_t c, + iterator &iter) + { + LOG_PREFIX(FixedKVBtree::handle_split); + + depth_t split_from = iter.check_split(); + + SUBDEBUGT(seastore_lba_details, "split_from {}, depth {}", c.trans, split_from, iter.get_depth()); + + if (split_from == iter.get_depth()) { + auto nroot = c.cache.template alloc_new_extent( + c.trans, node_size); + fixed_kv_node_meta_t meta{0, L_ADDR_MAX, iter.get_depth() + 1}; + nroot->set_meta(meta); + nroot->pin.set_range(meta); + nroot->journal_insert( + nroot->begin(), + L_ADDR_MIN, + root.get_location(), + nullptr); + iter.internal.push_back({nroot, 0}); + + root.set_location(nroot->get_paddr()); + root.set_depth(iter.get_depth()); + c.trans.get_lba_tree_stats().depth = iter.get_depth(); + root_dirty = true; + } + + /* pos may be either node_position_t or + * node_position_t */ + auto split_level = [&](auto &parent_pos, auto &pos) { + LOG_PREFIX(FixedKVBtree::handle_split); + auto [left, right, pivot] = pos.node->make_split_children(c); + + auto parent_node = parent_pos.node; + auto parent_iter = parent_pos.get_iter(); + + parent_node->update( + parent_iter, + left->get_paddr()); + parent_node->insert( + parent_iter + 1, + pivot, + right->get_paddr()); + + SUBDEBUGT( + seastore_lba_details, + "splitted {} into left: {}, right: {}", + c.trans, + *pos.node, + *left, + *right); + c.cache.retire_extent(c.trans, pos.node); + + return std::make_pair(left, right); + }; + + for (; split_from > 0; --split_from) { + auto &parent_pos = iter.get_internal(split_from + 1); + if (!parent_pos.node->is_pending()) { + parent_pos.node = c.cache.duplicate_for_write( + c.trans, parent_pos.node + )->template cast(); + } + + if (split_from > 1) { + auto &pos = iter.get_internal(split_from); + SUBDEBUGT( + seastore_lba_details, + "splitting internal {} at depth {}, parent: {} at pos: {}", + c.trans, + *pos.node, + split_from, + *parent_pos.node, + parent_pos.pos); + auto [left, right] = split_level(parent_pos, pos); + + if (pos.pos < left->get_size()) { + pos.node = left; + } else { + pos.node = right; + pos.pos -= left->get_size(); + + parent_pos.pos += 1; + } + } else { + auto &pos = iter.leaf; + SUBDEBUGT( + seastore_lba_details, + "splitting leaf {}, parent: {} at pos: {}", + c.trans, + *pos.node, + *parent_pos.node, + parent_pos.pos); + auto [left, right] = split_level(parent_pos, pos); + + /* right->get_node_meta().begin == pivot == right->begin()->get_key() + * Thus, if pos.pos == left->get_size(), we want iter to point to + * left with pos.pos at the end rather than right with pos.pos = 0 + * since the insertion would be to the left of the first element + * of right and thus necessarily less than right->get_node_meta().begin. + */ + if (pos.pos <= left->get_size()) { + pos.node = left; + } else { + pos.node = right; + pos.pos -= left->get_size(); + + parent_pos.pos += 1; + } + } + } + + return seastar::now(); + } + + + using handle_merge_iertr = base_iertr; + using handle_merge_ret = handle_merge_iertr::future<>; + handle_merge_ret handle_merge( + op_context_t c, + iterator &iter) + { + LOG_PREFIX(FixedKVBtree::handle_merge); + if (iter.get_depth() == 1 || + !iter.leaf.node->below_min_capacity()) { + SUBDEBUGT( + seastore_lba_details, + "no need to merge leaf, leaf size {}, depth {}", + c.trans, + iter.leaf.node->get_size(), + iter.get_depth()); + return seastar::now(); + } + + return seastar::do_with( + depth_t{1}, + [FNAME, this, c, &iter](auto &to_merge) { + return trans_intr::repeat( + [FNAME, this, c, &iter, &to_merge] { + SUBDEBUGT( + seastore_lba_details, + "merging depth {}", + c.trans, + to_merge); + auto &parent_pos = iter.get_internal(to_merge + 1); + auto merge_fut = handle_merge_iertr::now(); + if (to_merge > 1) { + auto &pos = iter.get_internal(to_merge); + merge_fut = merge_level(c, to_merge, parent_pos, pos); + } else { + auto &pos = iter.leaf; + merge_fut = merge_level(c, to_merge, parent_pos, pos); + } + + return merge_fut.si_then([FNAME, this, c, &iter, &to_merge] { + ++to_merge; + auto &pos = iter.get_internal(to_merge); + if (to_merge == iter.get_depth()) { + if (pos.node->get_size() == 1) { + SUBDEBUGT(seastore_lba_details, "collapsing root", c.trans); + c.cache.retire_extent(c.trans, pos.node); + assert(pos.pos == 0); + auto node_iter = pos.get_iter(); + root.set_location( + node_iter->get_val().maybe_relative_to(pos.node->get_paddr())); + iter.internal.pop_back(); + root.set_depth(iter.get_depth()); + get_tree_stats(c.trans).depth = iter.get_depth(); + root_dirty = true; + } else { + SUBDEBUGT(seastore_lba_details, "no need to collapse root", c.trans); + } + return seastar::stop_iteration::yes; + } else if (pos.node->below_min_capacity()) { + SUBDEBUGT( + seastore_lba_details, + "continuing, next node {} depth {} at min", + c.trans, + *pos.node, + to_merge); + return seastar::stop_iteration::no; + } else { + SUBDEBUGT( + seastore_lba_details, + "complete, next node {} depth {} not min", + c.trans, + *pos.node, + to_merge); + return seastar::stop_iteration::yes; + } + }); + }); + }); + } + + template + using node_position_t = typename iterator::template node_position_t; + + template , int> = 0> + base_iertr::future get_node( + op_context_t c, + depth_t depth, + paddr_t addr, + laddr_t begin, + laddr_t end) { + assert(depth == 1); + return get_leaf_node(c, addr, begin, end); + } + + template , int> = 0> + base_iertr::future get_node( + op_context_t c, + depth_t depth, + paddr_t addr, + laddr_t begin, + laddr_t end) { + return get_internal_node(c, depth, addr, begin, end); + } + + template + handle_merge_ret merge_level( + op_context_t c, + depth_t depth, + node_position_t &parent_pos, + node_position_t &pos) + { + LOG_PREFIX(FixedKVBtree::merge_level); + if (!parent_pos.node->is_pending()) { + parent_pos.node = c.cache.duplicate_for_write( + c.trans, parent_pos.node + )->template cast(); + } + + auto iter = parent_pos.get_iter(); + assert(iter.get_offset() < parent_pos.node->get_size()); + bool donor_is_left = ((iter.get_offset() + 1) == parent_pos.node->get_size()); + auto donor_iter = donor_is_left ? (iter - 1) : (iter + 1); + auto next_iter = donor_iter + 1; + auto begin = donor_iter->get_key(); + auto end = next_iter == parent_pos.node->end() + ? parent_pos.node->get_node_meta().end + : next_iter->get_key(); + + SUBDEBUGT(seastore_lba_details, "parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node); + return get_node( + c, + depth, + donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()), + begin, + end + ).si_then([c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( + typename NodeType::Ref donor) { + LOG_PREFIX(FixedKVBtree::merge_level); + auto [l, r] = donor_is_left ? + std::make_pair(donor, pos.node) : std::make_pair(pos.node, donor); + + auto [liter, riter] = donor_is_left ? + std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); + + if (donor->at_min_capacity()) { + auto replacement = l->make_full_merge(c, r); + + parent_pos.node->update( + liter, + replacement->get_paddr()); + parent_pos.node->remove(riter); + + pos.node = replacement; + if (donor_is_left) { + pos.pos += r->get_size(); + parent_pos.pos--; + } + + SUBDEBUGT(seastore_lba_details, "l: {}, r: {}, replacement: {}", c.trans, *l, *r, *replacement); + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + } else { + LOG_PREFIX(FixedKVBtree::merge_level); + auto [replacement_l, replacement_r, pivot] = + l->make_balanced( + c, + r, + !donor_is_left); + + parent_pos.node->update( + liter, + replacement_l->get_paddr()); + parent_pos.node->replace( + riter, + pivot, + replacement_r->get_paddr()); + + if (donor_is_left) { + assert(parent_pos.pos > 0); + parent_pos.pos--; + } + + auto orig_position = donor_is_left ? + l->get_size() + pos.pos : + pos.pos; + if (orig_position < replacement_l->get_size()) { + pos.node = replacement_l; + pos.pos = orig_position; + } else { + parent_pos.pos++; + pos.node = replacement_r; + pos.pos = orig_position - replacement_l->get_size(); + } + + SUBDEBUGT( + seastore_lba_details, + "l: {}, r: {}, replacement_l: {}, replacement_r: {}", + c.trans, *l, *r, *replacement_l, *replacement_r); + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + } + + return seastar::now(); + }); + } +}; + +} + diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index 08c94249604..42fbc6c5e93 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -81,7 +81,7 @@ std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const std::ostream &operator<<(std::ostream &out, const LBAPin &rhs) { - return out << "LBAPin(" << rhs.get_laddr() << "~" << rhs.get_length() + return out << "LBAPin(" << rhs.get_key() << "~" << rhs.get_length() << "->" << rhs.get_paddr(); } diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index f1063c1a02c..a0f8686ee97 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -666,20 +666,30 @@ private: }; class LogicalCachedExtent; -class LBAPin; -using LBAPinRef = std::unique_ptr; -class LBAPin { + +template +class PhysicalNodePin; + +template +using PhysicalNodePinRef = std::unique_ptr>; + +template +class PhysicalNodePin { public: virtual void link_extent(LogicalCachedExtent *ref) = 0; - virtual void take_pin(LBAPin &pin) = 0; + virtual void take_pin(PhysicalNodePin &pin) = 0; virtual extent_len_t get_length() const = 0; virtual paddr_t get_paddr() const = 0; - virtual laddr_t get_laddr() const = 0; - virtual LBAPinRef duplicate() const = 0; + virtual key_t get_key() const = 0; + virtual PhysicalNodePinRef duplicate() const = 0; virtual bool has_been_invalidated() const = 0; - virtual ~LBAPin() {} + virtual ~PhysicalNodePin() {} }; + +using LBAPin = PhysicalNodePin; +using LBAPinRef = PhysicalNodePinRef; + std::ostream &operator<<(std::ostream &out, const LBAPin &rhs); using lba_pin_list_t = std::list; @@ -756,7 +766,7 @@ public: void set_pin(LBAPinRef &&npin) { assert(!pin); pin = std::move(npin); - laddr = pin->get_laddr(); + laddr = pin->get_key(); pin->link_extent(this); } diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index 0c4d77dae62..c43fa4470bd 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -9,7 +9,6 @@ #include "include/buffer.h" #include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h" #include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" -#include "crimson/os/seastore/lba_manager/btree/lba_btree.h" #include "crimson/os/seastore/logging.h" SET_SUBSYS(seastore_lba); @@ -20,6 +19,15 @@ SET_SUBSYS(seastore_lba); * - TRACE: read operations, DEBUG details */ +namespace crimson::os::seastore { + +template<> +Transaction::tree_stats_t& get_tree_stats< + crimson::os::seastore::lba_manager::btree::LBABtree>(Transaction &t) { + return t.get_lba_tree_stats(); +} +} + namespace crimson::os::seastore::lba_manager::btree { BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( @@ -210,13 +218,13 @@ static bool is_lba_node(const CachedExtent &e) return is_lba_node(e.get_type()); } -btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e) +btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e) { if (is_lba_node(e)) { return e.cast()->pin; } else if (e.is_logical()) { return static_cast( - e.cast()->get_pin()).pin; + e.cast()->get_pin()).get_range_pin(); } else { ceph_abort_msg("impossible"); } @@ -280,23 +288,57 @@ void BtreeLBAManager::complete_transaction( } } +BtreeLBAManager::base_iertr::future<> _init_cached_extent( + op_context_t c, + const CachedExtentRef &e, + LBABtree &btree, + bool &ret) +{ + if (e->is_logical()) { + auto logn = e->cast(); + return btree.lower_bound( + c, + logn->get_laddr() + ).si_then([e, c, logn, &ret](auto iter) { + LOG_PREFIX(BtreeLBAManager::init_cached_extent); + if (!iter.is_end() && + iter.get_key() == logn->get_laddr() && + iter.get_val().paddr == logn->get_paddr()) { + logn->set_pin(iter.get_pin()); + ceph_assert(iter.get_val().len == e->get_length()); + if (c.pins) { + c.pins->add_pin( + static_cast(logn->get_pin()).get_range_pin()); + } + DEBUGT("logical extent {} live", c.trans, *logn); + ret = true; + } else { + DEBUGT("logical extent {} not live", c.trans, *logn); + ret = false; + } + }); + } else { + return btree.init_cached_extent(c, e + ).si_then([&ret](bool is_alive) { + ret = is_alive; + }); + } +} + BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent( Transaction &t, CachedExtentRef e) { LOG_PREFIX(BtreeLBAManager::init_cached_extent); TRACET("{}", t, *e); - return seastar::do_with(bool(), [this, e, FNAME, &t](bool& ret) { + return seastar::do_with(bool(), [this, e, &t](bool &ret) { auto c = get_context(t); - return with_btree(c, [c, e, &ret](auto &btree) { - return btree.init_cached_extent(c, e - ).si_then([&ret](bool is_alive) { - ret = is_alive; - }); - }).si_then([&ret, e, FNAME, c] { - DEBUGT("is_alive={} -- {}", c.trans, ret, *e); - return ret; - }); + return with_btree(c, [c, e, &ret](auto &btree) + -> base_iertr::future<> { + LOG_PREFIX(BtreeLBAManager::init_cached_extent); + DEBUGT("extent {}", c.trans, *e); + return _init_cached_extent(c, e, btree, ret); + }).si_then([&ret] { return ret; }); }); } @@ -380,7 +422,7 @@ BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent( return with_btree( c, [c, extent](auto &btree) mutable { - return btree.rewrite_lba_extent(c, extent); + return btree.rewrite_extent(c, extent); }); } else { DEBUGT("skip non lba extent -- {}", t, *extent); diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index b02a84949e0..5cf8c5a05bc 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -15,16 +15,21 @@ #include "common/interval_map.h" #include "crimson/osd/exceptions.h" +#include "crimson/os/seastore/btree/fixed_kv_btree.h" #include "crimson/os/seastore/seastore_types.h" #include "crimson/os/seastore/lba_manager.h" #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/segment_manager.h" #include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" -#include "crimson/os/seastore/lba_manager/btree/lba_btree.h" +#include "crimson/os/seastore/btree/btree_range_pin.h" namespace crimson::os::seastore::lba_manager::btree { +using LBABtree = FixedKVBtree; + +using BtreeLBAPin = BtreeNodePin; + /** * BtreeLBAManager * @@ -84,6 +89,14 @@ public: void complete_transaction( Transaction &t) final; + /** + * init_cached_extent + * + * Checks whether e is live (reachable from lba tree) and drops or initializes + * accordingly. + * + * Returns if e is live. + */ init_cached_extent_ret init_cached_extent( Transaction &t, CachedExtentRef e) final; @@ -117,8 +130,8 @@ public: void add_pin(LBAPin &pin) final { auto *bpin = reinterpret_cast(&pin); - pin_set.add_pin(bpin->pin); - bpin->parent = nullptr; + pin_set.add_pin(bpin->get_range_pin()); + bpin->set_parent(nullptr); } ~BtreeLBAManager(); @@ -126,24 +139,24 @@ private: SegmentManager &segment_manager; Cache &cache; - btree_pin_set_t pin_set; + btree_pin_set_t pin_set; struct { uint64_t num_alloc_extents = 0; uint64_t num_alloc_extents_iter_nexts = 0; } stats; - op_context_t get_context(Transaction &t) { - return op_context_t{cache, t, &pin_set}; + op_context_t get_context(Transaction &t) { + return op_context_t{cache, t, &pin_set}; } - static btree_range_pin_t &get_pin(CachedExtent &e); + static btree_range_pin_t &get_pin(CachedExtent &e); seastar::metrics::metric_group metrics; void register_metrics(); template auto with_btree( - op_context_t c, + op_context_t c, F &&f) { return cache.get_root( c.trans @@ -168,7 +181,7 @@ private: template auto with_btree_state( - op_context_t c, + op_context_t c, State &&init, F &&f) { return seastar::do_with( @@ -185,14 +198,14 @@ private: template auto with_btree_state( - op_context_t c, + op_context_t c, F &&f) { return with_btree_state(c, State{}, std::forward(f)); } template auto with_btree_ret( - op_context_t c, + op_context_t c, F &&f) { return with_btree_state( c, diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc deleted file mode 100644 index 21c4279edc8..00000000000 --- a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc +++ /dev/null @@ -1,155 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" -#include "crimson/os/seastore/logging.h" - -SET_SUBSYS(seastore_lba); - -namespace crimson::os::seastore::lba_manager::btree { - -void btree_range_pin_t::take_pin(btree_range_pin_t &other) -{ - ceph_assert(other.extent); - if (other.pins) { - other.pins->replace_pin(*this, other); - pins = other.pins; - other.pins = nullptr; - - if (other.has_ref()) { - other.drop_ref(); - acquire_ref(); - } - } -} - -btree_range_pin_t::~btree_range_pin_t() -{ - LOG_PREFIX(btree_range_pin_t::~btree_range_pin_t); - ceph_assert(!pins == !is_linked()); - ceph_assert(!ref); - if (pins) { - TRACE("removing {}", *this); - pins->remove_pin(*this, true); - } - extent = nullptr; -} - -void btree_pin_set_t::replace_pin(btree_range_pin_t &to, btree_range_pin_t &from) -{ - pins.replace_node(pins.iterator_to(from), to); -} - -void btree_pin_set_t::remove_pin(btree_range_pin_t &pin, bool do_check_parent) -{ - LOG_PREFIX(btree_pin_set_t::remove_pin); - TRACE("{}", pin); - ceph_assert(pin.is_linked()); - ceph_assert(pin.pins); - ceph_assert(!pin.ref); - - pins.erase(pin); - pin.pins = nullptr; - - if (do_check_parent) { - check_parent(pin); - } -} - -btree_range_pin_t *btree_pin_set_t::maybe_get_parent( - const lba_node_meta_t &meta) -{ - auto cmeta = meta; - cmeta.depth++; - auto iter = pins.upper_bound(cmeta, btree_range_pin_t::meta_cmp_t()); - if (iter == pins.begin()) { - return nullptr; - } else { - --iter; - if (iter->range.is_parent_of(meta)) { - return &*iter; - } else { - return nullptr; - } - } -} - -const btree_range_pin_t *btree_pin_set_t::maybe_get_first_child( - const lba_node_meta_t &meta) const -{ - if (meta.depth == 0) { - return nullptr; - } - - auto cmeta = meta; - cmeta.depth--; - - auto iter = pins.lower_bound(cmeta, btree_range_pin_t::meta_cmp_t()); - if (iter == pins.end()) { - return nullptr; - } else if (meta.is_parent_of(iter->range)) { - return &*iter; - } else { - return nullptr; - } -} - -void btree_pin_set_t::release_if_no_children(btree_range_pin_t &pin) -{ - ceph_assert(pin.is_linked()); - if (maybe_get_first_child(pin.range) == nullptr) { - pin.drop_ref(); - } -} - -void btree_pin_set_t::add_pin(btree_range_pin_t &pin) -{ - LOG_PREFIX(btree_pin_set_t::add_pin); - ceph_assert(!pin.is_linked()); - ceph_assert(!pin.pins); - ceph_assert(!pin.ref); - - auto [prev, inserted] = pins.insert(pin); - if (!inserted) { - ERROR("unable to add {} ({}), found {} ({})", - pin, - *(pin.extent), - *prev, - *(prev->extent)); - ceph_assert(0 == "impossible"); - return; - } - pin.pins = this; - if (!pin.is_root()) { - auto *parent = maybe_get_parent(pin.range); - ceph_assert(parent); - if (!parent->has_ref()) { - TRACE("acquiring parent {}", static_cast(parent)); - parent->acquire_ref(); - } else { - TRACE("parent has ref {}", static_cast(parent)); - } - } - if (maybe_get_first_child(pin.range) != nullptr) { - TRACE("acquiring self {}", pin); - pin.acquire_ref(); - } -} - -void btree_pin_set_t::retire(btree_range_pin_t &pin) -{ - pin.drop_ref(); - remove_pin(pin, false); -} - -void btree_pin_set_t::check_parent(btree_range_pin_t &pin) -{ - LOG_PREFIX(btree_pin_set_t::check_parent); - auto parent = maybe_get_parent(pin.range); - if (parent) { - TRACE("releasing parent {}", *parent); - release_if_no_children(*parent); - } -} - -} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h deleted file mode 100644 index b80e7488c4f..00000000000 --- a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h +++ /dev/null @@ -1,292 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#pragma once - -#include - -#include "crimson/os/seastore/cached_extent.h" -#include "crimson/os/seastore/seastore_types.h" - -namespace crimson::os::seastore::lba_manager::btree { - -class LBANode; -using LBANodeRef = TCachedExtentRef; - -struct lba_node_meta_t { - laddr_t begin = 0; - laddr_t end = 0; - depth_t depth = 0; - - bool is_parent_of(const lba_node_meta_t &other) const { - return (depth == other.depth + 1) && - (begin <= other.begin) && - (end > other.begin); - } - - std::pair split_into(laddr_t pivot) const { - return std::make_pair( - lba_node_meta_t{begin, pivot, depth}, - lba_node_meta_t{pivot, end, depth}); - } - - static lba_node_meta_t merge_from( - const lba_node_meta_t &lhs, const lba_node_meta_t &rhs) { - ceph_assert(lhs.depth == rhs.depth); - return lba_node_meta_t{lhs.begin, rhs.end, lhs.depth}; - } - - static std::pair - rebalance(const lba_node_meta_t &lhs, const lba_node_meta_t &rhs, laddr_t pivot) { - ceph_assert(lhs.depth == rhs.depth); - return std::make_pair( - lba_node_meta_t{lhs.begin, pivot, lhs.depth}, - lba_node_meta_t{pivot, rhs.end, lhs.depth}); - } - - bool is_root() const { - return begin == 0 && end == L_ADDR_MAX; - } -}; - -inline std::ostream &operator<<( - std::ostream &lhs, - const lba_node_meta_t &rhs) -{ - return lhs << "btree_node_meta_t(" - << "begin=" << rhs.begin - << ", end=" << rhs.end - << ", depth=" << rhs.depth - << ")"; -} - -/** - * btree_range_pin_t - * - * Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set - * hook, the lba_node_meta_t representing the lba range covered by a node, - * and extent and ref members intended to hold a reference when the extent - * should be pinned. - */ -class btree_pin_set_t; -class btree_range_pin_t : public boost::intrusive::set_base_hook<> { - friend class btree_pin_set_t; - lba_node_meta_t range; - - btree_pin_set_t *pins = nullptr; - - // We need to be able to remember extent without holding a reference, - // but we can do it more compactly -- TODO - CachedExtent *extent = nullptr; - CachedExtentRef ref; - - using index_t = boost::intrusive::set; - - static auto get_tuple(const lba_node_meta_t &meta) { - return std::make_tuple(-meta.depth, meta.begin); - } - - void acquire_ref() { - ref = CachedExtentRef(extent); - } - - void drop_ref() { - ref.reset(); - } - -public: - btree_range_pin_t() = default; - btree_range_pin_t(CachedExtent *extent) - : extent(extent) {} - btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent) - : range(rhs.range), extent(extent) {} - - bool has_ref() const { - return !!ref; - } - - bool is_root() const { - return range.is_root(); - } - - void set_range(const lba_node_meta_t &nrange) { - range = nrange; - } - void set_extent(CachedExtent *nextent) { - ceph_assert(!extent); - extent = nextent; - } - - CachedExtent &get_extent() { - assert(extent); - return *extent; - } - - bool has_ref() { - return !!ref; - } - - void take_pin(btree_range_pin_t &other); - - friend bool operator<( - const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { - return get_tuple(lhs.range) < get_tuple(rhs.range); - } - friend bool operator>( - const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { - return get_tuple(lhs.range) > get_tuple(rhs.range); - } - friend bool operator==( - const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { - return get_tuple(lhs.range) == rhs.get_tuple(rhs.range); - } - - struct meta_cmp_t { - bool operator()( - const btree_range_pin_t &lhs, const lba_node_meta_t &rhs) const { - return get_tuple(lhs.range) < get_tuple(rhs); - } - bool operator()( - const lba_node_meta_t &lhs, const btree_range_pin_t &rhs) const { - return get_tuple(lhs) < get_tuple(rhs.range); - } - }; - - friend std::ostream &operator<<( - std::ostream &lhs, - const btree_range_pin_t &rhs) { - return lhs << "btree_range_pin_t(" - << "begin=" << rhs.range.begin - << ", end=" << rhs.range.end - << ", depth=" << rhs.range.depth - << ", extent=" << rhs.extent - << ")"; - } - - friend class BtreeLBAPin; - ~btree_range_pin_t(); -}; - -/** - * btree_pin_set_t - * - * Ensures that for every cached node, all parent LBANodes required - * to map it are present in cache. Relocating these nodes can - * therefore be done without further reads or cache space. - * - * Contains a btree_range_pin_t for every clean or dirty LBANode - * or LogicalCachedExtent instance in cache at any point in time. - * For any LBANode, the contained btree_range_pin_t will hold - * a reference to that node pinning it in cache as long as that - * node has children in the set. This invariant can be violated - * only by calling retire_extent and is repaired by calling - * check_parent synchronously after adding any new extents. - */ -class btree_pin_set_t { - friend class btree_range_pin_t; - using pins_t = btree_range_pin_t::index_t; - pins_t pins; - - /// Removes pin from set optionally checking whether parent has other children - void remove_pin(btree_range_pin_t &pin, bool check_parent); - - void replace_pin(btree_range_pin_t &to, btree_range_pin_t &from); - - /// Returns parent pin if exists - btree_range_pin_t *maybe_get_parent(const lba_node_meta_t &pin); - - /// Returns earliest child pin if exist - const btree_range_pin_t *maybe_get_first_child(const lba_node_meta_t &pin) const; - - /// Releases pin if it has no children - void release_if_no_children(btree_range_pin_t &pin); - -public: - /// Adds pin to set, assumes set is consistent - void add_pin(btree_range_pin_t &pin); - - /** - * retire/check_parent - * - * See BtreeLBAManager::complete_transaction. - * retire removes the specified pin from the set, but does not - * check parents. After any new extents are added to the set, - * the caller is required to call check_parent to restore the - * invariant. - */ - void retire(btree_range_pin_t &pin); - void check_parent(btree_range_pin_t &pin); - - template - void scan(F &&f) { - for (auto &i : pins) { - std::invoke(f, i); - } - } - - ~btree_pin_set_t() { - ceph_assert(pins.empty()); - } -}; - -class BtreeLBAPin : public LBAPin { - friend class BtreeLBAManager; - friend class LBABtree; - - /** - * parent - * - * populated until link_extent is called to ensure cache residence - * until add_pin is called. - */ - CachedExtentRef parent; - - paddr_t paddr; - btree_range_pin_t pin; - -public: - BtreeLBAPin() = default; - - BtreeLBAPin( - CachedExtentRef parent, - paddr_t paddr, - lba_node_meta_t &&meta) - : parent(parent), paddr(paddr) { - pin.set_range(std::move(meta)); - } - - void link_extent(LogicalCachedExtent *ref) final { - pin.set_extent(ref); - } - - extent_len_t get_length() const final { - ceph_assert(pin.range.end > pin.range.begin); - return pin.range.end - pin.range.begin; - } - - paddr_t get_paddr() const final { - return paddr; - } - - laddr_t get_laddr() const final { - return pin.range.begin; - } - - LBAPinRef duplicate() const final { - auto ret = std::unique_ptr(new BtreeLBAPin); - ret->pin.set_range(pin.range); - ret->paddr = paddr; - ret->parent = parent; - return ret; - } - - void take_pin(LBAPin &opin) final { - pin.take_pin(static_cast(opin).pin); - } - - bool has_been_invalidated() const final { - return parent->has_been_invalidated(); - } -}; - -} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree.cc deleted file mode 100644 index f1f1eee373c..00000000000 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree.cc +++ /dev/null @@ -1,1022 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "crimson/os/seastore/lba_manager/btree/lba_btree.h" - -SET_SUBSYS(seastore_lba_details); - -namespace crimson::os::seastore::lba_manager::btree { - -LBABtree::mkfs_ret LBABtree::mkfs(op_context_t c) -{ - auto root_leaf = c.cache.alloc_new_extent( - c.trans, - LBA_BLOCK_SIZE); - root_leaf->set_size(0); - lba_node_meta_t meta{0, L_ADDR_MAX, 1}; - root_leaf->set_meta(meta); - root_leaf->pin.set_range(meta); - c.trans.get_lba_tree_stats().depth = 1u; - return lba_root_t{root_leaf->get_paddr(), 1u}; -} - -LBABtree::iterator::handle_boundary_ret LBABtree::iterator::handle_boundary( - op_context_t c, - mapped_space_visitor_t *visitor) -{ - assert(at_boundary()); - depth_t depth_with_space = 2; - for (; depth_with_space <= get_depth(); ++depth_with_space) { - if ((get_internal(depth_with_space).pos + 1) < - get_internal(depth_with_space).node->get_size()) { - break; - } - } - - if (depth_with_space <= get_depth()) { - return seastar::do_with( - [](const LBAInternalNode &internal) { return internal.begin(); }, - [](const LBALeafNode &leaf) { return leaf.begin(); }, - [this, c, depth_with_space, visitor](auto &li, auto &ll) { - for (depth_t depth = 2; depth < depth_with_space; ++depth) { - get_internal(depth).reset(); - } - leaf.reset(); - get_internal(depth_with_space).pos++; - // note, cannot result in at_boundary() by construction - return lookup_depth_range( - c, *this, depth_with_space - 1, 0, li, ll, visitor - ); - }); - } else { - // end - return seastar::now(); - } -} - -LBABtree::iterator_fut LBABtree::iterator::next( - op_context_t c, - mapped_space_visitor_t *visitor) const -{ - assert_valid(); - assert(!is_end()); - - auto ret = *this; - ret.leaf.pos++; - if (ret.at_boundary()) { - return seastar::do_with( - ret, - [c, visitor](auto &ret) mutable { - return ret.handle_boundary( - c, visitor - ).si_then([&ret] { - return std::move(ret); - }); - }); - } else { - return iterator_fut( - interruptible::ready_future_marker{}, - ret); - } - -} - -LBABtree::iterator_fut LBABtree::iterator::prev(op_context_t c) const -{ - assert_valid(); - assert(!is_begin()); - - auto ret = *this; - - if (ret.leaf.pos > 0) { - ret.leaf.pos--; - return iterator_fut( - interruptible::ready_future_marker{}, - ret); - } - - depth_t depth_with_space = 2; - for (; depth_with_space <= get_depth(); ++depth_with_space) { - if (ret.get_internal(depth_with_space).pos > 0) { - break; - } - } - - assert(depth_with_space <= ret.get_depth()); // must not be begin() - return seastar::do_with( - std::move(ret), - [](const LBAInternalNode &internal) { return --internal.end(); }, - [](const LBALeafNode &leaf) { return --leaf.end(); }, - [c, depth_with_space](auto &ret, auto &li, auto &ll) { - for (depth_t depth = 2; depth < depth_with_space; ++depth) { - ret.get_internal(depth).reset(); - } - ret.leaf.reset(); - ret.get_internal(depth_with_space).pos--; - // note, cannot result in at_boundary() by construction - return lookup_depth_range( - c, ret, depth_with_space - 1, 0, li, ll, nullptr - ).si_then([&ret] { - assert(!ret.at_boundary()); - return std::move(ret); - }); - }); -} - -LBABtree::iterator_fut LBABtree::lower_bound( - op_context_t c, - laddr_t addr, - mapped_space_visitor_t *visitor) const -{ - LOG_PREFIX(LBATree::lower_bound); - return lookup( - c, - [addr](const LBAInternalNode &internal) { - assert(internal.get_size() > 0); - auto iter = internal.upper_bound(addr); - assert(iter != internal.begin()); - --iter; - return iter; - }, - [FNAME, c, addr](const LBALeafNode &leaf) { - auto ret = leaf.lower_bound(addr); - DEBUGT( - "leaf addr {}, got ret offset {}, size {}, end {}", - c.trans, - addr, - ret.get_offset(), - leaf.get_size(), - ret == leaf.end()); - return ret; - }, - visitor - ).si_then([FNAME, c](auto &&ret) { - DEBUGT( - "ret.leaf.pos {}", - c.trans, - ret.leaf.pos); - ret.assert_valid(); - return std::move(ret); - }); -} - -LBABtree::insert_ret LBABtree::insert( - op_context_t c, - iterator iter, - laddr_t laddr, - lba_map_val_t val) -{ - LOG_PREFIX(LBATree::insert); - DEBUGT( - "inserting laddr {} at iter {}", - c.trans, - laddr, - iter.is_end() ? L_ADDR_MAX : iter.get_key()); - return seastar::do_with( - iter, - [this, c, laddr, val](auto &ret) { - return find_insertion( - c, laddr, ret - ).si_then([this, c, laddr, val, &ret] { - if (!ret.at_boundary() && ret.get_key() == laddr) { - return insert_ret( - interruptible::ready_future_marker{}, - std::make_pair(ret, false)); - } else { - ++(c.trans.get_lba_tree_stats().num_inserts); - return handle_split( - c, ret - ).si_then([c, laddr, val, &ret] { - if (!ret.leaf.node->is_pending()) { - CachedExtentRef mut = c.cache.duplicate_for_write( - c.trans, ret.leaf.node - ); - ret.leaf.node = mut->cast(); - } - auto iter = LBALeafNode::const_iterator( - ret.leaf.node.get(), ret.leaf.pos); - assert(iter == ret.leaf.node->lower_bound(laddr)); - assert(iter == ret.leaf.node->end() || iter->get_key() > laddr); - assert(laddr >= ret.leaf.node->get_meta().begin && - laddr < ret.leaf.node->get_meta().end); - ret.leaf.node->insert(iter, laddr, val); - return insert_ret( - interruptible::ready_future_marker{}, - std::make_pair(ret, true)); - }); - } - }); - }); -} - -LBABtree::update_ret LBABtree::update( - op_context_t c, - iterator iter, - lba_map_val_t val) -{ - LOG_PREFIX(LBATree::update); - DEBUGT( - "update element at {}", - c.trans, - iter.is_end() ? L_ADDR_MAX : iter.get_key()); - if (!iter.leaf.node->is_pending()) { - CachedExtentRef mut = c.cache.duplicate_for_write( - c.trans, iter.leaf.node - ); - iter.leaf.node = mut->cast(); - } - iter.leaf.node->update( - iter.leaf.node->iter_idx(iter.leaf.pos), - val); - return update_ret( - interruptible::ready_future_marker{}, - iter); -} - -LBABtree::remove_ret LBABtree::remove( - op_context_t c, - iterator iter) -{ - LOG_PREFIX(LBATree::remove); - DEBUGT( - "remove element at {}", - c.trans, - iter.is_end() ? L_ADDR_MAX : iter.get_key()); - assert(!iter.is_end()); - ++(c.trans.get_lba_tree_stats().num_erases); - return seastar::do_with( - iter, - [this, c](auto &ret) { - if (!ret.leaf.node->is_pending()) { - CachedExtentRef mut = c.cache.duplicate_for_write( - c.trans, ret.leaf.node - ); - ret.leaf.node = mut->cast(); - } - ret.leaf.node->remove( - ret.leaf.node->iter_idx(ret.leaf.pos)); - - return handle_merge( - c, ret - ); - }); -} - -LBABtree::init_cached_extent_ret LBABtree::init_cached_extent( - op_context_t c, - CachedExtentRef e) -{ - LOG_PREFIX(LBATree::init_cached_extent); - DEBUGT("extent {}", c.trans, *e); - if (e->is_logical()) { - auto logn = e->cast(); - return lower_bound( - c, - logn->get_laddr() - ).si_then([FNAME, e, c, logn](auto iter) { - if (!iter.is_end() && - iter.get_key() == logn->get_laddr() && - iter.get_val().paddr == logn->get_paddr()) { - logn->set_pin(iter.get_pin()); - ceph_assert(iter.get_val().len == e->get_length()); - if (c.pins) { - c.pins->add_pin( - static_cast(logn->get_pin()).pin); - } - DEBUGT("logical extent {} live", c.trans, *logn); - return true; - } else { - DEBUGT("logical extent {} not live", c.trans, *logn); - return false; - } - }); - } else if (e->get_type() == extent_types_t::LADDR_INTERNAL) { - auto eint = e->cast(); - return lower_bound( - c, eint->get_node_meta().begin - ).si_then([FNAME, e, c, eint](auto iter) { - // Note, this check is valid even if iter.is_end() - depth_t cand_depth = eint->get_node_meta().depth; - if (cand_depth <= iter.get_depth() && - &*iter.get_internal(cand_depth).node == &*eint) { - DEBUGT("extent {} is live", c.trans, *eint); - return true; - } else { - DEBUGT("extent {} is not live", c.trans, *eint); - return false; - } - }); - } else if (e->get_type() == extent_types_t::LADDR_LEAF) { - auto eleaf = e->cast(); - return lower_bound( - c, eleaf->get_node_meta().begin - ).si_then([FNAME, c, e, eleaf](auto iter) { - // Note, this check is valid even if iter.is_end() - if (iter.leaf.node == &*eleaf) { - DEBUGT("extent {} is live", c.trans, *eleaf); - return true; - } else { - DEBUGT("extent {} is not live", c.trans, *eleaf); - return false; - } - }); - } else { - DEBUGT( - "found other extent {} type {}", - c.trans, - *e, - e->get_type()); - return init_cached_extent_ret( - interruptible::ready_future_marker{}, - true); - } -} - -LBABtree::get_internal_if_live_ret -LBABtree::get_internal_if_live( - op_context_t c, - paddr_t addr, - laddr_t laddr, - seastore_off_t len) -{ - LOG_PREFIX(LBABtree::get_internal_if_live); - return lower_bound( - c, laddr - ).si_then([FNAME, c, addr, laddr, len](auto iter) { - for (depth_t d = 2; d <= iter.get_depth(); ++d) { - CachedExtent &node = *iter.get_internal(d).node; - auto internal_node = node.cast(); - if (internal_node->get_paddr() == addr) { - DEBUGT( - "extent laddr {} addr {}~{} found: {}", - c.trans, - laddr, - addr, - len, - *internal_node); - assert(internal_node->get_node_meta().begin == laddr); - return CachedExtentRef(internal_node); - } - } - DEBUGT( - "extent laddr {} addr {}~{} is not live, no matching internal node", - c.trans, - laddr, - addr, - len); - return CachedExtentRef(); - }); -} - -LBABtree::get_leaf_if_live_ret -LBABtree::get_leaf_if_live( - op_context_t c, - paddr_t addr, - laddr_t laddr, - seastore_off_t len) -{ - LOG_PREFIX(LBABtree::get_leaf_if_live); - return lower_bound( - c, laddr - ).si_then([FNAME, c, addr, laddr, len](auto iter) { - if (iter.leaf.node->get_paddr() == addr) { - DEBUGT( - "extent laddr {} addr {}~{} found: {}", - c.trans, - laddr, - addr, - len, - *iter.leaf.node); - return CachedExtentRef(iter.leaf.node); - } else { - DEBUGT( - "extent laddr {} addr {}~{} is not live, does not match node {}", - c.trans, - laddr, - addr, - len, - *iter.leaf.node); - return CachedExtentRef(); - } - }); -} - - -LBABtree::rewrite_lba_extent_ret LBABtree::rewrite_lba_extent( - op_context_t c, - CachedExtentRef e) -{ - LOG_PREFIX(LBABtree::rewrite_lba_extent); - assert(e->get_type() == extent_types_t::LADDR_INTERNAL || - e->get_type() == extent_types_t::LADDR_LEAF); - - auto do_rewrite = [&](auto &lba_extent) { - auto nlba_extent = c.cache.alloc_new_extent< - std::remove_reference_t - >( - c.trans, - lba_extent.get_length()); - lba_extent.get_bptr().copy_out( - 0, - lba_extent.get_length(), - nlba_extent->get_bptr().c_str()); - nlba_extent->pin.set_range(nlba_extent->get_node_meta()); - nlba_extent->set_last_modified(lba_extent.get_last_modified()); - - /* This is a bit underhanded. Any relative addrs here must necessarily - * be record relative as we are rewriting a dirty extent. Thus, we - * are using resolve_relative_addrs with a (likely negative) block - * relative offset to correct them to block-relative offsets adjusted - * for our new transaction location. - * - * Upon commit, these now block relative addresses will be interpretted - * against the real final address. - */ - nlba_extent->resolve_relative_addrs( - make_record_relative_paddr(0) - nlba_extent->get_paddr()); - - DEBUGT( - "rewriting {} into {}", - c.trans, - lba_extent, - *nlba_extent); - - return update_internal_mapping( - c, - nlba_extent->get_node_meta().depth, - nlba_extent->get_node_meta().begin, - e->get_paddr(), - nlba_extent->get_paddr() - ).si_then([c, e] { - c.cache.retire_extent(c.trans, e); - }); - }; - - CachedExtentRef nlba_extent; - if (e->get_type() == extent_types_t::LADDR_INTERNAL) { - auto lint = e->cast(); - return do_rewrite(*lint); - } else { - assert(e->get_type() == extent_types_t::LADDR_LEAF); - auto lleaf = e->cast(); - return do_rewrite(*lleaf); - } -} - -LBABtree::get_internal_node_ret LBABtree::get_internal_node( - op_context_t c, - depth_t depth, - paddr_t offset, - laddr_t begin, - laddr_t end) -{ - LOG_PREFIX(LBATree::get_internal_node); - DEBUGT( - "reading internal at offset {}, depth {}, begin {}, end {}", - c.trans, - offset, - depth, - begin, - end); - assert(depth > 1); - auto init_internal = [c, depth, begin, end](LBAInternalNode &node) { - assert(!node.is_pending()); - assert(!node.pin.is_linked()); - node.pin.set_range(lba_node_meta_t{begin, end, depth}); - if (c.pins) { - c.pins->add_pin(node.pin); - } - }; - return c.cache.get_extent( - c.trans, - offset, - LBA_BLOCK_SIZE, - init_internal - ).si_then([FNAME, c, offset, init_internal, depth, begin, end]( - LBAInternalNodeRef ret) { - DEBUGT( - "read internal at offset {} {}", - c.trans, - offset, - *ret); - // This can only happen during init_cached_extent - if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) { - assert(ret->is_dirty()); - init_internal(*ret); - } - auto meta = ret->get_meta(); - if (ret->get_size()) { - ceph_assert(meta.begin <= ret->begin()->get_key()); - ceph_assert(meta.end > (ret->end() - 1)->get_key()); - } - ceph_assert(depth == meta.depth); - ceph_assert(begin == meta.begin); - ceph_assert(end == meta.end); - return get_internal_node_ret( - interruptible::ready_future_marker{}, - ret); - }); -} - -LBABtree::get_leaf_node_ret LBABtree::get_leaf_node( - op_context_t c, - paddr_t offset, - laddr_t begin, - laddr_t end) -{ - LOG_PREFIX(LBATree::get_leaf_node); - DEBUGT( - "reading leaf at offset {}, begin {}, end {}", - c.trans, - offset, - begin, - end); - auto init_leaf = [c, begin, end](LBALeafNode &node) { - assert(!node.is_pending()); - assert(!node.pin.is_linked()); - node.pin.set_range(lba_node_meta_t{begin, end, 1}); - if (c.pins) { - c.pins->add_pin(node.pin); - } - }; - return c.cache.get_extent( - c.trans, - offset, - LBA_BLOCK_SIZE, - init_leaf - ).si_then([FNAME, c, offset, init_leaf, begin, end](LBALeafNodeRef ret) { - DEBUGT( - "read leaf at offset {} {}", - c.trans, - offset, - *ret); - // This can only happen during init_cached_extent - if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) { - assert(ret->is_dirty()); - init_leaf(*ret); - } - auto meta = ret->get_meta(); - if (ret->get_size()) { - ceph_assert(meta.begin <= ret->begin()->get_key()); - ceph_assert(meta.end > (ret->end() - 1)->get_key()); - } - ceph_assert(1 == meta.depth); - ceph_assert(begin == meta.begin); - ceph_assert(end == meta.end); - return get_leaf_node_ret( - interruptible::ready_future_marker{}, - ret); - }); -} - -LBABtree::find_insertion_ret LBABtree::find_insertion( - op_context_t c, - laddr_t laddr, - iterator &iter) -{ - assert(iter.is_end() || iter.get_key() >= laddr); - if (!iter.is_end() && iter.get_key() == laddr) { - return seastar::now(); - } else if (iter.leaf.node->get_node_meta().begin <= laddr) { -#ifndef NDEBUG - auto p = iter; - if (p.leaf.pos > 0) { - --p.leaf.pos; - assert(p.get_key() < laddr); - } -#endif - return seastar::now(); - } else { - assert(iter.leaf.pos == 0); - return iter.prev( - c - ).si_then([laddr, &iter](auto p) { - boost::ignore_unused(laddr); // avoid clang warning; - assert(p.leaf.node->get_node_meta().begin <= laddr); - assert(p.get_key() < laddr); - // Note, this is specifically allowed to violate the iterator - // invariant that pos is a valid index for the node in the event - // that the insertion point is at the end of a node. - p.leaf.pos++; - assert(p.at_boundary()); - iter = p; - return seastar::now(); - }); - } -} - -LBABtree::handle_split_ret LBABtree::handle_split( - op_context_t c, - iterator &iter) -{ - LOG_PREFIX(LBATree::handle_split); - - depth_t split_from = iter.check_split(); - - DEBUGT("split_from {}, depth {}", c.trans, split_from, iter.get_depth()); - - if (split_from == iter.get_depth()) { - auto nroot = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - lba_node_meta_t meta{0, L_ADDR_MAX, iter.get_depth() + 1}; - nroot->set_meta(meta); - nroot->pin.set_range(meta); - nroot->journal_insert( - std::cbegin(*nroot), - L_ADDR_MIN, - root.get_location(), - nullptr); - iter.internal.push_back({nroot, 0}); - - root.set_location(nroot->get_paddr()); - root.set_depth(iter.get_depth()); - c.trans.get_lba_tree_stats().depth = iter.get_depth(); - root_dirty = true; - } - - /* pos may be either node_position_t or - * node_position_t */ - auto split_level = [&, FNAME](auto &parent_pos, auto &pos) { - auto [left, right, pivot] = pos.node->make_split_children(c); - - auto parent_node = parent_pos.node; - auto parent_iter = parent_pos.get_iter(); - - parent_node->update( - parent_iter, - left->get_paddr()); - parent_node->insert( - parent_iter + 1, - pivot, - right->get_paddr()); - - DEBUGT("splitted {} into left: {}, right: {}", - c.trans, - *pos.node, - *left, - *right); - c.cache.retire_extent(c.trans, pos.node); - - return std::make_pair(left, right); - }; - - for (; split_from > 0; --split_from) { - auto &parent_pos = iter.get_internal(split_from + 1); - if (!parent_pos.node->is_pending()) { - parent_pos.node = c.cache.duplicate_for_write( - c.trans, parent_pos.node - )->cast(); - } - - if (split_from > 1) { - auto &pos = iter.get_internal(split_from); - DEBUGT("splitting internal {} at depth {}, parent: {} at pos: {}", - c.trans, - *pos.node, - split_from, - *parent_pos.node, - parent_pos.pos); - auto [left, right] = split_level(parent_pos, pos); - - if (pos.pos < left->get_size()) { - pos.node = left; - } else { - pos.node = right; - pos.pos -= left->get_size(); - - parent_pos.pos += 1; - } - } else { - auto &pos = iter.leaf; - DEBUGT("splitting leaf {}, parent: {} at pos: {}", - c.trans, - *pos.node, - *parent_pos.node, - parent_pos.pos); - auto [left, right] = split_level(parent_pos, pos); - - /* right->get_node_meta().begin == pivot == right->begin()->get_key() - * Thus, if pos.pos == left->get_size(), we want iter to point to - * left with pos.pos at the end rather than right with pos.pos = 0 - * since the insertion would be to the left of the first element - * of right and thus necessarily less than right->get_node_meta().begin. - */ - if (pos.pos <= left->get_size()) { - pos.node = left; - } else { - pos.node = right; - pos.pos -= left->get_size(); - - parent_pos.pos += 1; - } - } - } - - return seastar::now(); -} - -template -LBABtree::base_iertr::future get_node( - op_context_t c, - depth_t depth, - paddr_t addr, - laddr_t begin, - laddr_t end); - -template <> -LBABtree::base_iertr::future get_node( - op_context_t c, - depth_t depth, - paddr_t addr, - laddr_t begin, - laddr_t end) { - assert(depth == 1); - return LBABtree::get_leaf_node(c, addr, begin, end); -} - -template <> -LBABtree::base_iertr::future get_node( - op_context_t c, - depth_t depth, - paddr_t addr, - laddr_t begin, - laddr_t end) { - return LBABtree::get_internal_node(c, depth, addr, begin, end); -} - -template -LBABtree::handle_merge_ret merge_level( - op_context_t c, - depth_t depth, - LBABtree::node_position_t &parent_pos, - LBABtree::node_position_t &pos) -{ - LOG_PREFIX(LBABtree::merge_level); - if (!parent_pos.node->is_pending()) { - parent_pos.node = c.cache.duplicate_for_write( - c.trans, parent_pos.node - )->cast(); - } - - auto iter = parent_pos.get_iter(); - assert(iter.get_offset() < parent_pos.node->get_size()); - bool donor_is_left = ((iter.get_offset() + 1) == parent_pos.node->get_size()); - auto donor_iter = donor_is_left ? (iter - 1) : (iter + 1); - auto next_iter = donor_iter + 1; - auto begin = donor_iter->get_key(); - auto end = next_iter == parent_pos.node->end() - ? parent_pos.node->get_node_meta().end - : next_iter->get_key(); - - DEBUGT("parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node); - return get_node( - c, - depth, - donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()), - begin, - end - ).si_then([FNAME, c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( - typename NodeType::Ref donor) { - auto [l, r] = donor_is_left ? - std::make_pair(donor, pos.node) : std::make_pair(pos.node, donor); - - auto [liter, riter] = donor_is_left ? - std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); - - if (donor->at_min_capacity()) { - auto replacement = l->make_full_merge(c, r); - - parent_pos.node->update( - liter, - replacement->get_paddr()); - parent_pos.node->remove(riter); - - pos.node = replacement; - if (donor_is_left) { - pos.pos += r->get_size(); - parent_pos.pos--; - } - - DEBUGT("l: {}, r: {}, replacement: {}", c.trans, *l, *r, *replacement); - c.cache.retire_extent(c.trans, l); - c.cache.retire_extent(c.trans, r); - } else { - auto [replacement_l, replacement_r, pivot] = - l->make_balanced( - c, - r, - !donor_is_left); - - parent_pos.node->update( - liter, - replacement_l->get_paddr()); - parent_pos.node->replace( - riter, - pivot, - replacement_r->get_paddr()); - - if (donor_is_left) { - assert(parent_pos.pos > 0); - parent_pos.pos--; - } - - auto orig_position = donor_is_left ? - l->get_size() + pos.pos : - pos.pos; - if (orig_position < replacement_l->get_size()) { - pos.node = replacement_l; - pos.pos = orig_position; - } else { - parent_pos.pos++; - pos.node = replacement_r; - pos.pos = orig_position - replacement_l->get_size(); - } - - DEBUGT("l: {}, r: {}, replacement_l: {}, replacement_r: {}", - c.trans, *l, *r, *replacement_l, *replacement_r); - c.cache.retire_extent(c.trans, l); - c.cache.retire_extent(c.trans, r); - } - - return seastar::now(); - }); -} - -LBABtree::handle_merge_ret LBABtree::handle_merge( - op_context_t c, - iterator &iter) -{ - LOG_PREFIX(LBATree::handle_merge); - if (iter.get_depth() == 1 || - !iter.leaf.node->below_min_capacity()) { - DEBUGT( - "no need to merge leaf, leaf size {}, depth {}", - c.trans, - iter.leaf.node->get_size(), - iter.get_depth()); - return seastar::now(); - } - - return seastar::do_with( - depth_t{1}, - [FNAME, this, c, &iter](auto &to_merge) { - return trans_intr::repeat( - [FNAME, this, c, &iter, &to_merge] { - DEBUGT( - "merging depth {}", - c.trans, - to_merge); - auto &parent_pos = iter.get_internal(to_merge + 1); - auto merge_fut = handle_merge_iertr::now(); - if (to_merge > 1) { - auto &pos = iter.get_internal(to_merge); - merge_fut = merge_level(c, to_merge, parent_pos, pos); - } else { - auto &pos = iter.leaf; - merge_fut = merge_level(c, to_merge, parent_pos, pos); - } - - return merge_fut.si_then([FNAME, this, c, &iter, &to_merge] { - ++to_merge; - auto &pos = iter.get_internal(to_merge); - if (to_merge == iter.get_depth()) { - if (pos.node->get_size() == 1) { - DEBUGT("collapsing root", c.trans); - c.cache.retire_extent(c.trans, pos.node); - assert(pos.pos == 0); - auto node_iter = pos.get_iter(); - root.set_location( - node_iter->get_val().maybe_relative_to(pos.node->get_paddr())); - iter.internal.pop_back(); - root.set_depth(iter.get_depth()); - c.trans.get_lba_tree_stats().depth = iter.get_depth(); - root_dirty = true; - } else { - DEBUGT("no need to collapse root", c.trans); - } - return seastar::stop_iteration::yes; - } else if (pos.node->below_min_capacity()) { - DEBUGT( - "continuing, next node {} depth {} at min", - c.trans, - *pos.node, - to_merge); - return seastar::stop_iteration::no; - } else { - DEBUGT( - "complete, next node {} depth {} not min", - c.trans, - *pos.node, - to_merge); - return seastar::stop_iteration::yes; - } - }); - }); - }); -} - -LBABtree::update_internal_mapping_ret LBABtree::update_internal_mapping( - op_context_t c, - depth_t depth, - laddr_t laddr, - paddr_t old_addr, - paddr_t new_addr) -{ - LOG_PREFIX(LBATree::update_internal_mapping); - DEBUGT( - "updating laddr {} at depth {} from {} to {}", - c.trans, - laddr, - depth, - old_addr, - new_addr); - - return lower_bound( - c, laddr - ).si_then([=](auto iter) { - assert(iter.get_depth() >= depth); - if (depth == iter.get_depth()) { - DEBUGT("update at root", c.trans); - - if (laddr != 0) { - ERRORT( - "updating root laddr {} at depth {} from {} to {}," - "laddr is not 0", - c.trans, - laddr, - depth, - old_addr, - new_addr, - root.get_location()); - ceph_assert(0 == "impossible"); - } - - if (root.get_location() != old_addr) { - ERRORT( - "updating root laddr {} at depth {} from {} to {}," - "root addr {} does not match", - c.trans, - laddr, - depth, - old_addr, - new_addr, - root.get_location()); - ceph_assert(0 == "impossible"); - } - - root.set_location(new_addr); - root_dirty = true; - } else { - auto &parent = iter.get_internal(depth + 1); - assert(parent.node); - assert(parent.pos < parent.node->get_size()); - auto piter = parent.node->iter_idx(parent.pos); - - if (piter->get_key() != laddr) { - ERRORT( - "updating laddr {} at depth {} from {} to {}," - "node {} pos {} val pivot addr {} does not match", - c.trans, - laddr, - depth, - old_addr, - new_addr, - *(parent.node), - parent.pos, - piter->get_key()); - ceph_assert(0 == "impossible"); - } - - - if (piter->get_val() != old_addr) { - ERRORT( - "updating laddr {} at depth {} from {} to {}," - "node {} pos {} val addr {} does not match", - c.trans, - laddr, - depth, - old_addr, - new_addr, - *(parent.node), - parent.pos, - piter->get_val()); - ceph_assert(0 == "impossible"); - } - - CachedExtentRef mut = c.cache.duplicate_for_write( - c.trans, - parent.node - ); - LBAInternalNodeRef mparent = mut->cast(); - mparent->update(piter, new_addr); - - /* Note, iter is now invalid as we didn't udpate either the parent - * node reference to the new mutable instance nor did we update the - * child pointer to the new node. Not a problem as we'll now just - * destruct it. - */ - } - return seastar::now(); - }); -} -} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree.h deleted file mode 100644 index 0ba459202ef..00000000000 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree.h +++ /dev/null @@ -1,702 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#pragma once - -#include -#include -#include -#include - -#include "crimson/os/seastore/lba_manager.h" -#include "crimson/os/seastore/logging.h" -#include "crimson/os/seastore/seastore_types.h" -#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" - -namespace crimson::os::seastore::lba_manager::btree { - - -class LBABtree { - static constexpr size_t MAX_DEPTH = 16; -public: - using base_iertr = LBAManager::base_iertr; - - class iterator; - using iterator_fut = base_iertr::future; - - using mapped_space_visitor_t = LBAManager::scan_mapped_space_func_t; - - class iterator { - public: - iterator(const iterator &rhs) noexcept : - internal(rhs.internal), leaf(rhs.leaf) {} - iterator(iterator &&rhs) noexcept : - internal(std::move(rhs.internal)), leaf(std::move(rhs.leaf)) {} - - iterator &operator=(const iterator &) = default; - iterator &operator=(iterator &&) = default; - - iterator_fut next( - op_context_t c, - mapped_space_visitor_t *visit=nullptr) const; - - iterator_fut prev(op_context_t c) const; - - void assert_valid() const { - assert(leaf.node); - assert(leaf.pos <= leaf.node->get_size()); - - for (auto &i: internal) { - (void)i; - assert(i.node); - assert(i.pos < i.node->get_size()); - } - } - - depth_t get_depth() const { - return internal.size() + 1; - } - - auto &get_internal(depth_t depth) { - assert(depth > 1); - assert((depth - 2) < internal.size()); - return internal[depth - 2]; - } - - const auto &get_internal(depth_t depth) const { - assert(depth > 1); - assert((depth - 2) < internal.size()); - return internal[depth - 2]; - } - - laddr_t get_key() const { - assert(!is_end()); - return leaf.node->iter_idx(leaf.pos).get_key(); - } - lba_map_val_t get_val() const { - assert(!is_end()); - auto ret = leaf.node->iter_idx(leaf.pos).get_val(); - ret.paddr = ret.paddr.maybe_relative_to(leaf.node->get_paddr()); - return ret; - } - - bool is_end() const { - // external methods may only resolve at a boundary if at end - return at_boundary(); - } - - bool is_begin() const { - for (auto &i: internal) { - if (i.pos != 0) - return false; - } - return leaf.pos == 0; - } - - LBAPinRef get_pin() const { - assert(!is_end()); - auto val = get_val(); - auto key = get_key(); - return std::make_unique( - leaf.node, - val.paddr, - lba_node_meta_t{ key, key + val.len, 0 }); - } - - private: - iterator() noexcept {} - iterator(depth_t depth) noexcept : internal(depth - 1) {} - - friend class LBABtree; - static constexpr uint16_t INVALID = std::numeric_limits::max(); - template - struct node_position_t { - typename NodeType::Ref node; - uint16_t pos = INVALID; - - void reset() { - *this = node_position_t{}; - } - - auto get_iter() { - assert(pos != INVALID); - assert(pos < node->get_size()); - return node->iter_idx(pos); - } - }; - boost::container::static_vector< - node_position_t, MAX_DEPTH> internal; - node_position_t leaf; - - bool at_boundary() const { - assert(leaf.pos <= leaf.node->get_size()); - return leaf.pos == leaf.node->get_size(); - } - - using handle_boundary_ertr = base_iertr; - using handle_boundary_ret = handle_boundary_ertr::future<>; - handle_boundary_ret handle_boundary( - op_context_t c, - mapped_space_visitor_t *visitor); - - depth_t check_split() const { - if (!leaf.node->at_max_capacity()) { - return 0; - } - for (depth_t split_from = 1; split_from < get_depth(); ++split_from) { - if (!get_internal(split_from + 1).node->at_max_capacity()) - return split_from; - } - return get_depth(); - } - - depth_t check_merge() const { - if (!leaf.node->below_min_capacity()) { - return 0; - } - for (depth_t merge_from = 1; merge_from < get_depth(); ++merge_from) { - if (!get_internal(merge_from + 1).node->below_min_capacity()) - return merge_from; - } - return get_depth(); - } - }; - - LBABtree(lba_root_t root) : root(root) {} - - bool is_root_dirty() const { - return root_dirty; - } - lba_root_t get_root_undirty() { - ceph_assert(root_dirty); - root_dirty = false; - return root; - } - - /// mkfs - using mkfs_ret = lba_root_t; - static mkfs_ret mkfs(op_context_t c); - - /** - * lower_bound - * - * @param c [in] context - * @param addr [in] ddr - * @return least iterator >= key - */ - iterator_fut lower_bound( - op_context_t c, - laddr_t addr, - mapped_space_visitor_t *visit=nullptr) const; - - /** - * upper_bound - * - * @param c [in] context - * @param addr [in] ddr - * @return least iterator > key - */ - iterator_fut upper_bound( - op_context_t c, - laddr_t addr - ) const { - return lower_bound( - c, addr - ).si_then([c, addr](auto iter) { - if (!iter.is_end() && iter.get_key() == addr) { - return iter.next(c); - } else { - return iterator_fut( - interruptible::ready_future_marker{}, - iter); - } - }); - } - - /** - * upper_bound_right - * - * @param c [in] context - * @param addr [in] addr - * @return least iterator i s.t. i.get_key() + i.get_val().len > key - */ - iterator_fut upper_bound_right( - op_context_t c, - laddr_t addr) const - { - return lower_bound( - c, addr - ).si_then([c, addr](auto iter) { - if (iter.is_begin()) { - return iterator_fut( - interruptible::ready_future_marker{}, - iter); - } else { - return iter.prev( - c - ).si_then([iter, addr](auto prev) { - if ((prev.get_key() + prev.get_val().len) > addr) { - return iterator_fut( - interruptible::ready_future_marker{}, - prev); - } else { - return iterator_fut( - interruptible::ready_future_marker{}, - iter); - } - }); - } - }); - } - - iterator_fut begin(op_context_t c) const { - return lower_bound(c, 0); - } - iterator_fut end(op_context_t c) const { - return upper_bound(c, L_ADDR_MAX); - } - - using iterate_repeat_ret_inner = base_iertr::future< - seastar::stop_iteration>; - template - static base_iertr::future<> iterate_repeat( - op_context_t c, - iterator_fut &&iter_fut, - F &&f, - mapped_space_visitor_t *visitor=nullptr) { - return std::move( - iter_fut - ).si_then([c, visitor, f=std::forward(f)](auto iter) { - return seastar::do_with( - iter, - std::move(f), - [c, visitor](auto &pos, auto &f) { - return trans_intr::repeat( - [c, visitor, &f, &pos] { - return f( - pos - ).si_then([c, visitor, &pos](auto done) { - if (done == seastar::stop_iteration::yes) { - return iterate_repeat_ret_inner( - interruptible::ready_future_marker{}, - seastar::stop_iteration::yes); - } else { - ceph_assert(!pos.is_end()); - return pos.next( - c, visitor - ).si_then([&pos](auto next) { - pos = next; - return iterate_repeat_ret_inner( - interruptible::ready_future_marker{}, - seastar::stop_iteration::no); - }); - } - }); - }); - }); - }); - } - - /** - * insert - * - * Inserts val at laddr with iter as a hint. If element at laddr already - * exists returns iterator to that element unchanged and returns false. - * - * Invalidates all outstanding iterators for this tree on this transaction. - * - * @param c [in] op context - * @param iter [in] hint, insertion constant if immediately prior to iter - * @param laddr [in] addr at which to insert - * @param val [in] val to insert - * @return pair where iter points to element at addr, bool true - * iff element at laddr did not exist. - */ - using insert_iertr = base_iertr; - using insert_ret = insert_iertr::future>; - insert_ret insert( - op_context_t c, - iterator iter, - laddr_t laddr, - lba_map_val_t val - ); - insert_ret insert( - op_context_t c, - laddr_t laddr, - lba_map_val_t val) { - return lower_bound( - c, laddr - ).si_then([this, c, laddr, val](auto iter) { - return insert(c, iter, laddr, val); - }); - } - - /** - * update - * - * Invalidates all outstanding iterators for this tree on this transaction. - * - * @param c [in] op context - * @param iter [in] iterator to element to update, must not be end - * @param val [in] val with which to update - * @return iterator to newly updated element - */ - using update_iertr = base_iertr; - using update_ret = update_iertr::future; - update_ret update( - op_context_t c, - iterator iter, - lba_map_val_t val); - - /** - * remove - * - * Invalidates all outstanding iterators for this tree on this transaction. - * - * @param c [in] op context - * @param iter [in] iterator to element to remove, must not be end - */ - using remove_iertr = base_iertr; - using remove_ret = remove_iertr::future<>; - remove_ret remove( - op_context_t c, - iterator iter); - - /** - * init_cached_extent - * - * Checks whether e is live (reachable from lba tree) and drops or initializes - * accordingly. - * - * Returns if e is live. - */ - using init_cached_extent_iertr = base_iertr; - using init_cached_extent_ret = init_cached_extent_iertr::future; - init_cached_extent_ret init_cached_extent(op_context_t c, CachedExtentRef e); - - /// get_leaf_if_live: get leaf node at laddr/addr if still live - using get_leaf_if_live_iertr = base_iertr; - using get_leaf_if_live_ret = get_leaf_if_live_iertr::future; - get_leaf_if_live_ret get_leaf_if_live( - op_context_t c, - paddr_t addr, - laddr_t laddr, - seastore_off_t len); - - /// get_internal_if_live: get internal node at laddr/addr if still live - using get_internal_if_live_iertr = base_iertr; - using get_internal_if_live_ret = get_internal_if_live_iertr::future; - get_internal_if_live_ret get_internal_if_live( - op_context_t c, - paddr_t addr, - laddr_t laddr, - seastore_off_t len); - - /** - * rewrite_lba_extent - * - * Rewrites a fresh copy of extent into transaction and updates internal - * references. - */ - using rewrite_lba_extent_iertr = base_iertr; - using rewrite_lba_extent_ret = rewrite_lba_extent_iertr::future<>; - rewrite_lba_extent_ret rewrite_lba_extent(op_context_t c, CachedExtentRef e); - -private: - lba_root_t root; - bool root_dirty = false; - - using get_internal_node_iertr = base_iertr; - using get_internal_node_ret = get_internal_node_iertr::future; - static get_internal_node_ret get_internal_node( - op_context_t c, - depth_t depth, - paddr_t offset, - laddr_t begin, - laddr_t end); - - using get_leaf_node_iertr = base_iertr; - using get_leaf_node_ret = get_leaf_node_iertr::future; - static get_leaf_node_ret get_leaf_node( - op_context_t c, - paddr_t offset, - laddr_t begin, - laddr_t end); - - using lookup_root_iertr = base_iertr; - using lookup_root_ret = lookup_root_iertr::future<>; - lookup_root_ret lookup_root( - op_context_t c, - iterator &iter, - mapped_space_visitor_t *visitor) const { - if (root.get_depth() > 1) { - return get_internal_node( - c, - root.get_depth(), - root.get_location(), - 0, - L_ADDR_MAX - ).si_then([this, visitor, &iter](LBAInternalNodeRef root_node) { - iter.get_internal(root.get_depth()).node = root_node; - if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length()); - return lookup_root_iertr::now(); - }); - } else { - return get_leaf_node( - c, - root.get_location(), - 0, - L_ADDR_MAX - ).si_then([visitor, &iter](LBALeafNodeRef root_node) { - iter.leaf.node = root_node; - if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length()); - return lookup_root_iertr::now(); - }); - } - } - - using lookup_internal_level_iertr = base_iertr; - using lookup_internal_level_ret = lookup_internal_level_iertr::future<>; - template - static lookup_internal_level_ret lookup_internal_level( - op_context_t c, - depth_t depth, - iterator &iter, - F &f, - mapped_space_visitor_t *visitor - ) { - assert(depth > 1); - auto &parent_entry = iter.get_internal(depth + 1); - auto parent = parent_entry.node; - auto node_iter = parent->iter_idx(parent_entry.pos); - auto next_iter = node_iter + 1; - auto begin = node_iter->get_key(); - auto end = next_iter == parent->end() - ? parent->get_node_meta().end - : next_iter->get_key(); - return get_internal_node( - c, - depth, - node_iter->get_val().maybe_relative_to(parent->get_paddr()), - begin, - end - ).si_then([depth, visitor, &iter, &f](LBAInternalNodeRef node) { - auto &entry = iter.get_internal(depth); - entry.node = node; - auto node_iter = f(*node); - assert(node_iter != node->end()); - entry.pos = node_iter->get_offset(); - if (visitor) (*visitor)(node->get_paddr(), node->get_length()); - return seastar::now(); - }); - } - - using lookup_leaf_iertr = base_iertr; - using lookup_leaf_ret = lookup_leaf_iertr::future<>; - template - static lookup_internal_level_ret lookup_leaf( - op_context_t c, - iterator &iter, - F &f, - mapped_space_visitor_t *visitor - ) { - auto &parent_entry = iter.get_internal(2); - auto parent = parent_entry.node; - assert(parent); - auto node_iter = parent->iter_idx(parent_entry.pos); - auto next_iter = node_iter + 1; - auto begin = node_iter->get_key(); - auto end = next_iter == parent->end() - ? parent->get_node_meta().end - : next_iter->get_key(); - - return get_leaf_node( - c, - node_iter->get_val().maybe_relative_to(parent->get_paddr()), - begin, - end - ).si_then([visitor, &iter, &f](LBALeafNodeRef node) { - iter.leaf.node = node; - auto node_iter = f(*node); - iter.leaf.pos = node_iter->get_offset(); - if (visitor) (*visitor)(node->get_paddr(), node->get_length()); - return seastar::now(); - }); - } - - /** - * lookup_depth_range - * - * Performs node lookups on depths [from, to) using li and ll to - * specific target at each level. Note, may leave the iterator - * at_boundary(), call handle_boundary() prior to returning out - * lf LBABtree. - */ - using lookup_depth_range_iertr = base_iertr; - using lookup_depth_range_ret = lookup_depth_range_iertr::future<>; - template - static lookup_depth_range_ret lookup_depth_range( - op_context_t c, ///< [in] context - iterator &iter, ///< [in,out] iterator to populate - depth_t from, ///< [in] from inclusive - depth_t to, ///< [in] to exclusive, (to <= from, to == from is a noop) - LI &li, ///< [in] internal->iterator - LL &ll, ///< [in] leaf->iterator - mapped_space_visitor_t *visitor ///< [in] mapped space visitor - ) { - LOG_PREFIX(LBATree::lookup_depth_range); - SUBDEBUGT(seastore_lba_details, "{} -> {}", c.trans, from, to); - return seastar::do_with( - from, - [c, to, visitor, &iter, &li, &ll](auto &d) { - return trans_intr::repeat( - [c, to, visitor, &iter, &li, &ll, &d] { - if (d > to) { - return [&] { - if (d > 1) { - return lookup_internal_level( - c, - d, - iter, - li, - visitor); - } else { - assert(d == 1); - return lookup_leaf( - c, - iter, - ll, - visitor); - } - }().si_then([&d] { - --d; - return lookup_depth_range_iertr::make_ready_future< - seastar::stop_iteration - >(seastar::stop_iteration::no); - }); - } else { - return lookup_depth_range_iertr::make_ready_future< - seastar::stop_iteration - >(seastar::stop_iteration::yes); - } - }); - }); - } - - using lookup_iertr = base_iertr; - using lookup_ret = lookup_iertr::future; - template - lookup_ret lookup( - op_context_t c, - LI &&lookup_internal, - LL &&lookup_leaf, - mapped_space_visitor_t *visitor - ) const { - LOG_PREFIX(LBATree::lookup); - return seastar::do_with( - iterator{root.get_depth()}, - std::forward
  • (lookup_internal), - std::forward(lookup_leaf), - [FNAME, this, visitor, c](auto &iter, auto &li, auto &ll) { - return lookup_root( - c, iter, visitor - ).si_then([FNAME, this, visitor, c, &iter, &li, &ll] { - if (iter.get_depth() > 1) { - auto &root_entry = *(iter.internal.rbegin()); - root_entry.pos = li(*(root_entry.node)).get_offset(); - } else { - auto &root_entry = iter.leaf; - auto riter = ll(*(root_entry.node)); - root_entry.pos = riter->get_offset(); - } - SUBDEBUGT(seastore_lba_details, "got root, depth {}", c.trans, root.get_depth()); - return lookup_depth_range( - c, - iter, - root.get_depth() - 1, - 0, - li, - ll, - visitor - ).si_then([c, visitor, &iter] { - if (iter.at_boundary()) { - return iter.handle_boundary(c, visitor); - } else { - return lookup_iertr::now(); - } - }); - }).si_then([&iter] { - return std::move(iter); - }); - }); - } - - /** - * handle_split - * - * Prepare iter for insertion. iter should begin pointing at - * the valid insertion point (lower_bound(laddr)). - * - * Upon completion, iter will point at the - * position at which laddr should be inserted. iter may, upon completion, - * point at the end of a leaf other than the end leaf if that's the correct - * insertion point. - */ - using find_insertion_iertr = base_iertr; - using find_insertion_ret = find_insertion_iertr::future<>; - static find_insertion_ret find_insertion( - op_context_t c, - laddr_t laddr, - iterator &iter); - - /** - * handle_split - * - * Split nodes in iter as needed for insertion. First, scan iter from leaf - * to find first non-full level. Then, split from there towards leaf. - * - * Upon completion, iter will point at the newly split insertion point. As - * with find_insertion, iter's leaf pointer may be end without iter being - * end. - */ - using handle_split_iertr = base_iertr; - using handle_split_ret = handle_split_iertr::future<>; - handle_split_ret handle_split( - op_context_t c, - iterator &iter); - - using handle_merge_iertr = base_iertr; - using handle_merge_ret = handle_merge_iertr::future<>; - handle_merge_ret handle_merge( - op_context_t c, - iterator &iter); - - using update_internal_mapping_iertr = base_iertr; - using update_internal_mapping_ret = update_internal_mapping_iertr::future<>; - update_internal_mapping_ret update_internal_mapping( - op_context_t c, - depth_t depth, - laddr_t laddr, - paddr_t old_addr, - paddr_t new_addr); - - template - using node_position_t = iterator::node_position_t; - - template - friend base_iertr::future get_node( - op_context_t c, - depth_t depth, - paddr_t addr, - laddr_t begin, - laddr_t end); - - template - friend handle_merge_ret merge_level( - op_context_t c, - depth_t depth, - node_position_t &parent_pos, - node_position_t &pos); -}; - -} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h index 683efbed46a..8b2530e7c91 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -16,19 +16,14 @@ #include "crimson/os/seastore/seastore_types.h" #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/cached_extent.h" -#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" -#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" + +#include "crimson/os/seastore/btree/btree_range_pin.h" +#include "crimson/os/seastore/btree/fixed_kv_btree.h" namespace crimson::os::seastore::lba_manager::btree { using base_iertr = LBAManager::base_iertr; -struct op_context_t { - Cache &cache; - Transaction &trans; - btree_pin_set_t *pins = nullptr; -}; - /** * lba_map_val_t * @@ -57,15 +52,12 @@ WRITE_EQ_OPERATORS_4( std::ostream& operator<<(std::ostream& out, const lba_map_val_t&); -class BtreeLBAPin; -using BtreeLBAPinRef = std::unique_ptr; - constexpr size_t LBA_BLOCK_SIZE = 4096; /** * lba_node_meta_le_t * - * On disk layout for lba_node_meta_t + * On disk layout for fixed_kv_node_meta_t */ struct lba_node_meta_le_t { laddr_le_t begin = laddr_le_t(0); @@ -74,13 +66,13 @@ struct lba_node_meta_le_t { lba_node_meta_le_t() = default; lba_node_meta_le_t(const lba_node_meta_le_t &) = default; - explicit lba_node_meta_le_t(const lba_node_meta_t &val) + explicit lba_node_meta_le_t(const fixed_kv_node_meta_t &val) : begin(ceph_le64(val.begin)), end(ceph_le64(val.end)), depth(init_depth_le(val.depth)) {} - operator lba_node_meta_t() const { - return lba_node_meta_t{ begin, end, depth }; + operator fixed_kv_node_meta_t() const { + return fixed_kv_node_meta_t{ begin, end, depth }; } }; @@ -92,13 +84,13 @@ struct lba_node_meta_le_t { struct LBANode : CachedExtent { using LBANodeRef = TCachedExtentRef; - btree_range_pin_t pin; + btree_range_pin_t pin; LBANode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {} LBANode(const LBANode &rhs) : CachedExtent(rhs), pin(rhs.pin, this) {} - virtual lba_node_meta_t get_node_meta() const = 0; + virtual fixed_kv_node_meta_t get_node_meta() const = 0; virtual ~LBANode() = default; @@ -145,7 +137,7 @@ struct LBAInternalNode : LBANode, common::FixedKVNodeLayout< INTERNAL_NODE_CAPACITY, - lba_node_meta_t, lba_node_meta_le_t, + fixed_kv_node_meta_t, lba_node_meta_le_t, laddr_t, laddr_le_t, paddr_t, paddr_le_t> { using Ref = TCachedExtentRef; @@ -157,7 +149,7 @@ struct LBAInternalNode static constexpr extent_types_t TYPE = extent_types_t::LADDR_INTERNAL; - lba_node_meta_t get_node_meta() const { return get_meta(); } + fixed_kv_node_meta_t get_node_meta() const { return get_meta(); } CachedExtentRef duplicate_for_write() final { assert(delta_buffer.empty()); @@ -207,7 +199,7 @@ struct LBAInternalNode } std::tuple - make_split_children(op_context_t c) { + make_split_children(op_context_t c) { auto left = c.cache.alloc_new_extent( c.trans, LBA_BLOCK_SIZE); auto right = c.cache.alloc_new_extent( @@ -222,7 +214,7 @@ struct LBAInternalNode } Ref make_full_merge( - op_context_t c, + op_context_t c, Ref &right) { auto replacement = c.cache.alloc_new_extent( c.trans, LBA_BLOCK_SIZE); @@ -233,7 +225,7 @@ struct LBAInternalNode std::tuple make_balanced( - op_context_t c, + op_context_t c, Ref &_right, bool prefer_left) { ceph_assert(_right->get_type() == get_type()); @@ -383,7 +375,7 @@ struct LBALeafNode : LBANode, common::FixedKVNodeLayout< LEAF_NODE_CAPACITY, - lba_node_meta_t, lba_node_meta_le_t, + fixed_kv_node_meta_t, lba_node_meta_le_t, laddr_t, laddr_le_t, lba_map_val_t, lba_map_val_le_t> { using Ref = TCachedExtentRef; @@ -395,7 +387,7 @@ struct LBALeafNode static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF; - lba_node_meta_t get_node_meta() const { return get_meta(); } + fixed_kv_node_meta_t get_node_meta() const { return get_meta(); } CachedExtentRef duplicate_for_write() final { assert(delta_buffer.empty()); @@ -438,7 +430,7 @@ struct LBALeafNode std::tuple - make_split_children(op_context_t c) { + make_split_children(op_context_t c) { auto left = c.cache.alloc_new_extent( c.trans, LBA_BLOCK_SIZE); auto right = c.cache.alloc_new_extent( @@ -453,7 +445,7 @@ struct LBALeafNode } Ref make_full_merge( - op_context_t c, + op_context_t c, Ref &right) { auto replacement = c.cache.alloc_new_extent( c.trans, LBA_BLOCK_SIZE); @@ -464,7 +456,7 @@ struct LBALeafNode std::tuple make_balanced( - op_context_t c, + op_context_t c, Ref &_right, bool prefer_left) { ceph_assert(_right->get_type() == get_type()); diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index 62422997915..e59ad3dee7e 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -69,10 +69,10 @@ ObjectDataHandler::write_ret do_removals( LOG_PREFIX(object_data_handler.cc::do_removals); DEBUGT("decreasing ref: {}", ctx.t, - pin->get_laddr()); + pin->get_key()); return ctx.tm.dec_ref( ctx.t, - pin->get_laddr() + pin->get_key() ).si_then( [](auto){}, ObjectDataHandler::write_iertr::pass_further{}, @@ -129,14 +129,14 @@ ObjectDataHandler::write_ret do_insertions( region.len ).si_then([FNAME, ctx, ®ion](auto pin) { ceph_assert(pin->get_length() == region.len); - if (pin->get_laddr() != region.addr) { + if (pin->get_key() != region.addr) { ERRORT( "inconsistent laddr: pin: {} region {}", ctx.t, - pin->get_laddr(), + pin->get_key(), region.addr); } - ceph_assert(pin->get_laddr() == region.addr); + ceph_assert(pin->get_key() == region.addr); return ObjectDataHandler::write_iertr::now(); }); } @@ -156,7 +156,7 @@ using split_ret_bare = std::pair< using split_ret = get_iertr::future; split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset) { - const auto pin_offset = pin->get_laddr(); + const auto pin_offset = pin->get_key(); assert_aligned(pin_offset); ceph_assert(offset >= pin_offset); if (offset == pin_offset) { @@ -181,7 +181,7 @@ split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset) ); } else { // Data, return up to offset to prepend - auto to_prepend = offset - pin->get_laddr(); + auto to_prepend = offset - pin->get_key(); return read_pin(ctx, pin->duplicate() ).si_then([to_prepend](auto extent) { return get_iertr::make_ready_future( @@ -194,8 +194,8 @@ split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset) /// Reverse of split_pin_left split_ret split_pin_right(context_t ctx, LBAPinRef &pin, laddr_t end) { - const auto pin_begin = pin->get_laddr(); - const auto pin_end = pin->get_laddr() + pin->get_length(); + const auto pin_begin = pin->get_key(); + const auto pin_end = pin->get_key() + pin->get_length(); assert_aligned(pin_end); ceph_assert(pin_end >= end); if (end == pin_end) { @@ -273,7 +273,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation( ).si_then([max_object_size=max_object_size, &object_data](auto pin) { ceph_assert(pin->get_length() == max_object_size); object_data.update_reserved( - pin->get_laddr(), + pin->get_key(), pin->get_length()); return write_iertr::now(); }); @@ -302,17 +302,17 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( _pins.swap(pins); ceph_assert(pins.size()); auto &pin = *pins.front(); - ceph_assert(pin.get_laddr() >= object_data.get_reserved_data_base()); + ceph_assert(pin.get_key() >= object_data.get_reserved_data_base()); ceph_assert( - pin.get_laddr() <= object_data.get_reserved_data_base() + size); - auto pin_offset = pin.get_laddr() - + pin.get_key() <= object_data.get_reserved_data_base() + size); + auto pin_offset = pin.get_key() - object_data.get_reserved_data_base(); - if ((pin.get_laddr() == (object_data.get_reserved_data_base() + size)) || + if ((pin.get_key() == (object_data.get_reserved_data_base() + size)) || (pin.get_paddr().is_zero())) { /* First pin is exactly at the boundary or is a zero pin. Either way, * remove all pins and add a single zero pin to the end. */ to_write.emplace_back( - pin.get_laddr(), + pin.get_key(), object_data.get_reserved_data_len() - pin_offset); return clear_iertr::now(); } else { @@ -332,7 +332,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( )); bl.append_zero(p2roundup(size, ctx.tm.get_block_size()) - size); to_write.emplace_back( - pin.get_laddr(), + pin.get_key(), bl); to_write.emplace_back( object_data.get_reserved_data_base() + @@ -387,9 +387,9 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite( offset, bl.length()); ceph_assert(pins.size() >= 1); - auto pin_begin = pins.front()->get_laddr(); + auto pin_begin = pins.front()->get_key(); ceph_assert(pin_begin <= offset); - auto pin_end = pins.back()->get_laddr() + pins.back()->get_length(); + auto pin_end = pins.back()->get_key() + pins.back()->get_length(); ceph_assert(pin_end >= (offset + bl.length())); return split_pin_left( @@ -500,7 +500,7 @@ ObjectDataHandler::read_ret ObjectDataHandler::read( ).si_then([ctx, loffset, len, &ret](auto _pins) { // offset~len falls within reserved region and len > 0 ceph_assert(_pins.size() >= 1); - ceph_assert((*_pins.begin())->get_laddr() <= loffset); + ceph_assert((*_pins.begin())->get_key() <= loffset); return seastar::do_with( std::move(_pins), loffset, @@ -511,9 +511,9 @@ ObjectDataHandler::read_ret ObjectDataHandler::read( -> read_iertr::future<> { ceph_assert(current <= (loffset + len)); ceph_assert( - (loffset + len) > pin->get_laddr()); + (loffset + len) > pin->get_key()); laddr_t end = std::min( - pin->get_laddr() + pin->get_length(), + pin->get_key() + pin->get_length(), loffset + len); if (pin->get_paddr().is_zero()) { ceph_assert(end > current); // See LBAManager::get_mappings @@ -583,12 +583,12 @@ ObjectDataHandler::fiemap_ret ObjectDataHandler::fiemap( len ).si_then([loffset, len, &object_data, &ret](auto &&pins) { ceph_assert(pins.size() >= 1); - ceph_assert((*pins.begin())->get_laddr() <= loffset); + ceph_assert((*pins.begin())->get_key() <= loffset); for (auto &&i: pins) { if (!(i->get_paddr().is_zero())) { - auto ret_left = std::max(i->get_laddr(), loffset); + auto ret_left = std::max(i->get_key(), loffset); auto ret_right = std::min( - i->get_laddr() + i->get_length(), + i->get_key() + i->get_length(), loffset + len); assert(ret_right > ret_left); ret.emplace( diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index e6ddbb22150..646f78b76af 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -1081,22 +1081,22 @@ public: }; /** - * lba_root_t + * phy_tree_root_t */ -class __attribute__((packed)) lba_root_t { +class __attribute__((packed)) phy_tree_root_t { paddr_le_t root_addr; depth_le_t depth = init_extent_len_le(0); public: - lba_root_t() = default; + phy_tree_root_t() = default; - lba_root_t(paddr_t addr, depth_t depth) + phy_tree_root_t(paddr_t addr, depth_t depth) : root_addr(addr), depth(init_depth_le(depth)) {} - lba_root_t(const lba_root_t &o) = default; - lba_root_t(lba_root_t &&o) = default; - lba_root_t &operator=(const lba_root_t &o) = default; - lba_root_t &operator=(lba_root_t &&o) = default; + phy_tree_root_t(const phy_tree_root_t &o) = default; + phy_tree_root_t(phy_tree_root_t &&o) = default; + phy_tree_root_t &operator=(const phy_tree_root_t &o) = default; + phy_tree_root_t &operator=(phy_tree_root_t &&o) = default; paddr_t get_location() const { return root_addr; @@ -1188,6 +1188,7 @@ public: } }; +using lba_root_t = phy_tree_root_t; /** * root_t diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index e38c1ee9e05..61170ac8b62 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -481,14 +481,14 @@ TransactionManager::get_extent_if_live_ret TransactionManager::get_extent_if_liv return lba_manager->get_mapping( t, laddr).si_then([=, &t] (LBAPinRef pin) -> inner_ret { - ceph_assert(pin->get_laddr() == laddr); + ceph_assert(pin->get_key() == laddr); if (pin->get_paddr() == addr) { if (pin->get_length() != (extent_len_t)len) { ERRORT( "Invalid pin {}~{} {} found for " "extent {} {}~{} {}", t, - pin->get_laddr(), + pin->get_key(), pin->get_length(), pin->get_paddr(), type, diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 20342c29c78..a8275715474 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -141,7 +141,7 @@ struct lba_btree_test : btree_test_base { std::map check; auto get_op_context(Transaction &t) { - return op_context_t{*cache, t}; + return op_context_t{*cache, t}; } LBAManager::mkfs_ret test_structure_setup(Transaction &t) final { @@ -376,11 +376,11 @@ struct btree_lba_manager_test : btree_test_base { }).unsafe_get0(); logger().debug("alloc'd: {}", *ret); EXPECT_EQ(len, ret->get_length()); - auto [b, e] = get_overlap(t, ret->get_laddr(), len); + auto [b, e] = get_overlap(t, ret->get_key(), len); EXPECT_EQ(b, e); t.mappings.emplace( std::make_pair( - ret->get_laddr(), + ret->get_key(), test_extent_t{ ret->get_paddr(), ret->get_length(), @@ -474,7 +474,7 @@ struct btree_lba_manager_test : btree_test_base { EXPECT_EQ(ret_list.size(), 1); auto &ret = *ret_list.begin(); EXPECT_EQ(i.second.addr, ret->get_paddr()); - EXPECT_EQ(laddr, ret->get_laddr()); + EXPECT_EQ(laddr, ret->get_key()); EXPECT_EQ(len, ret->get_length()); auto ret_pin = with_trans_intr( @@ -484,7 +484,7 @@ struct btree_lba_manager_test : btree_test_base { t, laddr); }).unsafe_get0(); EXPECT_EQ(i.second.addr, ret_pin->get_paddr()); - EXPECT_EQ(laddr, ret_pin->get_laddr()); + EXPECT_EQ(laddr, ret_pin->get_key()); EXPECT_EQ(len, ret_pin->get_length()); } with_trans_intr( @@ -554,8 +554,8 @@ TEST_F(btree_lba_manager_test, force_split_merge) check_mappings(t); check_mappings(); } - incref_mapping(t, ret->get_laddr()); - decref_mapping(t, ret->get_laddr()); + incref_mapping(t, ret->get_key()); + decref_mapping(t, ret->get_key()); } logger().debug("submitting transaction"); submit_test_transaction(std::move(t));