From dc4fe22f6b9bb692712104fa841731af0ef20090 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Tue, 13 Jul 2021 02:02:18 +0000 Subject: [PATCH] crimson/os/seastore/lba_manager: replace btree implementation Replace previous implementation with one based around an internal iterator interface. Besides simplifying the implementation and removing duplicate lookups in the allocation pathway, this implementation should correct a design problem in the prior implementation wherein LBALeafNode::find_hole couldn't see the first element of the subsequent node and therefore assumed that there was one at get_meta().end. This patch removes the btree logic from lba_btree_node_impl.* leaving the LBAInternalNode and LBALeafNode layout in lba_btree_node.*. lba_btree.h/cc now have the main btree update/query logic. Signed-off-by: Samuel Just --- src/crimson/os/seastore/CMakeLists.txt | 3 +- src/crimson/os/seastore/cache.cc | 2 +- src/crimson/os/seastore/lba_manager.h | 12 - .../lba_manager/btree/btree_lba_manager.cc | 521 ++++------- .../lba_manager/btree/btree_lba_manager.h | 100 +- .../lba_manager/btree/btree_range_pin.h | 1 + .../seastore/lba_manager/btree/lba_btree.cc | 863 ++++++++++++++++++ .../os/seastore/lba_manager/btree/lba_btree.h | 608 ++++++++++++ .../lba_manager/btree/lba_btree_node.cc | 64 ++ .../lba_manager/btree/lba_btree_node.h | 640 +++++++++---- .../lba_manager/btree/lba_btree_node_impl.cc | 792 ---------------- .../lba_manager/btree/lba_btree_node_impl.h | 557 ----------- .../seastore/test_btree_lba_manager.cc | 28 - 13 files changed, 2252 insertions(+), 1939 deletions(-) create mode 100644 src/crimson/os/seastore/lba_manager/btree/lba_btree.cc create mode 100644 src/crimson/os/seastore/lba_manager/btree/lba_btree.h create mode 100644 src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc delete mode 100644 src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc delete mode 100644 src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index 71050d099e0e1..ccd67298328b8 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -10,8 +10,9 @@ add_library(crimson-seastore STATIC lba_manager.cc segment_cleaner.cc lba_manager/btree/btree_lba_manager.cc - lba_manager/btree/lba_btree_node_impl.cc lba_manager/btree/btree_range_pin.cc + lba_manager/btree/lba_btree.cc + lba_manager/btree/lba_btree_node.cc omap_manager.cc omap_manager/btree/btree_omap_manager.cc omap_manager/btree/omap_btree_node_impl.cc diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 98e76346d21fa..a61e9bebdebf3 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -9,7 +9,7 @@ // included for get_extent_by_type #include "crimson/os/seastore/collection_manager/collection_flat_node.h" -#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" #include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h" #include "crimson/os/seastore/object_data_handler.h" #include "crimson/os/seastore/collection_manager/collection_flat_node.h" diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index 1e19ac6ba9e0d..735c4d69be458 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -83,18 +83,6 @@ public: extent_len_t len, paddr_t addr) = 0; - /** - * Creates a new absolute mapping. - * - * off~len must be unreferenced - */ - using set_extent_iertr = base_iertr::extend< - crimson::ct_error::invarg>; - using set_extent_ret = set_extent_iertr::future; - virtual set_extent_ret set_extent( - Transaction &t, - laddr_t off, extent_len_t len, paddr_t addr) = 0; - struct ref_update_result_t { unsigned refcount = 0; paddr_t addr; diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index 278c0958fa990..361b0bb40532c 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -5,10 +5,12 @@ #include #include "crimson/common/log.h" +#include "crimson/os/seastore/logging.h" #include "include/buffer.h" #include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h" -#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree.h" namespace { @@ -22,18 +24,8 @@ namespace crimson::os::seastore::lba_manager::btree { BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( Transaction &t) { - logger().debug("BtreeLBAManager::mkfs"); return cache.get_root(t).si_then([this, &t](auto croot) { - auto root_leaf = cache.alloc_new_extent( - t, - LBA_BLOCK_SIZE); - root_leaf->set_size(0); - lba_node_meta_t meta{0, L_ADDR_MAX, 1}; - root_leaf->set_meta(meta); - root_leaf->pin.set_range(meta); - croot->get_root().lba_root = - lba_root_t{root_leaf->get_paddr(), 1u}; - t.get_lba_tree_stats().depth = 1u; + croot->get_root().lba_root = LBABtree::mkfs(get_context(t)); return mkfs_iertr::now(); }).handle_error_interruptible( mkfs_iertr::pass_further{}, @@ -43,39 +35,33 @@ BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( ); } -BtreeLBAManager::get_root_ret -BtreeLBAManager::get_root(Transaction &t) -{ - return cache.get_root(t).si_then([this, &t](auto croot) { - logger().debug( - "BtreeLBAManager::get_root: reading root at {} depth {}", - paddr_t{croot->get_root().lba_root.get_location()}, - croot->get_root().lba_root.get_depth()); - return get_lba_btree_extent( - get_context(t), - croot, - croot->get_root().lba_root.get_depth(), - croot->get_root().lba_root.get_location(), - paddr_t()); - }); -} - BtreeLBAManager::get_mappings_ret BtreeLBAManager::get_mappings( Transaction &t, laddr_t offset, extent_len_t length) { - logger().debug("BtreeLBAManager::get_mappings: {}, {}", offset, length); - return get_root( - t).si_then([this, &t, offset, length](auto extent) { - return extent->lookup_range( - get_context(t), - offset, length - ).si_then([](auto &&e) { - logger().debug("BtreeLBAManager::get_mappings: got mappings {}", e); - return std::move(e); - }); - }); + LOG_PREFIX(BtreeLBAManager::get_mappings); + DEBUGT("offset: {}, length{}", t, offset, length); + auto c = get_context(t); + return with_btree_state( + c, + [c, offset, length](auto &btree, auto &ret) { + return LBABtree::iterate_repeat( + c, + btree.upper_bound_right(c, offset), + [&ret, offset, length](auto &pos) { + if (pos.is_end() || pos.get_key() >= (offset + length)) { + return LBABtree::iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } + ceph_assert((pos.get_key() + pos.get_val().len) > offset); + ret.push_back(pos.get_pin()); + return LBABtree::iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + }); + }); } @@ -84,7 +70,8 @@ BtreeLBAManager::get_mappings( Transaction &t, laddr_list_t &&list) { - logger().debug("BtreeLBAManager::get_mappings: {}", list); + LOG_PREFIX(BtreeLBAManager::get_mappings); + DEBUGT("{}", t, list); auto l = std::make_unique(std::move(list)); auto retptr = std::make_unique(); auto &ret = *retptr; @@ -107,14 +94,26 @@ BtreeLBAManager::get_mapping( Transaction &t, laddr_t offset) { - logger().debug("BtreeLBAManager::get_mapping: {}", offset); - return get_root(t - ).si_then([this, &t, offset] (auto extent) { - return extent->lookup_pin(get_context(t), offset); - }).si_then([] (auto &&e) { - logger().debug("BtreeLBAManager::get_mapping: got mapping {}", *e); - return std::move(e); - }); + LOG_PREFIX(BtreeLBAManager::get_mapping); + DEBUGT(": {}", t, offset); + auto c = get_context(t); + return with_btree_ret( + c, + [FNAME, c, offset](auto &btree) { + return btree.lower_bound( + c, offset + ).si_then([FNAME, offset, c](auto iter) -> get_mapping_ret { + if (iter.is_end() || iter.get_key() != offset) { + return crimson::ct_error::enoent::make(); + } else { + auto e = iter.get_pin(); + DEBUGT("got mapping {}", c.trans, *e); + return get_mapping_ret( + interruptible::ready_future_marker{}, + std::move(e)); + } + }); + }); } BtreeLBAManager::alloc_extent_ret @@ -124,54 +123,52 @@ BtreeLBAManager::alloc_extent( extent_len_t len, paddr_t addr) { - // TODO: we can certainly combine the lookup and the insert. - return get_root( - t).si_then([this, &t, hint, len](auto extent) { - logger().debug( - "BtreeLBAManager::alloc_extent: beginning search at {}", - *extent); - return extent->find_hole( - get_context(t), - hint, - L_ADDR_MAX, - len).si_then([extent](auto ret) { - return std::make_pair(ret, extent); + struct state_t { + laddr_t last_end; + + std::optional insert_iter; + std::optional ret; + + state_t(laddr_t hint) : last_end(hint) {} + }; + + LOG_PREFIX(BtreeLBAManager::alloc_extent); + DEBUGT("hint: {}, length: {}", t, hint, len); + auto c = get_context(t); + return with_btree_state( + c, + hint, + [FNAME, c, hint, len, addr](auto &btree, auto &state) { + return LBABtree::iterate_repeat( + c, + btree.upper_bound_right(c, hint), + [&state, len](auto &pos) { + if (pos.is_end() || pos.get_key() >= (state.last_end + len)) { + state.insert_iter = pos; + return LBABtree::iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } else { + state.last_end = pos.get_key() + pos.get_val().len; + return LBABtree::iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + } + }).si_then([FNAME, c, addr, len, &btree, &state] { + DEBUGT("about to insert at addr {}~{}", c.trans, state.last_end, len); + return btree.insert( + c, + *state.insert_iter, + state.last_end, + lba_map_val_t{len, addr, 1, 0} + ).si_then([&state](auto &&p) { + auto [iter, inserted] = std::move(p); + ceph_assert(inserted); + state.ret = iter; + }); }); - }).si_then([this, &t, len, addr](auto allocation_pair) { - auto &[laddr, extent] = allocation_pair; - ceph_assert(laddr != L_ADDR_MAX); - return insert_mapping( - t, - extent, - laddr, - { len, addr, 1, 0 } - ).si_then([laddr=laddr, addr, len](auto pin) { - logger().debug( - "BtreeLBAManager::alloc_extent: alloc {}~{} for {}", - laddr, - len, - addr); - return LBAPinRef(pin.release()); - }); - }); -} - -BtreeLBAManager::set_extent_ret -BtreeLBAManager::set_extent( - Transaction &t, - laddr_t off, extent_len_t len, paddr_t addr) -{ - return get_root( - t).si_then([this, &t, off, len, addr](auto root) { - return insert_mapping( - t, - root, - off, - { len, addr, 1, 0 }); - }).si_then([](auto ret) { - return set_extent_ret( - interruptible::ready_future_marker{}, - LBAPinRef(ret.release())); + }).si_then([](auto &&state) { + return state.ret->get_pin(); }); } @@ -257,58 +254,15 @@ BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent( Transaction &t, CachedExtentRef e) { - logger().debug("{}: {}", __func__, *e); - return get_root(t).si_then( - [this, &t, e=std::move(e)](LBANodeRef root) mutable { - if (is_lba_node(*e)) { - auto lban = e->cast(); - logger().debug("init_cached_extent: lba node, getting root"); - return root->lookup( - op_context_t{cache, pin_set, t}, - lban->get_node_meta().begin, - lban->get_node_meta().depth - ).si_then([this, e=std::move(e)](LBANodeRef c) { - if (c->get_paddr() == e->get_paddr()) { - assert(&*c == &*e); - logger().debug("init_cached_extent: {} initialized", *e); - } else { - // e is obsolete - logger().debug("init_cached_extent: {} obsolete", *e); - cache.drop_from_cache(e); - } - return init_cached_extent_iertr::now(); - }); - } else if (e->is_logical()) { - auto logn = e->cast(); - return root->lookup_range( - op_context_t{cache, pin_set, t}, - logn->get_laddr(), - logn->get_length()).si_then( - [this, logn=std::move(logn)](auto pins) { - if (pins.size() == 1) { - auto pin = std::move(pins.front()); - pins.pop_front(); - if (pin->get_paddr() == logn->get_paddr()) { - logn->set_pin(std::move(pin)); - pin_set.add_pin( - static_cast(logn->get_pin()).pin); - logger().debug("init_cached_extent: {} initialized", *logn); - } else { - // paddr doesn't match, remapped, obsolete - logger().debug("init_cached_extent: {} obsolete", *logn); - cache.drop_from_cache(logn); - } - } else { - // set of extents changed, obsolete - logger().debug("init_cached_extent: {} obsolete", *logn); - cache.drop_from_cache(logn); - } - return init_cached_extent_iertr::now(); - }); - } else { - logger().debug("init_cached_extent: {} skipped", *e); - return init_cached_extent_iertr::now(); - } + LOG_PREFIX(BtreeLBAManager::init_cached_extent); + DEBUGT(": extent {}", t, *e); + auto c = get_context(t); + return with_btree( + c, + [c, e](auto &btree) { + return btree.init_cached_extent( + c, e + ).si_then([](auto) {}); }); } @@ -318,18 +272,27 @@ BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings( laddr_t end, scan_mappings_func_t &&f) { - return seastar::do_with( - std::move(f), - LBANodeRef(), - [=, &t](auto &f, auto &lbarootref) { - return get_root(t).si_then( - [=, &t, &f](LBANodeRef lbaroot) mutable { - lbarootref = lbaroot; - return lbaroot->scan_mappings( - get_context(t), - begin, - end, - f); + LOG_PREFIX(BtreeLBAManager::scan_mappings); + DEBUGT("begin: {}, end: {}", t, begin, end); + + auto c = get_context(t); + return with_btree( + c, + [c, f=std::move(f), begin, end](auto &btree) mutable { + return LBABtree::iterate_repeat( + c, + btree.upper_bound_right(c, begin), + [f=std::move(f), begin, end](auto &pos) { + if (pos.is_end() || pos.get_key() >= end) { + return LBABtree::iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } + ceph_assert((pos.get_key() + pos.get_val().len) > begin); + f(pos.get_key(), pos.get_val().paddr, pos.get_val().len); + return LBABtree::iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); }); }); } @@ -338,16 +301,30 @@ BtreeLBAManager::scan_mapped_space_ret BtreeLBAManager::scan_mapped_space( Transaction &t, scan_mapped_space_func_t &&f) { + LOG_PREFIX(BtreeLBAManager::scan_mapped_space); + DEBUGT("", t); + auto c = get_context(t); return seastar::do_with( std::move(f), - LBANodeRef(), - [=, &t](auto &f, auto &lbarootref) { - return get_root(t).si_then( - [=, &t, &f](LBANodeRef lbaroot) mutable { - lbarootref = lbaroot; - return lbaroot->scan_mapped_space( - get_context(t), - f); + [this, c](auto &visitor) { + return with_btree( + c, + [c, &visitor](auto &btree) { + return LBABtree::iterate_repeat( + c, + btree.lower_bound(c, 0, &visitor), + [&visitor](auto &pos) { + if (pos.is_end()) { + return LBABtree::iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } + visitor(pos.get_val().paddr, pos.get_val().len); + return LBABtree::iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + }, + &visitor); }); }); } @@ -356,6 +333,10 @@ BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent( Transaction &t, CachedExtentRef extent) { + LOG_PREFIX(BtreeLBAManager::rewrite_extent); + if (extent->has_been_invalidated()) { + ERRORT("{} has been invalidated", t, *extent); + } assert(!extent->has_been_invalidated()); logger().debug( @@ -401,46 +382,12 @@ BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent( } ); } else if (is_lba_node(*extent)) { - auto lba_extent = extent->cast(); - cache.retire_extent(t, extent); - auto nlba_extent = cache.alloc_new_extent_by_type( - t, - lba_extent->get_type(), - lba_extent->get_length())->cast(); - lba_extent->get_bptr().copy_out( - 0, - lba_extent->get_length(), - nlba_extent->get_bptr().c_str()); - nlba_extent->pin.set_range(nlba_extent->get_node_meta()); - - /* This is a bit underhanded. Any relative addrs here must necessarily - * be record relative as we are rewriting a dirty extent. Thus, we - * are using resolve_relative_addrs with a (likely negative) block - * relative offset to correct them to block-relative offsets adjusted - * for our new transaction location. - * - * Upon commit, these now block relative addresses will be interpretted - * against the real final address. - */ - nlba_extent->resolve_relative_addrs( - make_record_relative_paddr(0) - nlba_extent->get_paddr()); - - logger().debug( - "{}: rewriting {} into {}", - __func__, - *lba_extent, - *nlba_extent); - - return update_internal_mapping( - t, - nlba_extent->get_node_meta().depth, - nlba_extent->get_node_meta().begin, - nlba_extent->get_paddr()).si_then( - [](auto) {}, - rewrite_extent_iertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error in BtreeLBAManager::rewrite_extent update_internal_mapping" - }); + auto c = get_context(t); + return with_btree( + c, + [c, extent](auto &btree) mutable { + return btree.rewrite_lba_extent(c, extent); + }); } else { return rewrite_extent_iertr::now(); } @@ -462,24 +409,14 @@ BtreeLBAManager::get_physical_extent_if_live( laddr, len ).si_then([=, &t](CachedExtentRef extent) { - return get_root(t).si_then([=, &t](LBANodeRef root) { - auto lba_node = extent->cast(); - return root->lookup( - op_context_t{cache, pin_set, t}, - lba_node->get_node_meta().begin, - lba_node->get_node_meta().depth).si_then([=](LBANodeRef c) { - if (c->get_paddr() == lba_node->get_paddr()) { - return get_physical_extent_if_live_ret( - interruptible::ready_future_marker{}, - lba_node); - } else { - cache.drop_from_cache(lba_node); - return get_physical_extent_if_live_ret( - interruptible::ready_future_marker{}, - CachedExtentRef()); - } - }); - }); + auto c = get_context(t); + return with_btree_ret( + c, + [c, extent](auto &btree) { + return btree.init_cached_extent( + c, + extent); + }); }); } @@ -489,58 +426,13 @@ BtreeLBAManager::BtreeLBAManager( : segment_manager(segment_manager), cache(cache) {} -BtreeLBAManager::insert_mapping_ret BtreeLBAManager::insert_mapping( - Transaction &t, - LBANodeRef root, - laddr_t laddr, - lba_map_val_t val) -{ - ++(t.get_lba_tree_stats().num_inserts); - auto split = insert_mapping_iertr::future( - interruptible::ready_future_marker{}, - root); - if (root->at_max_capacity()) { - split = cache.get_root(t).si_then( - [this, root, laddr, &t](RootBlockRef croot) { - logger().debug( - "BtreeLBAManager::insert_mapping: splitting root {}", - *croot); - { - auto mut_croot = cache.duplicate_for_write(t, croot); - croot = mut_croot->cast(); - } - auto nroot = cache.alloc_new_extent(t, LBA_BLOCK_SIZE); - lba_node_meta_t meta{0, L_ADDR_MAX, root->get_node_meta().depth + 1}; - nroot->set_meta(meta); - nroot->pin.set_range(meta); - nroot->journal_insert( - nroot->begin(), - L_ADDR_MIN, - root->get_paddr(), - nullptr); - auto new_depth = root->get_node_meta().depth + 1; - croot->get_root().lba_root = lba_root_t{ - nroot->get_paddr(), - new_depth - }; - t.get_lba_tree_stats().depth = new_depth; - return nroot->split_entry( - get_context(t), - laddr, nroot->begin(), root); - }); - } - return split.si_then([this, &t, laddr, val](LBANodeRef node) { - return node->insert( - get_context(t), - laddr, val); - }); -} - BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount( Transaction &t, laddr_t addr, int delta) { + LOG_PREFIX(BtreeLBAManager::update_refcount); + DEBUGT("addr {}, delta {}", t, addr, delta); return update_mapping( t, addr, @@ -563,58 +455,39 @@ BtreeLBAManager::update_mapping_ret BtreeLBAManager::update_mapping( laddr_t addr, update_func_t &&f) { - return get_root(t - ).si_then([this, f=std::move(f), &t, addr](LBANodeRef root) mutable { - return root->mutate_mapping( - get_context(t), - addr, - std::move(f)); - }); -} + LOG_PREFIX(BtreeLBAManager::update_mapping); + DEBUGT("addr {}", t, addr); + auto c = get_context(t); + return with_btree_ret( + c, + [f=std::move(f), c, addr](auto &btree) mutable { + return btree.lower_bound( + c, addr + ).si_then([&btree, f=std::move(f), c, addr](auto iter) + -> update_mapping_ret { + if (iter.is_end() || iter.get_key() != addr) { + return crimson::ct_error::enoent::make(); + } -BtreeLBAManager::update_internal_mapping_ret -BtreeLBAManager::update_internal_mapping( - Transaction &t, - depth_t depth, - laddr_t laddr, - paddr_t paddr) -{ - return cache.get_root(t).si_then([=, &t](RootBlockRef croot) { - if (depth == croot->get_root().lba_root.get_depth()) { - logger().debug( - "update_internal_mapping: updating lba root to: {}->{}", - laddr, - paddr); - { - auto mut_croot = cache.duplicate_for_write(t, croot); - croot = mut_croot->cast(); - } - ceph_assert(laddr == 0); - auto old_paddr = croot->get_root().lba_root.get_location(); - croot->get_root().lba_root.set_location(paddr); - return update_internal_mapping_ret( - interruptible::ready_future_marker{}, - old_paddr); - } else { - logger().debug( - "update_internal_mapping: updating lba node at depth {} to: {}->{}", - depth, - laddr, - paddr); - return get_lba_btree_extent( - get_context(t), - croot, - croot->get_root().lba_root.get_depth(), - croot->get_root().lba_root.get_location(), - paddr_t()).si_then([=, &t](LBANodeRef broot) { - return broot->mutate_internal_address( - get_context(t), - depth, - laddr, - paddr); - }); - } - }); + auto ret = f(iter.get_val()); + if (ret.refcount == 0) { + return btree.remove( + c, + iter + ).si_then([ret] { + return ret; + }); + } else { + return btree.update( + c, + iter, + ret + ).si_then([ret](auto) { + return ret; + }); + } + }); + }); } BtreeLBAManager::~BtreeLBAManager() diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index 0c16efaca74ba..a0834b159c24c 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -15,12 +15,14 @@ #include "common/interval_map.h" #include "crimson/osd/exceptions.h" +#include "crimson/os/seastore/logging.h" #include "crimson/os/seastore/seastore_types.h" #include "crimson/os/seastore/lba_manager.h" #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/segment_manager.h" #include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree.h" namespace crimson::os::seastore::lba_manager::btree { @@ -68,10 +70,6 @@ public: extent_len_t len, paddr_t addr) final; - set_extent_ret set_extent( - Transaction &t, - laddr_t off, extent_len_t len, paddr_t addr) final; - ref_ret decref_extent( Transaction &t, laddr_t addr) final { @@ -131,29 +129,69 @@ private: static btree_range_pin_t &get_pin(CachedExtent &e); + template + auto with_btree( + op_context_t c, + F &&f) { + return cache.get_root( + c.trans + ).si_then([this, c, f=std::forward(f)](RootBlockRef croot) mutable { + return seastar::do_with( + LBABtree(croot->get_root().lba_root), + [this, c, croot, f=std::move(f)](auto &btree) mutable { + return f( + btree + ).si_then([this, c, croot, &btree] { + if (btree.is_root_dirty()) { + auto mut_croot = cache.duplicate_for_write( + c.trans, croot + )->cast(); + mut_croot->get_root().lba_root = btree.get_root_undirty(); + } + return base_iertr::now(); + }); + }); + }); + } - /** - * get_root - * - * Get a reference to the root LBANode. - */ - using get_root_iertr = base_iertr; - using get_root_ret = get_root_iertr::future; - get_root_ret get_root(Transaction &); + template + auto with_btree_state( + op_context_t c, + State &&init, + F &&f) { + return seastar::do_with( + std::forward(init), + [this, c, f=std::forward(f)](auto &state) mutable { + (void)this; // silence incorrect clang warning about capture + return with_btree(c, [&state, f=std::move(f)](auto &btree) mutable { + return f(btree, state); + }).si_then([&state] { + return seastar::make_ready_future(std::move(state)); + }); + }); + } - /** - * insert_mapping - * - * Insert a lba mapping into the tree - */ - using insert_mapping_iertr = base_iertr; - using insert_mapping_ret = insert_mapping_iertr::future; - insert_mapping_ret insert_mapping( - Transaction &t, ///< [in,out] transaction - LBANodeRef root, ///< [in] root node - laddr_t laddr, ///< [in] logical addr to insert - lba_map_val_t val ///< [in] mapping to insert - ); + template + auto with_btree_state( + op_context_t c, + F &&f) { + return with_btree_state(c, State{}, std::forward(f)); + } + + template + auto with_btree_ret( + op_context_t c, + F &&f) { + return with_btree_state( + c, + [f=std::forward(f)](auto &btree, auto &ret) mutable { + return f( + btree + ).si_then([&ret](auto &&_ret) { + ret = std::move(_ret); + }); + }); + } /** * update_refcount @@ -173,19 +211,13 @@ private: */ using update_mapping_iertr = ref_iertr; using update_mapping_ret = ref_iertr::future; - using update_func_t = LBANode::mutate_func_t; + using update_func_t = std::function< + lba_map_val_t(const lba_map_val_t &v) + >; update_mapping_ret update_mapping( Transaction &t, laddr_t addr, update_func_t &&f); - - using update_internal_mapping_iertr = LBANode::mutate_internal_address_iertr; - using update_internal_mapping_ret = LBANode::mutate_internal_address_ret; - update_internal_mapping_ret update_internal_mapping( - Transaction &t, - depth_t depth, - laddr_t laddr, - paddr_t paddr); }; using BtreeLBAManagerRef = std::unique_ptr; diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h index 08e3979c020ec..bdc76c441dbe6 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h @@ -235,6 +235,7 @@ public: class BtreeLBAPin : public LBAPin { friend class BtreeLBAManager; + friend class LBABtree; /** * parent diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree.cc new file mode 100644 index 0000000000000..e350f437e7ab5 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree.cc @@ -0,0 +1,863 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/log.h" +#include "crimson/os/seastore/logging.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree.h" + +namespace crimson::os::seastore::lba_manager::btree { + +LBABtree::mkfs_ret LBABtree::mkfs(op_context_t c) +{ + auto root_leaf = c.cache.alloc_new_extent( + c.trans, + LBA_BLOCK_SIZE); + root_leaf->set_size(0); + lba_node_meta_t meta{0, L_ADDR_MAX, 1}; + root_leaf->set_meta(meta); + root_leaf->pin.set_range(meta); + c.trans.get_lba_tree_stats().depth = 1u; + return lba_root_t{root_leaf->get_paddr(), 1u}; +} + +LBABtree::iterator_fut LBABtree::iterator::next( + op_context_t c, + mapped_space_visitor_t *visitor) const +{ + assert_valid(); + assert(!is_end()); + + if ((leaf.pos + 1) < leaf.node->get_size()) { + auto ret = *this; + ret.leaf.pos++; + return iterator_fut( + interruptible::ready_future_marker{}, + ret); + } + + depth_t depth_with_space = 2; + for (; depth_with_space <= get_depth(); ++depth_with_space) { + if ((get_internal(depth_with_space).pos + 1) < + get_internal(depth_with_space).node->get_size()) { + break; + } + } + + if (depth_with_space <= get_depth()) { + return seastar::do_with( + *this, + [](const LBAInternalNode &internal) { return internal.begin(); }, + [](const LBALeafNode &leaf) { return leaf.begin(); }, + [c, depth_with_space, visitor](auto &ret, auto &li, auto &ll) { + for (depth_t depth = 2; depth < depth_with_space; ++depth) { + ret.get_internal(depth).reset(); + } + ret.leaf.reset(); + ret.get_internal(depth_with_space).pos++; + return lookup_depth_range( + c, ret, depth_with_space - 1, 0, li, ll, visitor + ).si_then([&ret] { + return std::move(ret); + }); + }); + } else { + // end + auto ret = *this; + ret.leaf.pos = ret.leaf.node->get_size(); + return iterator_fut( + interruptible::ready_future_marker{}, + ret); + } +} + +LBABtree::iterator_fut LBABtree::iterator::prev(op_context_t c) const +{ + assert_valid(); + assert(!is_begin()); + + auto ret = *this; + + if (is_end()) { + ret.leaf.pos = ret.leaf.node->get_size(); + } + + if (ret.leaf.pos > 0) { + ret.leaf.pos--; + return iterator_fut( + interruptible::ready_future_marker{}, + ret); + } + + depth_t depth_with_space = 2; + for (; depth_with_space <= get_depth(); ++depth_with_space) { + if (ret.get_internal(depth_with_space).pos > 0) { + break; + } + } + + assert(depth_with_space <= ret.get_depth()); // must not be begin() + return seastar::do_with( + std::move(ret), + [](const LBAInternalNode &internal) { return --internal.end(); }, + [](const LBALeafNode &leaf) { return --leaf.end(); }, + [c, depth_with_space](auto &ret, auto &li, auto &ll) { + for (depth_t depth = 2; depth < depth_with_space; ++depth) { + ret.get_internal(depth).reset(); + } + ret.leaf.reset(); + ret.get_internal(depth_with_space).pos--; + return lookup_depth_range( + c, ret, depth_with_space - 1, 0, li, ll, nullptr + ).si_then([&ret] { + return std::move(ret); + }); + }); +} + +LBABtree::iterator_fut LBABtree::lower_bound( + op_context_t c, + laddr_t addr, + mapped_space_visitor_t *visitor) const +{ + LOG_PREFIX(LBATree::lower_bound); + return lookup( + c, + [addr](const LBAInternalNode &internal) { + assert(internal.get_size() > 0); + auto iter = internal.upper_bound(addr); + assert(iter != internal.begin()); + --iter; + return iter; + }, + [FNAME, c, addr](const LBALeafNode &leaf) { + auto ret = leaf.lower_bound(addr); + DEBUGT( + "leaf addr {}, got ret offset {}, size {}, end {}", + c.trans, + addr, + ret.get_offset(), + leaf.get_size(), + ret == leaf.end()); + return ret; + }, + visitor + ).si_then([FNAME, c](auto &&ret) { + DEBUGT( + "ret.leaf.pos {}", + c.trans, + ret.leaf.pos); + ret.assert_valid(); + return std::move(ret); + }); +} + +LBABtree::insert_ret LBABtree::insert( + op_context_t c, + iterator iter, + laddr_t laddr, + lba_map_val_t val) +{ + LOG_PREFIX(LBATree::insert); + DEBUGT( + "inserting laddr {} at iter {}", + c.trans, + laddr, + iter.is_end() ? L_ADDR_MAX : iter.get_key()); + return seastar::do_with( + iter, + [this, c, laddr, val](auto &ret) { + return find_insertion( + c, laddr, ret + ).si_then([this, c, laddr, val, &ret] { + if (!ret.is_end() && ret.get_key() == laddr) { + return insert_ret( + interruptible::ready_future_marker{}, + std::make_pair(ret, false)); + } else { + return handle_split( + c, ret + ).si_then([c, laddr, val, &ret] { + if (!ret.leaf.node->is_pending()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, ret.leaf.node + ); + ret.leaf.node = mut->cast(); + } + auto iter = ret.leaf.node->lower_bound(laddr); + if (iter != ret.leaf.node->end() && iter->get_key() == laddr) { + return insert_ret( + interruptible::ready_future_marker{}, + std::make_pair(ret, false)); + } else { + ret.leaf.pos = iter->get_offset(); + assert(laddr >= ret.leaf.node->get_meta().begin && + laddr < ret.leaf.node->get_meta().end); + ret.leaf.node->insert(iter, laddr, val); + return insert_ret( + interruptible::ready_future_marker{}, + std::make_pair(ret, true)); + } + }); + } + }); + }); +} + +LBABtree::update_ret LBABtree::update( + op_context_t c, + iterator iter, + lba_map_val_t val) +{ + LOG_PREFIX(LBATree::update); + DEBUGT( + "update element at {}", + c.trans, + iter.is_end() ? L_ADDR_MAX : iter.get_key()); + if (!iter.leaf.node->is_pending()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, iter.leaf.node + ); + iter.leaf.node = mut->cast(); + } + iter.leaf.node->update( + iter.leaf.node->iter_idx(iter.leaf.pos), + val); + return update_ret( + interruptible::ready_future_marker{}, + iter); +} + +LBABtree::remove_ret LBABtree::remove( + op_context_t c, + iterator iter) +{ + LOG_PREFIX(LBATree::remove); + DEBUGT( + "remove element at {}", + c.trans, + iter.is_end() ? L_ADDR_MAX : iter.get_key()); + assert(!iter.is_end()); + return seastar::do_with( + iter, + [this, c](auto &ret) { + if (!ret.leaf.node->is_pending()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, ret.leaf.node + ); + ret.leaf.node = mut->cast(); + } + ret.leaf.node->remove( + ret.leaf.node->iter_idx(ret.leaf.pos)); + + return handle_merge( + c, ret + ); + }); +} + +LBABtree::init_cached_extent_ret LBABtree::init_cached_extent( + op_context_t c, + CachedExtentRef e) +{ + LOG_PREFIX(LBATree::init_cached_extent); + DEBUGT(": extent {}", c.trans, *e); + if (e->is_logical()) { + auto logn = e->cast(); + return lower_bound( + c, + logn->get_laddr() + ).si_then([FNAME, e, c, logn](auto iter) { + if (!iter.is_end() && + iter.get_key() == logn->get_laddr() && + iter.get_val().paddr == logn->get_paddr()) { + logn->set_pin(iter.get_pin()); + ceph_assert(iter.get_val().len == e->get_length()); + c.pins.add_pin( + static_cast(logn->get_pin()).pin); + DEBUGT(": logical extent {} live, initialized", c.trans, *logn); + return e; + } else { + DEBUGT(": logical extent {} not live, dropping", c.trans, *logn); + c.cache.drop_from_cache(logn); + return CachedExtentRef(); + } + }); + } else if (e->get_type() == extent_types_t::LADDR_INTERNAL) { + auto eint = e->cast(); + return lower_bound( + c, eint->get_node_meta().begin + ).si_then([FNAME, e, c, eint](auto iter) { + // Note, this check is valid even if iter.is_end() + depth_t cand_depth = eint->get_node_meta().depth; + if (cand_depth <= iter.get_depth() && + &*iter.get_internal(cand_depth).node == &*eint) { + DEBUGT(": extent {} is live", c.trans, *eint); + return e; + } else { + DEBUGT(": extent {} is not live", c.trans, *eint); + c.cache.drop_from_cache(eint); + return CachedExtentRef(); + } + }); + } else if (e->get_type() == extent_types_t::LADDR_LEAF) { + auto eleaf = e->cast(); + return lower_bound( + c, eleaf->get_node_meta().begin + ).si_then([FNAME, c, e, eleaf](auto iter) { + // Note, this check is valid even if iter.is_end() + if (iter.leaf.node == &*eleaf) { + DEBUGT(": extent {} is live", c.trans, *eleaf); + return e; + } else { + DEBUGT(": extent {} is not live", c.trans, *eleaf); + c.cache.drop_from_cache(eleaf); + return CachedExtentRef(); + } + }); + } else { + DEBUGT( + ": found other extent {} type {}", + c.trans, + *e, + e->get_type()); + return init_cached_extent_ret( + interruptible::ready_future_marker{}, + e); + } +} + +LBABtree::rewrite_lba_extent_ret LBABtree::rewrite_lba_extent( + op_context_t c, + CachedExtentRef e) +{ + LOG_PREFIX(LBABtree::rewrite_lba_extent); + assert(e->get_type() == extent_types_t::LADDR_INTERNAL || + e->get_type() == extent_types_t::LADDR_LEAF); + + auto do_rewrite = [&](auto &lba_extent) { + auto nlba_extent = c.cache.alloc_new_extent< + std::remove_reference_t + >( + c.trans, + lba_extent.get_length()); + lba_extent.get_bptr().copy_out( + 0, + lba_extent.get_length(), + nlba_extent->get_bptr().c_str()); + nlba_extent->pin.set_range(nlba_extent->get_node_meta()); + + /* This is a bit underhanded. Any relative addrs here must necessarily + * be record relative as we are rewriting a dirty extent. Thus, we + * are using resolve_relative_addrs with a (likely negative) block + * relative offset to correct them to block-relative offsets adjusted + * for our new transaction location. + * + * Upon commit, these now block relative addresses will be interpretted + * against the real final address. + */ + nlba_extent->resolve_relative_addrs( + make_record_relative_paddr(0) - nlba_extent->get_paddr()); + + DEBUGT( + "rewriting {} into {}", + c.trans, + lba_extent, + *nlba_extent); + + return update_internal_mapping( + c, + nlba_extent->get_node_meta().depth, + nlba_extent->get_node_meta().begin, + e->get_paddr(), + nlba_extent->get_paddr() + ).si_then([c, e] { + c.cache.retire_extent(c.trans, e); + }); + }; + + CachedExtentRef nlba_extent; + if (e->get_type() == extent_types_t::LADDR_INTERNAL) { + auto lint = e->cast(); + return do_rewrite(*lint); + } else { + assert(e->get_type() == extent_types_t::LADDR_LEAF); + auto lleaf = e->cast(); + return do_rewrite(*lleaf); + } +} + +LBABtree::get_internal_node_ret LBABtree::get_internal_node( + op_context_t c, + depth_t depth, + paddr_t offset) +{ + LOG_PREFIX(LBATree::get_internal_node); + DEBUGT( + "reading internal at offset {}, depth {}", + c.trans, + offset, + depth); + return c.cache.get_extent( + c.trans, + offset, + LBA_BLOCK_SIZE + ).si_then([FNAME, c, offset](LBAInternalNodeRef ret) { + DEBUGT( + "read internal at offset {} {}", + c.trans, + offset, + *ret); + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + if (!ret->is_pending() && !ret->pin.is_linked()) { + ret->pin.set_range(meta); + c.pins.add_pin(ret->pin); + } + return get_internal_node_ret( + interruptible::ready_future_marker{}, + ret); + }); +} + +LBABtree::get_leaf_node_ret LBABtree::get_leaf_node( + op_context_t c, + paddr_t offset) +{ + LOG_PREFIX(LBATree::get_leaf_node); + DEBUGT( + "reading leaf at offset {}", + c.trans, + offset); + return c.cache.get_extent( + c.trans, + offset, + LBA_BLOCK_SIZE + ).si_then([FNAME, c, offset](LBALeafNodeRef ret) { + DEBUGT( + "read leaf at offset {} {}", + c.trans, + offset, + *ret); + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + if (!ret->is_pending() && !ret->pin.is_linked()) { + ret->pin.set_range(meta); + c.pins.add_pin(ret->pin); + } + return get_leaf_node_ret( + interruptible::ready_future_marker{}, + ret); + }); +} + +LBABtree::find_insertion_ret LBABtree::find_insertion( + op_context_t c, + laddr_t laddr, + iterator &iter) +{ + assert(iter.is_end() || iter.get_key() >= laddr); + if (!iter.is_end() && iter.get_key() == laddr) { + return seastar::now(); + } else if (iter.leaf.node->get_node_meta().begin <= laddr) { + auto p = iter; + if (p.leaf.pos > 0) { + --p.leaf.pos; + assert(p.get_key() < laddr); + } + return seastar::now(); + } else { + assert(iter.leaf.pos == 0); + return iter.prev( + c + ).si_then([laddr, &iter](auto p) { + assert(p.leaf.node->get_node_meta().begin <= laddr); + assert(p.get_key() < laddr); + // Note, this is specifically allowed to violate the iterator + // invariant that pos is a valid index for the node in the event + // that the insertion point is at the end of a node. + p.leaf.pos++; + iter = p; + return seastar::now(); + }); + } +} + +LBABtree::handle_split_ret LBABtree::handle_split( + op_context_t c, + iterator &iter) +{ + LOG_PREFIX(LBATree::insert); + + depth_t split_from = iter.check_split(); + + DEBUGT("split_from {}, depth {}", c.trans, split_from, iter.get_depth()); + + if (split_from == iter.get_depth()) { + auto nroot = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + lba_node_meta_t meta{0, L_ADDR_MAX, iter.get_depth() + 1}; + nroot->set_meta(meta); + nroot->pin.set_range(meta); + nroot->journal_insert( + nroot->begin(), + L_ADDR_MIN, + root.get_location(), + nullptr); + iter.internal.push_back({nroot, 0}); + + root.set_location(nroot->get_paddr()); + root.set_depth(iter.get_depth()); + c.trans.get_lba_tree_stats().depth = iter.get_depth(); + root_dirty = true; + } + + /* pos may be either node_position_t or + * node_position_t */ + auto split_level = [&](auto &parent_pos, auto &pos) { + auto [left, right, pivot] = pos.node->make_split_children(c); + + auto parent_node = parent_pos.node; + auto parent_iter = parent_pos.get_iter(); + + parent_node->update( + parent_iter, + left->get_paddr()); + parent_node->insert( + parent_iter + 1, + pivot, + right->get_paddr()); + + c.cache.retire_extent(c.trans, pos.node); + + /* right->get_node_meta().begin == pivot == right->begin()->get_key() + * Thus, if pos.pos == left->get_size(), we want iter to point to + * left with pos.pos at the end rather than right with pos.pos = 0 + * since the insertion would be to the left of the first element + * of right and thus necessarily less than right->get_node_meta().begin. + */ + if (pos.pos <= left->get_size()) { + pos.node = left; + } else { + pos.node = right; + pos.pos -= left->get_size(); + + parent_pos.pos += 1; + } + }; + + for (; split_from > 0; --split_from) { + auto &parent_pos = iter.get_internal(split_from + 1); + if (!parent_pos.node->is_pending()) { + parent_pos.node = c.cache.duplicate_for_write( + c.trans, parent_pos.node + )->cast(); + } + + if (split_from > 1) { + auto &pos = iter.get_internal(split_from); + DEBUGT("splitting parent {} depth {}", c.trans, split_from, *pos.node); + split_level(parent_pos, pos); + } else { + auto &pos = iter.leaf; + DEBUGT("splitting child {}", c.trans, *pos.node); + split_level(parent_pos, pos); + } + } + + return seastar::now(); +} + +template +LBABtree::base_iertr::future get_node( + op_context_t c, + depth_t depth, + paddr_t addr); + +template <> +LBABtree::base_iertr::future get_node( + op_context_t c, + depth_t depth, + paddr_t addr) { + assert(depth == 1); + return LBABtree::get_leaf_node(c, addr); +} + +template <> +LBABtree::base_iertr::future get_node( + op_context_t c, + depth_t depth, + paddr_t addr) { + return LBABtree::get_internal_node(c, depth, addr); +} + +template +LBABtree::handle_merge_ret merge_level( + op_context_t c, + depth_t depth, + LBABtree::node_position_t &parent_pos, + LBABtree::node_position_t &pos) +{ + if (!parent_pos.node->is_pending()) { + parent_pos.node = c.cache.duplicate_for_write( + c.trans, parent_pos.node + )->cast(); + } + + auto iter = parent_pos.get_iter(); + assert(iter.get_offset() < parent_pos.node->get_size()); + bool donor_is_left = ((iter.get_offset() + 1) == parent_pos.node->get_size()); + auto donor_iter = donor_is_left ? (iter - 1) : (iter + 1); + + return get_node( + c, + depth, + donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()) + ).si_then([c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( + typename NodeType::Ref donor) { + auto [l, r] = donor_is_left ? + std::make_pair(donor, pos.node) : std::make_pair(pos.node, donor); + + auto [liter, riter] = donor_is_left ? + std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); + + if (donor->at_min_capacity()) { + auto replacement = l->make_full_merge(c, r); + + parent_pos.node->update( + liter, + replacement->get_paddr()); + parent_pos.node->remove(riter); + + pos.node = replacement; + if (donor_is_left) { + pos.pos += r->get_size(); + parent_pos.pos--; + } + + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + } else { + auto [replacement_l, replacement_r, pivot] = + l->make_balanced( + c, + r, + !donor_is_left); + + parent_pos.node->update( + liter, + replacement_l->get_paddr()); + parent_pos.node->replace( + riter, + pivot, + replacement_r->get_paddr()); + + if (donor_is_left) { + assert(parent_pos.pos > 0); + parent_pos.pos--; + } + + auto orig_position = donor_is_left ? + l->get_size() + pos.pos : + pos.pos; + if (orig_position < replacement_l->get_size()) { + pos.node = replacement_l; + pos.pos = orig_position; + } else { + parent_pos.pos++; + pos.node = replacement_r; + pos.pos = orig_position - replacement_l->get_size(); + } + + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + } + + return seastar::now(); + }); +} + +LBABtree::handle_merge_ret LBABtree::handle_merge( + op_context_t c, + iterator &iter) +{ + LOG_PREFIX(LBATree::handle_merge); + if (!iter.leaf.node->at_min_capacity() || + iter.get_depth() == 1) { + DEBUGT( + "no need to merge leaf, leaf size {}, depth {}", + c.trans, + iter.leaf.node->get_size(), + iter.get_depth()); + return seastar::now(); + } + + return seastar::do_with( + depth_t{1}, + [FNAME, this, c, &iter](auto &to_merge) { + return trans_intr::repeat( + [FNAME, this, c, &iter, &to_merge] { + DEBUGT( + "merging depth {}", + c.trans, + to_merge); + auto &parent_pos = iter.get_internal(to_merge + 1); + auto merge_fut = handle_merge_iertr::now(); + if (to_merge > 1) { + auto &pos = iter.get_internal(to_merge); + merge_fut = merge_level(c, to_merge, parent_pos, pos); + } else { + auto &pos = iter.leaf; + merge_fut = merge_level(c, to_merge, parent_pos, pos); + } + + return merge_fut.si_then([FNAME, this, c, &iter, &to_merge] { + ++to_merge; + auto &pos = iter.get_internal(to_merge); + if (to_merge == iter.get_depth()) { + if (pos.node->get_size() == 1) { + DEBUGT("collapsing root", c.trans); + c.cache.retire_extent(c.trans, pos.node); + assert(pos.pos == 0); + auto node_iter = pos.get_iter(); + root.set_location( + node_iter->get_val().maybe_relative_to(pos.node->get_paddr())); + iter.internal.pop_back(); + root.set_depth(iter.get_depth()); + c.trans.get_lba_tree_stats().depth = iter.get_depth(); + root_dirty = true; + } else { + DEBUGT("no need to collapse root", c.trans); + } + return seastar::stop_iteration::yes; + } else if (pos.node->at_min_capacity()) { + DEBUGT( + "continuing, next node {} depth {} at min", + c.trans, + *pos.node, + to_merge); + return seastar::stop_iteration::no; + } else { + DEBUGT( + "complete, next node {} depth {} not min", + c.trans, + *pos.node, + to_merge); + return seastar::stop_iteration::yes; + } + }); + }); + }); +} + +LBABtree::update_internal_mapping_ret LBABtree::update_internal_mapping( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t old_addr, + paddr_t new_addr) +{ + LOG_PREFIX(LBATree::update_internal_mapping); + DEBUGT( + "updating laddr {} at depth {} from {} to {}", + c.trans, + laddr, + depth, + old_addr, + new_addr); + + return lower_bound( + c, laddr + ).si_then([=](auto iter) { + assert(iter.get_depth() >= depth); + if (depth == iter.get_depth()) { + DEBUGT("update at root", c.trans); + + if (laddr != 0) { + ERRORT( + "updating root laddr {} at depth {} from {} to {}," + "laddr is not 0", + c.trans, + laddr, + depth, + old_addr, + new_addr, + root.get_location()); + ceph_assert(0 == "impossible"); + } + + if (root.get_location() != old_addr) { + ERRORT( + "updating root laddr {} at depth {} from {} to {}," + "root addr {} does not match", + c.trans, + laddr, + depth, + old_addr, + new_addr, + root.get_location()); + ceph_assert(0 == "impossible"); + } + + root.set_location(new_addr); + root_dirty = true; + } else { + auto &parent = iter.get_internal(depth + 1); + assert(parent.node); + assert(parent.pos < parent.node->get_size()); + auto piter = parent.node->iter_idx(parent.pos); + + if (piter->get_key() != laddr) { + ERRORT( + "updating laddr {} at depth {} from {} to {}," + "node {} pos {} val pivot addr {} does not match", + c.trans, + laddr, + depth, + old_addr, + new_addr, + *(parent.node), + parent.pos, + piter->get_key()); + ceph_assert(0 == "impossible"); + } + + + if (piter->get_val() != old_addr) { + ERRORT( + "updating laddr {} at depth {} from {} to {}," + "node {} pos {} val addr {} does not match", + c.trans, + laddr, + depth, + old_addr, + new_addr, + *(parent.node), + parent.pos, + piter->get_val()); + ceph_assert(0 == "impossible"); + } + + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, + parent.node + ); + LBAInternalNodeRef mparent = mut->cast(); + mparent->update(piter, new_addr); + + /* Note, iter is now invalid as we didn't udpate either the parent + * node reference to the new mutable instance nor did we update the + * child pointer to the new node. Not a problem as we'll now just + * destruct it. + */ + } + return seastar::now(); + }); +} +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree.h new file mode 100644 index 0000000000000..887d58a8c566b --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree.h @@ -0,0 +1,608 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" + +namespace crimson::os::seastore::lba_manager::btree { + + +class LBABtree { + static constexpr size_t MAX_DEPTH = 16; +public: + using base_iertr = LBAManager::base_iertr; + + class iterator; + using iterator_fut = base_iertr::future; + + using mapped_space_visitor_t = LBAManager::scan_mapped_space_func_t; + + class iterator { + public: + iterator(const iterator &) noexcept = default; + iterator(iterator &&) noexcept = default; + iterator &operator=(const iterator &) = default; + iterator &operator=(iterator &&) = default; + + iterator(depth_t depth) : internal(depth - 1) {} + + iterator_fut next( + op_context_t c, + mapped_space_visitor_t *visit=nullptr) const; + + iterator_fut prev(op_context_t c) const; + + void assert_valid() const { + assert(leaf.node); + assert(leaf.pos <= leaf.node->get_size()); + + for (auto &i: internal) { + (void)i; + assert(i.node); + assert(i.pos < i.node->get_size()); + } + } + + depth_t get_depth() const { + return internal.size() + 1; + } + + auto &get_internal(depth_t depth) { + assert(depth > 1); + assert((depth - 2) < internal.size()); + return internal[depth - 2]; + } + + const auto &get_internal(depth_t depth) const { + assert(depth > 1); + assert((depth - 2) < internal.size()); + return internal[depth - 2]; + } + + laddr_t get_key() const { + assert(!is_end()); + return leaf.node->iter_idx(leaf.pos).get_key(); + } + lba_map_val_t get_val() const { + assert(!is_end()); + return leaf.node->iter_idx(leaf.pos).get_val(); + } + + bool is_end() const { + assert(leaf.pos <= leaf.node->get_size()); + return leaf.pos == leaf.node->get_size(); + } + + bool is_begin() const { + for (auto &i: internal) { + if (i.pos != 0) + return false; + } + return leaf.pos == 0; + } + + LBAPinRef get_pin() const { + assert(!is_end()); + return std::make_unique( + leaf.node, + get_val().paddr.maybe_relative_to(leaf.node->get_paddr()), + lba_node_meta_t{ get_key(), get_key() + get_val().len, 0 }); + } + + private: + iterator() = default; + + friend class LBABtree; + static constexpr uint16_t INVALID = std::numeric_limits::max(); + template + struct node_position_t { + typename NodeType::Ref node; + uint16_t pos = INVALID; + + void reset() { + *this = node_position_t{}; + } + + auto get_iter() { + assert(pos != INVALID); + assert(pos < node->get_size()); + return node->iter_idx(pos); + } + }; + boost::container::static_vector< + node_position_t, MAX_DEPTH> internal; + node_position_t leaf; + + depth_t check_split() const { + if (!leaf.node->at_max_capacity()) { + return 0; + } + for (depth_t split_from = 1; split_from < get_depth(); ++split_from) { + if (!get_internal(split_from + 1).node->at_max_capacity()) + return split_from; + } + return get_depth(); + } + + depth_t check_merge() const { + if (!leaf.node->at_min_capacity()) { + return 0; + } + for (depth_t merge_from = 1; merge_from < get_depth(); ++merge_from) { + if (!get_internal(merge_from + 1).node->at_min_capacity()) + return merge_from; + } + return get_depth(); + } + }; + + LBABtree(lba_root_t root) : root(root) {} + + bool is_root_dirty() const { + return root_dirty; + } + lba_root_t get_root_undirty() { + ceph_assert(root_dirty); + root_dirty = false; + return root; + } + + /// mkfs + using mkfs_ret = lba_root_t; + static mkfs_ret mkfs(op_context_t c); + + /** + * lower_bound + * + * @param c [in] context + * @param addr [in] ddr + * @return least iterator >= key + */ + iterator_fut lower_bound( + op_context_t c, + laddr_t addr, + mapped_space_visitor_t *visit=nullptr) const; + + /** + * upper_bound + * + * @param c [in] context + * @param addr [in] ddr + * @return least iterator > key + */ + iterator_fut upper_bound( + op_context_t c, + laddr_t addr + ) const { + return lower_bound( + c, addr + ).si_then([c, addr](auto iter) { + if (!iter.is_end() && iter.get_key() == addr) { + return iter.next(c); + } else { + return iterator_fut( + interruptible::ready_future_marker{}, + iter); + } + }); + } + + /** + * upper_bound_right + * + * @param c [in] context + * @param addr [in] addr + * @return least iterator i s.t. i.get_key() + i.get_val().len > key + */ + iterator_fut upper_bound_right( + op_context_t c, + laddr_t addr) const + { + return lower_bound( + c, addr + ).si_then([c, addr](auto iter) { + if (iter.is_begin()) { + return iterator_fut( + interruptible::ready_future_marker{}, + iter); + } else { + return iter.prev( + c + ).si_then([iter, addr](auto prev) { + if ((prev.get_key() + prev.get_val().len) > addr) { + return iterator_fut( + interruptible::ready_future_marker{}, + prev); + } else { + return iterator_fut( + interruptible::ready_future_marker{}, + iter); + } + }); + } + }); + } + + iterator_fut begin(op_context_t c) const { + return lower_bound(c, 0); + } + iterator_fut end(op_context_t c) const { + return upper_bound(c, L_ADDR_MAX); + } + + using iterate_repeat_ret = base_iertr::future< + seastar::stop_iteration>; + template + static auto iterate_repeat( + op_context_t c, + iterator_fut &&iter_fut, + F &&f, + mapped_space_visitor_t *visitor=nullptr) { + return std::move( + iter_fut + ).si_then([c, visitor, f=std::forward(f)](auto iter) { + return seastar::do_with( + iter, + std::move(f), + [c, visitor](auto &pos, auto &f) { + return trans_intr::repeat( + [c, visitor, &f, &pos] { + return f( + pos + ).si_then([c, visitor, &pos](auto done) { + if (done == seastar::stop_iteration::yes) { + return iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } else { + ceph_assert(!pos.is_end()); + return pos.next( + c, visitor + ).si_then([&pos](auto next) { + pos = next; + return iterate_repeat_ret( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + }); + } + }); + }); + }); + }); + } + + /** + * insert + * + * Inserts val at laddr with iter as a hint. If element at laddr already + * exists returns iterator to that element unchanged and returns false. + * + * Invalidates all outstanding iterators for this tree on this transaction. + * + * @param c [in] op context + * @param iter [in] hint, insertion constant if immediately prior to iter + * @param laddr [in] addr at which to insert + * @param val [in] val to insert + * @return pair where iter points to element at addr, bool true + * iff element at laddr did not exist. + */ + using insert_iertr = base_iertr; + using insert_ret = insert_iertr::future>; + insert_ret insert( + op_context_t c, + iterator iter, + laddr_t laddr, + lba_map_val_t val + ); + insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) { + return lower_bound( + c, laddr + ).si_then([this, c, laddr, val](auto iter) { + return insert(c, iter, laddr, val); + }); + } + + /** + * update + * + * Invalidates all outstanding iterators for this tree on this transaction. + * + * @param c [in] op context + * @param iter [in] iterator to element to update, must not be end + * @param val [in] val with which to update + * @return iterator to newly updated element + */ + using update_iertr = base_iertr; + using update_ret = update_iertr::future; + update_ret update( + op_context_t c, + iterator iter, + lba_map_val_t val); + + /** + * remove + * + * Invalidates all outstanding iterators for this tree on this transaction. + * + * @param c [in] op context + * @param iter [in] iterator to element to remove, must not be end + */ + using remove_iertr = base_iertr; + using remove_ret = remove_iertr::future<>; + remove_ret remove( + op_context_t c, + iterator iter); + + /** + * init_cached_extent + * + * Checks whether e is live (reachable from lba tree) and drops or initializes + * accordingly. + * + * Returns e if live and a null CachedExtentRef otherwise. + */ + using init_cached_extent_iertr = base_iertr; + using init_cached_extent_ret = init_cached_extent_iertr::future; + init_cached_extent_ret init_cached_extent(op_context_t c, CachedExtentRef e); + + /** + * rewrite_lba_extent + * + * Rewrites a fresh copy of extent into transaction and updates internal + * references. + */ + using rewrite_lba_extent_iertr = base_iertr; + using rewrite_lba_extent_ret = rewrite_lba_extent_iertr::future<>; + rewrite_lba_extent_ret rewrite_lba_extent(op_context_t c, CachedExtentRef e); + +private: + lba_root_t root; + bool root_dirty = false; + + using get_internal_node_iertr = base_iertr; + using get_internal_node_ret = get_internal_node_iertr::future; + static get_internal_node_ret get_internal_node( + op_context_t c, + depth_t depth, + paddr_t offset); + + using get_leaf_node_iertr = base_iertr; + using get_leaf_node_ret = get_leaf_node_iertr::future; + static get_leaf_node_ret get_leaf_node( + op_context_t c, + paddr_t offset); + + using lookup_root_iertr = base_iertr; + using lookup_root_ret = lookup_root_iertr::future<>; + lookup_root_ret lookup_root( + op_context_t c, + iterator &iter, + mapped_space_visitor_t *visitor) const { + if (root.get_depth() > 1) { + return get_internal_node( + c, + root.get_depth(), + root.get_location() + ).si_then([this, visitor, &iter](LBAInternalNodeRef root_node) { + iter.get_internal(root.get_depth()).node = root_node; + if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length()); + return lookup_root_iertr::now(); + }); + } else { + return get_leaf_node( + c, + root.get_location() + ).si_then([visitor, &iter](LBALeafNodeRef root_node) { + iter.leaf.node = root_node; + if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length()); + return lookup_root_iertr::now(); + }); + } + } + + using lookup_internal_level_iertr = base_iertr; + using lookup_internal_level_ret = lookup_internal_level_iertr::future<>; + template + static lookup_internal_level_ret lookup_internal_level( + op_context_t c, + depth_t depth, + iterator &iter, + F &f, + mapped_space_visitor_t *visitor + ) { + assert(depth > 1); + auto &parent_entry = iter.get_internal(depth + 1); + auto parent = parent_entry.node; + auto node_iter = parent->iter_idx(parent_entry.pos); + return get_internal_node( + c, + depth, + node_iter->get_val().maybe_relative_to(parent->get_paddr()) + ).si_then([depth, visitor, &iter, &f](LBAInternalNodeRef node) { + auto &entry = iter.get_internal(depth); + entry.node = node; + auto node_iter = f(*node); + assert(node_iter != node->end()); + entry.pos = node_iter->get_offset(); + if (visitor) (*visitor)(node->get_paddr(), node->get_length()); + return seastar::now(); + }); + } + + using lookup_leaf_iertr = base_iertr; + using lookup_leaf_ret = lookup_leaf_iertr::future<>; + template + static lookup_internal_level_ret lookup_leaf( + op_context_t c, + iterator &iter, + F &f, + mapped_space_visitor_t *visitor + ) { + auto &parent_entry = iter.get_internal(2); + auto parent = parent_entry.node; + assert(parent); + auto node_iter = parent->iter_idx(parent_entry.pos); + + return get_leaf_node( + c, + node_iter->get_val().maybe_relative_to(parent->get_paddr()) + ).si_then([visitor, &iter, &f](LBALeafNodeRef node) { + iter.leaf.node = node; + auto node_iter = f(*node); + iter.leaf.pos = node_iter->get_offset(); + if (visitor) (*visitor)(node->get_paddr(), node->get_length()); + return seastar::now(); + }); + } + + using lookup_depth_range_iertr = base_iertr; + using lookup_depth_range_ret = lookup_depth_range_iertr::future<>; + template + static lookup_depth_range_ret lookup_depth_range( + op_context_t c, ///< [in] context + iterator &iter, ///< [in,out] iterator to populate + depth_t from, ///< [in] from inclusive + depth_t to, ///< [in] to exclusive, (to <= from, to == from is a noop) + LI &li, ///< [in] internal->iterator + LL &ll, ///< [in] leaf->iterator + mapped_space_visitor_t *visitor ///< [in] mapped space visitor + ) { + LOG_PREFIX(LBATree::lookup_depth_range); + DEBUGT("{} -> {}", c.trans, from, to); + return seastar::do_with( + from, + [c, to, visitor, &iter, &li, &ll](auto &d) { + return trans_intr::repeat( + [c, to, visitor, &iter, &li, &ll, &d] { + if (d > to) { + return [&] { + if (d > 1) { + return lookup_internal_level( + c, + d, + iter, + li, + visitor); + } else { + assert(d == 1); + return lookup_leaf( + c, + iter, + ll, + visitor); + } + }().si_then([&d] { + --d; + return lookup_depth_range_iertr::make_ready_future< + seastar::stop_iteration + >(seastar::stop_iteration::no); + }); + } else { + return lookup_depth_range_iertr::make_ready_future< + seastar::stop_iteration + >(seastar::stop_iteration::yes); + } + }); + }); + } + + using lookup_iertr = base_iertr; + using lookup_ret = lookup_iertr::future; + template + lookup_ret lookup( + op_context_t c, + LI &&lookup_internal, + LL &&lookup_leaf, + mapped_space_visitor_t *visitor + ) const { + LOG_PREFIX(LBATree::lookup); + return seastar::do_with( + iterator{root.get_depth()}, + std::forward
  • (lookup_internal), + std::forward(lookup_leaf), + [FNAME, this, visitor, c](auto &iter, auto &li, auto &ll) { + return lookup_root( + c, iter, visitor + ).si_then([FNAME, this, visitor, c, &iter, &li, &ll] { + if (iter.get_depth() > 1) { + auto &root_entry = *(iter.internal.rbegin()); + root_entry.pos = li(*(root_entry.node)).get_offset(); + } else { + auto &root_entry = iter.leaf; + auto riter = ll(*(root_entry.node)); + root_entry.pos = riter->get_offset(); + } + DEBUGT("got root, depth {}", c.trans, root.get_depth()); + return lookup_depth_range( + c, + iter, + root.get_depth() - 1, + 0, + li, + ll, + visitor); + }).si_then([&iter] { + return std::move(iter); + }); + }); + } + + using find_insertion_iertr = base_iertr; + using find_insertion_ret = find_insertion_iertr::future<>; + static find_insertion_ret find_insertion( + op_context_t c, + laddr_t laddr, + iterator &iter); + + using handle_split_iertr = base_iertr; + using handle_split_ret = handle_split_iertr::future<>; + handle_split_ret handle_split( + op_context_t c, + iterator &iter); + + using handle_merge_iertr = base_iertr; + using handle_merge_ret = handle_merge_iertr::future<>; + handle_merge_ret handle_merge( + op_context_t c, + iterator &iter); + + using update_internal_mapping_iertr = base_iertr; + using update_internal_mapping_ret = update_internal_mapping_iertr::future<>; + update_internal_mapping_ret update_internal_mapping( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t old_addr, + paddr_t new_addr); + + template + using node_position_t = iterator::node_position_t; + + template + friend base_iertr::future get_node( + op_context_t c, + depth_t depth, + paddr_t addr); + + template + friend handle_merge_ret merge_level( + op_context_t c, + depth_t depth, + node_position_t &parent_pos, + node_position_t &pos); +}; + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc new file mode 100644 index 0000000000000..8fadc00d8d87d --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include +#include + +#include "include/buffer.h" +#include "include/byteorder.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_seastore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +std::ostream &LBAInternalNode::print_detail(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", meta=" << get_meta(); +} + +void LBAInternalNode::resolve_relative_addrs(paddr_t base) +{ + for (auto i: *this) { + if (i->get_val().is_relative()) { + auto updated = base.add_relative(i->get_val()); + logger().debug( + "LBAInternalNode::resolve_relative_addrs {} -> {}", + i->get_val(), + updated); + i->set_val(updated); + } + } +} + +std::ostream &LBALeafNode::print_detail(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", meta=" << get_meta(); +} + +void LBALeafNode::resolve_relative_addrs(paddr_t base) +{ + for (auto i: *this) { + if (i->get_val().paddr.is_relative()) { + auto val = i->get_val(); + val.paddr = base.add_relative(val.paddr); + logger().debug( + "LBALeafNode::resolve_relative_addrs {} -> {}", + i->get_val().paddr, + val.paddr); + i->set_val(val); + } + } +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h index 053bf27b2f6c9..66eca56a223df 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -7,9 +7,17 @@ #include #include -#include "crimson/common/log.h" -#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" + +#include "include/buffer.h" + +#include "crimson/common/fixed_kv_node_layout.h" +#include "crimson/common/errorator.h" #include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" namespace crimson::os::seastore::lba_manager::btree { @@ -32,6 +40,7 @@ struct lba_map_val_t { uint32_t refcount = 0; ///< refcount uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO) + lba_map_val_t() = default; lba_map_val_t( extent_len_t len, paddr_t paddr, @@ -43,6 +52,30 @@ struct lba_map_val_t { class BtreeLBAPin; using BtreeLBAPinRef = std::unique_ptr; +constexpr size_t LBA_BLOCK_SIZE = 4096; + +/** + * lba_node_meta_le_t + * + * On disk layout for lba_node_meta_t + */ +struct lba_node_meta_le_t { + laddr_le_t begin = laddr_le_t(0); + laddr_le_t end = laddr_le_t(0); + depth_le_t depth = init_depth_le(0); + + lba_node_meta_le_t() = default; + lba_node_meta_le_t(const lba_node_meta_le_t &) = default; + explicit lba_node_meta_le_t(const lba_node_meta_t &val) + : begin(ceph_le64(val.begin)), + end(ceph_le64(val.end)), + depth(init_depth_le(val.depth)) {} + + operator lba_node_meta_t() const { + return lba_node_meta_t{ begin, end, depth }; + } +}; + /** * LBANode * @@ -59,221 +92,448 @@ struct LBANode : CachedExtent { virtual lba_node_meta_t get_node_meta() const = 0; - /** - * lookup - * - * Returns the node at the specified depth responsible - * for laddr - */ - using lookup_iertr = base_iertr; - using lookup_ret = lookup_iertr::future; - virtual lookup_ret lookup( - op_context_t c, - laddr_t addr, - depth_t depth) = 0; + virtual ~LBANode() = default; - /** - * lookup_range - * - * Returns mappings within range [addr, addr+len) - */ - using lookup_range_iertr = LBAManager::get_mappings_iertr; - using lookup_range_ret = LBAManager::get_mappings_ret; - virtual lookup_range_ret lookup_range( - op_context_t c, - laddr_t addr, - extent_len_t len) = 0; + void on_delta_write(paddr_t record_block_offset) final { + // All in-memory relative addrs are necessarily record-relative + assert(get_prior_instance()); + pin.take_pin(get_prior_instance()->cast()->pin); + resolve_relative_addrs(record_block_offset); + } - /** - * lookup_pin - * - * Returns the mapping at addr - */ - using lookup_pin_iertr = LBAManager::get_mapping_iertr; - using lookup_pin_ret = LBAManager::get_mapping_ret; - virtual lookup_pin_ret lookup_pin( - op_context_t c, - laddr_t addr) = 0; + void on_initial_write() final { + // All in-memory relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + } - /** - * insert - * - * Recursively inserts into subtree rooted at *this. Caller - * must already have handled splitting if at_max_capacity(). - * - * Precondition: !at_max_capacity() - */ - using insert_iertr = base_iertr; - using insert_ret = insert_iertr::future; - virtual insert_ret insert( - op_context_t c, - laddr_t laddr, - lba_map_val_t val) = 0; + void on_clean_read() final { + // From initial write of block, relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + } - /** - * find_hole - * - * Finds minimum hole of size len in [min, max) - * - * @return addr of hole, L_ADDR_NULL if unfound - */ - using find_hole_iertr = base_iertr; - using find_hole_ret = find_hole_iertr::future; - virtual find_hole_ret find_hole( - op_context_t c, - laddr_t min, - laddr_t max, - extent_len_t len) = 0; + virtual void resolve_relative_addrs(paddr_t base) = 0; +}; +using LBANodeRef = LBANode::LBANodeRef; - /** - * scan_mappings - * - * Call f for all mappings in [begin, end) - */ - using scan_mappings_iertr = LBAManager::scan_mappings_iertr; - using scan_mappings_ret = LBAManager::scan_mappings_ret; - using scan_mappings_func_t = LBAManager::scan_mappings_func_t; - virtual scan_mappings_ret scan_mappings( - op_context_t c, - laddr_t begin, - laddr_t end, - scan_mappings_func_t &f) = 0; - - using scan_mapped_space_iertr = LBAManager::scan_mapped_space_iertr; - using scan_mapped_space_ret = LBAManager::scan_mapped_space_ret; - using scan_mapped_space_func_t = LBAManager::scan_mapped_space_func_t; - virtual scan_mapped_space_ret scan_mapped_space( - op_context_t c, - scan_mapped_space_func_t &f) = 0; +/** + * LBAInternalNode + * + * Abstracts operations on and layout of internal nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[255] (254*8)b + * values : paddr_t[255] (254*8)b + * = 4096 + + * TODO: make the above capacity calculation part of FixedKVNodeLayout + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t INTERNAL_NODE_CAPACITY = 254; +struct LBAInternalNode + : LBANode, + common::FixedKVNodeLayout< + INTERNAL_NODE_CAPACITY, + lba_node_meta_t, lba_node_meta_le_t, + laddr_t, laddr_le_t, + paddr_t, paddr_le_t> { + using Ref = TCachedExtentRef; + using internal_iterator_t = const_iterator; + template + LBAInternalNode(T&&... t) : + LBANode(std::forward(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t TYPE = extent_types_t::LADDR_INTERNAL; + + lba_node_meta_t get_node_meta() const { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new LBAInternalNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } - /** - * mutate_mapping - * - * Lookups up laddr, calls f on value. If f returns a value, inserts it. - * If it returns nullopt, removes the value. - * Caller must already have merged if at_min_capacity(). - * - * Recursive calls use mutate_mapping_internal. - * - * Precondition: !at_min_capacity() - */ - using mutate_mapping_iertr = base_iertr::extend< - crimson::ct_error::enoent ///< mapping does not exist - >; - using mutate_mapping_ret = mutate_mapping_iertr::future< - lba_map_val_t>; - using mutate_func_t = std::function< - lba_map_val_t(const lba_map_val_t &v) - >; - virtual mutate_mapping_ret mutate_mapping( - op_context_t c, - laddr_t laddr, - mutate_func_t &&f) = 0; - virtual mutate_mapping_ret mutate_mapping_internal( - op_context_t c, - laddr_t laddr, - bool is_root, - mutate_func_t &&f) = 0; + void update( + const_iterator iter, + paddr_t addr) { + return journal_update( + iter, + maybe_generate_relative(addr), + maybe_get_delta_buffer()); + } - /** - * mutate_internal_address - * - * Looks up internal node mapping at laddr, depth and - * updates the mapping to paddr. Returns previous paddr - * (for debugging purposes). - */ - using mutate_internal_address_iertr = base_iertr::extend< - crimson::ct_error::enoent ///< mapping does not exist - >; - using mutate_internal_address_ret = mutate_internal_address_iertr::future< - paddr_t>; - virtual mutate_internal_address_ret mutate_internal_address( - op_context_t c, - depth_t depth, - laddr_t laddr, - paddr_t paddr) = 0; + void insert( + const_iterator iter, + laddr_t pivot, + paddr_t addr) { + return journal_insert( + iter, + pivot, + maybe_generate_relative(addr), + maybe_get_delta_buffer()); + } - /** - * make_split_children - * - * Generates appropriately typed left and right nodes formed from the - * contents of *this. - * - * Returns where pivot is the first value of right. - */ - virtual std::tuple< - LBANodeRef, - LBANodeRef, - laddr_t> - make_split_children( - op_context_t c) = 0; + void remove(const_iterator iter) { + return journal_remove( + iter, + maybe_get_delta_buffer()); + } - /** - * make_full_merge - * - * Returns a single node formed from merging *this and right. - * Precondition: at_min_capacity() && right.at_min_capacity() - */ - virtual LBANodeRef make_full_merge( + void replace( + const_iterator iter, + laddr_t pivot, + paddr_t addr) { + return journal_replace( + iter, + pivot, + maybe_generate_relative(addr), + maybe_get_delta_buffer()); + } + + std::tuple + make_split_children(op_context_t c) { + auto left = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + auto right = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + auto pivot = split_into(*left, *right); + left->pin.set_range(left->get_meta()); + right->pin.set_range(right->get_meta()); + return std::make_tuple( + left, + right, + pivot); + } + + Ref make_full_merge( op_context_t c, - LBANodeRef &right) = 0; + Ref &right) { + auto replacement = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + replacement->merge_from(*this, *right->cast()); + replacement->pin.set_range(replacement->get_meta()); + return replacement; + } + + std::tuple + make_balanced( + op_context_t c, + Ref &_right, + bool prefer_left) { + ceph_assert(_right->get_type() == get_type()); + auto &right = *_right->cast(); + auto replacement_left = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + auto replacement_right = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + + auto pivot = balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + + replacement_left->pin.set_range(replacement_left->get_meta()); + replacement_right->pin.set_range(replacement_right->get_meta()); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } /** - * make_balanced + * Internal relative addresses on read or in memory prior to commit + * are either record or block relative depending on whether this + * physical node is is_initial_pending() or just is_pending(). * - * Returns nodes formed by balancing the contents of *this and right. - * - * Returns where pivot is the first value of right. + * User passes appropriate base depending on lifecycle and + * resolve_relative_addrs fixes up relative internal references + * based on base. */ - virtual std::tuple< - LBANodeRef, - LBANodeRef, - laddr_t> - make_balanced( - op_context_t c, - LBANodeRef &right, - bool prefer_left) = 0; + void resolve_relative_addrs(paddr_t base); + void node_resolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_block_relative()); + i->set_val(get_paddr().add_relative(i->get_val())); + } + } + } + } + void node_unresolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_record_relative()); + i->set_val(i->get_val() - get_paddr()); + } + } + } + } - virtual bool at_max_capacity() const = 0; - virtual bool at_min_capacity() const = 0; + extent_types_t get_type() const final { + return TYPE; + } - virtual ~LBANode() = default; + std::ostream &print_detail(std::ostream &out) const final; - void on_delta_write(paddr_t record_block_offset) final { - // All in-memory relative addrs are necessarily record-relative - assert(get_prior_instance()); - pin.take_pin(get_prior_instance()->cast()->pin); - resolve_relative_addrs(record_block_offset); + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; } - void on_initial_write() final { - // All in-memory relative addrs are necessarily block-relative - resolve_relative_addrs(get_paddr()); + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + set_last_committed_crc(get_crc32c()); + resolve_relative_addrs(base); } - void on_clean_read() final { - // From initial write of block, relative addrs are necessarily block-relative - resolve_relative_addrs(get_paddr()); + bool at_max_capacity() const { + return get_size() == get_capacity(); } - virtual void resolve_relative_addrs(paddr_t base) = 0; + bool at_min_capacity() const { + return get_size() == (get_capacity() / 2); + } }; -using LBANodeRef = LBANode::LBANodeRef; +using LBAInternalNodeRef = LBAInternalNode::Ref; + +/** + * LBALeafNode + * + * Abstracts operations on and layout of leaf nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[170] (145*8)b + * values : lba_map_val_t[170] (145*20)b + * = 4092 + * + * TODO: update FixedKVNodeLayout to handle the above calculation + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t LEAF_NODE_CAPACITY = 145; /** - * get_lba_btree_extent + * lba_map_val_le_t * - * Fetches node at depth of the appropriate type. + * On disk layout for lba_map_val_t. */ -using get_lba_node_iertr = base_iertr; -using get_lba_node_ret = get_lba_node_iertr::future; -get_lba_node_ret get_lba_btree_extent( - op_context_t c, ///< [in] context structure - CachedExtentRef parent, ///< [in] paddr ref source - depth_t depth, ///< [in] depth of node to fetch - paddr_t offset, ///< [in] physical addr of node - paddr_t base ///< [in] depending on user, block addr or record addr - /// in case offset is relative -); +struct lba_map_val_le_t { + extent_len_le_t len = init_extent_len_le(0); + paddr_le_t paddr; + ceph_le32 refcount{0}; + ceph_le32 checksum{0}; + + lba_map_val_le_t() = default; + lba_map_val_le_t(const lba_map_val_le_t &) = default; + explicit lba_map_val_le_t(const lba_map_val_t &val) + : len(init_extent_len_le(val.len)), + paddr(paddr_le_t(val.paddr)), + refcount(val.refcount), + checksum(val.checksum) {} + + operator lba_map_val_t() const { + return lba_map_val_t{ len, paddr, refcount, checksum }; + } +}; + +struct LBALeafNode + : LBANode, + common::FixedKVNodeLayout< + LEAF_NODE_CAPACITY, + lba_node_meta_t, lba_node_meta_le_t, + laddr_t, laddr_le_t, + lba_map_val_t, lba_map_val_le_t> { + using Ref = TCachedExtentRef; + using internal_iterator_t = const_iterator; + template + LBALeafNode(T&&... t) : + LBANode(std::forward(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF; + + lba_node_meta_t get_node_meta() const { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new LBALeafNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + void update( + const_iterator iter, + lba_map_val_t val) { + val.paddr = maybe_generate_relative(val.paddr); + return journal_update( + iter, + val, + maybe_get_delta_buffer()); + } + + auto insert( + const_iterator iter, + laddr_t addr, + lba_map_val_t val) { + val.paddr = maybe_generate_relative(val.paddr); + journal_insert( + iter, + addr, + val, + maybe_get_delta_buffer()); + return iter; + } + + void remove(const_iterator iter) { + return journal_remove( + iter, + maybe_get_delta_buffer()); + } + + + std::tuple + make_split_children(op_context_t c) { + auto left = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + auto right = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + auto pivot = split_into(*left, *right); + left->pin.set_range(left->get_meta()); + right->pin.set_range(right->get_meta()); + return std::make_tuple( + left, + right, + pivot); + } + + Ref make_full_merge( + op_context_t c, + Ref &right) { + auto replacement = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + replacement->merge_from(*this, *right->cast()); + replacement->pin.set_range(replacement->get_meta()); + return replacement; + } + + std::tuple + make_balanced( + op_context_t c, + Ref &_right, + bool prefer_left) { + ceph_assert(_right->get_type() == get_type()); + auto &right = *_right->cast(); + auto replacement_left = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + auto replacement_right = c.cache.alloc_new_extent( + c.trans, LBA_BLOCK_SIZE); + + auto pivot = balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + + replacement_left->pin.set_range(replacement_left->get_meta()); + replacement_right->pin.set_range(replacement_right->get_meta()); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } + + // See LBAInternalNode, same concept + void resolve_relative_addrs(paddr_t base); + void node_resolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.paddr.is_relative()) { + assert(val.paddr.is_block_relative()); + val.paddr = get_paddr().add_relative(val.paddr); + i->set_val(val); + } + } + } + } + void node_unresolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.paddr.is_relative()) { + auto val = i->get_val(); + assert(val.paddr.is_record_relative()); + val.paddr = val.paddr - get_paddr(); + i->set_val(val); + } + } + } + } + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + set_last_committed_crc(get_crc32c()); + resolve_relative_addrs(base); + } + + extent_types_t get_type() const final { + return TYPE; + } + + std::ostream &print_detail(std::ostream &out) const final; + + bool at_max_capacity() const { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const { + return get_size() == (get_capacity() / 2); + } +}; +using LBALeafNodeRef = TCachedExtentRef; } diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc deleted file mode 100644 index d11f27a57ad46..0000000000000 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc +++ /dev/null @@ -1,792 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include - -#include -#include - -#include "include/buffer.h" -#include "include/byteorder.h" - -#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" - -namespace { - seastar::logger& logger() { - return crimson::get_logger(ceph_subsys_seastore); - } -} - -namespace crimson::os::seastore::lba_manager::btree { - -std::ostream &LBAInternalNode::print_detail(std::ostream &out) const -{ - return out << ", size=" << get_size() - << ", meta=" << get_meta(); -} - -LBAInternalNode::lookup_ret LBAInternalNode::lookup( - op_context_t c, - laddr_t addr, - depth_t depth) -{ - auto meta = get_meta(); - if (depth == get_meta().depth) { - return lookup_ret( - interruptible::ready_future_marker{}, - this); - } - assert(meta.begin <= addr); - assert(meta.end > addr); - - [[maybe_unused]] auto [iter, biter] = bound(addr, addr + 1); - assert(iter != biter); - assert(iter + 1 == biter); - return get_lba_btree_extent( - c, - this, - meta.depth - 1, - iter->get_val(), - get_paddr()).si_then([c, addr, depth](auto child) { - return child->lookup(c, addr, depth); - }).finally([ref=LBANodeRef(this)] {}); -} - -LBAInternalNode::lookup_range_ret LBAInternalNode::lookup_range( - op_context_t c, - laddr_t addr, - extent_len_t len) -{ - auto [begin, end] = bound(addr, addr + len); - auto result_up = std::make_unique(); - auto &result = *result_up; - return trans_intr::do_for_each( - std::move(begin), - std::move(end), - [this, c, &result, addr, len](const auto &val) { - return get_lba_btree_extent( - c, - this, - get_meta().depth - 1, - val.get_val(), - get_paddr()).si_then( - [c, &result, addr, len](auto extent) { - return extent->lookup_range( - c, - addr, - len).si_then( - [&result](auto pin_list) { - result.splice(result.end(), pin_list, - pin_list.begin(), pin_list.end()); - }); - }); - }).si_then([result=std::move(result_up), ref=LBANodeRef(this)] { - return lookup_range_iertr::make_ready_future( - std::move(*result)); - }); -} - -LBAInternalNode::lookup_pin_ret LBAInternalNode::lookup_pin( - op_context_t c, - laddr_t addr) -{ - auto iter = get_containing_child(addr); - return get_lba_btree_extent( - c, - this, - get_meta().depth - 1, - iter->get_val(), - get_paddr() - ).si_then([c, addr](LBANodeRef extent) { - return extent->lookup_pin(c, addr); - }).finally([ref=LBANodeRef(this)] {}); -} - -LBAInternalNode::insert_ret LBAInternalNode::insert( - op_context_t c, - laddr_t laddr, - lba_map_val_t val) -{ - auto insertion_pt = get_containing_child(laddr); - return get_lba_btree_extent( - c, - this, - get_meta().depth - 1, - insertion_pt->get_val(), - get_paddr()).si_then( - [this, insertion_pt, c, laddr, val=std::move(val)]( - auto extent) mutable { - return extent->at_max_capacity() ? - split_entry(c, laddr, insertion_pt, extent) : - insert_iertr::make_ready_future(std::move(extent)); - }).si_then([c, laddr, val=std::move(val)]( - LBANodeRef extent) mutable { - return extent->insert(c, laddr, val); - }); -} - -LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping( - op_context_t c, - laddr_t laddr, - mutate_func_t &&f) -{ - return mutate_mapping_internal(c, laddr, true, std::move(f)); -} - -LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping_internal( - op_context_t c, - laddr_t laddr, - bool is_root, - mutate_func_t &&f) -{ - auto mutation_pt = get_containing_child(laddr); - if (mutation_pt == end()) { - assert(0 == "impossible"); - return crimson::ct_error::enoent::make(); - } - return get_lba_btree_extent( - c, - this, - get_meta().depth - 1, - mutation_pt->get_val(), - get_paddr() - ).si_then([=](LBANodeRef extent) { - if (extent->at_min_capacity() && get_size() > 1) { - return merge_entry( - c, - laddr, - mutation_pt, - extent, - is_root); - } else { - return merge_iertr::make_ready_future( - std::move(extent)); - } - }).si_then([c, laddr, f=std::move(f)](LBANodeRef extent) mutable { - return extent->mutate_mapping_internal(c, laddr, false, std::move(f)); - }); -} - -LBAInternalNode::mutate_internal_address_ret LBAInternalNode::mutate_internal_address( - op_context_t c, - depth_t depth, - laddr_t laddr, - paddr_t paddr) -{ - if (get_meta().depth == (depth + 1)) { - if (!is_pending()) { - return c.cache.duplicate_for_write(c.trans, this)->cast( - )->mutate_internal_address( - c, - depth, - laddr, - paddr); - } - auto iter = get_containing_child(laddr); - if (iter->get_key() != laddr) { - logger().debug( - "LBAInternalNode::mutate_internal_address laddr {} " - "not found in extent {}", - laddr, - *this); - return crimson::ct_error::enoent::make(); - } - - auto old_paddr = iter->get_val(); - - journal_update( - iter, - maybe_generate_relative(paddr), - maybe_get_delta_buffer()); - - return mutate_internal_address_ret( - interruptible::ready_future_marker{}, - old_paddr - ); - } else { - auto iter = get_containing_child(laddr); - return get_lba_btree_extent( - c, - this, - get_meta().depth - 1, - iter->get_val(), - get_paddr() - ).si_then([=](auto node) { - return node->mutate_internal_address( - c, - depth, - laddr, - paddr); - }); - } -} - -LBAInternalNode::find_hole_ret LBAInternalNode::find_hole( - op_context_t c, - laddr_t min_addr, - laddr_t max_addr, - extent_len_t len) -{ - logger().debug( - "LBAInternalNode::find_hole min={}, max={}, len={}, *this={}", - min_addr, max_addr, len, *this); - auto [begin, end] = bound(min_addr, max_addr); - return seastar::do_with( - begin, - L_ADDR_NULL, - [this, c, min_addr, len, end=end](auto &i, auto &ret) { - return trans_intr::repeat([=, &i, &ret]() - -> find_hole_iertr::future { - if (i == end) { - return seastar::make_ready_future( - seastar::stop_iteration::yes); - } - return get_lba_btree_extent( - c, - this, - get_meta().depth - 1, - i->get_val(), - get_paddr() - ).si_then([=, &i](auto extent) mutable { - auto lb = std::max(min_addr, i->get_key()); - auto ub = i->get_next_key_or_max(); - logger().debug("LBAInternalNode::find_hole extent {} lb {} ub {}", - *extent, lb, ub); - return extent->find_hole(c, lb, ub, len); - }).si_then([&i, &ret](auto addr) { - if (addr == L_ADDR_NULL) { - ++i; - return seastar::make_ready_future( - seastar::stop_iteration::no); - } else { - ret = addr; - return seastar::make_ready_future( - seastar::stop_iteration::yes); - } - }); - }).si_then([&ret, ref=LBANodeRef(this)]() { - return ret; - }); - }); -} - -LBAInternalNode::scan_mappings_ret LBAInternalNode::scan_mappings( - op_context_t c, - laddr_t begin, - laddr_t end, - scan_mappings_func_t &f) -{ - auto [biter, eiter] = bound(begin, end); - return trans_intr::do_for_each( - std::move(biter), - std::move(eiter), - [=, &f](auto &viter) { - return get_lba_btree_extent( - c, - this, - get_meta().depth - 1, - viter->get_val(), - get_paddr()).si_then([=, &f](auto child) { - return child->scan_mappings(c, begin, end, f); - }); - }).si_then([ref=LBANodeRef(this)] {}); -} - -LBAInternalNode::scan_mapped_space_ret LBAInternalNode::scan_mapped_space( - op_context_t c, - scan_mapped_space_func_t &f) -{ - f(get_paddr(), get_length()); - return trans_intr::do_for_each( - begin(), end(), - [=, &f](auto &viter) { - return get_lba_btree_extent( - c, - this, - get_meta().depth - 1, - viter->get_val(), - get_paddr()).si_then([=, &f](auto child) { - return child->scan_mapped_space(c, f); - }); - }).si_then([ref=LBANodeRef(this)]{}); -} - - -void LBAInternalNode::resolve_relative_addrs(paddr_t base) -{ - for (auto i: *this) { - if (i->get_val().is_relative()) { - auto updated = base.add_relative(i->get_val()); - logger().debug( - "LBAInternalNode::resolve_relative_addrs {} -> {}", - i->get_val(), - updated); - i->set_val(updated); - } - } -} - - -LBAInternalNode::split_ret -LBAInternalNode::split_entry( - op_context_t c, - laddr_t addr, - internal_iterator_t iter, LBANodeRef entry) -{ - if (!is_pending()) { - auto mut = c.cache.duplicate_for_write( - c.trans, this)->cast(); - auto mut_iter = mut->iter_idx(iter->get_offset()); - return mut->split_entry(c, addr, mut_iter, entry); - } - - ceph_assert(!at_max_capacity()); - auto [left, right, pivot] = entry->make_split_children(c); - - journal_update( - iter, - maybe_generate_relative(left->get_paddr()), - maybe_get_delta_buffer()); - journal_insert( - iter + 1, - pivot, - maybe_generate_relative(right->get_paddr()), - maybe_get_delta_buffer()); - - c.cache.retire_extent(c.trans, entry); - - logger().debug( - "LBAInternalNode::split_entry *this {} entry {} into left {} right {}", - *this, - *entry, - *left, - *right); - - return split_iertr::make_ready_future( - pivot > addr ? left : right - ); -} - -LBAInternalNode::merge_ret -LBAInternalNode::merge_entry( - op_context_t c, - laddr_t addr, - internal_iterator_t iter, - LBANodeRef entry, - bool is_root) -{ - if (!is_pending()) { - auto mut = c.cache.duplicate_for_write(c.trans, this)->cast(); - auto mut_iter = mut->iter_idx(iter->get_offset()); - return mut->merge_entry(c, addr, mut_iter, entry, is_root); - } - - logger().debug( - "LBAInternalNode: merge_entry: {}, {}", - *this, - *entry); - auto donor_is_left = (iter + 1) == end(); - auto donor_iter = donor_is_left ? iter - 1 : iter + 1; - return get_lba_btree_extent( - c, - this, - get_meta().depth - 1, - donor_iter->get_val(), - get_paddr() - ).si_then([=](auto donor) mutable { - auto [l, r] = donor_is_left ? - std::make_pair(donor, entry) : std::make_pair(entry, donor); - auto [liter, riter] = donor_is_left ? - std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); - if (donor->at_min_capacity()) { - auto replacement = l->make_full_merge( - c, - r); - - journal_update( - liter, - maybe_generate_relative(replacement->get_paddr()), - maybe_get_delta_buffer()); - journal_remove(riter, maybe_get_delta_buffer()); - - c.cache.retire_extent(c.trans, l); - c.cache.retire_extent(c.trans, r); - - if (is_root && get_size() == 1) { - return c.cache.get_root(c.trans).si_then([=](RootBlockRef croot) { - { - auto mut_croot = c.cache.duplicate_for_write(c.trans, croot); - croot = mut_croot->cast(); - } - auto new_root_addr = begin()->get_val().maybe_relative_to(get_paddr()); - auto new_depth = get_meta().depth - 1; - croot->get_root().lba_root = lba_root_t{ - new_root_addr, - new_depth}; - c.trans.get_lba_tree_stats().depth = new_depth; - logger().debug( - "LBAInternalNode::merge_entry: collapsing root {} to addr {}", - *this, - new_root_addr); - c.cache.retire_extent(c.trans, this); - return merge_iertr::make_ready_future(replacement); - }); - } else { - return merge_iertr::make_ready_future(replacement); - } - } else { - logger().debug( - "LBAInternalEntry::merge_entry balanced l {} r {}", - *l, - *r); - auto [replacement_l, replacement_r, pivot] = - l->make_balanced( - c, - r, - !donor_is_left); - - journal_update( - liter, - maybe_generate_relative(replacement_l->get_paddr()), - maybe_get_delta_buffer()); - journal_replace( - riter, - pivot, - maybe_generate_relative(replacement_r->get_paddr()), - maybe_get_delta_buffer()); - - c.cache.retire_extent(c.trans, l); - c.cache.retire_extent(c.trans, r); - return merge_iertr::make_ready_future( - addr >= pivot ? replacement_r : replacement_l - ); - } - }); -} - - -LBAInternalNode::internal_iterator_t -LBAInternalNode::get_containing_child(laddr_t laddr) -{ - // TODO: binary search - for (auto i = begin(); i != end(); ++i) { - if (i.contains(laddr)) - return i; - } - ceph_assert(0 == "invalid"); - return end(); -} - -std::ostream &LBALeafNode::print_detail(std::ostream &out) const -{ - return out << ", size=" << get_size() - << ", meta=" << get_meta(); -} - -LBALeafNode::lookup_range_ret LBALeafNode::lookup_range( - op_context_t c, - laddr_t addr, - extent_len_t len) -{ - logger().debug( - "LBALeafNode::lookup_range {}~{}", - addr, - len); - auto ret = lba_pin_list_t(); - auto [i, end] = get_leaf_entries(addr, len); - for (; i != end; ++i) { - auto val = i->get_val(); - auto begin = i->get_key(); - ret.emplace_back( - std::make_unique( - this, - val.paddr.maybe_relative_to(get_paddr()), - lba_node_meta_t{ begin, begin + val.len, 0})); - } - return lookup_range_iertr::make_ready_future( - std::move(ret)); -} - -LBALeafNode::lookup_pin_ret LBALeafNode::lookup_pin( - op_context_t c, - laddr_t addr) -{ - logger().debug("LBALeafNode::lookup_pin {}", addr); - auto iter = find(addr); - if (iter == end()) { - return crimson::ct_error::enoent::make(); - } - auto val = iter->get_val(); - auto begin = iter->get_key(); - return lookup_pin_ret( - interruptible::ready_future_marker{}, - std::make_unique( - this, - val.paddr.maybe_relative_to(get_paddr()), - lba_node_meta_t{ begin, begin + val.len, 0})); -} - -LBALeafNode::insert_ret LBALeafNode::insert( - op_context_t c, - laddr_t laddr, - lba_map_val_t val) -{ - ceph_assert(!at_max_capacity()); - - if (!is_pending()) { - return c.cache.duplicate_for_write(c.trans, this - )->cast()->insert(c, laddr, val); - } - - val.paddr = maybe_generate_relative(val.paddr); - logger().debug( - "LBALeafNode::insert: inserting {}~{} -> {}", - laddr, - val.len, - val.paddr); - - auto insert_pt = lower_bound(laddr); - journal_insert(insert_pt, laddr, val, maybe_get_delta_buffer()); - - logger().debug( - "LBALeafNode::insert: inserted {}~{} -> {}", - insert_pt.get_key(), - insert_pt.get_val().len, - insert_pt.get_val().paddr); - auto begin = insert_pt.get_key(); - return insert_ret( - interruptible::ready_future_marker{}, - std::make_unique( - this, - val.paddr.maybe_relative_to(get_paddr()), - lba_node_meta_t{ begin, begin + val.len, 0})); -} - -LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping( - op_context_t c, - laddr_t laddr, - mutate_func_t &&f) -{ - return mutate_mapping_internal(c, laddr, true, std::move(f)); -} - -LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping_internal( - op_context_t c, - laddr_t laddr, - bool is_root, - mutate_func_t &&f) -{ - auto mutation_pt = find(laddr); - if (mutation_pt == end()) { - return crimson::ct_error::enoent::make(); - } - - if (!is_pending()) { - return c.cache.duplicate_for_write(c.trans, this)->cast( - )->mutate_mapping_internal( - c, - laddr, - is_root, - std::move(f)); - } - - auto cur = mutation_pt.get_val(); - auto mutated = f(cur); - - mutated.paddr = maybe_generate_relative(mutated.paddr); - - logger().debug( - "{}: mutate addr {}: {} -> {}", - __func__, - laddr, - cur.paddr, - mutated.paddr); - - if (mutated.refcount > 0) { - journal_update(mutation_pt, mutated, maybe_get_delta_buffer()); - return mutate_mapping_ret( - interruptible::ready_future_marker{}, - mutated); - } else { - ++(c.trans.get_lba_tree_stats().num_erases); - journal_remove(mutation_pt, maybe_get_delta_buffer()); - return mutate_mapping_ret( - interruptible::ready_future_marker{}, - mutated); - } -} - -LBALeafNode::mutate_internal_address_ret LBALeafNode::mutate_internal_address( - op_context_t c, - depth_t depth, - laddr_t laddr, - paddr_t paddr) -{ - ceph_assert(0 == "Impossible"); - return mutate_internal_address_ret( - interruptible::ready_future_marker{}, - paddr); -} - -LBALeafNode::find_hole_ret LBALeafNode::find_hole( - op_context_t c, - laddr_t min, - laddr_t max, - extent_len_t len) -{ - logger().debug( - "LBALeafNode::find_hole min={} max={}, len={}, *this={}", - min, max, len, *this); - auto [liter, uiter] = bound(min, max); - for (auto i = liter; i != uiter; ++i) { - auto ub = i->get_key(); - if (min + len <= ub) { - return find_hole_ret( - interruptible::ready_future_marker{}, - min); - } else { - min = i->get_key() + i->get_val().len; - } - } - if (min + len <= max) { - return find_hole_ret( - interruptible::ready_future_marker{}, - min); - } else { - return find_hole_ret( - interruptible::ready_future_marker{}, - L_ADDR_MAX); - } -} - -LBALeafNode::scan_mappings_ret LBALeafNode::scan_mappings( - op_context_t c, - laddr_t begin, - laddr_t end, - scan_mappings_func_t &f) -{ - auto [biter, eiter] = bound(begin, end); - for (auto i = biter; i != eiter; ++i) { - auto val = i->get_val(); - f(i->get_key(), val.paddr, val.len); - } - return scan_mappings_iertr::now(); -} - -LBALeafNode::scan_mapped_space_ret LBALeafNode::scan_mapped_space( - op_context_t c, - scan_mapped_space_func_t &f) -{ - f(get_paddr(), get_length()); - for (auto i = begin(); i != end(); ++i) { - auto val = i->get_val(); - f(val.paddr, val.len); - } - return scan_mappings_iertr::now(); -} - - -void LBALeafNode::resolve_relative_addrs(paddr_t base) -{ - for (auto i: *this) { - if (i->get_val().paddr.is_relative()) { - auto val = i->get_val(); - val.paddr = base.add_relative(val.paddr); - logger().debug( - "LBALeafNode::resolve_relative_addrs {} -> {}", - i->get_val().paddr, - val.paddr); - i->set_val(val); - } - } -} - -std::pair -LBALeafNode::get_leaf_entries(laddr_t addr, extent_len_t len) -{ - return bound(addr, addr + len); -} - -get_lba_node_ret get_lba_btree_extent( - op_context_t c, - CachedExtentRef parent, - depth_t depth, - paddr_t offset, - paddr_t base) -{ - offset = offset.maybe_relative_to(base); - ceph_assert(depth > 0); - if (depth > 1) { - logger().debug( - "get_lba_btree_extent: reading internal at offset {}, depth {}", - offset, - depth); - return c.cache.get_extent( - c.trans, - offset, - LBA_BLOCK_SIZE).si_then([c, parent](auto ret) - -> get_lba_node_ret { - auto meta = ret->get_meta(); - if (ret->get_size()) { - ceph_assert(meta.begin <= ret->begin()->get_key()); - ceph_assert(meta.end > (ret->end() - 1)->get_key()); - } - if (parent->has_been_invalidated() || ret->has_been_invalidated()) { - logger().debug( - "get_lba_btree_extent: parent {} or ret {} is invalid, transaction {} is conflicted: {}", - *parent, - *ret, - (void*)&c.trans, - c.trans.is_conflicted()); - assert(!(parent->has_been_invalidated() || ret->has_been_invalidated())); - } - if (!ret->is_pending() && !ret->pin.is_linked()) { - ret->pin.set_range(meta); - c.pins.add_pin(ret->pin); - } - return get_lba_node_ret( - interruptible::ready_future_marker{}, - LBANodeRef(ret.detach(), /* add_ref = */ false)); - }); - } else { - logger().debug( - "get_lba_btree_extent: reading leaf at offset {}, depth {}", - offset, - depth); - return c.cache.get_extent( - c.trans, - offset, - LBA_BLOCK_SIZE).si_then([offset, c, parent](auto ret) - -> get_lba_node_ret { - logger().debug( - "get_lba_btree_extent: read leaf at offset {} {}, parent {}", - offset, - *ret, - *parent); - auto meta = ret->get_meta(); - if (ret->get_size()) { - ceph_assert(meta.begin <= ret->begin()->get_key()); - ceph_assert(meta.end > (ret->end() - 1)->get_key()); - } - if (parent->has_been_invalidated() || ret->has_been_invalidated()) { - logger().debug( - "get_lba_btree_extent: parent {} or ret {} is invalid, transaction {} is conflicted: {}", - *parent, - *ret, - (void*)&c.trans, - c.trans.is_conflicted()); - assert(!(parent->has_been_invalidated() || ret->has_been_invalidated())); - } - if (!ret->is_pending() && !ret->pin.is_linked()) { - ret->pin.set_range(meta); - c.pins.add_pin(ret->pin); - } - return get_lba_node_ret( - interruptible::ready_future_marker{}, - LBANodeRef(ret.detach(), /* add_ref = */ false)); - }); - } -} - -} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h deleted file mode 100644 index fc9fd0634b895..0000000000000 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h +++ /dev/null @@ -1,557 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#pragma once - -#include -#include - -#include -#include - -#include "include/buffer.h" - -#include "crimson/common/fixed_kv_node_layout.h" -#include "crimson/common/errorator.h" -#include "crimson/os/seastore/lba_manager.h" -#include "crimson/os/seastore/seastore_types.h" -#include "crimson/os/seastore/cache.h" -#include "crimson/os/seastore/cached_extent.h" -#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" -#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" - -namespace crimson::os::seastore::lba_manager::btree { - -constexpr size_t LBA_BLOCK_SIZE = 4096; - -/** - * lba_node_meta_le_t - * - * On disk layout for lba_node_meta_t - */ -struct lba_node_meta_le_t { - laddr_le_t begin = laddr_le_t(0); - laddr_le_t end = laddr_le_t(0); - depth_le_t depth = init_depth_le(0); - - lba_node_meta_le_t() = default; - lba_node_meta_le_t(const lba_node_meta_le_t &) = default; - explicit lba_node_meta_le_t(const lba_node_meta_t &val) - : begin(ceph_le64(val.begin)), - end(ceph_le64(val.end)), - depth(init_depth_le(val.depth)) {} - - operator lba_node_meta_t() const { - return lba_node_meta_t{ begin, end, depth }; - } -}; - - -/** - * LBAInternalNode - * - * Abstracts operations on and layout of internal nodes for the - * LBA Tree. - * - * Layout (4k): - * size : uint32_t[1] 4b - * (padding) : 4b - * meta : lba_node_meta_le_t[3] (1*24)b - * keys : laddr_t[255] (254*8)b - * values : paddr_t[255] (254*8)b - * = 4096 - - * TODO: make the above capacity calculation part of FixedKVNodeLayout - * TODO: the above alignment probably isn't portable without further work - */ -constexpr size_t INTERNAL_NODE_CAPACITY = 254; -struct LBAInternalNode - : LBANode, - common::FixedKVNodeLayout< - INTERNAL_NODE_CAPACITY, - lba_node_meta_t, lba_node_meta_le_t, - laddr_t, laddr_le_t, - paddr_t, paddr_le_t> { - using internal_iterator_t = const_iterator; - template - LBAInternalNode(T&&... t) : - LBANode(std::forward(t)...), - FixedKVNodeLayout(get_bptr().c_str()) {} - - static constexpr extent_types_t TYPE = extent_types_t::LADDR_INTERNAL; - - lba_node_meta_t get_node_meta() const final { return get_meta(); } - - CachedExtentRef duplicate_for_write() final { - assert(delta_buffer.empty()); - return CachedExtentRef(new LBAInternalNode(*this)); - }; - - delta_buffer_t delta_buffer; - delta_buffer_t *maybe_get_delta_buffer() { - return is_mutation_pending() ? &delta_buffer : nullptr; - } - - lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final; - - lookup_range_ret lookup_range( - op_context_t c, - laddr_t addr, - extent_len_t len) final; - - lookup_pin_ret lookup_pin( - op_context_t c, - laddr_t addr) final; - - insert_ret insert( - op_context_t c, - laddr_t laddr, - lba_map_val_t val) final; - - mutate_mapping_ret mutate_mapping( - op_context_t c, - laddr_t laddr, - mutate_func_t &&f) final; - mutate_mapping_ret mutate_mapping_internal( - op_context_t c, - laddr_t laddr, - bool is_root, - mutate_func_t &&f) final; - - mutate_internal_address_ret mutate_internal_address( - op_context_t c, - depth_t depth, - laddr_t laddr, - paddr_t paddr) final; - - find_hole_ret find_hole( - op_context_t c, - laddr_t min, - laddr_t max, - extent_len_t len) final; - - scan_mappings_ret scan_mappings( - op_context_t c, - laddr_t begin, - laddr_t end, - scan_mappings_func_t &f) final; - - scan_mapped_space_ret scan_mapped_space( - op_context_t c, - scan_mapped_space_func_t &f) final; - - std::tuple - make_split_children(op_context_t c) final { - auto left = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - auto right = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - auto pivot = split_into(*left, *right); - left->pin.set_range(left->get_meta()); - right->pin.set_range(right->get_meta()); - return std::make_tuple( - left, - right, - pivot); - } - - LBANodeRef make_full_merge( - op_context_t c, - LBANodeRef &right) final { - auto replacement = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - replacement->merge_from(*this, *right->cast()); - replacement->pin.set_range(replacement->get_meta()); - return replacement; - } - - std::tuple - make_balanced( - op_context_t c, - LBANodeRef &_right, - bool prefer_left) final { - ceph_assert(_right->get_type() == TYPE); - auto &right = *_right->cast(); - auto replacement_left = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - auto replacement_right = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - - auto pivot = balance_into_new_nodes( - *this, - right, - prefer_left, - *replacement_left, - *replacement_right); - - replacement_left->pin.set_range(replacement_left->get_meta()); - replacement_right->pin.set_range(replacement_right->get_meta()); - return std::make_tuple( - replacement_left, - replacement_right, - pivot); - } - - /** - * Internal relative addresses on read or in memory prior to commit - * are either record or block relative depending on whether this - * physical node is is_initial_pending() or just is_pending(). - * - * User passes appropriate base depending on lifecycle and - * resolve_relative_addrs fixes up relative internal references - * based on base. - */ - void resolve_relative_addrs(paddr_t base) final; - void node_resolve_vals(iterator from, iterator to) const final { - if (is_initial_pending()) { - for (auto i = from; i != to; ++i) { - if (i->get_val().is_relative()) { - assert(i->get_val().is_block_relative()); - i->set_val(get_paddr().add_relative(i->get_val())); - } - } - } - } - void node_unresolve_vals(iterator from, iterator to) const final { - if (is_initial_pending()) { - for (auto i = from; i != to; ++i) { - if (i->get_val().is_relative()) { - assert(i->get_val().is_record_relative()); - i->set_val(i->get_val() - get_paddr()); - } - } - } - } - - extent_types_t get_type() const final { - return TYPE; - } - - std::ostream &print_detail(std::ostream &out) const final; - - ceph::bufferlist get_delta() final { - ceph::buffer::ptr bptr(delta_buffer.get_bytes()); - delta_buffer.copy_out(bptr.c_str(), bptr.length()); - ceph::bufferlist bl; - bl.push_back(bptr); - return bl; - } - - void apply_delta_and_adjust_crc( - paddr_t base, const ceph::bufferlist &_bl) final { - assert(_bl.length()); - ceph::bufferlist bl = _bl; - bl.rebuild(); - delta_buffer_t buffer; - buffer.copy_in(bl.front().c_str(), bl.front().length()); - buffer.replay(*this); - set_last_committed_crc(get_crc32c()); - resolve_relative_addrs(base); - } - - bool at_max_capacity() const final { - return get_size() == get_capacity(); - } - - bool at_min_capacity() const { - return get_size() == (get_capacity() / 2); - } - - /// returns iterators containing [l, r) - std::pair bound( - laddr_t l, laddr_t r) { - // TODO: inefficient - auto retl = begin(); - for (; retl != end(); ++retl) { - if (retl->get_next_key_or_max() > l) - break; - } - auto retr = retl; - for (; retr != end(); ++retr) { - if (retr->get_key() >= r) - break; - } - return std::make_pair(retl, retr); - } - - using split_iertr = base_iertr; - using split_ret = split_iertr::future; - split_ret split_entry( - op_context_t c, - laddr_t addr, - internal_iterator_t, - LBANodeRef entry); - - using merge_iertr = base_iertr; - using merge_ret = merge_iertr::future; - merge_ret merge_entry( - op_context_t c, - laddr_t addr, - internal_iterator_t, - LBANodeRef entry, - bool is_root); - - /// returns iterator for subtree containing laddr - internal_iterator_t get_containing_child(laddr_t laddr); -}; - -/** - * LBALeafNode - * - * Abstracts operations on and layout of leaf nodes for the - * LBA Tree. - * - * Layout (4k): - * size : uint32_t[1] 4b - * (padding) : 4b - * meta : lba_node_meta_le_t[3] (1*24)b - * keys : laddr_t[170] (145*8)b - * values : lba_map_val_t[170] (145*20)b - * = 4092 - * - * TODO: update FixedKVNodeLayout to handle the above calculation - * TODO: the above alignment probably isn't portable without further work - */ -constexpr size_t LEAF_NODE_CAPACITY = 145; - -/** - * lba_map_val_le_t - * - * On disk layout for lba_map_val_t. - */ -struct lba_map_val_le_t { - extent_len_le_t len = init_extent_len_le(0); - paddr_le_t paddr; - ceph_le32 refcount{0}; - ceph_le32 checksum{0}; - - lba_map_val_le_t() = default; - lba_map_val_le_t(const lba_map_val_le_t &) = default; - explicit lba_map_val_le_t(const lba_map_val_t &val) - : len(init_extent_len_le(val.len)), - paddr(paddr_le_t(val.paddr)), - refcount(val.refcount), - checksum(val.checksum) {} - - operator lba_map_val_t() const { - return lba_map_val_t{ len, paddr, refcount, checksum }; - } -}; - -struct LBALeafNode - : LBANode, - common::FixedKVNodeLayout< - LEAF_NODE_CAPACITY, - lba_node_meta_t, lba_node_meta_le_t, - laddr_t, laddr_le_t, - lba_map_val_t, lba_map_val_le_t> { - using internal_iterator_t = const_iterator; - template - LBALeafNode(T&&... t) : - LBANode(std::forward(t)...), - FixedKVNodeLayout(get_bptr().c_str()) {} - - static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF; - - lba_node_meta_t get_node_meta() const final { return get_meta(); } - - CachedExtentRef duplicate_for_write() final { - assert(delta_buffer.empty()); - return CachedExtentRef(new LBALeafNode(*this)); - }; - - delta_buffer_t delta_buffer; - delta_buffer_t *maybe_get_delta_buffer() { - return is_mutation_pending() ? &delta_buffer : nullptr; - } - - lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final - { - return lookup_ret( - interruptible::ready_future_marker{}, - this); - } - - lookup_range_ret lookup_range( - op_context_t c, - laddr_t addr, - extent_len_t len) final; - - lookup_pin_ret lookup_pin( - op_context_t c, - laddr_t addr) final; - - insert_ret insert( - op_context_t c, - laddr_t laddr, - lba_map_val_t val) final; - - mutate_mapping_ret mutate_mapping( - op_context_t c, - laddr_t laddr, - mutate_func_t &&f) final; - mutate_mapping_ret mutate_mapping_internal( - op_context_t c, - laddr_t laddr, - bool is_root, - mutate_func_t &&f) final; - - mutate_internal_address_ret mutate_internal_address( - op_context_t c, - depth_t depth, - laddr_t laddr, - paddr_t paddr) final; - - find_hole_ret find_hole( - op_context_t c, - laddr_t min, - laddr_t max, - extent_len_t len) final; - - scan_mappings_ret scan_mappings( - op_context_t c, - laddr_t begin, - laddr_t end, - scan_mappings_func_t &f) final; - - scan_mapped_space_ret scan_mapped_space( - op_context_t c, - scan_mapped_space_func_t &f) final; - - std::tuple - make_split_children(op_context_t c) final { - auto left = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - auto right = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - auto pivot = split_into(*left, *right); - left->pin.set_range(left->get_meta()); - right->pin.set_range(right->get_meta()); - return std::make_tuple( - left, - right, - pivot); - } - - LBANodeRef make_full_merge( - op_context_t c, - LBANodeRef &right) final { - auto replacement = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - replacement->merge_from(*this, *right->cast()); - replacement->pin.set_range(replacement->get_meta()); - return replacement; - } - - std::tuple - make_balanced( - op_context_t c, - LBANodeRef &_right, - bool prefer_left) final { - ceph_assert(_right->get_type() == TYPE); - auto &right = *_right->cast(); - auto replacement_left = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - auto replacement_right = c.cache.alloc_new_extent( - c.trans, LBA_BLOCK_SIZE); - - auto pivot = balance_into_new_nodes( - *this, - right, - prefer_left, - *replacement_left, - *replacement_right); - - replacement_left->pin.set_range(replacement_left->get_meta()); - replacement_right->pin.set_range(replacement_right->get_meta()); - return std::make_tuple( - replacement_left, - replacement_right, - pivot); - } - - // See LBAInternalNode, same concept - void resolve_relative_addrs(paddr_t base) final; - void node_resolve_vals(iterator from, iterator to) const final { - if (is_initial_pending()) { - for (auto i = from; i != to; ++i) { - auto val = i->get_val(); - if (val.paddr.is_relative()) { - assert(val.paddr.is_block_relative()); - val.paddr = get_paddr().add_relative(val.paddr); - i->set_val(val); - } - } - } - } - void node_unresolve_vals(iterator from, iterator to) const final { - if (is_initial_pending()) { - for (auto i = from; i != to; ++i) { - auto val = i->get_val(); - if (val.paddr.is_relative()) { - auto val = i->get_val(); - assert(val.paddr.is_record_relative()); - val.paddr = val.paddr - get_paddr(); - i->set_val(val); - } - } - } - } - - ceph::bufferlist get_delta() final { - ceph::buffer::ptr bptr(delta_buffer.get_bytes()); - delta_buffer.copy_out(bptr.c_str(), bptr.length()); - ceph::bufferlist bl; - bl.push_back(bptr); - return bl; - } - - void apply_delta_and_adjust_crc( - paddr_t base, const ceph::bufferlist &_bl) final { - assert(_bl.length()); - ceph::bufferlist bl = _bl; - bl.rebuild(); - delta_buffer_t buffer; - buffer.copy_in(bl.front().c_str(), bl.front().length()); - buffer.replay(*this); - set_last_committed_crc(get_crc32c()); - resolve_relative_addrs(base); - } - - extent_types_t get_type() const final { - return TYPE; - } - - std::ostream &print_detail(std::ostream &out) const final; - - bool at_max_capacity() const final { - return get_size() == get_capacity(); - } - - bool at_min_capacity() const final { - return get_size() == (get_capacity() / 2); - } - - /// returns iterators containing addresses [l, r) - std::pair bound( - laddr_t l, laddr_t r) { - // TODO: inefficient - auto retl = begin(); - for (; retl != end(); ++retl) { - if (retl->get_key() >= l || (retl->get_key() + retl->get_val().len) > l) - break; - } - auto retr = retl; - for (; retr != end(); ++retr) { - if (retr->get_key() >= r) - break; - } - return std::make_pair(retl, retr); - } - - std::pair - get_leaf_entries(laddr_t addr, extent_len_t len); -}; -using LBALeafNodeRef = TCachedExtentRef; - -} diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 3ea9abf02f032..5ecd1640e0f8f 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -187,34 +187,6 @@ struct btree_lba_manager_test : return ret; } - auto set_mapping( - test_transaction_t &t, - laddr_t addr, - size_t len, - paddr_t paddr) { - auto [b, e] = get_overlap(t, addr, len); - EXPECT_EQ(b, e); - - auto ret = with_trans_intr( - *t.t, - [=](auto &t) { - return lba_manager->set_extent(t, addr, len, paddr); - }).unsafe_get0(); - EXPECT_EQ(addr, ret->get_laddr()); - EXPECT_EQ(len, ret->get_length()); - EXPECT_EQ(paddr, ret->get_paddr()); - t.mappings.emplace( - std::make_pair( - ret->get_laddr(), - test_extent_t{ - ret->get_paddr(), - ret->get_length(), - 1 - } - )); - return ret; - } - auto decref_mapping( test_transaction_t &t, laddr_t addr) { -- 2.39.5