lba_manager.cc
segment_cleaner.cc
lba_manager/btree/btree_lba_manager.cc
- lba_manager/btree/lba_btree_node_impl.cc
lba_manager/btree/btree_range_pin.cc
+ lba_manager/btree/lba_btree.cc
+ lba_manager/btree/lba_btree_node.cc
omap_manager.cc
omap_manager/btree/btree_omap_manager.cc
omap_manager/btree/omap_btree_node_impl.cc
// included for get_extent_by_type
#include "crimson/os/seastore/collection_manager/collection_flat_node.h"
-#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
#include "crimson/os/seastore/object_data_handler.h"
#include "crimson/os/seastore/collection_manager/collection_flat_node.h"
extent_len_t len,
paddr_t addr) = 0;
- /**
- * Creates a new absolute mapping.
- *
- * off~len must be unreferenced
- */
- using set_extent_iertr = base_iertr::extend<
- crimson::ct_error::invarg>;
- using set_extent_ret = set_extent_iertr::future<LBAPinRef>;
- virtual set_extent_ret set_extent(
- Transaction &t,
- laddr_t off, extent_len_t len, paddr_t addr) = 0;
-
struct ref_update_result_t {
unsigned refcount = 0;
paddr_t addr;
#include <string.h>
#include "crimson/common/log.h"
+#include "crimson/os/seastore/logging.h"
#include "include/buffer.h"
#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h"
-#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree.h"
namespace {
BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs(
Transaction &t)
{
- logger().debug("BtreeLBAManager::mkfs");
return cache.get_root(t).si_then([this, &t](auto croot) {
- auto root_leaf = cache.alloc_new_extent<LBALeafNode>(
- t,
- LBA_BLOCK_SIZE);
- root_leaf->set_size(0);
- lba_node_meta_t meta{0, L_ADDR_MAX, 1};
- root_leaf->set_meta(meta);
- root_leaf->pin.set_range(meta);
- croot->get_root().lba_root =
- lba_root_t{root_leaf->get_paddr(), 1u};
- t.get_lba_tree_stats().depth = 1u;
+ croot->get_root().lba_root = LBABtree::mkfs(get_context(t));
return mkfs_iertr::now();
}).handle_error_interruptible(
mkfs_iertr::pass_further{},
);
}
-BtreeLBAManager::get_root_ret
-BtreeLBAManager::get_root(Transaction &t)
-{
- return cache.get_root(t).si_then([this, &t](auto croot) {
- logger().debug(
- "BtreeLBAManager::get_root: reading root at {} depth {}",
- paddr_t{croot->get_root().lba_root.get_location()},
- croot->get_root().lba_root.get_depth());
- return get_lba_btree_extent(
- get_context(t),
- croot,
- croot->get_root().lba_root.get_depth(),
- croot->get_root().lba_root.get_location(),
- paddr_t());
- });
-}
-
BtreeLBAManager::get_mappings_ret
BtreeLBAManager::get_mappings(
Transaction &t,
laddr_t offset, extent_len_t length)
{
- logger().debug("BtreeLBAManager::get_mappings: {}, {}", offset, length);
- return get_root(
- t).si_then([this, &t, offset, length](auto extent) {
- return extent->lookup_range(
- get_context(t),
- offset, length
- ).si_then([](auto &&e) {
- logger().debug("BtreeLBAManager::get_mappings: got mappings {}", e);
- return std::move(e);
- });
- });
+ LOG_PREFIX(BtreeLBAManager::get_mappings);
+ DEBUGT("offset: {}, length{}", t, offset, length);
+ auto c = get_context(t);
+ return with_btree_state<lba_pin_list_t>(
+ c,
+ [c, offset, length](auto &btree, auto &ret) {
+ return LBABtree::iterate_repeat(
+ c,
+ btree.upper_bound_right(c, offset),
+ [&ret, offset, length](auto &pos) {
+ if (pos.is_end() || pos.get_key() >= (offset + length)) {
+ return LBABtree::iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ }
+ ceph_assert((pos.get_key() + pos.get_val().len) > offset);
+ ret.push_back(pos.get_pin());
+ return LBABtree::iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ });
+ });
}
Transaction &t,
laddr_list_t &&list)
{
- logger().debug("BtreeLBAManager::get_mappings: {}", list);
+ LOG_PREFIX(BtreeLBAManager::get_mappings);
+ DEBUGT("{}", t, list);
auto l = std::make_unique<laddr_list_t>(std::move(list));
auto retptr = std::make_unique<lba_pin_list_t>();
auto &ret = *retptr;
Transaction &t,
laddr_t offset)
{
- logger().debug("BtreeLBAManager::get_mapping: {}", offset);
- return get_root(t
- ).si_then([this, &t, offset] (auto extent) {
- return extent->lookup_pin(get_context(t), offset);
- }).si_then([] (auto &&e) {
- logger().debug("BtreeLBAManager::get_mapping: got mapping {}", *e);
- return std::move(e);
- });
+ LOG_PREFIX(BtreeLBAManager::get_mapping);
+ DEBUGT(": {}", t, offset);
+ auto c = get_context(t);
+ return with_btree_ret<LBAPinRef>(
+ c,
+ [FNAME, c, offset](auto &btree) {
+ return btree.lower_bound(
+ c, offset
+ ).si_then([FNAME, offset, c](auto iter) -> get_mapping_ret {
+ if (iter.is_end() || iter.get_key() != offset) {
+ return crimson::ct_error::enoent::make();
+ } else {
+ auto e = iter.get_pin();
+ DEBUGT("got mapping {}", c.trans, *e);
+ return get_mapping_ret(
+ interruptible::ready_future_marker{},
+ std::move(e));
+ }
+ });
+ });
}
BtreeLBAManager::alloc_extent_ret
extent_len_t len,
paddr_t addr)
{
- // TODO: we can certainly combine the lookup and the insert.
- return get_root(
- t).si_then([this, &t, hint, len](auto extent) {
- logger().debug(
- "BtreeLBAManager::alloc_extent: beginning search at {}",
- *extent);
- return extent->find_hole(
- get_context(t),
- hint,
- L_ADDR_MAX,
- len).si_then([extent](auto ret) {
- return std::make_pair(ret, extent);
+ struct state_t {
+ laddr_t last_end;
+
+ std::optional<LBABtree::iterator> insert_iter;
+ std::optional<LBABtree::iterator> ret;
+
+ state_t(laddr_t hint) : last_end(hint) {}
+ };
+
+ LOG_PREFIX(BtreeLBAManager::alloc_extent);
+ DEBUGT("hint: {}, length: {}", t, hint, len);
+ auto c = get_context(t);
+ return with_btree_state<state_t>(
+ c,
+ hint,
+ [FNAME, c, hint, len, addr](auto &btree, auto &state) {
+ return LBABtree::iterate_repeat(
+ c,
+ btree.upper_bound_right(c, hint),
+ [&state, len](auto &pos) {
+ if (pos.is_end() || pos.get_key() >= (state.last_end + len)) {
+ state.insert_iter = pos;
+ return LBABtree::iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ } else {
+ state.last_end = pos.get_key() + pos.get_val().len;
+ return LBABtree::iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ }
+ }).si_then([FNAME, c, addr, len, &btree, &state] {
+ DEBUGT("about to insert at addr {}~{}", c.trans, state.last_end, len);
+ return btree.insert(
+ c,
+ *state.insert_iter,
+ state.last_end,
+ lba_map_val_t{len, addr, 1, 0}
+ ).si_then([&state](auto &&p) {
+ auto [iter, inserted] = std::move(p);
+ ceph_assert(inserted);
+ state.ret = iter;
+ });
});
- }).si_then([this, &t, len, addr](auto allocation_pair) {
- auto &[laddr, extent] = allocation_pair;
- ceph_assert(laddr != L_ADDR_MAX);
- return insert_mapping(
- t,
- extent,
- laddr,
- { len, addr, 1, 0 }
- ).si_then([laddr=laddr, addr, len](auto pin) {
- logger().debug(
- "BtreeLBAManager::alloc_extent: alloc {}~{} for {}",
- laddr,
- len,
- addr);
- return LBAPinRef(pin.release());
- });
- });
-}
-
-BtreeLBAManager::set_extent_ret
-BtreeLBAManager::set_extent(
- Transaction &t,
- laddr_t off, extent_len_t len, paddr_t addr)
-{
- return get_root(
- t).si_then([this, &t, off, len, addr](auto root) {
- return insert_mapping(
- t,
- root,
- off,
- { len, addr, 1, 0 });
- }).si_then([](auto ret) {
- return set_extent_ret(
- interruptible::ready_future_marker{},
- LBAPinRef(ret.release()));
+ }).si_then([](auto &&state) {
+ return state.ret->get_pin();
});
}
Transaction &t,
CachedExtentRef e)
{
- logger().debug("{}: {}", __func__, *e);
- return get_root(t).si_then(
- [this, &t, e=std::move(e)](LBANodeRef root) mutable {
- if (is_lba_node(*e)) {
- auto lban = e->cast<LBANode>();
- logger().debug("init_cached_extent: lba node, getting root");
- return root->lookup(
- op_context_t{cache, pin_set, t},
- lban->get_node_meta().begin,
- lban->get_node_meta().depth
- ).si_then([this, e=std::move(e)](LBANodeRef c) {
- if (c->get_paddr() == e->get_paddr()) {
- assert(&*c == &*e);
- logger().debug("init_cached_extent: {} initialized", *e);
- } else {
- // e is obsolete
- logger().debug("init_cached_extent: {} obsolete", *e);
- cache.drop_from_cache(e);
- }
- return init_cached_extent_iertr::now();
- });
- } else if (e->is_logical()) {
- auto logn = e->cast<LogicalCachedExtent>();
- return root->lookup_range(
- op_context_t{cache, pin_set, t},
- logn->get_laddr(),
- logn->get_length()).si_then(
- [this, logn=std::move(logn)](auto pins) {
- if (pins.size() == 1) {
- auto pin = std::move(pins.front());
- pins.pop_front();
- if (pin->get_paddr() == logn->get_paddr()) {
- logn->set_pin(std::move(pin));
- pin_set.add_pin(
- static_cast<BtreeLBAPin&>(logn->get_pin()).pin);
- logger().debug("init_cached_extent: {} initialized", *logn);
- } else {
- // paddr doesn't match, remapped, obsolete
- logger().debug("init_cached_extent: {} obsolete", *logn);
- cache.drop_from_cache(logn);
- }
- } else {
- // set of extents changed, obsolete
- logger().debug("init_cached_extent: {} obsolete", *logn);
- cache.drop_from_cache(logn);
- }
- return init_cached_extent_iertr::now();
- });
- } else {
- logger().debug("init_cached_extent: {} skipped", *e);
- return init_cached_extent_iertr::now();
- }
+ LOG_PREFIX(BtreeLBAManager::init_cached_extent);
+ DEBUGT(": extent {}", t, *e);
+ auto c = get_context(t);
+ return with_btree(
+ c,
+ [c, e](auto &btree) {
+ return btree.init_cached_extent(
+ c, e
+ ).si_then([](auto) {});
});
}
laddr_t end,
scan_mappings_func_t &&f)
{
- return seastar::do_with(
- std::move(f),
- LBANodeRef(),
- [=, &t](auto &f, auto &lbarootref) {
- return get_root(t).si_then(
- [=, &t, &f](LBANodeRef lbaroot) mutable {
- lbarootref = lbaroot;
- return lbaroot->scan_mappings(
- get_context(t),
- begin,
- end,
- f);
+ LOG_PREFIX(BtreeLBAManager::scan_mappings);
+ DEBUGT("begin: {}, end: {}", t, begin, end);
+
+ auto c = get_context(t);
+ return with_btree(
+ c,
+ [c, f=std::move(f), begin, end](auto &btree) mutable {
+ return LBABtree::iterate_repeat(
+ c,
+ btree.upper_bound_right(c, begin),
+ [f=std::move(f), begin, end](auto &pos) {
+ if (pos.is_end() || pos.get_key() >= end) {
+ return LBABtree::iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ }
+ ceph_assert((pos.get_key() + pos.get_val().len) > begin);
+ f(pos.get_key(), pos.get_val().paddr, pos.get_val().len);
+ return LBABtree::iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
});
});
}
Transaction &t,
scan_mapped_space_func_t &&f)
{
+ LOG_PREFIX(BtreeLBAManager::scan_mapped_space);
+ DEBUGT("", t);
+ auto c = get_context(t);
return seastar::do_with(
std::move(f),
- LBANodeRef(),
- [=, &t](auto &f, auto &lbarootref) {
- return get_root(t).si_then(
- [=, &t, &f](LBANodeRef lbaroot) mutable {
- lbarootref = lbaroot;
- return lbaroot->scan_mapped_space(
- get_context(t),
- f);
+ [this, c](auto &visitor) {
+ return with_btree(
+ c,
+ [c, &visitor](auto &btree) {
+ return LBABtree::iterate_repeat(
+ c,
+ btree.lower_bound(c, 0, &visitor),
+ [&visitor](auto &pos) {
+ if (pos.is_end()) {
+ return LBABtree::iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ }
+ visitor(pos.get_val().paddr, pos.get_val().len);
+ return LBABtree::iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ },
+ &visitor);
});
});
}
Transaction &t,
CachedExtentRef extent)
{
+ LOG_PREFIX(BtreeLBAManager::rewrite_extent);
+ if (extent->has_been_invalidated()) {
+ ERRORT("{} has been invalidated", t, *extent);
+ }
assert(!extent->has_been_invalidated());
logger().debug(
}
);
} else if (is_lba_node(*extent)) {
- auto lba_extent = extent->cast<LBANode>();
- cache.retire_extent(t, extent);
- auto nlba_extent = cache.alloc_new_extent_by_type(
- t,
- lba_extent->get_type(),
- lba_extent->get_length())->cast<LBANode>();
- lba_extent->get_bptr().copy_out(
- 0,
- lba_extent->get_length(),
- nlba_extent->get_bptr().c_str());
- nlba_extent->pin.set_range(nlba_extent->get_node_meta());
-
- /* This is a bit underhanded. Any relative addrs here must necessarily
- * be record relative as we are rewriting a dirty extent. Thus, we
- * are using resolve_relative_addrs with a (likely negative) block
- * relative offset to correct them to block-relative offsets adjusted
- * for our new transaction location.
- *
- * Upon commit, these now block relative addresses will be interpretted
- * against the real final address.
- */
- nlba_extent->resolve_relative_addrs(
- make_record_relative_paddr(0) - nlba_extent->get_paddr());
-
- logger().debug(
- "{}: rewriting {} into {}",
- __func__,
- *lba_extent,
- *nlba_extent);
-
- return update_internal_mapping(
- t,
- nlba_extent->get_node_meta().depth,
- nlba_extent->get_node_meta().begin,
- nlba_extent->get_paddr()).si_then(
- [](auto) {},
- rewrite_extent_iertr::pass_further{},
- crimson::ct_error::assert_all{
- "Invalid error in BtreeLBAManager::rewrite_extent update_internal_mapping"
- });
+ auto c = get_context(t);
+ return with_btree(
+ c,
+ [c, extent](auto &btree) mutable {
+ return btree.rewrite_lba_extent(c, extent);
+ });
} else {
return rewrite_extent_iertr::now();
}
laddr,
len
).si_then([=, &t](CachedExtentRef extent) {
- return get_root(t).si_then([=, &t](LBANodeRef root) {
- auto lba_node = extent->cast<LBANode>();
- return root->lookup(
- op_context_t{cache, pin_set, t},
- lba_node->get_node_meta().begin,
- lba_node->get_node_meta().depth).si_then([=](LBANodeRef c) {
- if (c->get_paddr() == lba_node->get_paddr()) {
- return get_physical_extent_if_live_ret(
- interruptible::ready_future_marker{},
- lba_node);
- } else {
- cache.drop_from_cache(lba_node);
- return get_physical_extent_if_live_ret(
- interruptible::ready_future_marker{},
- CachedExtentRef());
- }
- });
- });
+ auto c = get_context(t);
+ return with_btree_ret<CachedExtentRef>(
+ c,
+ [c, extent](auto &btree) {
+ return btree.init_cached_extent(
+ c,
+ extent);
+ });
});
}
: segment_manager(segment_manager),
cache(cache) {}
-BtreeLBAManager::insert_mapping_ret BtreeLBAManager::insert_mapping(
- Transaction &t,
- LBANodeRef root,
- laddr_t laddr,
- lba_map_val_t val)
-{
- ++(t.get_lba_tree_stats().num_inserts);
- auto split = insert_mapping_iertr::future<LBANodeRef>(
- interruptible::ready_future_marker{},
- root);
- if (root->at_max_capacity()) {
- split = cache.get_root(t).si_then(
- [this, root, laddr, &t](RootBlockRef croot) {
- logger().debug(
- "BtreeLBAManager::insert_mapping: splitting root {}",
- *croot);
- {
- auto mut_croot = cache.duplicate_for_write(t, croot);
- croot = mut_croot->cast<RootBlock>();
- }
- auto nroot = cache.alloc_new_extent<LBAInternalNode>(t, LBA_BLOCK_SIZE);
- lba_node_meta_t meta{0, L_ADDR_MAX, root->get_node_meta().depth + 1};
- nroot->set_meta(meta);
- nroot->pin.set_range(meta);
- nroot->journal_insert(
- nroot->begin(),
- L_ADDR_MIN,
- root->get_paddr(),
- nullptr);
- auto new_depth = root->get_node_meta().depth + 1;
- croot->get_root().lba_root = lba_root_t{
- nroot->get_paddr(),
- new_depth
- };
- t.get_lba_tree_stats().depth = new_depth;
- return nroot->split_entry(
- get_context(t),
- laddr, nroot->begin(), root);
- });
- }
- return split.si_then([this, &t, laddr, val](LBANodeRef node) {
- return node->insert(
- get_context(t),
- laddr, val);
- });
-}
-
BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount(
Transaction &t,
laddr_t addr,
int delta)
{
+ LOG_PREFIX(BtreeLBAManager::update_refcount);
+ DEBUGT("addr {}, delta {}", t, addr, delta);
return update_mapping(
t,
addr,
laddr_t addr,
update_func_t &&f)
{
- return get_root(t
- ).si_then([this, f=std::move(f), &t, addr](LBANodeRef root) mutable {
- return root->mutate_mapping(
- get_context(t),
- addr,
- std::move(f));
- });
-}
+ LOG_PREFIX(BtreeLBAManager::update_mapping);
+ DEBUGT("addr {}", t, addr);
+ auto c = get_context(t);
+ return with_btree_ret<lba_map_val_t>(
+ c,
+ [f=std::move(f), c, addr](auto &btree) mutable {
+ return btree.lower_bound(
+ c, addr
+ ).si_then([&btree, f=std::move(f), c, addr](auto iter)
+ -> update_mapping_ret {
+ if (iter.is_end() || iter.get_key() != addr) {
+ return crimson::ct_error::enoent::make();
+ }
-BtreeLBAManager::update_internal_mapping_ret
-BtreeLBAManager::update_internal_mapping(
- Transaction &t,
- depth_t depth,
- laddr_t laddr,
- paddr_t paddr)
-{
- return cache.get_root(t).si_then([=, &t](RootBlockRef croot) {
- if (depth == croot->get_root().lba_root.get_depth()) {
- logger().debug(
- "update_internal_mapping: updating lba root to: {}->{}",
- laddr,
- paddr);
- {
- auto mut_croot = cache.duplicate_for_write(t, croot);
- croot = mut_croot->cast<RootBlock>();
- }
- ceph_assert(laddr == 0);
- auto old_paddr = croot->get_root().lba_root.get_location();
- croot->get_root().lba_root.set_location(paddr);
- return update_internal_mapping_ret(
- interruptible::ready_future_marker{},
- old_paddr);
- } else {
- logger().debug(
- "update_internal_mapping: updating lba node at depth {} to: {}->{}",
- depth,
- laddr,
- paddr);
- return get_lba_btree_extent(
- get_context(t),
- croot,
- croot->get_root().lba_root.get_depth(),
- croot->get_root().lba_root.get_location(),
- paddr_t()).si_then([=, &t](LBANodeRef broot) {
- return broot->mutate_internal_address(
- get_context(t),
- depth,
- laddr,
- paddr);
- });
- }
- });
+ auto ret = f(iter.get_val());
+ if (ret.refcount == 0) {
+ return btree.remove(
+ c,
+ iter
+ ).si_then([ret] {
+ return ret;
+ });
+ } else {
+ return btree.update(
+ c,
+ iter,
+ ret
+ ).si_then([ret](auto) {
+ return ret;
+ });
+ }
+ });
+ });
}
BtreeLBAManager::~BtreeLBAManager()
#include "common/interval_map.h"
#include "crimson/osd/exceptions.h"
+#include "crimson/os/seastore/logging.h"
#include "crimson/os/seastore/seastore_types.h"
#include "crimson/os/seastore/lba_manager.h"
#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/segment_manager.h"
#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree.h"
namespace crimson::os::seastore::lba_manager::btree {
extent_len_t len,
paddr_t addr) final;
- set_extent_ret set_extent(
- Transaction &t,
- laddr_t off, extent_len_t len, paddr_t addr) final;
-
ref_ret decref_extent(
Transaction &t,
laddr_t addr) final {
static btree_range_pin_t &get_pin(CachedExtent &e);
+ template <typename F, typename... Args>
+ auto with_btree(
+ op_context_t c,
+ F &&f) {
+ return cache.get_root(
+ c.trans
+ ).si_then([this, c, f=std::forward<F>(f)](RootBlockRef croot) mutable {
+ return seastar::do_with(
+ LBABtree(croot->get_root().lba_root),
+ [this, c, croot, f=std::move(f)](auto &btree) mutable {
+ return f(
+ btree
+ ).si_then([this, c, croot, &btree] {
+ if (btree.is_root_dirty()) {
+ auto mut_croot = cache.duplicate_for_write(
+ c.trans, croot
+ )->cast<RootBlock>();
+ mut_croot->get_root().lba_root = btree.get_root_undirty();
+ }
+ return base_iertr::now();
+ });
+ });
+ });
+ }
- /**
- * get_root
- *
- * Get a reference to the root LBANode.
- */
- using get_root_iertr = base_iertr;
- using get_root_ret = get_root_iertr::future<LBANodeRef>;
- get_root_ret get_root(Transaction &);
+ template <typename State, typename F>
+ auto with_btree_state(
+ op_context_t c,
+ State &&init,
+ F &&f) {
+ return seastar::do_with(
+ std::forward<State>(init),
+ [this, c, f=std::forward<F>(f)](auto &state) mutable {
+ (void)this; // silence incorrect clang warning about capture
+ return with_btree(c, [&state, f=std::move(f)](auto &btree) mutable {
+ return f(btree, state);
+ }).si_then([&state] {
+ return seastar::make_ready_future<State>(std::move(state));
+ });
+ });
+ }
- /**
- * insert_mapping
- *
- * Insert a lba mapping into the tree
- */
- using insert_mapping_iertr = base_iertr;
- using insert_mapping_ret = insert_mapping_iertr::future<LBAPinRef>;
- insert_mapping_ret insert_mapping(
- Transaction &t, ///< [in,out] transaction
- LBANodeRef root, ///< [in] root node
- laddr_t laddr, ///< [in] logical addr to insert
- lba_map_val_t val ///< [in] mapping to insert
- );
+ template <typename State, typename F>
+ auto with_btree_state(
+ op_context_t c,
+ F &&f) {
+ return with_btree_state<State, F>(c, State{}, std::forward<F>(f));
+ }
+
+ template <typename Ret, typename F>
+ auto with_btree_ret(
+ op_context_t c,
+ F &&f) {
+ return with_btree_state<Ret>(
+ c,
+ [f=std::forward<F>(f)](auto &btree, auto &ret) mutable {
+ return f(
+ btree
+ ).si_then([&ret](auto &&_ret) {
+ ret = std::move(_ret);
+ });
+ });
+ }
/**
* update_refcount
*/
using update_mapping_iertr = ref_iertr;
using update_mapping_ret = ref_iertr::future<lba_map_val_t>;
- using update_func_t = LBANode::mutate_func_t;
+ using update_func_t = std::function<
+ lba_map_val_t(const lba_map_val_t &v)
+ >;
update_mapping_ret update_mapping(
Transaction &t,
laddr_t addr,
update_func_t &&f);
-
- using update_internal_mapping_iertr = LBANode::mutate_internal_address_iertr;
- using update_internal_mapping_ret = LBANode::mutate_internal_address_ret;
- update_internal_mapping_ret update_internal_mapping(
- Transaction &t,
- depth_t depth,
- laddr_t laddr,
- paddr_t paddr);
};
using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>;
class BtreeLBAPin : public LBAPin {
friend class BtreeLBAManager;
+ friend class LBABtree;
/**
* parent
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/logging.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+LBABtree::mkfs_ret LBABtree::mkfs(op_context_t c)
+{
+ auto root_leaf = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans,
+ LBA_BLOCK_SIZE);
+ root_leaf->set_size(0);
+ lba_node_meta_t meta{0, L_ADDR_MAX, 1};
+ root_leaf->set_meta(meta);
+ root_leaf->pin.set_range(meta);
+ c.trans.get_lba_tree_stats().depth = 1u;
+ return lba_root_t{root_leaf->get_paddr(), 1u};
+}
+
+LBABtree::iterator_fut LBABtree::iterator::next(
+ op_context_t c,
+ mapped_space_visitor_t *visitor) const
+{
+ assert_valid();
+ assert(!is_end());
+
+ if ((leaf.pos + 1) < leaf.node->get_size()) {
+ auto ret = *this;
+ ret.leaf.pos++;
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ ret);
+ }
+
+ depth_t depth_with_space = 2;
+ for (; depth_with_space <= get_depth(); ++depth_with_space) {
+ if ((get_internal(depth_with_space).pos + 1) <
+ get_internal(depth_with_space).node->get_size()) {
+ break;
+ }
+ }
+
+ if (depth_with_space <= get_depth()) {
+ return seastar::do_with(
+ *this,
+ [](const LBAInternalNode &internal) { return internal.begin(); },
+ [](const LBALeafNode &leaf) { return leaf.begin(); },
+ [c, depth_with_space, visitor](auto &ret, auto &li, auto &ll) {
+ for (depth_t depth = 2; depth < depth_with_space; ++depth) {
+ ret.get_internal(depth).reset();
+ }
+ ret.leaf.reset();
+ ret.get_internal(depth_with_space).pos++;
+ return lookup_depth_range(
+ c, ret, depth_with_space - 1, 0, li, ll, visitor
+ ).si_then([&ret] {
+ return std::move(ret);
+ });
+ });
+ } else {
+ // end
+ auto ret = *this;
+ ret.leaf.pos = ret.leaf.node->get_size();
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ ret);
+ }
+}
+
+LBABtree::iterator_fut LBABtree::iterator::prev(op_context_t c) const
+{
+ assert_valid();
+ assert(!is_begin());
+
+ auto ret = *this;
+
+ if (is_end()) {
+ ret.leaf.pos = ret.leaf.node->get_size();
+ }
+
+ if (ret.leaf.pos > 0) {
+ ret.leaf.pos--;
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ ret);
+ }
+
+ depth_t depth_with_space = 2;
+ for (; depth_with_space <= get_depth(); ++depth_with_space) {
+ if (ret.get_internal(depth_with_space).pos > 0) {
+ break;
+ }
+ }
+
+ assert(depth_with_space <= ret.get_depth()); // must not be begin()
+ return seastar::do_with(
+ std::move(ret),
+ [](const LBAInternalNode &internal) { return --internal.end(); },
+ [](const LBALeafNode &leaf) { return --leaf.end(); },
+ [c, depth_with_space](auto &ret, auto &li, auto &ll) {
+ for (depth_t depth = 2; depth < depth_with_space; ++depth) {
+ ret.get_internal(depth).reset();
+ }
+ ret.leaf.reset();
+ ret.get_internal(depth_with_space).pos--;
+ return lookup_depth_range(
+ c, ret, depth_with_space - 1, 0, li, ll, nullptr
+ ).si_then([&ret] {
+ return std::move(ret);
+ });
+ });
+}
+
+LBABtree::iterator_fut LBABtree::lower_bound(
+ op_context_t c,
+ laddr_t addr,
+ mapped_space_visitor_t *visitor) const
+{
+ LOG_PREFIX(LBATree::lower_bound);
+ return lookup(
+ c,
+ [addr](const LBAInternalNode &internal) {
+ assert(internal.get_size() > 0);
+ auto iter = internal.upper_bound(addr);
+ assert(iter != internal.begin());
+ --iter;
+ return iter;
+ },
+ [FNAME, c, addr](const LBALeafNode &leaf) {
+ auto ret = leaf.lower_bound(addr);
+ DEBUGT(
+ "leaf addr {}, got ret offset {}, size {}, end {}",
+ c.trans,
+ addr,
+ ret.get_offset(),
+ leaf.get_size(),
+ ret == leaf.end());
+ return ret;
+ },
+ visitor
+ ).si_then([FNAME, c](auto &&ret) {
+ DEBUGT(
+ "ret.leaf.pos {}",
+ c.trans,
+ ret.leaf.pos);
+ ret.assert_valid();
+ return std::move(ret);
+ });
+}
+
+LBABtree::insert_ret LBABtree::insert(
+ op_context_t c,
+ iterator iter,
+ laddr_t laddr,
+ lba_map_val_t val)
+{
+ LOG_PREFIX(LBATree::insert);
+ DEBUGT(
+ "inserting laddr {} at iter {}",
+ c.trans,
+ laddr,
+ iter.is_end() ? L_ADDR_MAX : iter.get_key());
+ return seastar::do_with(
+ iter,
+ [this, c, laddr, val](auto &ret) {
+ return find_insertion(
+ c, laddr, ret
+ ).si_then([this, c, laddr, val, &ret] {
+ if (!ret.is_end() && ret.get_key() == laddr) {
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ std::make_pair(ret, false));
+ } else {
+ return handle_split(
+ c, ret
+ ).si_then([c, laddr, val, &ret] {
+ if (!ret.leaf.node->is_pending()) {
+ CachedExtentRef mut = c.cache.duplicate_for_write(
+ c.trans, ret.leaf.node
+ );
+ ret.leaf.node = mut->cast<LBALeafNode>();
+ }
+ auto iter = ret.leaf.node->lower_bound(laddr);
+ if (iter != ret.leaf.node->end() && iter->get_key() == laddr) {
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ std::make_pair(ret, false));
+ } else {
+ ret.leaf.pos = iter->get_offset();
+ assert(laddr >= ret.leaf.node->get_meta().begin &&
+ laddr < ret.leaf.node->get_meta().end);
+ ret.leaf.node->insert(iter, laddr, val);
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ std::make_pair(ret, true));
+ }
+ });
+ }
+ });
+ });
+}
+
+LBABtree::update_ret LBABtree::update(
+ op_context_t c,
+ iterator iter,
+ lba_map_val_t val)
+{
+ LOG_PREFIX(LBATree::update);
+ DEBUGT(
+ "update element at {}",
+ c.trans,
+ iter.is_end() ? L_ADDR_MAX : iter.get_key());
+ if (!iter.leaf.node->is_pending()) {
+ CachedExtentRef mut = c.cache.duplicate_for_write(
+ c.trans, iter.leaf.node
+ );
+ iter.leaf.node = mut->cast<LBALeafNode>();
+ }
+ iter.leaf.node->update(
+ iter.leaf.node->iter_idx(iter.leaf.pos),
+ val);
+ return update_ret(
+ interruptible::ready_future_marker{},
+ iter);
+}
+
+LBABtree::remove_ret LBABtree::remove(
+ op_context_t c,
+ iterator iter)
+{
+ LOG_PREFIX(LBATree::remove);
+ DEBUGT(
+ "remove element at {}",
+ c.trans,
+ iter.is_end() ? L_ADDR_MAX : iter.get_key());
+ assert(!iter.is_end());
+ return seastar::do_with(
+ iter,
+ [this, c](auto &ret) {
+ if (!ret.leaf.node->is_pending()) {
+ CachedExtentRef mut = c.cache.duplicate_for_write(
+ c.trans, ret.leaf.node
+ );
+ ret.leaf.node = mut->cast<LBALeafNode>();
+ }
+ ret.leaf.node->remove(
+ ret.leaf.node->iter_idx(ret.leaf.pos));
+
+ return handle_merge(
+ c, ret
+ );
+ });
+}
+
+LBABtree::init_cached_extent_ret LBABtree::init_cached_extent(
+ op_context_t c,
+ CachedExtentRef e)
+{
+ LOG_PREFIX(LBATree::init_cached_extent);
+ DEBUGT(": extent {}", c.trans, *e);
+ if (e->is_logical()) {
+ auto logn = e->cast<LogicalCachedExtent>();
+ return lower_bound(
+ c,
+ logn->get_laddr()
+ ).si_then([FNAME, e, c, logn](auto iter) {
+ if (!iter.is_end() &&
+ iter.get_key() == logn->get_laddr() &&
+ iter.get_val().paddr == logn->get_paddr()) {
+ logn->set_pin(iter.get_pin());
+ ceph_assert(iter.get_val().len == e->get_length());
+ c.pins.add_pin(
+ static_cast<BtreeLBAPin&>(logn->get_pin()).pin);
+ DEBUGT(": logical extent {} live, initialized", c.trans, *logn);
+ return e;
+ } else {
+ DEBUGT(": logical extent {} not live, dropping", c.trans, *logn);
+ c.cache.drop_from_cache(logn);
+ return CachedExtentRef();
+ }
+ });
+ } else if (e->get_type() == extent_types_t::LADDR_INTERNAL) {
+ auto eint = e->cast<LBAInternalNode>();
+ return lower_bound(
+ c, eint->get_node_meta().begin
+ ).si_then([FNAME, e, c, eint](auto iter) {
+ // Note, this check is valid even if iter.is_end()
+ depth_t cand_depth = eint->get_node_meta().depth;
+ if (cand_depth <= iter.get_depth() &&
+ &*iter.get_internal(cand_depth).node == &*eint) {
+ DEBUGT(": extent {} is live", c.trans, *eint);
+ return e;
+ } else {
+ DEBUGT(": extent {} is not live", c.trans, *eint);
+ c.cache.drop_from_cache(eint);
+ return CachedExtentRef();
+ }
+ });
+ } else if (e->get_type() == extent_types_t::LADDR_LEAF) {
+ auto eleaf = e->cast<LBALeafNode>();
+ return lower_bound(
+ c, eleaf->get_node_meta().begin
+ ).si_then([FNAME, c, e, eleaf](auto iter) {
+ // Note, this check is valid even if iter.is_end()
+ if (iter.leaf.node == &*eleaf) {
+ DEBUGT(": extent {} is live", c.trans, *eleaf);
+ return e;
+ } else {
+ DEBUGT(": extent {} is not live", c.trans, *eleaf);
+ c.cache.drop_from_cache(eleaf);
+ return CachedExtentRef();
+ }
+ });
+ } else {
+ DEBUGT(
+ ": found other extent {} type {}",
+ c.trans,
+ *e,
+ e->get_type());
+ return init_cached_extent_ret(
+ interruptible::ready_future_marker{},
+ e);
+ }
+}
+
+LBABtree::rewrite_lba_extent_ret LBABtree::rewrite_lba_extent(
+ op_context_t c,
+ CachedExtentRef e)
+{
+ LOG_PREFIX(LBABtree::rewrite_lba_extent);
+ assert(e->get_type() == extent_types_t::LADDR_INTERNAL ||
+ e->get_type() == extent_types_t::LADDR_LEAF);
+
+ auto do_rewrite = [&](auto &lba_extent) {
+ auto nlba_extent = c.cache.alloc_new_extent<
+ std::remove_reference_t<decltype(lba_extent)>
+ >(
+ c.trans,
+ lba_extent.get_length());
+ lba_extent.get_bptr().copy_out(
+ 0,
+ lba_extent.get_length(),
+ nlba_extent->get_bptr().c_str());
+ nlba_extent->pin.set_range(nlba_extent->get_node_meta());
+
+ /* This is a bit underhanded. Any relative addrs here must necessarily
+ * be record relative as we are rewriting a dirty extent. Thus, we
+ * are using resolve_relative_addrs with a (likely negative) block
+ * relative offset to correct them to block-relative offsets adjusted
+ * for our new transaction location.
+ *
+ * Upon commit, these now block relative addresses will be interpretted
+ * against the real final address.
+ */
+ nlba_extent->resolve_relative_addrs(
+ make_record_relative_paddr(0) - nlba_extent->get_paddr());
+
+ DEBUGT(
+ "rewriting {} into {}",
+ c.trans,
+ lba_extent,
+ *nlba_extent);
+
+ return update_internal_mapping(
+ c,
+ nlba_extent->get_node_meta().depth,
+ nlba_extent->get_node_meta().begin,
+ e->get_paddr(),
+ nlba_extent->get_paddr()
+ ).si_then([c, e] {
+ c.cache.retire_extent(c.trans, e);
+ });
+ };
+
+ CachedExtentRef nlba_extent;
+ if (e->get_type() == extent_types_t::LADDR_INTERNAL) {
+ auto lint = e->cast<LBAInternalNode>();
+ return do_rewrite(*lint);
+ } else {
+ assert(e->get_type() == extent_types_t::LADDR_LEAF);
+ auto lleaf = e->cast<LBALeafNode>();
+ return do_rewrite(*lleaf);
+ }
+}
+
+LBABtree::get_internal_node_ret LBABtree::get_internal_node(
+ op_context_t c,
+ depth_t depth,
+ paddr_t offset)
+{
+ LOG_PREFIX(LBATree::get_internal_node);
+ DEBUGT(
+ "reading internal at offset {}, depth {}",
+ c.trans,
+ offset,
+ depth);
+ return c.cache.get_extent<LBAInternalNode>(
+ c.trans,
+ offset,
+ LBA_BLOCK_SIZE
+ ).si_then([FNAME, c, offset](LBAInternalNodeRef ret) {
+ DEBUGT(
+ "read internal at offset {} {}",
+ c.trans,
+ offset,
+ *ret);
+ auto meta = ret->get_meta();
+ if (ret->get_size()) {
+ ceph_assert(meta.begin <= ret->begin()->get_key());
+ ceph_assert(meta.end > (ret->end() - 1)->get_key());
+ }
+ if (!ret->is_pending() && !ret->pin.is_linked()) {
+ ret->pin.set_range(meta);
+ c.pins.add_pin(ret->pin);
+ }
+ return get_internal_node_ret(
+ interruptible::ready_future_marker{},
+ ret);
+ });
+}
+
+LBABtree::get_leaf_node_ret LBABtree::get_leaf_node(
+ op_context_t c,
+ paddr_t offset)
+{
+ LOG_PREFIX(LBATree::get_leaf_node);
+ DEBUGT(
+ "reading leaf at offset {}",
+ c.trans,
+ offset);
+ return c.cache.get_extent<LBALeafNode>(
+ c.trans,
+ offset,
+ LBA_BLOCK_SIZE
+ ).si_then([FNAME, c, offset](LBALeafNodeRef ret) {
+ DEBUGT(
+ "read leaf at offset {} {}",
+ c.trans,
+ offset,
+ *ret);
+ auto meta = ret->get_meta();
+ if (ret->get_size()) {
+ ceph_assert(meta.begin <= ret->begin()->get_key());
+ ceph_assert(meta.end > (ret->end() - 1)->get_key());
+ }
+ if (!ret->is_pending() && !ret->pin.is_linked()) {
+ ret->pin.set_range(meta);
+ c.pins.add_pin(ret->pin);
+ }
+ return get_leaf_node_ret(
+ interruptible::ready_future_marker{},
+ ret);
+ });
+}
+
+LBABtree::find_insertion_ret LBABtree::find_insertion(
+ op_context_t c,
+ laddr_t laddr,
+ iterator &iter)
+{
+ assert(iter.is_end() || iter.get_key() >= laddr);
+ if (!iter.is_end() && iter.get_key() == laddr) {
+ return seastar::now();
+ } else if (iter.leaf.node->get_node_meta().begin <= laddr) {
+ auto p = iter;
+ if (p.leaf.pos > 0) {
+ --p.leaf.pos;
+ assert(p.get_key() < laddr);
+ }
+ return seastar::now();
+ } else {
+ assert(iter.leaf.pos == 0);
+ return iter.prev(
+ c
+ ).si_then([laddr, &iter](auto p) {
+ assert(p.leaf.node->get_node_meta().begin <= laddr);
+ assert(p.get_key() < laddr);
+ // Note, this is specifically allowed to violate the iterator
+ // invariant that pos is a valid index for the node in the event
+ // that the insertion point is at the end of a node.
+ p.leaf.pos++;
+ iter = p;
+ return seastar::now();
+ });
+ }
+}
+
+LBABtree::handle_split_ret LBABtree::handle_split(
+ op_context_t c,
+ iterator &iter)
+{
+ LOG_PREFIX(LBATree::insert);
+
+ depth_t split_from = iter.check_split();
+
+ DEBUGT("split_from {}, depth {}", c.trans, split_from, iter.get_depth());
+
+ if (split_from == iter.get_depth()) {
+ auto nroot = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ lba_node_meta_t meta{0, L_ADDR_MAX, iter.get_depth() + 1};
+ nroot->set_meta(meta);
+ nroot->pin.set_range(meta);
+ nroot->journal_insert(
+ nroot->begin(),
+ L_ADDR_MIN,
+ root.get_location(),
+ nullptr);
+ iter.internal.push_back({nroot, 0});
+
+ root.set_location(nroot->get_paddr());
+ root.set_depth(iter.get_depth());
+ c.trans.get_lba_tree_stats().depth = iter.get_depth();
+ root_dirty = true;
+ }
+
+ /* pos may be either node_position_t<LBALeafNode> or
+ * node_position_t<LBAInternalNode> */
+ auto split_level = [&](auto &parent_pos, auto &pos) {
+ auto [left, right, pivot] = pos.node->make_split_children(c);
+
+ auto parent_node = parent_pos.node;
+ auto parent_iter = parent_pos.get_iter();
+
+ parent_node->update(
+ parent_iter,
+ left->get_paddr());
+ parent_node->insert(
+ parent_iter + 1,
+ pivot,
+ right->get_paddr());
+
+ c.cache.retire_extent(c.trans, pos.node);
+
+ /* right->get_node_meta().begin == pivot == right->begin()->get_key()
+ * Thus, if pos.pos == left->get_size(), we want iter to point to
+ * left with pos.pos at the end rather than right with pos.pos = 0
+ * since the insertion would be to the left of the first element
+ * of right and thus necessarily less than right->get_node_meta().begin.
+ */
+ if (pos.pos <= left->get_size()) {
+ pos.node = left;
+ } else {
+ pos.node = right;
+ pos.pos -= left->get_size();
+
+ parent_pos.pos += 1;
+ }
+ };
+
+ for (; split_from > 0; --split_from) {
+ auto &parent_pos = iter.get_internal(split_from + 1);
+ if (!parent_pos.node->is_pending()) {
+ parent_pos.node = c.cache.duplicate_for_write(
+ c.trans, parent_pos.node
+ )->cast<LBAInternalNode>();
+ }
+
+ if (split_from > 1) {
+ auto &pos = iter.get_internal(split_from);
+ DEBUGT("splitting parent {} depth {}", c.trans, split_from, *pos.node);
+ split_level(parent_pos, pos);
+ } else {
+ auto &pos = iter.leaf;
+ DEBUGT("splitting child {}", c.trans, *pos.node);
+ split_level(parent_pos, pos);
+ }
+ }
+
+ return seastar::now();
+}
+
+template <typename NodeType>
+LBABtree::base_iertr::future<typename NodeType::Ref> get_node(
+ op_context_t c,
+ depth_t depth,
+ paddr_t addr);
+
+template <>
+LBABtree::base_iertr::future<LBALeafNodeRef> get_node<LBALeafNode>(
+ op_context_t c,
+ depth_t depth,
+ paddr_t addr) {
+ assert(depth == 1);
+ return LBABtree::get_leaf_node(c, addr);
+}
+
+template <>
+LBABtree::base_iertr::future<LBAInternalNodeRef> get_node<LBAInternalNode>(
+ op_context_t c,
+ depth_t depth,
+ paddr_t addr) {
+ return LBABtree::get_internal_node(c, depth, addr);
+}
+
+template <typename NodeType>
+LBABtree::handle_merge_ret merge_level(
+ op_context_t c,
+ depth_t depth,
+ LBABtree::node_position_t<LBAInternalNode> &parent_pos,
+ LBABtree::node_position_t<NodeType> &pos)
+{
+ if (!parent_pos.node->is_pending()) {
+ parent_pos.node = c.cache.duplicate_for_write(
+ c.trans, parent_pos.node
+ )->cast<LBAInternalNode>();
+ }
+
+ auto iter = parent_pos.get_iter();
+ assert(iter.get_offset() < parent_pos.node->get_size());
+ bool donor_is_left = ((iter.get_offset() + 1) == parent_pos.node->get_size());
+ auto donor_iter = donor_is_left ? (iter - 1) : (iter + 1);
+
+ return get_node<NodeType>(
+ c,
+ depth,
+ donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr())
+ ).si_then([c, iter, donor_iter, donor_is_left, &parent_pos, &pos](
+ typename NodeType::Ref donor) {
+ auto [l, r] = donor_is_left ?
+ std::make_pair(donor, pos.node) : std::make_pair(pos.node, donor);
+
+ auto [liter, riter] = donor_is_left ?
+ std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+
+ if (donor->at_min_capacity()) {
+ auto replacement = l->make_full_merge(c, r);
+
+ parent_pos.node->update(
+ liter,
+ replacement->get_paddr());
+ parent_pos.node->remove(riter);
+
+ pos.node = replacement;
+ if (donor_is_left) {
+ pos.pos += r->get_size();
+ parent_pos.pos--;
+ }
+
+ c.cache.retire_extent(c.trans, l);
+ c.cache.retire_extent(c.trans, r);
+ } else {
+ auto [replacement_l, replacement_r, pivot] =
+ l->make_balanced(
+ c,
+ r,
+ !donor_is_left);
+
+ parent_pos.node->update(
+ liter,
+ replacement_l->get_paddr());
+ parent_pos.node->replace(
+ riter,
+ pivot,
+ replacement_r->get_paddr());
+
+ if (donor_is_left) {
+ assert(parent_pos.pos > 0);
+ parent_pos.pos--;
+ }
+
+ auto orig_position = donor_is_left ?
+ l->get_size() + pos.pos :
+ pos.pos;
+ if (orig_position < replacement_l->get_size()) {
+ pos.node = replacement_l;
+ pos.pos = orig_position;
+ } else {
+ parent_pos.pos++;
+ pos.node = replacement_r;
+ pos.pos = orig_position - replacement_l->get_size();
+ }
+
+ c.cache.retire_extent(c.trans, l);
+ c.cache.retire_extent(c.trans, r);
+ }
+
+ return seastar::now();
+ });
+}
+
+LBABtree::handle_merge_ret LBABtree::handle_merge(
+ op_context_t c,
+ iterator &iter)
+{
+ LOG_PREFIX(LBATree::handle_merge);
+ if (!iter.leaf.node->at_min_capacity() ||
+ iter.get_depth() == 1) {
+ DEBUGT(
+ "no need to merge leaf, leaf size {}, depth {}",
+ c.trans,
+ iter.leaf.node->get_size(),
+ iter.get_depth());
+ return seastar::now();
+ }
+
+ return seastar::do_with(
+ depth_t{1},
+ [FNAME, this, c, &iter](auto &to_merge) {
+ return trans_intr::repeat(
+ [FNAME, this, c, &iter, &to_merge] {
+ DEBUGT(
+ "merging depth {}",
+ c.trans,
+ to_merge);
+ auto &parent_pos = iter.get_internal(to_merge + 1);
+ auto merge_fut = handle_merge_iertr::now();
+ if (to_merge > 1) {
+ auto &pos = iter.get_internal(to_merge);
+ merge_fut = merge_level(c, to_merge, parent_pos, pos);
+ } else {
+ auto &pos = iter.leaf;
+ merge_fut = merge_level(c, to_merge, parent_pos, pos);
+ }
+
+ return merge_fut.si_then([FNAME, this, c, &iter, &to_merge] {
+ ++to_merge;
+ auto &pos = iter.get_internal(to_merge);
+ if (to_merge == iter.get_depth()) {
+ if (pos.node->get_size() == 1) {
+ DEBUGT("collapsing root", c.trans);
+ c.cache.retire_extent(c.trans, pos.node);
+ assert(pos.pos == 0);
+ auto node_iter = pos.get_iter();
+ root.set_location(
+ node_iter->get_val().maybe_relative_to(pos.node->get_paddr()));
+ iter.internal.pop_back();
+ root.set_depth(iter.get_depth());
+ c.trans.get_lba_tree_stats().depth = iter.get_depth();
+ root_dirty = true;
+ } else {
+ DEBUGT("no need to collapse root", c.trans);
+ }
+ return seastar::stop_iteration::yes;
+ } else if (pos.node->at_min_capacity()) {
+ DEBUGT(
+ "continuing, next node {} depth {} at min",
+ c.trans,
+ *pos.node,
+ to_merge);
+ return seastar::stop_iteration::no;
+ } else {
+ DEBUGT(
+ "complete, next node {} depth {} not min",
+ c.trans,
+ *pos.node,
+ to_merge);
+ return seastar::stop_iteration::yes;
+ }
+ });
+ });
+ });
+}
+
+LBABtree::update_internal_mapping_ret LBABtree::update_internal_mapping(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t old_addr,
+ paddr_t new_addr)
+{
+ LOG_PREFIX(LBATree::update_internal_mapping);
+ DEBUGT(
+ "updating laddr {} at depth {} from {} to {}",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr);
+
+ return lower_bound(
+ c, laddr
+ ).si_then([=](auto iter) {
+ assert(iter.get_depth() >= depth);
+ if (depth == iter.get_depth()) {
+ DEBUGT("update at root", c.trans);
+
+ if (laddr != 0) {
+ ERRORT(
+ "updating root laddr {} at depth {} from {} to {},"
+ "laddr is not 0",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr,
+ root.get_location());
+ ceph_assert(0 == "impossible");
+ }
+
+ if (root.get_location() != old_addr) {
+ ERRORT(
+ "updating root laddr {} at depth {} from {} to {},"
+ "root addr {} does not match",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr,
+ root.get_location());
+ ceph_assert(0 == "impossible");
+ }
+
+ root.set_location(new_addr);
+ root_dirty = true;
+ } else {
+ auto &parent = iter.get_internal(depth + 1);
+ assert(parent.node);
+ assert(parent.pos < parent.node->get_size());
+ auto piter = parent.node->iter_idx(parent.pos);
+
+ if (piter->get_key() != laddr) {
+ ERRORT(
+ "updating laddr {} at depth {} from {} to {},"
+ "node {} pos {} val pivot addr {} does not match",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr,
+ *(parent.node),
+ parent.pos,
+ piter->get_key());
+ ceph_assert(0 == "impossible");
+ }
+
+
+ if (piter->get_val() != old_addr) {
+ ERRORT(
+ "updating laddr {} at depth {} from {} to {},"
+ "node {} pos {} val addr {} does not match",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr,
+ *(parent.node),
+ parent.pos,
+ piter->get_val());
+ ceph_assert(0 == "impossible");
+ }
+
+ CachedExtentRef mut = c.cache.duplicate_for_write(
+ c.trans,
+ parent.node
+ );
+ LBAInternalNodeRef mparent = mut->cast<LBAInternalNode>();
+ mparent->update(piter, new_addr);
+
+ /* Note, iter is now invalid as we didn't udpate either the parent
+ * node reference to the new mutable instance nor did we update the
+ * child pointer to the new node. Not a problem as we'll now just
+ * destruct it.
+ */
+ }
+ return seastar::now();
+ });
+}
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/container/static_vector.hpp>
+#include <sys/mman.h>
+#include <memory>
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+
+class LBABtree {
+ static constexpr size_t MAX_DEPTH = 16;
+public:
+ using base_iertr = LBAManager::base_iertr;
+
+ class iterator;
+ using iterator_fut = base_iertr::future<iterator>;
+
+ using mapped_space_visitor_t = LBAManager::scan_mapped_space_func_t;
+
+ class iterator {
+ public:
+ iterator(const iterator &) noexcept = default;
+ iterator(iterator &&) noexcept = default;
+ iterator &operator=(const iterator &) = default;
+ iterator &operator=(iterator &&) = default;
+
+ iterator(depth_t depth) : internal(depth - 1) {}
+
+ iterator_fut next(
+ op_context_t c,
+ mapped_space_visitor_t *visit=nullptr) const;
+
+ iterator_fut prev(op_context_t c) const;
+
+ void assert_valid() const {
+ assert(leaf.node);
+ assert(leaf.pos <= leaf.node->get_size());
+
+ for (auto &i: internal) {
+ (void)i;
+ assert(i.node);
+ assert(i.pos < i.node->get_size());
+ }
+ }
+
+ depth_t get_depth() const {
+ return internal.size() + 1;
+ }
+
+ auto &get_internal(depth_t depth) {
+ assert(depth > 1);
+ assert((depth - 2) < internal.size());
+ return internal[depth - 2];
+ }
+
+ const auto &get_internal(depth_t depth) const {
+ assert(depth > 1);
+ assert((depth - 2) < internal.size());
+ return internal[depth - 2];
+ }
+
+ laddr_t get_key() const {
+ assert(!is_end());
+ return leaf.node->iter_idx(leaf.pos).get_key();
+ }
+ lba_map_val_t get_val() const {
+ assert(!is_end());
+ return leaf.node->iter_idx(leaf.pos).get_val();
+ }
+
+ bool is_end() const {
+ assert(leaf.pos <= leaf.node->get_size());
+ return leaf.pos == leaf.node->get_size();
+ }
+
+ bool is_begin() const {
+ for (auto &i: internal) {
+ if (i.pos != 0)
+ return false;
+ }
+ return leaf.pos == 0;
+ }
+
+ LBAPinRef get_pin() const {
+ assert(!is_end());
+ return std::make_unique<BtreeLBAPin>(
+ leaf.node,
+ get_val().paddr.maybe_relative_to(leaf.node->get_paddr()),
+ lba_node_meta_t{ get_key(), get_key() + get_val().len, 0 });
+ }
+
+ private:
+ iterator() = default;
+
+ friend class LBABtree;
+ static constexpr uint16_t INVALID = std::numeric_limits<uint16_t>::max();
+ template <typename NodeType>
+ struct node_position_t {
+ typename NodeType::Ref node;
+ uint16_t pos = INVALID;
+
+ void reset() {
+ *this = node_position_t{};
+ }
+
+ auto get_iter() {
+ assert(pos != INVALID);
+ assert(pos < node->get_size());
+ return node->iter_idx(pos);
+ }
+ };
+ boost::container::static_vector<
+ node_position_t<LBAInternalNode>, MAX_DEPTH> internal;
+ node_position_t<LBALeafNode> leaf;
+
+ depth_t check_split() const {
+ if (!leaf.node->at_max_capacity()) {
+ return 0;
+ }
+ for (depth_t split_from = 1; split_from < get_depth(); ++split_from) {
+ if (!get_internal(split_from + 1).node->at_max_capacity())
+ return split_from;
+ }
+ return get_depth();
+ }
+
+ depth_t check_merge() const {
+ if (!leaf.node->at_min_capacity()) {
+ return 0;
+ }
+ for (depth_t merge_from = 1; merge_from < get_depth(); ++merge_from) {
+ if (!get_internal(merge_from + 1).node->at_min_capacity())
+ return merge_from;
+ }
+ return get_depth();
+ }
+ };
+
+ LBABtree(lba_root_t root) : root(root) {}
+
+ bool is_root_dirty() const {
+ return root_dirty;
+ }
+ lba_root_t get_root_undirty() {
+ ceph_assert(root_dirty);
+ root_dirty = false;
+ return root;
+ }
+
+ /// mkfs
+ using mkfs_ret = lba_root_t;
+ static mkfs_ret mkfs(op_context_t c);
+
+ /**
+ * lower_bound
+ *
+ * @param c [in] context
+ * @param addr [in] ddr
+ * @return least iterator >= key
+ */
+ iterator_fut lower_bound(
+ op_context_t c,
+ laddr_t addr,
+ mapped_space_visitor_t *visit=nullptr) const;
+
+ /**
+ * upper_bound
+ *
+ * @param c [in] context
+ * @param addr [in] ddr
+ * @return least iterator > key
+ */
+ iterator_fut upper_bound(
+ op_context_t c,
+ laddr_t addr
+ ) const {
+ return lower_bound(
+ c, addr
+ ).si_then([c, addr](auto iter) {
+ if (!iter.is_end() && iter.get_key() == addr) {
+ return iter.next(c);
+ } else {
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ iter);
+ }
+ });
+ }
+
+ /**
+ * upper_bound_right
+ *
+ * @param c [in] context
+ * @param addr [in] addr
+ * @return least iterator i s.t. i.get_key() + i.get_val().len > key
+ */
+ iterator_fut upper_bound_right(
+ op_context_t c,
+ laddr_t addr) const
+ {
+ return lower_bound(
+ c, addr
+ ).si_then([c, addr](auto iter) {
+ if (iter.is_begin()) {
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ iter);
+ } else {
+ return iter.prev(
+ c
+ ).si_then([iter, addr](auto prev) {
+ if ((prev.get_key() + prev.get_val().len) > addr) {
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ prev);
+ } else {
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ iter);
+ }
+ });
+ }
+ });
+ }
+
+ iterator_fut begin(op_context_t c) const {
+ return lower_bound(c, 0);
+ }
+ iterator_fut end(op_context_t c) const {
+ return upper_bound(c, L_ADDR_MAX);
+ }
+
+ using iterate_repeat_ret = base_iertr::future<
+ seastar::stop_iteration>;
+ template <typename F>
+ static auto iterate_repeat(
+ op_context_t c,
+ iterator_fut &&iter_fut,
+ F &&f,
+ mapped_space_visitor_t *visitor=nullptr) {
+ return std::move(
+ iter_fut
+ ).si_then([c, visitor, f=std::forward<F>(f)](auto iter) {
+ return seastar::do_with(
+ iter,
+ std::move(f),
+ [c, visitor](auto &pos, auto &f) {
+ return trans_intr::repeat(
+ [c, visitor, &f, &pos] {
+ return f(
+ pos
+ ).si_then([c, visitor, &pos](auto done) {
+ if (done == seastar::stop_iteration::yes) {
+ return iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ } else {
+ ceph_assert(!pos.is_end());
+ return pos.next(
+ c, visitor
+ ).si_then([&pos](auto next) {
+ pos = next;
+ return iterate_repeat_ret(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ });
+ }
+ });
+ });
+ });
+ });
+ }
+
+ /**
+ * insert
+ *
+ * Inserts val at laddr with iter as a hint. If element at laddr already
+ * exists returns iterator to that element unchanged and returns false.
+ *
+ * Invalidates all outstanding iterators for this tree on this transaction.
+ *
+ * @param c [in] op context
+ * @param iter [in] hint, insertion constant if immediately prior to iter
+ * @param laddr [in] addr at which to insert
+ * @param val [in] val to insert
+ * @return pair<iter, bool> where iter points to element at addr, bool true
+ * iff element at laddr did not exist.
+ */
+ using insert_iertr = base_iertr;
+ using insert_ret = insert_iertr::future<std::pair<iterator, bool>>;
+ insert_ret insert(
+ op_context_t c,
+ iterator iter,
+ laddr_t laddr,
+ lba_map_val_t val
+ );
+ insert_ret insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val) {
+ return lower_bound(
+ c, laddr
+ ).si_then([this, c, laddr, val](auto iter) {
+ return insert(c, iter, laddr, val);
+ });
+ }
+
+ /**
+ * update
+ *
+ * Invalidates all outstanding iterators for this tree on this transaction.
+ *
+ * @param c [in] op context
+ * @param iter [in] iterator to element to update, must not be end
+ * @param val [in] val with which to update
+ * @return iterator to newly updated element
+ */
+ using update_iertr = base_iertr;
+ using update_ret = update_iertr::future<iterator>;
+ update_ret update(
+ op_context_t c,
+ iterator iter,
+ lba_map_val_t val);
+
+ /**
+ * remove
+ *
+ * Invalidates all outstanding iterators for this tree on this transaction.
+ *
+ * @param c [in] op context
+ * @param iter [in] iterator to element to remove, must not be end
+ */
+ using remove_iertr = base_iertr;
+ using remove_ret = remove_iertr::future<>;
+ remove_ret remove(
+ op_context_t c,
+ iterator iter);
+
+ /**
+ * init_cached_extent
+ *
+ * Checks whether e is live (reachable from lba tree) and drops or initializes
+ * accordingly.
+ *
+ * Returns e if live and a null CachedExtentRef otherwise.
+ */
+ using init_cached_extent_iertr = base_iertr;
+ using init_cached_extent_ret = init_cached_extent_iertr::future<CachedExtentRef>;
+ init_cached_extent_ret init_cached_extent(op_context_t c, CachedExtentRef e);
+
+ /**
+ * rewrite_lba_extent
+ *
+ * Rewrites a fresh copy of extent into transaction and updates internal
+ * references.
+ */
+ using rewrite_lba_extent_iertr = base_iertr;
+ using rewrite_lba_extent_ret = rewrite_lba_extent_iertr::future<>;
+ rewrite_lba_extent_ret rewrite_lba_extent(op_context_t c, CachedExtentRef e);
+
+private:
+ lba_root_t root;
+ bool root_dirty = false;
+
+ using get_internal_node_iertr = base_iertr;
+ using get_internal_node_ret = get_internal_node_iertr::future<LBAInternalNodeRef>;
+ static get_internal_node_ret get_internal_node(
+ op_context_t c,
+ depth_t depth,
+ paddr_t offset);
+
+ using get_leaf_node_iertr = base_iertr;
+ using get_leaf_node_ret = get_leaf_node_iertr::future<LBALeafNodeRef>;
+ static get_leaf_node_ret get_leaf_node(
+ op_context_t c,
+ paddr_t offset);
+
+ using lookup_root_iertr = base_iertr;
+ using lookup_root_ret = lookup_root_iertr::future<>;
+ lookup_root_ret lookup_root(
+ op_context_t c,
+ iterator &iter,
+ mapped_space_visitor_t *visitor) const {
+ if (root.get_depth() > 1) {
+ return get_internal_node(
+ c,
+ root.get_depth(),
+ root.get_location()
+ ).si_then([this, visitor, &iter](LBAInternalNodeRef root_node) {
+ iter.get_internal(root.get_depth()).node = root_node;
+ if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length());
+ return lookup_root_iertr::now();
+ });
+ } else {
+ return get_leaf_node(
+ c,
+ root.get_location()
+ ).si_then([visitor, &iter](LBALeafNodeRef root_node) {
+ iter.leaf.node = root_node;
+ if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length());
+ return lookup_root_iertr::now();
+ });
+ }
+ }
+
+ using lookup_internal_level_iertr = base_iertr;
+ using lookup_internal_level_ret = lookup_internal_level_iertr::future<>;
+ template <typename F>
+ static lookup_internal_level_ret lookup_internal_level(
+ op_context_t c,
+ depth_t depth,
+ iterator &iter,
+ F &f,
+ mapped_space_visitor_t *visitor
+ ) {
+ assert(depth > 1);
+ auto &parent_entry = iter.get_internal(depth + 1);
+ auto parent = parent_entry.node;
+ auto node_iter = parent->iter_idx(parent_entry.pos);
+ return get_internal_node(
+ c,
+ depth,
+ node_iter->get_val().maybe_relative_to(parent->get_paddr())
+ ).si_then([depth, visitor, &iter, &f](LBAInternalNodeRef node) {
+ auto &entry = iter.get_internal(depth);
+ entry.node = node;
+ auto node_iter = f(*node);
+ assert(node_iter != node->end());
+ entry.pos = node_iter->get_offset();
+ if (visitor) (*visitor)(node->get_paddr(), node->get_length());
+ return seastar::now();
+ });
+ }
+
+ using lookup_leaf_iertr = base_iertr;
+ using lookup_leaf_ret = lookup_leaf_iertr::future<>;
+ template <typename F>
+ static lookup_internal_level_ret lookup_leaf(
+ op_context_t c,
+ iterator &iter,
+ F &f,
+ mapped_space_visitor_t *visitor
+ ) {
+ auto &parent_entry = iter.get_internal(2);
+ auto parent = parent_entry.node;
+ assert(parent);
+ auto node_iter = parent->iter_idx(parent_entry.pos);
+
+ return get_leaf_node(
+ c,
+ node_iter->get_val().maybe_relative_to(parent->get_paddr())
+ ).si_then([visitor, &iter, &f](LBALeafNodeRef node) {
+ iter.leaf.node = node;
+ auto node_iter = f(*node);
+ iter.leaf.pos = node_iter->get_offset();
+ if (visitor) (*visitor)(node->get_paddr(), node->get_length());
+ return seastar::now();
+ });
+ }
+
+ using lookup_depth_range_iertr = base_iertr;
+ using lookup_depth_range_ret = lookup_depth_range_iertr::future<>;
+ template <typename LI, typename LL>
+ static lookup_depth_range_ret lookup_depth_range(
+ op_context_t c, ///< [in] context
+ iterator &iter, ///< [in,out] iterator to populate
+ depth_t from, ///< [in] from inclusive
+ depth_t to, ///< [in] to exclusive, (to <= from, to == from is a noop)
+ LI &li, ///< [in] internal->iterator
+ LL &ll, ///< [in] leaf->iterator
+ mapped_space_visitor_t *visitor ///< [in] mapped space visitor
+ ) {
+ LOG_PREFIX(LBATree::lookup_depth_range);
+ DEBUGT("{} -> {}", c.trans, from, to);
+ return seastar::do_with(
+ from,
+ [c, to, visitor, &iter, &li, &ll](auto &d) {
+ return trans_intr::repeat(
+ [c, to, visitor, &iter, &li, &ll, &d] {
+ if (d > to) {
+ return [&] {
+ if (d > 1) {
+ return lookup_internal_level(
+ c,
+ d,
+ iter,
+ li,
+ visitor);
+ } else {
+ assert(d == 1);
+ return lookup_leaf(
+ c,
+ iter,
+ ll,
+ visitor);
+ }
+ }().si_then([&d] {
+ --d;
+ return lookup_depth_range_iertr::make_ready_future<
+ seastar::stop_iteration
+ >(seastar::stop_iteration::no);
+ });
+ } else {
+ return lookup_depth_range_iertr::make_ready_future<
+ seastar::stop_iteration
+ >(seastar::stop_iteration::yes);
+ }
+ });
+ });
+ }
+
+ using lookup_iertr = base_iertr;
+ using lookup_ret = lookup_iertr::future<iterator>;
+ template <typename LI, typename LL>
+ lookup_ret lookup(
+ op_context_t c,
+ LI &&lookup_internal,
+ LL &&lookup_leaf,
+ mapped_space_visitor_t *visitor
+ ) const {
+ LOG_PREFIX(LBATree::lookup);
+ return seastar::do_with(
+ iterator{root.get_depth()},
+ std::forward<LI>(lookup_internal),
+ std::forward<LL>(lookup_leaf),
+ [FNAME, this, visitor, c](auto &iter, auto &li, auto &ll) {
+ return lookup_root(
+ c, iter, visitor
+ ).si_then([FNAME, this, visitor, c, &iter, &li, &ll] {
+ if (iter.get_depth() > 1) {
+ auto &root_entry = *(iter.internal.rbegin());
+ root_entry.pos = li(*(root_entry.node)).get_offset();
+ } else {
+ auto &root_entry = iter.leaf;
+ auto riter = ll(*(root_entry.node));
+ root_entry.pos = riter->get_offset();
+ }
+ DEBUGT("got root, depth {}", c.trans, root.get_depth());
+ return lookup_depth_range(
+ c,
+ iter,
+ root.get_depth() - 1,
+ 0,
+ li,
+ ll,
+ visitor);
+ }).si_then([&iter] {
+ return std::move(iter);
+ });
+ });
+ }
+
+ using find_insertion_iertr = base_iertr;
+ using find_insertion_ret = find_insertion_iertr::future<>;
+ static find_insertion_ret find_insertion(
+ op_context_t c,
+ laddr_t laddr,
+ iterator &iter);
+
+ using handle_split_iertr = base_iertr;
+ using handle_split_ret = handle_split_iertr::future<>;
+ handle_split_ret handle_split(
+ op_context_t c,
+ iterator &iter);
+
+ using handle_merge_iertr = base_iertr;
+ using handle_merge_ret = handle_merge_iertr::future<>;
+ handle_merge_ret handle_merge(
+ op_context_t c,
+ iterator &iter);
+
+ using update_internal_mapping_iertr = base_iertr;
+ using update_internal_mapping_ret = update_internal_mapping_iertr::future<>;
+ update_internal_mapping_ret update_internal_mapping(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t old_addr,
+ paddr_t new_addr);
+
+ template <typename T>
+ using node_position_t = iterator::node_position_t<T>;
+
+ template <typename NodeType>
+ friend base_iertr::future<typename NodeType::Ref> get_node(
+ op_context_t c,
+ depth_t depth,
+ paddr_t addr);
+
+ template <typename NodeType>
+ friend handle_merge_ret merge_level(
+ op_context_t c,
+ depth_t depth,
+ node_position_t<LBAInternalNode> &parent_pos,
+ node_position_t<NodeType> &pos);
+};
+
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+#include "include/byteorder.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_seastore);
+ }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+std::ostream &LBAInternalNode::print_detail(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", meta=" << get_meta();
+}
+
+void LBAInternalNode::resolve_relative_addrs(paddr_t base)
+{
+ for (auto i: *this) {
+ if (i->get_val().is_relative()) {
+ auto updated = base.add_relative(i->get_val());
+ logger().debug(
+ "LBAInternalNode::resolve_relative_addrs {} -> {}",
+ i->get_val(),
+ updated);
+ i->set_val(updated);
+ }
+ }
+}
+
+std::ostream &LBALeafNode::print_detail(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", meta=" << get_meta();
+}
+
+void LBALeafNode::resolve_relative_addrs(paddr_t base)
+{
+ for (auto i: *this) {
+ if (i->get_val().paddr.is_relative()) {
+ auto val = i->get_val();
+ val.paddr = base.add_relative(val.paddr);
+ logger().debug(
+ "LBALeafNode::resolve_relative_addrs {} -> {}",
+ i->get_val().paddr,
+ val.paddr);
+ i->set_val(val);
+ }
+ }
+}
+
+}
#include <memory>
#include <string.h>
-#include "crimson/common/log.h"
-#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+
+#include "include/buffer.h"
+
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/common/errorator.h"
#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
namespace crimson::os::seastore::lba_manager::btree {
uint32_t refcount = 0; ///< refcount
uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO)
+ lba_map_val_t() = default;
lba_map_val_t(
extent_len_t len,
paddr_t paddr,
class BtreeLBAPin;
using BtreeLBAPinRef = std::unique_ptr<BtreeLBAPin>;
+constexpr size_t LBA_BLOCK_SIZE = 4096;
+
+/**
+ * lba_node_meta_le_t
+ *
+ * On disk layout for lba_node_meta_t
+ */
+struct lba_node_meta_le_t {
+ laddr_le_t begin = laddr_le_t(0);
+ laddr_le_t end = laddr_le_t(0);
+ depth_le_t depth = init_depth_le(0);
+
+ lba_node_meta_le_t() = default;
+ lba_node_meta_le_t(const lba_node_meta_le_t &) = default;
+ explicit lba_node_meta_le_t(const lba_node_meta_t &val)
+ : begin(ceph_le64(val.begin)),
+ end(ceph_le64(val.end)),
+ depth(init_depth_le(val.depth)) {}
+
+ operator lba_node_meta_t() const {
+ return lba_node_meta_t{ begin, end, depth };
+ }
+};
+
/**
* LBANode
*
virtual lba_node_meta_t get_node_meta() const = 0;
- /**
- * lookup
- *
- * Returns the node at the specified depth responsible
- * for laddr
- */
- using lookup_iertr = base_iertr;
- using lookup_ret = lookup_iertr::future<LBANodeRef>;
- virtual lookup_ret lookup(
- op_context_t c,
- laddr_t addr,
- depth_t depth) = 0;
+ virtual ~LBANode() = default;
- /**
- * lookup_range
- *
- * Returns mappings within range [addr, addr+len)
- */
- using lookup_range_iertr = LBAManager::get_mappings_iertr;
- using lookup_range_ret = LBAManager::get_mappings_ret;
- virtual lookup_range_ret lookup_range(
- op_context_t c,
- laddr_t addr,
- extent_len_t len) = 0;
+ void on_delta_write(paddr_t record_block_offset) final {
+ // All in-memory relative addrs are necessarily record-relative
+ assert(get_prior_instance());
+ pin.take_pin(get_prior_instance()->cast<LBANode>()->pin);
+ resolve_relative_addrs(record_block_offset);
+ }
- /**
- * lookup_pin
- *
- * Returns the mapping at addr
- */
- using lookup_pin_iertr = LBAManager::get_mapping_iertr;
- using lookup_pin_ret = LBAManager::get_mapping_ret;
- virtual lookup_pin_ret lookup_pin(
- op_context_t c,
- laddr_t addr) = 0;
+ void on_initial_write() final {
+ // All in-memory relative addrs are necessarily block-relative
+ resolve_relative_addrs(get_paddr());
+ }
- /**
- * insert
- *
- * Recursively inserts into subtree rooted at *this. Caller
- * must already have handled splitting if at_max_capacity().
- *
- * Precondition: !at_max_capacity()
- */
- using insert_iertr = base_iertr;
- using insert_ret = insert_iertr::future<LBAPinRef>;
- virtual insert_ret insert(
- op_context_t c,
- laddr_t laddr,
- lba_map_val_t val) = 0;
+ void on_clean_read() final {
+ // From initial write of block, relative addrs are necessarily block-relative
+ resolve_relative_addrs(get_paddr());
+ }
- /**
- * find_hole
- *
- * Finds minimum hole of size len in [min, max)
- *
- * @return addr of hole, L_ADDR_NULL if unfound
- */
- using find_hole_iertr = base_iertr;
- using find_hole_ret = find_hole_iertr::future<laddr_t>;
- virtual find_hole_ret find_hole(
- op_context_t c,
- laddr_t min,
- laddr_t max,
- extent_len_t len) = 0;
+ virtual void resolve_relative_addrs(paddr_t base) = 0;
+};
+using LBANodeRef = LBANode::LBANodeRef;
- /**
- * scan_mappings
- *
- * Call f for all mappings in [begin, end)
- */
- using scan_mappings_iertr = LBAManager::scan_mappings_iertr;
- using scan_mappings_ret = LBAManager::scan_mappings_ret;
- using scan_mappings_func_t = LBAManager::scan_mappings_func_t;
- virtual scan_mappings_ret scan_mappings(
- op_context_t c,
- laddr_t begin,
- laddr_t end,
- scan_mappings_func_t &f) = 0;
-
- using scan_mapped_space_iertr = LBAManager::scan_mapped_space_iertr;
- using scan_mapped_space_ret = LBAManager::scan_mapped_space_ret;
- using scan_mapped_space_func_t = LBAManager::scan_mapped_space_func_t;
- virtual scan_mapped_space_ret scan_mapped_space(
- op_context_t c,
- scan_mapped_space_func_t &f) = 0;
+/**
+ * LBAInternalNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ * size : uint32_t[1] 4b
+ * (padding) : 4b
+ * meta : lba_node_meta_le_t[3] (1*24)b
+ * keys : laddr_t[255] (254*8)b
+ * values : paddr_t[255] (254*8)b
+ * = 4096
+
+ * TODO: make the above capacity calculation part of FixedKVNodeLayout
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t INTERNAL_NODE_CAPACITY = 254;
+struct LBAInternalNode
+ : LBANode,
+ common::FixedKVNodeLayout<
+ INTERNAL_NODE_CAPACITY,
+ lba_node_meta_t, lba_node_meta_le_t,
+ laddr_t, laddr_le_t,
+ paddr_t, paddr_le_t> {
+ using Ref = TCachedExtentRef<LBAInternalNode>;
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ LBAInternalNode(T&&... t) :
+ LBANode(std::forward<T>(t)...),
+ FixedKVNodeLayout(get_bptr().c_str()) {}
+
+ static constexpr extent_types_t TYPE = extent_types_t::LADDR_INTERNAL;
+
+ lba_node_meta_t get_node_meta() const { return get_meta(); }
+
+ CachedExtentRef duplicate_for_write() final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new LBAInternalNode(*this));
+ };
+
+ delta_buffer_t delta_buffer;
+ delta_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
- /**
- * mutate_mapping
- *
- * Lookups up laddr, calls f on value. If f returns a value, inserts it.
- * If it returns nullopt, removes the value.
- * Caller must already have merged if at_min_capacity().
- *
- * Recursive calls use mutate_mapping_internal.
- *
- * Precondition: !at_min_capacity()
- */
- using mutate_mapping_iertr = base_iertr::extend<
- crimson::ct_error::enoent ///< mapping does not exist
- >;
- using mutate_mapping_ret = mutate_mapping_iertr::future<
- lba_map_val_t>;
- using mutate_func_t = std::function<
- lba_map_val_t(const lba_map_val_t &v)
- >;
- virtual mutate_mapping_ret mutate_mapping(
- op_context_t c,
- laddr_t laddr,
- mutate_func_t &&f) = 0;
- virtual mutate_mapping_ret mutate_mapping_internal(
- op_context_t c,
- laddr_t laddr,
- bool is_root,
- mutate_func_t &&f) = 0;
+ void update(
+ const_iterator iter,
+ paddr_t addr) {
+ return journal_update(
+ iter,
+ maybe_generate_relative(addr),
+ maybe_get_delta_buffer());
+ }
- /**
- * mutate_internal_address
- *
- * Looks up internal node mapping at laddr, depth and
- * updates the mapping to paddr. Returns previous paddr
- * (for debugging purposes).
- */
- using mutate_internal_address_iertr = base_iertr::extend<
- crimson::ct_error::enoent ///< mapping does not exist
- >;
- using mutate_internal_address_ret = mutate_internal_address_iertr::future<
- paddr_t>;
- virtual mutate_internal_address_ret mutate_internal_address(
- op_context_t c,
- depth_t depth,
- laddr_t laddr,
- paddr_t paddr) = 0;
+ void insert(
+ const_iterator iter,
+ laddr_t pivot,
+ paddr_t addr) {
+ return journal_insert(
+ iter,
+ pivot,
+ maybe_generate_relative(addr),
+ maybe_get_delta_buffer());
+ }
- /**
- * make_split_children
- *
- * Generates appropriately typed left and right nodes formed from the
- * contents of *this.
- *
- * Returns <left, right, pivot> where pivot is the first value of right.
- */
- virtual std::tuple<
- LBANodeRef,
- LBANodeRef,
- laddr_t>
- make_split_children(
- op_context_t c) = 0;
+ void remove(const_iterator iter) {
+ return journal_remove(
+ iter,
+ maybe_get_delta_buffer());
+ }
- /**
- * make_full_merge
- *
- * Returns a single node formed from merging *this and right.
- * Precondition: at_min_capacity() && right.at_min_capacity()
- */
- virtual LBANodeRef make_full_merge(
+ void replace(
+ const_iterator iter,
+ laddr_t pivot,
+ paddr_t addr) {
+ return journal_replace(
+ iter,
+ pivot,
+ maybe_generate_relative(addr),
+ maybe_get_delta_buffer());
+ }
+
+ std::tuple<Ref, Ref, laddr_t>
+ make_split_children(op_context_t c) {
+ auto left = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto right = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto pivot = split_into(*left, *right);
+ left->pin.set_range(left->get_meta());
+ right->pin.set_range(right->get_meta());
+ return std::make_tuple(
+ left,
+ right,
+ pivot);
+ }
+
+ Ref make_full_merge(
op_context_t c,
- LBANodeRef &right) = 0;
+ Ref &right) {
+ auto replacement = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ replacement->merge_from(*this, *right->cast<LBAInternalNode>());
+ replacement->pin.set_range(replacement->get_meta());
+ return replacement;
+ }
+
+ std::tuple<Ref, Ref, laddr_t>
+ make_balanced(
+ op_context_t c,
+ Ref &_right,
+ bool prefer_left) {
+ ceph_assert(_right->get_type() == get_type());
+ auto &right = *_right->cast<LBAInternalNode>();
+ auto replacement_left = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto replacement_right = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+
+ auto pivot = balance_into_new_nodes(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+
+ replacement_left->pin.set_range(replacement_left->get_meta());
+ replacement_right->pin.set_range(replacement_right->get_meta());
+ return std::make_tuple(
+ replacement_left,
+ replacement_right,
+ pivot);
+ }
/**
- * make_balanced
+ * Internal relative addresses on read or in memory prior to commit
+ * are either record or block relative depending on whether this
+ * physical node is is_initial_pending() or just is_pending().
*
- * Returns nodes formed by balancing the contents of *this and right.
- *
- * Returns <left, right, pivot> where pivot is the first value of right.
+ * User passes appropriate base depending on lifecycle and
+ * resolve_relative_addrs fixes up relative internal references
+ * based on base.
*/
- virtual std::tuple<
- LBANodeRef,
- LBANodeRef,
- laddr_t>
- make_balanced(
- op_context_t c,
- LBANodeRef &right,
- bool prefer_left) = 0;
+ void resolve_relative_addrs(paddr_t base);
+ void node_resolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ if (i->get_val().is_relative()) {
+ assert(i->get_val().is_block_relative());
+ i->set_val(get_paddr().add_relative(i->get_val()));
+ }
+ }
+ }
+ }
+ void node_unresolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ if (i->get_val().is_relative()) {
+ assert(i->get_val().is_record_relative());
+ i->set_val(i->get_val() - get_paddr());
+ }
+ }
+ }
+ }
- virtual bool at_max_capacity() const = 0;
- virtual bool at_min_capacity() const = 0;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
- virtual ~LBANode() = default;
+ std::ostream &print_detail(std::ostream &out) const final;
- void on_delta_write(paddr_t record_block_offset) final {
- // All in-memory relative addrs are necessarily record-relative
- assert(get_prior_instance());
- pin.take_pin(get_prior_instance()->cast<LBANode>()->pin);
- resolve_relative_addrs(record_block_offset);
+ ceph::bufferlist get_delta() final {
+ assert(!delta_buffer.empty());
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
}
- void on_initial_write() final {
- // All in-memory relative addrs are necessarily block-relative
- resolve_relative_addrs(get_paddr());
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ set_last_committed_crc(get_crc32c());
+ resolve_relative_addrs(base);
}
- void on_clean_read() final {
- // From initial write of block, relative addrs are necessarily block-relative
- resolve_relative_addrs(get_paddr());
+ bool at_max_capacity() const {
+ return get_size() == get_capacity();
}
- virtual void resolve_relative_addrs(paddr_t base) = 0;
+ bool at_min_capacity() const {
+ return get_size() == (get_capacity() / 2);
+ }
};
-using LBANodeRef = LBANode::LBANodeRef;
+using LBAInternalNodeRef = LBAInternalNode::Ref;
+
+/**
+ * LBALeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ * size : uint32_t[1] 4b
+ * (padding) : 4b
+ * meta : lba_node_meta_le_t[3] (1*24)b
+ * keys : laddr_t[170] (145*8)b
+ * values : lba_map_val_t[170] (145*20)b
+ * = 4092
+ *
+ * TODO: update FixedKVNodeLayout to handle the above calculation
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t LEAF_NODE_CAPACITY = 145;
/**
- * get_lba_btree_extent
+ * lba_map_val_le_t
*
- * Fetches node at depth of the appropriate type.
+ * On disk layout for lba_map_val_t.
*/
-using get_lba_node_iertr = base_iertr;
-using get_lba_node_ret = get_lba_node_iertr::future<LBANodeRef>;
-get_lba_node_ret get_lba_btree_extent(
- op_context_t c, ///< [in] context structure
- CachedExtentRef parent, ///< [in] paddr ref source
- depth_t depth, ///< [in] depth of node to fetch
- paddr_t offset, ///< [in] physical addr of node
- paddr_t base ///< [in] depending on user, block addr or record addr
- /// in case offset is relative
-);
+struct lba_map_val_le_t {
+ extent_len_le_t len = init_extent_len_le(0);
+ paddr_le_t paddr;
+ ceph_le32 refcount{0};
+ ceph_le32 checksum{0};
+
+ lba_map_val_le_t() = default;
+ lba_map_val_le_t(const lba_map_val_le_t &) = default;
+ explicit lba_map_val_le_t(const lba_map_val_t &val)
+ : len(init_extent_len_le(val.len)),
+ paddr(paddr_le_t(val.paddr)),
+ refcount(val.refcount),
+ checksum(val.checksum) {}
+
+ operator lba_map_val_t() const {
+ return lba_map_val_t{ len, paddr, refcount, checksum };
+ }
+};
+
+struct LBALeafNode
+ : LBANode,
+ common::FixedKVNodeLayout<
+ LEAF_NODE_CAPACITY,
+ lba_node_meta_t, lba_node_meta_le_t,
+ laddr_t, laddr_le_t,
+ lba_map_val_t, lba_map_val_le_t> {
+ using Ref = TCachedExtentRef<LBALeafNode>;
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ LBALeafNode(T&&... t) :
+ LBANode(std::forward<T>(t)...),
+ FixedKVNodeLayout(get_bptr().c_str()) {}
+
+ static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF;
+
+ lba_node_meta_t get_node_meta() const { return get_meta(); }
+
+ CachedExtentRef duplicate_for_write() final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new LBALeafNode(*this));
+ };
+
+ delta_buffer_t delta_buffer;
+ delta_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ void update(
+ const_iterator iter,
+ lba_map_val_t val) {
+ val.paddr = maybe_generate_relative(val.paddr);
+ return journal_update(
+ iter,
+ val,
+ maybe_get_delta_buffer());
+ }
+
+ auto insert(
+ const_iterator iter,
+ laddr_t addr,
+ lba_map_val_t val) {
+ val.paddr = maybe_generate_relative(val.paddr);
+ journal_insert(
+ iter,
+ addr,
+ val,
+ maybe_get_delta_buffer());
+ return iter;
+ }
+
+ void remove(const_iterator iter) {
+ return journal_remove(
+ iter,
+ maybe_get_delta_buffer());
+ }
+
+
+ std::tuple<Ref, Ref, laddr_t>
+ make_split_children(op_context_t c) {
+ auto left = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto right = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto pivot = split_into(*left, *right);
+ left->pin.set_range(left->get_meta());
+ right->pin.set_range(right->get_meta());
+ return std::make_tuple(
+ left,
+ right,
+ pivot);
+ }
+
+ Ref make_full_merge(
+ op_context_t c,
+ Ref &right) {
+ auto replacement = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ replacement->merge_from(*this, *right->cast<LBALeafNode>());
+ replacement->pin.set_range(replacement->get_meta());
+ return replacement;
+ }
+
+ std::tuple<Ref, Ref, laddr_t>
+ make_balanced(
+ op_context_t c,
+ Ref &_right,
+ bool prefer_left) {
+ ceph_assert(_right->get_type() == get_type());
+ auto &right = *_right->cast<LBALeafNode>();
+ auto replacement_left = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto replacement_right = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+
+ auto pivot = balance_into_new_nodes(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+
+ replacement_left->pin.set_range(replacement_left->get_meta());
+ replacement_right->pin.set_range(replacement_right->get_meta());
+ return std::make_tuple(
+ replacement_left,
+ replacement_right,
+ pivot);
+ }
+
+ // See LBAInternalNode, same concept
+ void resolve_relative_addrs(paddr_t base);
+ void node_resolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ auto val = i->get_val();
+ if (val.paddr.is_relative()) {
+ assert(val.paddr.is_block_relative());
+ val.paddr = get_paddr().add_relative(val.paddr);
+ i->set_val(val);
+ }
+ }
+ }
+ }
+ void node_unresolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ auto val = i->get_val();
+ if (val.paddr.is_relative()) {
+ auto val = i->get_val();
+ assert(val.paddr.is_record_relative());
+ val.paddr = val.paddr - get_paddr();
+ i->set_val(val);
+ }
+ }
+ }
+ }
+
+ ceph::bufferlist get_delta() final {
+ assert(!delta_buffer.empty());
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ set_last_committed_crc(get_crc32c());
+ resolve_relative_addrs(base);
+ }
+
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ std::ostream &print_detail(std::ostream &out) const final;
+
+ bool at_max_capacity() const {
+ return get_size() == get_capacity();
+ }
+
+ bool at_min_capacity() const {
+ return get_size() == (get_capacity() / 2);
+ }
+};
+using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>;
}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include <sys/mman.h>
-#include <string.h>
-
-#include <memory>
-#include <string.h>
-
-#include "include/buffer.h"
-#include "include/byteorder.h"
-
-#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
-
-namespace {
- seastar::logger& logger() {
- return crimson::get_logger(ceph_subsys_seastore);
- }
-}
-
-namespace crimson::os::seastore::lba_manager::btree {
-
-std::ostream &LBAInternalNode::print_detail(std::ostream &out) const
-{
- return out << ", size=" << get_size()
- << ", meta=" << get_meta();
-}
-
-LBAInternalNode::lookup_ret LBAInternalNode::lookup(
- op_context_t c,
- laddr_t addr,
- depth_t depth)
-{
- auto meta = get_meta();
- if (depth == get_meta().depth) {
- return lookup_ret(
- interruptible::ready_future_marker{},
- this);
- }
- assert(meta.begin <= addr);
- assert(meta.end > addr);
-
- [[maybe_unused]] auto [iter, biter] = bound(addr, addr + 1);
- assert(iter != biter);
- assert(iter + 1 == biter);
- return get_lba_btree_extent(
- c,
- this,
- meta.depth - 1,
- iter->get_val(),
- get_paddr()).si_then([c, addr, depth](auto child) {
- return child->lookup(c, addr, depth);
- }).finally([ref=LBANodeRef(this)] {});
-}
-
-LBAInternalNode::lookup_range_ret LBAInternalNode::lookup_range(
- op_context_t c,
- laddr_t addr,
- extent_len_t len)
-{
- auto [begin, end] = bound(addr, addr + len);
- auto result_up = std::make_unique<lba_pin_list_t>();
- auto &result = *result_up;
- return trans_intr::do_for_each(
- std::move(begin),
- std::move(end),
- [this, c, &result, addr, len](const auto &val) {
- return get_lba_btree_extent(
- c,
- this,
- get_meta().depth - 1,
- val.get_val(),
- get_paddr()).si_then(
- [c, &result, addr, len](auto extent) {
- return extent->lookup_range(
- c,
- addr,
- len).si_then(
- [&result](auto pin_list) {
- result.splice(result.end(), pin_list,
- pin_list.begin(), pin_list.end());
- });
- });
- }).si_then([result=std::move(result_up), ref=LBANodeRef(this)] {
- return lookup_range_iertr::make_ready_future<lba_pin_list_t>(
- std::move(*result));
- });
-}
-
-LBAInternalNode::lookup_pin_ret LBAInternalNode::lookup_pin(
- op_context_t c,
- laddr_t addr)
-{
- auto iter = get_containing_child(addr);
- return get_lba_btree_extent(
- c,
- this,
- get_meta().depth - 1,
- iter->get_val(),
- get_paddr()
- ).si_then([c, addr](LBANodeRef extent) {
- return extent->lookup_pin(c, addr);
- }).finally([ref=LBANodeRef(this)] {});
-}
-
-LBAInternalNode::insert_ret LBAInternalNode::insert(
- op_context_t c,
- laddr_t laddr,
- lba_map_val_t val)
-{
- auto insertion_pt = get_containing_child(laddr);
- return get_lba_btree_extent(
- c,
- this,
- get_meta().depth - 1,
- insertion_pt->get_val(),
- get_paddr()).si_then(
- [this, insertion_pt, c, laddr, val=std::move(val)](
- auto extent) mutable {
- return extent->at_max_capacity() ?
- split_entry(c, laddr, insertion_pt, extent) :
- insert_iertr::make_ready_future<LBANodeRef>(std::move(extent));
- }).si_then([c, laddr, val=std::move(val)](
- LBANodeRef extent) mutable {
- return extent->insert(c, laddr, val);
- });
-}
-
-LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping(
- op_context_t c,
- laddr_t laddr,
- mutate_func_t &&f)
-{
- return mutate_mapping_internal(c, laddr, true, std::move(f));
-}
-
-LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping_internal(
- op_context_t c,
- laddr_t laddr,
- bool is_root,
- mutate_func_t &&f)
-{
- auto mutation_pt = get_containing_child(laddr);
- if (mutation_pt == end()) {
- assert(0 == "impossible");
- return crimson::ct_error::enoent::make();
- }
- return get_lba_btree_extent(
- c,
- this,
- get_meta().depth - 1,
- mutation_pt->get_val(),
- get_paddr()
- ).si_then([=](LBANodeRef extent) {
- if (extent->at_min_capacity() && get_size() > 1) {
- return merge_entry(
- c,
- laddr,
- mutation_pt,
- extent,
- is_root);
- } else {
- return merge_iertr::make_ready_future<LBANodeRef>(
- std::move(extent));
- }
- }).si_then([c, laddr, f=std::move(f)](LBANodeRef extent) mutable {
- return extent->mutate_mapping_internal(c, laddr, false, std::move(f));
- });
-}
-
-LBAInternalNode::mutate_internal_address_ret LBAInternalNode::mutate_internal_address(
- op_context_t c,
- depth_t depth,
- laddr_t laddr,
- paddr_t paddr)
-{
- if (get_meta().depth == (depth + 1)) {
- if (!is_pending()) {
- return c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>(
- )->mutate_internal_address(
- c,
- depth,
- laddr,
- paddr);
- }
- auto iter = get_containing_child(laddr);
- if (iter->get_key() != laddr) {
- logger().debug(
- "LBAInternalNode::mutate_internal_address laddr {} "
- "not found in extent {}",
- laddr,
- *this);
- return crimson::ct_error::enoent::make();
- }
-
- auto old_paddr = iter->get_val();
-
- journal_update(
- iter,
- maybe_generate_relative(paddr),
- maybe_get_delta_buffer());
-
- return mutate_internal_address_ret(
- interruptible::ready_future_marker{},
- old_paddr
- );
- } else {
- auto iter = get_containing_child(laddr);
- return get_lba_btree_extent(
- c,
- this,
- get_meta().depth - 1,
- iter->get_val(),
- get_paddr()
- ).si_then([=](auto node) {
- return node->mutate_internal_address(
- c,
- depth,
- laddr,
- paddr);
- });
- }
-}
-
-LBAInternalNode::find_hole_ret LBAInternalNode::find_hole(
- op_context_t c,
- laddr_t min_addr,
- laddr_t max_addr,
- extent_len_t len)
-{
- logger().debug(
- "LBAInternalNode::find_hole min={}, max={}, len={}, *this={}",
- min_addr, max_addr, len, *this);
- auto [begin, end] = bound(min_addr, max_addr);
- return seastar::do_with(
- begin,
- L_ADDR_NULL,
- [this, c, min_addr, len, end=end](auto &i, auto &ret) {
- return trans_intr::repeat([=, &i, &ret]()
- -> find_hole_iertr::future<seastar::stop_iteration> {
- if (i == end) {
- return seastar::make_ready_future<seastar::stop_iteration>(
- seastar::stop_iteration::yes);
- }
- return get_lba_btree_extent(
- c,
- this,
- get_meta().depth - 1,
- i->get_val(),
- get_paddr()
- ).si_then([=, &i](auto extent) mutable {
- auto lb = std::max(min_addr, i->get_key());
- auto ub = i->get_next_key_or_max();
- logger().debug("LBAInternalNode::find_hole extent {} lb {} ub {}",
- *extent, lb, ub);
- return extent->find_hole(c, lb, ub, len);
- }).si_then([&i, &ret](auto addr) {
- if (addr == L_ADDR_NULL) {
- ++i;
- return seastar::make_ready_future<seastar::stop_iteration>(
- seastar::stop_iteration::no);
- } else {
- ret = addr;
- return seastar::make_ready_future<seastar::stop_iteration>(
- seastar::stop_iteration::yes);
- }
- });
- }).si_then([&ret, ref=LBANodeRef(this)]() {
- return ret;
- });
- });
-}
-
-LBAInternalNode::scan_mappings_ret LBAInternalNode::scan_mappings(
- op_context_t c,
- laddr_t begin,
- laddr_t end,
- scan_mappings_func_t &f)
-{
- auto [biter, eiter] = bound(begin, end);
- return trans_intr::do_for_each(
- std::move(biter),
- std::move(eiter),
- [=, &f](auto &viter) {
- return get_lba_btree_extent(
- c,
- this,
- get_meta().depth - 1,
- viter->get_val(),
- get_paddr()).si_then([=, &f](auto child) {
- return child->scan_mappings(c, begin, end, f);
- });
- }).si_then([ref=LBANodeRef(this)] {});
-}
-
-LBAInternalNode::scan_mapped_space_ret LBAInternalNode::scan_mapped_space(
- op_context_t c,
- scan_mapped_space_func_t &f)
-{
- f(get_paddr(), get_length());
- return trans_intr::do_for_each(
- begin(), end(),
- [=, &f](auto &viter) {
- return get_lba_btree_extent(
- c,
- this,
- get_meta().depth - 1,
- viter->get_val(),
- get_paddr()).si_then([=, &f](auto child) {
- return child->scan_mapped_space(c, f);
- });
- }).si_then([ref=LBANodeRef(this)]{});
-}
-
-
-void LBAInternalNode::resolve_relative_addrs(paddr_t base)
-{
- for (auto i: *this) {
- if (i->get_val().is_relative()) {
- auto updated = base.add_relative(i->get_val());
- logger().debug(
- "LBAInternalNode::resolve_relative_addrs {} -> {}",
- i->get_val(),
- updated);
- i->set_val(updated);
- }
- }
-}
-
-
-LBAInternalNode::split_ret
-LBAInternalNode::split_entry(
- op_context_t c,
- laddr_t addr,
- internal_iterator_t iter, LBANodeRef entry)
-{
- if (!is_pending()) {
- auto mut = c.cache.duplicate_for_write(
- c.trans, this)->cast<LBAInternalNode>();
- auto mut_iter = mut->iter_idx(iter->get_offset());
- return mut->split_entry(c, addr, mut_iter, entry);
- }
-
- ceph_assert(!at_max_capacity());
- auto [left, right, pivot] = entry->make_split_children(c);
-
- journal_update(
- iter,
- maybe_generate_relative(left->get_paddr()),
- maybe_get_delta_buffer());
- journal_insert(
- iter + 1,
- pivot,
- maybe_generate_relative(right->get_paddr()),
- maybe_get_delta_buffer());
-
- c.cache.retire_extent(c.trans, entry);
-
- logger().debug(
- "LBAInternalNode::split_entry *this {} entry {} into left {} right {}",
- *this,
- *entry,
- *left,
- *right);
-
- return split_iertr::make_ready_future<LBANodeRef>(
- pivot > addr ? left : right
- );
-}
-
-LBAInternalNode::merge_ret
-LBAInternalNode::merge_entry(
- op_context_t c,
- laddr_t addr,
- internal_iterator_t iter,
- LBANodeRef entry,
- bool is_root)
-{
- if (!is_pending()) {
- auto mut = c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>();
- auto mut_iter = mut->iter_idx(iter->get_offset());
- return mut->merge_entry(c, addr, mut_iter, entry, is_root);
- }
-
- logger().debug(
- "LBAInternalNode: merge_entry: {}, {}",
- *this,
- *entry);
- auto donor_is_left = (iter + 1) == end();
- auto donor_iter = donor_is_left ? iter - 1 : iter + 1;
- return get_lba_btree_extent(
- c,
- this,
- get_meta().depth - 1,
- donor_iter->get_val(),
- get_paddr()
- ).si_then([=](auto donor) mutable {
- auto [l, r] = donor_is_left ?
- std::make_pair(donor, entry) : std::make_pair(entry, donor);
- auto [liter, riter] = donor_is_left ?
- std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
- if (donor->at_min_capacity()) {
- auto replacement = l->make_full_merge(
- c,
- r);
-
- journal_update(
- liter,
- maybe_generate_relative(replacement->get_paddr()),
- maybe_get_delta_buffer());
- journal_remove(riter, maybe_get_delta_buffer());
-
- c.cache.retire_extent(c.trans, l);
- c.cache.retire_extent(c.trans, r);
-
- if (is_root && get_size() == 1) {
- return c.cache.get_root(c.trans).si_then([=](RootBlockRef croot) {
- {
- auto mut_croot = c.cache.duplicate_for_write(c.trans, croot);
- croot = mut_croot->cast<RootBlock>();
- }
- auto new_root_addr = begin()->get_val().maybe_relative_to(get_paddr());
- auto new_depth = get_meta().depth - 1;
- croot->get_root().lba_root = lba_root_t{
- new_root_addr,
- new_depth};
- c.trans.get_lba_tree_stats().depth = new_depth;
- logger().debug(
- "LBAInternalNode::merge_entry: collapsing root {} to addr {}",
- *this,
- new_root_addr);
- c.cache.retire_extent(c.trans, this);
- return merge_iertr::make_ready_future<LBANodeRef>(replacement);
- });
- } else {
- return merge_iertr::make_ready_future<LBANodeRef>(replacement);
- }
- } else {
- logger().debug(
- "LBAInternalEntry::merge_entry balanced l {} r {}",
- *l,
- *r);
- auto [replacement_l, replacement_r, pivot] =
- l->make_balanced(
- c,
- r,
- !donor_is_left);
-
- journal_update(
- liter,
- maybe_generate_relative(replacement_l->get_paddr()),
- maybe_get_delta_buffer());
- journal_replace(
- riter,
- pivot,
- maybe_generate_relative(replacement_r->get_paddr()),
- maybe_get_delta_buffer());
-
- c.cache.retire_extent(c.trans, l);
- c.cache.retire_extent(c.trans, r);
- return merge_iertr::make_ready_future<LBANodeRef>(
- addr >= pivot ? replacement_r : replacement_l
- );
- }
- });
-}
-
-
-LBAInternalNode::internal_iterator_t
-LBAInternalNode::get_containing_child(laddr_t laddr)
-{
- // TODO: binary search
- for (auto i = begin(); i != end(); ++i) {
- if (i.contains(laddr))
- return i;
- }
- ceph_assert(0 == "invalid");
- return end();
-}
-
-std::ostream &LBALeafNode::print_detail(std::ostream &out) const
-{
- return out << ", size=" << get_size()
- << ", meta=" << get_meta();
-}
-
-LBALeafNode::lookup_range_ret LBALeafNode::lookup_range(
- op_context_t c,
- laddr_t addr,
- extent_len_t len)
-{
- logger().debug(
- "LBALeafNode::lookup_range {}~{}",
- addr,
- len);
- auto ret = lba_pin_list_t();
- auto [i, end] = get_leaf_entries(addr, len);
- for (; i != end; ++i) {
- auto val = i->get_val();
- auto begin = i->get_key();
- ret.emplace_back(
- std::make_unique<BtreeLBAPin>(
- this,
- val.paddr.maybe_relative_to(get_paddr()),
- lba_node_meta_t{ begin, begin + val.len, 0}));
- }
- return lookup_range_iertr::make_ready_future<lba_pin_list_t>(
- std::move(ret));
-}
-
-LBALeafNode::lookup_pin_ret LBALeafNode::lookup_pin(
- op_context_t c,
- laddr_t addr)
-{
- logger().debug("LBALeafNode::lookup_pin {}", addr);
- auto iter = find(addr);
- if (iter == end()) {
- return crimson::ct_error::enoent::make();
- }
- auto val = iter->get_val();
- auto begin = iter->get_key();
- return lookup_pin_ret(
- interruptible::ready_future_marker{},
- std::make_unique<BtreeLBAPin>(
- this,
- val.paddr.maybe_relative_to(get_paddr()),
- lba_node_meta_t{ begin, begin + val.len, 0}));
-}
-
-LBALeafNode::insert_ret LBALeafNode::insert(
- op_context_t c,
- laddr_t laddr,
- lba_map_val_t val)
-{
- ceph_assert(!at_max_capacity());
-
- if (!is_pending()) {
- return c.cache.duplicate_for_write(c.trans, this
- )->cast<LBALeafNode>()->insert(c, laddr, val);
- }
-
- val.paddr = maybe_generate_relative(val.paddr);
- logger().debug(
- "LBALeafNode::insert: inserting {}~{} -> {}",
- laddr,
- val.len,
- val.paddr);
-
- auto insert_pt = lower_bound(laddr);
- journal_insert(insert_pt, laddr, val, maybe_get_delta_buffer());
-
- logger().debug(
- "LBALeafNode::insert: inserted {}~{} -> {}",
- insert_pt.get_key(),
- insert_pt.get_val().len,
- insert_pt.get_val().paddr);
- auto begin = insert_pt.get_key();
- return insert_ret(
- interruptible::ready_future_marker{},
- std::make_unique<BtreeLBAPin>(
- this,
- val.paddr.maybe_relative_to(get_paddr()),
- lba_node_meta_t{ begin, begin + val.len, 0}));
-}
-
-LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping(
- op_context_t c,
- laddr_t laddr,
- mutate_func_t &&f)
-{
- return mutate_mapping_internal(c, laddr, true, std::move(f));
-}
-
-LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping_internal(
- op_context_t c,
- laddr_t laddr,
- bool is_root,
- mutate_func_t &&f)
-{
- auto mutation_pt = find(laddr);
- if (mutation_pt == end()) {
- return crimson::ct_error::enoent::make();
- }
-
- if (!is_pending()) {
- return c.cache.duplicate_for_write(c.trans, this)->cast<LBALeafNode>(
- )->mutate_mapping_internal(
- c,
- laddr,
- is_root,
- std::move(f));
- }
-
- auto cur = mutation_pt.get_val();
- auto mutated = f(cur);
-
- mutated.paddr = maybe_generate_relative(mutated.paddr);
-
- logger().debug(
- "{}: mutate addr {}: {} -> {}",
- __func__,
- laddr,
- cur.paddr,
- mutated.paddr);
-
- if (mutated.refcount > 0) {
- journal_update(mutation_pt, mutated, maybe_get_delta_buffer());
- return mutate_mapping_ret(
- interruptible::ready_future_marker{},
- mutated);
- } else {
- ++(c.trans.get_lba_tree_stats().num_erases);
- journal_remove(mutation_pt, maybe_get_delta_buffer());
- return mutate_mapping_ret(
- interruptible::ready_future_marker{},
- mutated);
- }
-}
-
-LBALeafNode::mutate_internal_address_ret LBALeafNode::mutate_internal_address(
- op_context_t c,
- depth_t depth,
- laddr_t laddr,
- paddr_t paddr)
-{
- ceph_assert(0 == "Impossible");
- return mutate_internal_address_ret(
- interruptible::ready_future_marker{},
- paddr);
-}
-
-LBALeafNode::find_hole_ret LBALeafNode::find_hole(
- op_context_t c,
- laddr_t min,
- laddr_t max,
- extent_len_t len)
-{
- logger().debug(
- "LBALeafNode::find_hole min={} max={}, len={}, *this={}",
- min, max, len, *this);
- auto [liter, uiter] = bound(min, max);
- for (auto i = liter; i != uiter; ++i) {
- auto ub = i->get_key();
- if (min + len <= ub) {
- return find_hole_ret(
- interruptible::ready_future_marker{},
- min);
- } else {
- min = i->get_key() + i->get_val().len;
- }
- }
- if (min + len <= max) {
- return find_hole_ret(
- interruptible::ready_future_marker{},
- min);
- } else {
- return find_hole_ret(
- interruptible::ready_future_marker{},
- L_ADDR_MAX);
- }
-}
-
-LBALeafNode::scan_mappings_ret LBALeafNode::scan_mappings(
- op_context_t c,
- laddr_t begin,
- laddr_t end,
- scan_mappings_func_t &f)
-{
- auto [biter, eiter] = bound(begin, end);
- for (auto i = biter; i != eiter; ++i) {
- auto val = i->get_val();
- f(i->get_key(), val.paddr, val.len);
- }
- return scan_mappings_iertr::now();
-}
-
-LBALeafNode::scan_mapped_space_ret LBALeafNode::scan_mapped_space(
- op_context_t c,
- scan_mapped_space_func_t &f)
-{
- f(get_paddr(), get_length());
- for (auto i = begin(); i != end(); ++i) {
- auto val = i->get_val();
- f(val.paddr, val.len);
- }
- return scan_mappings_iertr::now();
-}
-
-
-void LBALeafNode::resolve_relative_addrs(paddr_t base)
-{
- for (auto i: *this) {
- if (i->get_val().paddr.is_relative()) {
- auto val = i->get_val();
- val.paddr = base.add_relative(val.paddr);
- logger().debug(
- "LBALeafNode::resolve_relative_addrs {} -> {}",
- i->get_val().paddr,
- val.paddr);
- i->set_val(val);
- }
- }
-}
-
-std::pair<LBALeafNode::internal_iterator_t, LBALeafNode::internal_iterator_t>
-LBALeafNode::get_leaf_entries(laddr_t addr, extent_len_t len)
-{
- return bound(addr, addr + len);
-}
-
-get_lba_node_ret get_lba_btree_extent(
- op_context_t c,
- CachedExtentRef parent,
- depth_t depth,
- paddr_t offset,
- paddr_t base)
-{
- offset = offset.maybe_relative_to(base);
- ceph_assert(depth > 0);
- if (depth > 1) {
- logger().debug(
- "get_lba_btree_extent: reading internal at offset {}, depth {}",
- offset,
- depth);
- return c.cache.get_extent<LBAInternalNode>(
- c.trans,
- offset,
- LBA_BLOCK_SIZE).si_then([c, parent](auto ret)
- -> get_lba_node_ret {
- auto meta = ret->get_meta();
- if (ret->get_size()) {
- ceph_assert(meta.begin <= ret->begin()->get_key());
- ceph_assert(meta.end > (ret->end() - 1)->get_key());
- }
- if (parent->has_been_invalidated() || ret->has_been_invalidated()) {
- logger().debug(
- "get_lba_btree_extent: parent {} or ret {} is invalid, transaction {} is conflicted: {}",
- *parent,
- *ret,
- (void*)&c.trans,
- c.trans.is_conflicted());
- assert(!(parent->has_been_invalidated() || ret->has_been_invalidated()));
- }
- if (!ret->is_pending() && !ret->pin.is_linked()) {
- ret->pin.set_range(meta);
- c.pins.add_pin(ret->pin);
- }
- return get_lba_node_ret(
- interruptible::ready_future_marker{},
- LBANodeRef(ret.detach(), /* add_ref = */ false));
- });
- } else {
- logger().debug(
- "get_lba_btree_extent: reading leaf at offset {}, depth {}",
- offset,
- depth);
- return c.cache.get_extent<LBALeafNode>(
- c.trans,
- offset,
- LBA_BLOCK_SIZE).si_then([offset, c, parent](auto ret)
- -> get_lba_node_ret {
- logger().debug(
- "get_lba_btree_extent: read leaf at offset {} {}, parent {}",
- offset,
- *ret,
- *parent);
- auto meta = ret->get_meta();
- if (ret->get_size()) {
- ceph_assert(meta.begin <= ret->begin()->get_key());
- ceph_assert(meta.end > (ret->end() - 1)->get_key());
- }
- if (parent->has_been_invalidated() || ret->has_been_invalidated()) {
- logger().debug(
- "get_lba_btree_extent: parent {} or ret {} is invalid, transaction {} is conflicted: {}",
- *parent,
- *ret,
- (void*)&c.trans,
- c.trans.is_conflicted());
- assert(!(parent->has_been_invalidated() || ret->has_been_invalidated()));
- }
- if (!ret->is_pending() && !ret->pin.is_linked()) {
- ret->pin.set_range(meta);
- c.pins.add_pin(ret->pin);
- }
- return get_lba_node_ret(
- interruptible::ready_future_marker{},
- LBANodeRef(ret.detach(), /* add_ref = */ false));
- });
- }
-}
-
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-
-#include <sys/mman.h>
-#include <string.h>
-
-#include <memory>
-#include <string.h>
-
-#include "include/buffer.h"
-
-#include "crimson/common/fixed_kv_node_layout.h"
-#include "crimson/common/errorator.h"
-#include "crimson/os/seastore/lba_manager.h"
-#include "crimson/os/seastore/seastore_types.h"
-#include "crimson/os/seastore/cache.h"
-#include "crimson/os/seastore/cached_extent.h"
-#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
-#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
-
-namespace crimson::os::seastore::lba_manager::btree {
-
-constexpr size_t LBA_BLOCK_SIZE = 4096;
-
-/**
- * lba_node_meta_le_t
- *
- * On disk layout for lba_node_meta_t
- */
-struct lba_node_meta_le_t {
- laddr_le_t begin = laddr_le_t(0);
- laddr_le_t end = laddr_le_t(0);
- depth_le_t depth = init_depth_le(0);
-
- lba_node_meta_le_t() = default;
- lba_node_meta_le_t(const lba_node_meta_le_t &) = default;
- explicit lba_node_meta_le_t(const lba_node_meta_t &val)
- : begin(ceph_le64(val.begin)),
- end(ceph_le64(val.end)),
- depth(init_depth_le(val.depth)) {}
-
- operator lba_node_meta_t() const {
- return lba_node_meta_t{ begin, end, depth };
- }
-};
-
-
-/**
- * LBAInternalNode
- *
- * Abstracts operations on and layout of internal nodes for the
- * LBA Tree.
- *
- * Layout (4k):
- * size : uint32_t[1] 4b
- * (padding) : 4b
- * meta : lba_node_meta_le_t[3] (1*24)b
- * keys : laddr_t[255] (254*8)b
- * values : paddr_t[255] (254*8)b
- * = 4096
-
- * TODO: make the above capacity calculation part of FixedKVNodeLayout
- * TODO: the above alignment probably isn't portable without further work
- */
-constexpr size_t INTERNAL_NODE_CAPACITY = 254;
-struct LBAInternalNode
- : LBANode,
- common::FixedKVNodeLayout<
- INTERNAL_NODE_CAPACITY,
- lba_node_meta_t, lba_node_meta_le_t,
- laddr_t, laddr_le_t,
- paddr_t, paddr_le_t> {
- using internal_iterator_t = const_iterator;
- template <typename... T>
- LBAInternalNode(T&&... t) :
- LBANode(std::forward<T>(t)...),
- FixedKVNodeLayout(get_bptr().c_str()) {}
-
- static constexpr extent_types_t TYPE = extent_types_t::LADDR_INTERNAL;
-
- lba_node_meta_t get_node_meta() const final { return get_meta(); }
-
- CachedExtentRef duplicate_for_write() final {
- assert(delta_buffer.empty());
- return CachedExtentRef(new LBAInternalNode(*this));
- };
-
- delta_buffer_t delta_buffer;
- delta_buffer_t *maybe_get_delta_buffer() {
- return is_mutation_pending() ? &delta_buffer : nullptr;
- }
-
- lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final;
-
- lookup_range_ret lookup_range(
- op_context_t c,
- laddr_t addr,
- extent_len_t len) final;
-
- lookup_pin_ret lookup_pin(
- op_context_t c,
- laddr_t addr) final;
-
- insert_ret insert(
- op_context_t c,
- laddr_t laddr,
- lba_map_val_t val) final;
-
- mutate_mapping_ret mutate_mapping(
- op_context_t c,
- laddr_t laddr,
- mutate_func_t &&f) final;
- mutate_mapping_ret mutate_mapping_internal(
- op_context_t c,
- laddr_t laddr,
- bool is_root,
- mutate_func_t &&f) final;
-
- mutate_internal_address_ret mutate_internal_address(
- op_context_t c,
- depth_t depth,
- laddr_t laddr,
- paddr_t paddr) final;
-
- find_hole_ret find_hole(
- op_context_t c,
- laddr_t min,
- laddr_t max,
- extent_len_t len) final;
-
- scan_mappings_ret scan_mappings(
- op_context_t c,
- laddr_t begin,
- laddr_t end,
- scan_mappings_func_t &f) final;
-
- scan_mapped_space_ret scan_mapped_space(
- op_context_t c,
- scan_mapped_space_func_t &f) final;
-
- std::tuple<LBANodeRef, LBANodeRef, laddr_t>
- make_split_children(op_context_t c) final {
- auto left = c.cache.alloc_new_extent<LBAInternalNode>(
- c.trans, LBA_BLOCK_SIZE);
- auto right = c.cache.alloc_new_extent<LBAInternalNode>(
- c.trans, LBA_BLOCK_SIZE);
- auto pivot = split_into(*left, *right);
- left->pin.set_range(left->get_meta());
- right->pin.set_range(right->get_meta());
- return std::make_tuple(
- left,
- right,
- pivot);
- }
-
- LBANodeRef make_full_merge(
- op_context_t c,
- LBANodeRef &right) final {
- auto replacement = c.cache.alloc_new_extent<LBAInternalNode>(
- c.trans, LBA_BLOCK_SIZE);
- replacement->merge_from(*this, *right->cast<LBAInternalNode>());
- replacement->pin.set_range(replacement->get_meta());
- return replacement;
- }
-
- std::tuple<LBANodeRef, LBANodeRef, laddr_t>
- make_balanced(
- op_context_t c,
- LBANodeRef &_right,
- bool prefer_left) final {
- ceph_assert(_right->get_type() == TYPE);
- auto &right = *_right->cast<LBAInternalNode>();
- auto replacement_left = c.cache.alloc_new_extent<LBAInternalNode>(
- c.trans, LBA_BLOCK_SIZE);
- auto replacement_right = c.cache.alloc_new_extent<LBAInternalNode>(
- c.trans, LBA_BLOCK_SIZE);
-
- auto pivot = balance_into_new_nodes(
- *this,
- right,
- prefer_left,
- *replacement_left,
- *replacement_right);
-
- replacement_left->pin.set_range(replacement_left->get_meta());
- replacement_right->pin.set_range(replacement_right->get_meta());
- return std::make_tuple(
- replacement_left,
- replacement_right,
- pivot);
- }
-
- /**
- * Internal relative addresses on read or in memory prior to commit
- * are either record or block relative depending on whether this
- * physical node is is_initial_pending() or just is_pending().
- *
- * User passes appropriate base depending on lifecycle and
- * resolve_relative_addrs fixes up relative internal references
- * based on base.
- */
- void resolve_relative_addrs(paddr_t base) final;
- void node_resolve_vals(iterator from, iterator to) const final {
- if (is_initial_pending()) {
- for (auto i = from; i != to; ++i) {
- if (i->get_val().is_relative()) {
- assert(i->get_val().is_block_relative());
- i->set_val(get_paddr().add_relative(i->get_val()));
- }
- }
- }
- }
- void node_unresolve_vals(iterator from, iterator to) const final {
- if (is_initial_pending()) {
- for (auto i = from; i != to; ++i) {
- if (i->get_val().is_relative()) {
- assert(i->get_val().is_record_relative());
- i->set_val(i->get_val() - get_paddr());
- }
- }
- }
- }
-
- extent_types_t get_type() const final {
- return TYPE;
- }
-
- std::ostream &print_detail(std::ostream &out) const final;
-
- ceph::bufferlist get_delta() final {
- ceph::buffer::ptr bptr(delta_buffer.get_bytes());
- delta_buffer.copy_out(bptr.c_str(), bptr.length());
- ceph::bufferlist bl;
- bl.push_back(bptr);
- return bl;
- }
-
- void apply_delta_and_adjust_crc(
- paddr_t base, const ceph::bufferlist &_bl) final {
- assert(_bl.length());
- ceph::bufferlist bl = _bl;
- bl.rebuild();
- delta_buffer_t buffer;
- buffer.copy_in(bl.front().c_str(), bl.front().length());
- buffer.replay(*this);
- set_last_committed_crc(get_crc32c());
- resolve_relative_addrs(base);
- }
-
- bool at_max_capacity() const final {
- return get_size() == get_capacity();
- }
-
- bool at_min_capacity() const {
- return get_size() == (get_capacity() / 2);
- }
-
- /// returns iterators containing [l, r)
- std::pair<internal_iterator_t, internal_iterator_t> bound(
- laddr_t l, laddr_t r) {
- // TODO: inefficient
- auto retl = begin();
- for (; retl != end(); ++retl) {
- if (retl->get_next_key_or_max() > l)
- break;
- }
- auto retr = retl;
- for (; retr != end(); ++retr) {
- if (retr->get_key() >= r)
- break;
- }
- return std::make_pair(retl, retr);
- }
-
- using split_iertr = base_iertr;
- using split_ret = split_iertr::future<LBANodeRef>;
- split_ret split_entry(
- op_context_t c,
- laddr_t addr,
- internal_iterator_t,
- LBANodeRef entry);
-
- using merge_iertr = base_iertr;
- using merge_ret = merge_iertr::future<LBANodeRef>;
- merge_ret merge_entry(
- op_context_t c,
- laddr_t addr,
- internal_iterator_t,
- LBANodeRef entry,
- bool is_root);
-
- /// returns iterator for subtree containing laddr
- internal_iterator_t get_containing_child(laddr_t laddr);
-};
-
-/**
- * LBALeafNode
- *
- * Abstracts operations on and layout of leaf nodes for the
- * LBA Tree.
- *
- * Layout (4k):
- * size : uint32_t[1] 4b
- * (padding) : 4b
- * meta : lba_node_meta_le_t[3] (1*24)b
- * keys : laddr_t[170] (145*8)b
- * values : lba_map_val_t[170] (145*20)b
- * = 4092
- *
- * TODO: update FixedKVNodeLayout to handle the above calculation
- * TODO: the above alignment probably isn't portable without further work
- */
-constexpr size_t LEAF_NODE_CAPACITY = 145;
-
-/**
- * lba_map_val_le_t
- *
- * On disk layout for lba_map_val_t.
- */
-struct lba_map_val_le_t {
- extent_len_le_t len = init_extent_len_le(0);
- paddr_le_t paddr;
- ceph_le32 refcount{0};
- ceph_le32 checksum{0};
-
- lba_map_val_le_t() = default;
- lba_map_val_le_t(const lba_map_val_le_t &) = default;
- explicit lba_map_val_le_t(const lba_map_val_t &val)
- : len(init_extent_len_le(val.len)),
- paddr(paddr_le_t(val.paddr)),
- refcount(val.refcount),
- checksum(val.checksum) {}
-
- operator lba_map_val_t() const {
- return lba_map_val_t{ len, paddr, refcount, checksum };
- }
-};
-
-struct LBALeafNode
- : LBANode,
- common::FixedKVNodeLayout<
- LEAF_NODE_CAPACITY,
- lba_node_meta_t, lba_node_meta_le_t,
- laddr_t, laddr_le_t,
- lba_map_val_t, lba_map_val_le_t> {
- using internal_iterator_t = const_iterator;
- template <typename... T>
- LBALeafNode(T&&... t) :
- LBANode(std::forward<T>(t)...),
- FixedKVNodeLayout(get_bptr().c_str()) {}
-
- static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF;
-
- lba_node_meta_t get_node_meta() const final { return get_meta(); }
-
- CachedExtentRef duplicate_for_write() final {
- assert(delta_buffer.empty());
- return CachedExtentRef(new LBALeafNode(*this));
- };
-
- delta_buffer_t delta_buffer;
- delta_buffer_t *maybe_get_delta_buffer() {
- return is_mutation_pending() ? &delta_buffer : nullptr;
- }
-
- lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final
- {
- return lookup_ret(
- interruptible::ready_future_marker{},
- this);
- }
-
- lookup_range_ret lookup_range(
- op_context_t c,
- laddr_t addr,
- extent_len_t len) final;
-
- lookup_pin_ret lookup_pin(
- op_context_t c,
- laddr_t addr) final;
-
- insert_ret insert(
- op_context_t c,
- laddr_t laddr,
- lba_map_val_t val) final;
-
- mutate_mapping_ret mutate_mapping(
- op_context_t c,
- laddr_t laddr,
- mutate_func_t &&f) final;
- mutate_mapping_ret mutate_mapping_internal(
- op_context_t c,
- laddr_t laddr,
- bool is_root,
- mutate_func_t &&f) final;
-
- mutate_internal_address_ret mutate_internal_address(
- op_context_t c,
- depth_t depth,
- laddr_t laddr,
- paddr_t paddr) final;
-
- find_hole_ret find_hole(
- op_context_t c,
- laddr_t min,
- laddr_t max,
- extent_len_t len) final;
-
- scan_mappings_ret scan_mappings(
- op_context_t c,
- laddr_t begin,
- laddr_t end,
- scan_mappings_func_t &f) final;
-
- scan_mapped_space_ret scan_mapped_space(
- op_context_t c,
- scan_mapped_space_func_t &f) final;
-
- std::tuple<LBANodeRef, LBANodeRef, laddr_t>
- make_split_children(op_context_t c) final {
- auto left = c.cache.alloc_new_extent<LBALeafNode>(
- c.trans, LBA_BLOCK_SIZE);
- auto right = c.cache.alloc_new_extent<LBALeafNode>(
- c.trans, LBA_BLOCK_SIZE);
- auto pivot = split_into(*left, *right);
- left->pin.set_range(left->get_meta());
- right->pin.set_range(right->get_meta());
- return std::make_tuple(
- left,
- right,
- pivot);
- }
-
- LBANodeRef make_full_merge(
- op_context_t c,
- LBANodeRef &right) final {
- auto replacement = c.cache.alloc_new_extent<LBALeafNode>(
- c.trans, LBA_BLOCK_SIZE);
- replacement->merge_from(*this, *right->cast<LBALeafNode>());
- replacement->pin.set_range(replacement->get_meta());
- return replacement;
- }
-
- std::tuple<LBANodeRef, LBANodeRef, laddr_t>
- make_balanced(
- op_context_t c,
- LBANodeRef &_right,
- bool prefer_left) final {
- ceph_assert(_right->get_type() == TYPE);
- auto &right = *_right->cast<LBALeafNode>();
- auto replacement_left = c.cache.alloc_new_extent<LBALeafNode>(
- c.trans, LBA_BLOCK_SIZE);
- auto replacement_right = c.cache.alloc_new_extent<LBALeafNode>(
- c.trans, LBA_BLOCK_SIZE);
-
- auto pivot = balance_into_new_nodes(
- *this,
- right,
- prefer_left,
- *replacement_left,
- *replacement_right);
-
- replacement_left->pin.set_range(replacement_left->get_meta());
- replacement_right->pin.set_range(replacement_right->get_meta());
- return std::make_tuple(
- replacement_left,
- replacement_right,
- pivot);
- }
-
- // See LBAInternalNode, same concept
- void resolve_relative_addrs(paddr_t base) final;
- void node_resolve_vals(iterator from, iterator to) const final {
- if (is_initial_pending()) {
- for (auto i = from; i != to; ++i) {
- auto val = i->get_val();
- if (val.paddr.is_relative()) {
- assert(val.paddr.is_block_relative());
- val.paddr = get_paddr().add_relative(val.paddr);
- i->set_val(val);
- }
- }
- }
- }
- void node_unresolve_vals(iterator from, iterator to) const final {
- if (is_initial_pending()) {
- for (auto i = from; i != to; ++i) {
- auto val = i->get_val();
- if (val.paddr.is_relative()) {
- auto val = i->get_val();
- assert(val.paddr.is_record_relative());
- val.paddr = val.paddr - get_paddr();
- i->set_val(val);
- }
- }
- }
- }
-
- ceph::bufferlist get_delta() final {
- ceph::buffer::ptr bptr(delta_buffer.get_bytes());
- delta_buffer.copy_out(bptr.c_str(), bptr.length());
- ceph::bufferlist bl;
- bl.push_back(bptr);
- return bl;
- }
-
- void apply_delta_and_adjust_crc(
- paddr_t base, const ceph::bufferlist &_bl) final {
- assert(_bl.length());
- ceph::bufferlist bl = _bl;
- bl.rebuild();
- delta_buffer_t buffer;
- buffer.copy_in(bl.front().c_str(), bl.front().length());
- buffer.replay(*this);
- set_last_committed_crc(get_crc32c());
- resolve_relative_addrs(base);
- }
-
- extent_types_t get_type() const final {
- return TYPE;
- }
-
- std::ostream &print_detail(std::ostream &out) const final;
-
- bool at_max_capacity() const final {
- return get_size() == get_capacity();
- }
-
- bool at_min_capacity() const final {
- return get_size() == (get_capacity() / 2);
- }
-
- /// returns iterators <lb, ub> containing addresses [l, r)
- std::pair<internal_iterator_t, internal_iterator_t> bound(
- laddr_t l, laddr_t r) {
- // TODO: inefficient
- auto retl = begin();
- for (; retl != end(); ++retl) {
- if (retl->get_key() >= l || (retl->get_key() + retl->get_val().len) > l)
- break;
- }
- auto retr = retl;
- for (; retr != end(); ++retr) {
- if (retr->get_key() >= r)
- break;
- }
- return std::make_pair(retl, retr);
- }
-
- std::pair<internal_iterator_t, internal_iterator_t>
- get_leaf_entries(laddr_t addr, extent_len_t len);
-};
-using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>;
-
-}
return ret;
}
- auto set_mapping(
- test_transaction_t &t,
- laddr_t addr,
- size_t len,
- paddr_t paddr) {
- auto [b, e] = get_overlap(t, addr, len);
- EXPECT_EQ(b, e);
-
- auto ret = with_trans_intr(
- *t.t,
- [=](auto &t) {
- return lba_manager->set_extent(t, addr, len, paddr);
- }).unsafe_get0();
- EXPECT_EQ(addr, ret->get_laddr());
- EXPECT_EQ(len, ret->get_length());
- EXPECT_EQ(paddr, ret->get_paddr());
- t.mappings.emplace(
- std::make_pair(
- ret->get_laddr(),
- test_extent_t{
- ret->get_paddr(),
- ret->get_length(),
- 1
- }
- ));
- return ret;
- }
-
auto decref_mapping(
test_transaction_t &t,
laddr_t addr) {