From b7eace1619d115e6003e0769977f8906a1264237 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Fri, 15 May 2026 17:10:04 +0800 Subject: [PATCH] crimson/os/seastore: also update the mappings copied by client transactions when committing background rewriting transactions With the 128-bit laddr key layout in place, SeaStore::rename would involve copying mappings. These mappings must also be updated when the logical extents they point to are rewritten. Signed-off-by: Xuehan Xu --- .../os/seastore/btree/fixed_kv_btree.h | 160 +++++++++++++++++- src/crimson/os/seastore/cache.cc | 3 + src/crimson/os/seastore/cached_extent.cc | 10 ++ src/crimson/os/seastore/cached_extent.h | 1 + .../os/seastore/lba/btree_lba_manager.cc | 61 ++++++- .../os/seastore/lba/btree_lba_manager.h | 13 ++ src/crimson/os/seastore/linked_tree_node.h | 15 ++ src/crimson/os/seastore/transaction.h | 33 +++- 8 files changed, 280 insertions(+), 16 deletions(-) diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 8d70ad3e866f..c31b713ecff6 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -26,6 +26,11 @@ phy_tree_root_t& get_phy_tree_root(root_t& r); using get_phy_tree_root_node_ret = std::pair>; +template +CachedExtentRef get_phy_tree_root_node_sync( + const RootBlockRef &root_block, + op_context_t c); + template const get_phy_tree_root_node_ret get_phy_tree_root_node( const RootBlockRef &root_block, @@ -467,6 +472,10 @@ public: return get_phy_tree_root_node(root_block, c); } + auto get_root_node_sync(op_context_t c) const { + return get_phy_tree_root_node_sync(root_block, c); + } + /// mkfs using mkfs_ret = phy_tree_root_t; static mkfs_ret mkfs(RootBlockRef &root_block, op_context_t c) { @@ -521,6 +530,66 @@ public: return new cursor_t(c, leaf, leaf->modifications, key, it.get_val(), pos); } + iterator lower_bound_sync( + op_context_t c, + node_key_t addr) + { + LOG_PREFIX(FixedKVBtree::lower_bound_sync); + auto depth = get_root().get_depth(); +#ifndef NDEBUG + iterator iter{depth, iterator::state_t::FULL}; +#else + iterator iter{depth}; +#endif + auto root_node = get_root_node_sync(c); + if (depth == 1) { + iter.leaf.node = root_node->template cast(); + auto &root_entry = iter.leaf; + auto riter = root_entry.node->lower_bound(addr); + SUBTRACET( + seastore_fixedkv_tree, + "leaf addr {}, got ret offset {}, size {}, end {}", + c.trans, + addr, + riter.get_offset(), + root_entry.node->get_size(), + riter == root_entry.node->end()); + root_entry.pos = riter->get_offset(); + return iter; + } + iter.get_internal(depth).node = + root_node->template cast(); + assert(depth > 1); + while (depth > 1) { + auto &entry = iter.get_internal(depth); + auto riter = entry.node->upper_bound(addr); + assert(riter != entry.node->begin()); + --riter; + entry.pos = riter.get_offset(); + depth--; + if (depth > 1) { + auto child = entry.node->template get_child_sync( + c.trans, c.cache, entry.pos, riter.get_key()); + iter.get_internal(depth).node = child; + } else { + auto child = entry.node->template get_child_sync( + c.trans, c.cache, entry.pos, riter.get_key()); + iter.leaf.node = child; + } + } + auto it = iter.leaf.node->upper_bound(addr); + iter.leaf.pos = it->get_offset(); + SUBTRACET( + seastore_fixedkv_tree, + "leaf addr {}, got ret offset {}, size {}, end {}", + c.trans, + addr, + it.get_offset(), + iter.leaf.node->get_size(), + it == iter.leaf.node->end()); + return iter; + } + /** * lower_bound * @@ -937,6 +1006,82 @@ public: }); } + /** + * copy + * + * Copy is pretty similar as Insert, the difference is that it's + * inserting the val copied from src_iter into the position cor- + * responding to laddr. + * + * The reason we are introducing this method is that, since rewrite + * transactions are not invalidating other ones, we can't allow + * the val retrieved from one iterator be passed across the boundary + * of continuations, we must pass the iterator to be copied instead. + */ + using copy_iertr = base_iertr; + using copy_ret = copy_iertr::future>; + copy_ret copy( + op_context_t c, + iterator iter, + laddr_t laddr, + iterator src_iter, + BaseChildNode *child) + { + LOG_PREFIX(FixedKVBtree::insert); + SUBTRACET( + seastore_fixedkv_tree, + "copying laddr {} at iter {}", + c.trans, + laddr, + iter.is_end() ? min_max_t::max : iter.get_key()); + if constexpr (std::is_same_v) { + // avoid unexpect default extent type for lba btree + assert(src_iter.get_val().type != extent_types_t::ROOT); + } + return seastar::do_with( + iter, + src_iter, + [this, c, laddr, child](auto &ret, auto &src_iter) { + return find_insertion( + c, laddr, ret + ).si_then([this, c, laddr, &ret, child, &src_iter] { + if (!ret.at_boundary() && ret.get_key() == laddr) { + return insert_ret( + interruptible::ready_future_marker{}, + std::make_pair(ret, false)); + } else { + ++(get_tree_stats(c.trans).num_inserts); + return handle_split( + c, ret + ).si_then([c, laddr, &ret, child, &src_iter] { + if (!ret.leaf.node->is_mutable()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, ret.leaf.node + ); + ret.leaf.node = mut->cast(); + } + auto iter = typename leaf_node_t::const_iterator( + ret.leaf.node.get(), ret.leaf.pos); + assert(iter == ret.leaf.node->lower_bound(laddr)); + assert(iter == ret.leaf.node->end() || iter->get_key() > laddr); + assert(laddr >= ret.leaf.node->get_meta().begin && + laddr < ret.leaf.node->get_meta().end); + ret.leaf.node->insert(iter, laddr, src_iter.get_val()); + if constexpr (std::is_base_of_v< + ParentNode, leaf_node_t>) { + ret.leaf.node->insert_child_ptr( + ret.leaf.pos, child, ret.leaf.node->get_size() - 1); + } + (void)child; + return insert_ret( + interruptible::ready_future_marker{}, + std::make_pair(ret, true)); + }); + } + }); + }); + } + insert_ret insert( op_context_t c, node_key_t laddr, @@ -959,9 +1104,7 @@ public: * @param val [in] val with which to update * @return iterator to newly updated element */ - using update_iertr = base_iertr; - using update_ret = update_iertr::future; - update_ret update( + iterator update( op_context_t c, iterator iter, node_val_t val, @@ -989,9 +1132,7 @@ public: iter.leaf.node->update_child_ptr(iter.leaf.pos, child); } } - return update_ret( - interruptible::ready_future_marker{}, - iter); + return iter; } @@ -2386,6 +2527,13 @@ private: template struct is_fixed_kv_tree : std::false_type {}; +template +tree_type_t get_btree_sync(op_context_t c) { + assert(!c.trans.peek_root()->is_pending_io()); + auto root = c.trans.peek_root(); + return tree_type_t(root); +} + template Cache::get_root_iertr::future get_btree(op_context_t c) { diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index c6840d7670fe..5bbb8feeb034 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -2038,6 +2038,9 @@ void Cache::complete_commit( if (is_lba_backref_node(i->get_type())) { committer.commit_data(); } + if (i->is_logical()) { + committer.maybe_sync_copied_lba_key(); + } touch_extent_fully(prior, &t_src, t.get_cache_hint()); committer.sync_version(); committer.unblock_trans(t); diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index 18af8083d33a..08951ee8f8f9 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -409,6 +409,16 @@ void ExtentCommitter::commit_state() { extent.on_state_commit(); } +void ExtentCommitter::maybe_sync_copied_lba_key() { + ceph_assert(extent.is_logical()); + auto &lextent = static_cast(extent); + auto &prior = *extent.prior_instance; + for (auto &item : prior.read_transactions) { + item.t->maybe_sync_copied_lba_key( + lextent.get_laddr(), lextent.get_paddr()); + } +} + void ExtentCommitter::commit_and_share_paddr() { auto &prior = *extent.prior_instance; auto old_paddr = prior.get_prior_paddr_and_reset(); diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 043edd618174..098fb1358dfa 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -296,6 +296,7 @@ public: void commit_and_share_paddr(); + void maybe_sync_copied_lba_key(); private: // the rewritten extent CachedExtent &extent; diff --git a/src/crimson/os/seastore/lba/btree_lba_manager.cc b/src/crimson/os/seastore/lba/btree_lba_manager.cc index ba9dc390737d..23746bbd7fea 100644 --- a/src/crimson/os/seastore/lba/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba/btree_lba_manager.cc @@ -58,6 +58,25 @@ template phy_tree_root_t& get_phy_tree_root< crimson::os::seastore::lba::LBABtree>(root_t &r); +template <> +CachedExtentRef get_phy_tree_root_node_sync< + crimson::os::seastore::lba::LBABtree>( + const RootBlockRef &root_block, op_context_t c) +{ + auto lba_root = root_block->lba_root_node; + if (!lba_root) { + ceph_assert(root_block->is_pending()); + auto &prior = static_cast(*root_block->get_prior_instance()); + lba_root = prior.lba_root_node; + } else { + ceph_assert(lba_root->is_initial_pending() + == root_block->is_pending()); + } + ceph_assert(lba_root); + auto ret = c.cache.peek_extent_viewable_by_trans(c.trans, lba_root); + return ret; +} + template <> const get_phy_tree_root_node_ret get_phy_tree_root_node< crimson::os::seastore::lba::LBABtree>( @@ -935,7 +954,7 @@ BtreeLBAManager::_update_mapping( ); co_return iter.get_cursor(c); } else { - iter = co_await btree.update( + iter = btree.update( c, iter, ret, @@ -1108,6 +1127,29 @@ BtreeLBAManager::remap_mappings( co_return ret; } +void BtreeLBAManager::update_paddr_sync( + Transaction &t, + laddr_t laddr, + paddr_t paddr) +{ + LOG_PREFIX(BtreeLBAManager::update_mapping); + DEBUGT("laddr={}, paddr={}", t, laddr, paddr); + auto c = get_context(t); + auto btree = get_btree_sync(c); + auto iter = btree.lower_bound_sync(c, laddr); + auto cursor = iter.get_cursor(c); + btree.update( + c, + std::move(iter), + lba_map_val_t{ + cursor->get_length(), + pladdr_t{std::move(paddr)}, + cursor->get_refcount(), + cursor->get_checksum(), + cursor->get_extent_type()}, + nullptr); +} + BtreeLBAManager::move_mapping_ret BtreeLBAManager::_copy_mapping( op_context_t c, @@ -1128,6 +1170,8 @@ BtreeLBAManager::_copy_mapping( move_mapping_ret_t ret{std::move(src), std::move(dest)}; auto &cursor = *ret.dest; auto iter = btree.make_partial_iter(c, cursor); + auto &scursor = *ret.src; + auto src_iter = btree.make_partial_iter(c, scursor); if (!iter.is_end()) { assert(iter.get_key() >= dest_laddr + ret.src->get_length()); } @@ -1139,16 +1183,17 @@ BtreeLBAManager::_copy_mapping( } else { addr = ret.src->get_paddr(); } - auto [niter, inserted] = co_await btree.insert( + c.trans.new_lba_key_copied( + ret.src->get_key(), + dest_laddr, + [this](Transaction &t, laddr_t laddr, paddr_t paddr) { + update_paddr_sync(t, laddr, paddr); + }); + auto [niter, inserted] = co_await btree.copy( c, std::move(iter), dest_laddr, - lba_map_val_t{ - ret.src->get_length(), - std::move(addr), - EXTENT_DEFAULT_REF_COUNT, - ret.src->is_indirect() ? 0 : ret.src->get_checksum(), - ret.src->get_extent_type()}, + std::move(src_iter), extent ? extent : get_reserved_ptr()); ceph_assert(inserted); ret.dest = niter.get_cursor(c); diff --git a/src/crimson/os/seastore/lba/btree_lba_manager.h b/src/crimson/os/seastore/lba/btree_lba_manager.h index 684cc86e38db..d04eb7d1e1c9 100644 --- a/src/crimson/os/seastore/lba/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba/btree_lba_manager.h @@ -387,6 +387,19 @@ private: LBACursorRef dest, LogicalChildNode *extent); + /** + * update_paddr_sync + * + * This is basically for updating the paddr of the mapping + * that has been copied by the transaction t and modified + * by some background rewrite transaction. + */ + void update_paddr_sync( + Transaction &t, + laddr_t laddr, + paddr_t paddr); + + /** * _update_mapping * diff --git a/src/crimson/os/seastore/linked_tree_node.h b/src/crimson/os/seastore/linked_tree_node.h index 737985e863f0..2f74dcc82871 100644 --- a/src/crimson/os/seastore/linked_tree_node.h +++ b/src/crimson/os/seastore/linked_tree_node.h @@ -326,6 +326,21 @@ public: return {viewable, find_pending_version(t, key, state)}; } + template + TCachedExtentRef get_child_sync( + Transaction &t, + ExtentTransViewRetriever &etvr, + btreenode_pos_t pos, + node_key_t key) + { + assert(children.capacity()); + assert(key == down_cast().iter_idx(pos).get_key()); + auto child = children[pos]; + ceph_assert(!is_reserved_ptr(child)); + assert(is_valid_child_ptr(child)); + return static_cast(child); + } + template get_child_ret_t get_child( Transaction &t, diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 2b56aef47fe6..a79d822de6a7 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -646,8 +646,34 @@ public: btree_cursor_stats_t cursor_stats; - bool need_wait_visibility = false; - + bool need_wait_visibility = false; + + using update_copied_lba_key_func_t = + std::function; + void new_lba_key_copied( + laddr_t src, + laddr_t dest, + update_copied_lba_key_func_t &&func) { + copied_lba_keys.emplace(src, dest); + if (!update_copied_lba_key) { + update_copied_lba_key = std::move(func); + } + } + void maybe_sync_copied_lba_key(laddr_t laddr, paddr_t paddr) { + if (likely(copied_lba_keys.empty())) { + return; + } + assert(update_copied_lba_key); + auto it = copied_lba_keys.find(laddr); + if (it == copied_lba_keys.end()) { + return; + } + laddr_t key = it->second; + update_copied_lba_key(*this, key, paddr); + } + RootBlockRef peek_root() { + return root; + } private: friend class Cache; friend Ref make_test_transaction(); @@ -869,6 +895,9 @@ private: backref_entry_refs_t backref_entries; cache_hint_t cache_hint = CACHE_HINT_TOUCH; + + std::map copied_lba_keys; + std::function update_copied_lba_key; }; using TransactionRef = Transaction::Ref; -- 2.47.3