]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: also update the mappings copied by client 59476/head
authorXuehan Xu <xuxuehan@qianxin.com>
Fri, 15 May 2026 09:10:04 +0000 (17:10 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Mon, 25 May 2026 08:55:32 +0000 (16:55 +0800)
transactions when committing background rewriting transactions

With the 128-bit laddr key layout in place, SeaStore::rename would
involve copying mappings. These mappings must also be updated when
the logical extents they point to are rewritten.

Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/os/seastore/btree/fixed_kv_btree.h
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/cached_extent.cc
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/lba/btree_lba_manager.cc
src/crimson/os/seastore/lba/btree_lba_manager.h
src/crimson/os/seastore/linked_tree_node.h
src/crimson/os/seastore/transaction.h

index 8d70ad3e866f5a0b6239dee45b219180c2d70fbb..c31b713ecff6e3939ef2ac4fd286c5165c29f29c 100644 (file)
@@ -26,6 +26,11 @@ phy_tree_root_t& get_phy_tree_root(root_t& r);
 using get_phy_tree_root_node_ret =
   std::pair<bool, get_child_iertr::future<CachedExtentRef>>;
 
+template <typename T>
+CachedExtentRef get_phy_tree_root_node_sync(
+  const RootBlockRef &root_block,
+  op_context_t c);
+
 template <typename T>
 const get_phy_tree_root_node_ret get_phy_tree_root_node(
   const RootBlockRef &root_block,
@@ -467,6 +472,10 @@ public:
     return get_phy_tree_root_node<self_type>(root_block, c);
   }
 
+  auto get_root_node_sync(op_context_t c) const {
+    return get_phy_tree_root_node_sync<self_type>(root_block, c);
+  }
+
   /// mkfs
   using mkfs_ret = phy_tree_root_t;
   static mkfs_ret mkfs(RootBlockRef &root_block, op_context_t c) {
@@ -521,6 +530,66 @@ public:
     return new cursor_t(c, leaf, leaf->modifications, key, it.get_val(), pos);
   }
 
+  iterator lower_bound_sync(
+    op_context_t c,
+    node_key_t addr)
+  {
+    LOG_PREFIX(FixedKVBtree::lower_bound_sync);
+    auto depth = get_root().get_depth();
+#ifndef NDEBUG
+    iterator iter{depth, iterator::state_t::FULL};
+#else
+    iterator iter{depth};
+#endif
+    auto root_node = get_root_node_sync(c);
+    if (depth == 1) {
+      iter.leaf.node = root_node->template cast<leaf_node_t>();
+      auto &root_entry = iter.leaf;
+      auto riter = root_entry.node->lower_bound(addr);
+      SUBTRACET(
+        seastore_fixedkv_tree,
+        "leaf addr {}, got ret offset {}, size {}, end {}",
+        c.trans,
+        addr,
+        riter.get_offset(),
+        root_entry.node->get_size(),
+        riter == root_entry.node->end());
+      root_entry.pos = riter->get_offset();
+      return iter;
+    }
+    iter.get_internal(depth).node =
+      root_node->template cast<internal_node_t>();
+    assert(depth > 1);
+    while (depth > 1) {
+      auto &entry = iter.get_internal(depth);
+      auto riter = entry.node->upper_bound(addr);
+      assert(riter != entry.node->begin());
+      --riter;
+      entry.pos = riter.get_offset();
+      depth--;
+      if (depth > 1) {
+        auto child = entry.node->template get_child_sync<internal_node_t>(
+          c.trans, c.cache, entry.pos, riter.get_key());
+        iter.get_internal(depth).node = child;
+      } else {
+        auto child = entry.node->template get_child_sync<leaf_node_t>(
+          c.trans, c.cache, entry.pos, riter.get_key());
+        iter.leaf.node = child;
+      }
+    }
+    auto it = iter.leaf.node->upper_bound(addr);
+    iter.leaf.pos = it->get_offset();
+    SUBTRACET(
+      seastore_fixedkv_tree,
+      "leaf addr {}, got ret offset {}, size {}, end {}",
+      c.trans,
+      addr,
+      it.get_offset(),
+      iter.leaf.node->get_size(),
+      it == iter.leaf.node->end());
+    return iter;
+  }
+
   /**
    * lower_bound
    *
@@ -937,6 +1006,82 @@ public:
       });
   }
 
+  /**
+   * copy
+   *
+   * Copy is pretty similar as Insert, the difference is that it's
+   * inserting the val copied from src_iter into the position cor-
+   * responding to laddr.
+   *
+   * The reason we are introducing this method is that, since rewrite
+   * transactions are not invalidating other ones, we can't allow
+   * the val retrieved from one iterator be passed across the boundary
+   * of continuations, we must pass the iterator to be copied instead.
+   */
+  using copy_iertr = base_iertr;
+  using copy_ret = copy_iertr::future<std::pair<iterator, bool>>;
+  copy_ret copy(
+    op_context_t c,
+    iterator iter,
+    laddr_t laddr,
+    iterator src_iter,
+    BaseChildNode<leaf_node_t, node_key_t> *child)
+  {
+    LOG_PREFIX(FixedKVBtree::insert);
+    SUBTRACET(
+      seastore_fixedkv_tree,
+      "copying laddr {} at iter {}",
+      c.trans,
+      laddr,
+      iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key());
+    if constexpr (std::is_same_v<node_key_t, laddr_t>) {
+      // avoid unexpect default extent type for lba btree
+      assert(src_iter.get_val().type != extent_types_t::ROOT);
+    }
+    return seastar::do_with(
+      iter,
+      src_iter,
+      [this, c, laddr, child](auto &ret, auto &src_iter) {
+        return find_insertion(
+          c, laddr, ret
+        ).si_then([this, c, laddr, &ret, child, &src_iter] {
+          if (!ret.at_boundary() && ret.get_key() == laddr) {
+            return insert_ret(
+              interruptible::ready_future_marker{},
+              std::make_pair(ret, false));
+          } else {
+            ++(get_tree_stats<self_type>(c.trans).num_inserts);
+            return handle_split(
+              c, ret
+            ).si_then([c, laddr, &ret, child, &src_iter] {
+              if (!ret.leaf.node->is_mutable()) {
+                CachedExtentRef mut = c.cache.duplicate_for_write(
+                  c.trans, ret.leaf.node
+                );
+                ret.leaf.node = mut->cast<leaf_node_t>();
+              }
+              auto iter = typename leaf_node_t::const_iterator(
+                  ret.leaf.node.get(), ret.leaf.pos);
+              assert(iter == ret.leaf.node->lower_bound(laddr));
+              assert(iter == ret.leaf.node->end() || iter->get_key() > laddr);
+              assert(laddr >= ret.leaf.node->get_meta().begin &&
+                     laddr < ret.leaf.node->get_meta().end);
+              ret.leaf.node->insert(iter, laddr, src_iter.get_val());
+              if constexpr (std::is_base_of_v<
+                  ParentNode<leaf_node_t, node_key_t>, leaf_node_t>) {
+                ret.leaf.node->insert_child_ptr(
+                  ret.leaf.pos, child, ret.leaf.node->get_size() - 1);
+              }
+              (void)child;
+              return insert_ret(
+                interruptible::ready_future_marker{},
+                std::make_pair(ret, true));
+            });
+          }
+        });
+      });
+  }
+
   insert_ret insert(
     op_context_t c,
     node_key_t laddr,
@@ -959,9 +1104,7 @@ public:
    * @param val [in] val with which to update
    * @return iterator to newly updated element
    */
-  using update_iertr = base_iertr;
-  using update_ret = update_iertr::future<iterator>;
-  update_ret update(
+  iterator update(
     op_context_t c,
     iterator iter,
     node_val_t val,
@@ -989,9 +1132,7 @@ public:
         iter.leaf.node->update_child_ptr(iter.leaf.pos, child);
       }
     }
-    return update_ret(
-      interruptible::ready_future_marker{},
-      iter);
+    return iter;
   }
 
 
@@ -2386,6 +2527,13 @@ private:
 template <typename T>
 struct is_fixed_kv_tree : std::false_type {};
 
+template <typename tree_type_t>
+tree_type_t get_btree_sync(op_context_t c) {
+  assert(!c.trans.peek_root()->is_pending_io());
+  auto root = c.trans.peek_root();
+  return tree_type_t(root);
+}
+
 template <typename tree_type_t>
 Cache::get_root_iertr::future<tree_type_t>
 get_btree(op_context_t c) {
index c6840d7670feda7628de2f75e5aea87da2b278b6..5bbb8feeb034c83e2db1e3ea80ba647167c1bf37 100644 (file)
@@ -2038,6 +2038,9 @@ void Cache::complete_commit(
       if (is_lba_backref_node(i->get_type())) {
         committer.commit_data();
       }
+      if (i->is_logical()) {
+        committer.maybe_sync_copied_lba_key();
+      }
       touch_extent_fully(prior, &t_src, t.get_cache_hint());
       committer.sync_version();
       committer.unblock_trans(t);
index 18af8083d33a9791e2164ccece3542393a8ce3a8..08951ee8f8f9dd16bb7a65fa02ef812750427d9d 100644 (file)
@@ -409,6 +409,16 @@ void ExtentCommitter::commit_state() {
   extent.on_state_commit();
 }
 
+void ExtentCommitter::maybe_sync_copied_lba_key() {
+  ceph_assert(extent.is_logical());
+  auto &lextent = static_cast<LogicalChildNode&>(extent);
+  auto &prior = *extent.prior_instance;
+  for (auto &item : prior.read_transactions) {
+    item.t->maybe_sync_copied_lba_key(
+      lextent.get_laddr(), lextent.get_paddr());
+  }
+}
+
 void ExtentCommitter::commit_and_share_paddr() {
   auto &prior = *extent.prior_instance;
   auto old_paddr = prior.get_prior_paddr_and_reset();
index 043edd61817470c0f2dc69b20bedc5a9c2f4492a..098fb1358dfaebe644230aa48a98f52aa3245d6e 100644 (file)
@@ -296,6 +296,7 @@ public:
 
   void commit_and_share_paddr();
 
+  void maybe_sync_copied_lba_key();
 private:
   // the rewritten extent
   CachedExtent &extent;
index ba9dc390737d3f2da428c7f676f925ff43f8829e..23746bbd7fea09f469d5c46074e46df5d00236ef 100644 (file)
@@ -58,6 +58,25 @@ template phy_tree_root_t&
 get_phy_tree_root<
   crimson::os::seastore::lba::LBABtree>(root_t &r);
 
+template <>
+CachedExtentRef get_phy_tree_root_node_sync<
+  crimson::os::seastore::lba::LBABtree>(
+  const RootBlockRef &root_block, op_context_t c)
+{
+  auto lba_root = root_block->lba_root_node;
+  if (!lba_root) {
+    ceph_assert(root_block->is_pending());
+    auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance());
+    lba_root = prior.lba_root_node;
+  } else {
+    ceph_assert(lba_root->is_initial_pending()
+      == root_block->is_pending());
+  }
+  ceph_assert(lba_root);
+  auto ret = c.cache.peek_extent_viewable_by_trans(c.trans, lba_root);
+  return ret;
+}
+
 template <>
 const get_phy_tree_root_node_ret get_phy_tree_root_node<
   crimson::os::seastore::lba::LBABtree>(
@@ -935,7 +954,7 @@ BtreeLBAManager::_update_mapping(
     );
     co_return iter.get_cursor(c);
   } else {
-    iter = co_await btree.update(
+    iter = btree.update(
       c,
       iter,
       ret,
@@ -1108,6 +1127,29 @@ BtreeLBAManager::remap_mappings(
   co_return ret;
 }
 
+void BtreeLBAManager::update_paddr_sync(
+  Transaction &t,
+  laddr_t laddr,
+  paddr_t paddr)
+{
+  LOG_PREFIX(BtreeLBAManager::update_mapping);
+  DEBUGT("laddr={}, paddr={}", t, laddr, paddr);
+  auto c = get_context(t);
+  auto btree = get_btree_sync<LBABtree>(c);
+  auto iter = btree.lower_bound_sync(c, laddr);
+  auto cursor = iter.get_cursor(c);
+  btree.update(
+    c,
+    std::move(iter),
+    lba_map_val_t{
+      cursor->get_length(),
+      pladdr_t{std::move(paddr)},
+      cursor->get_refcount(),
+      cursor->get_checksum(),
+      cursor->get_extent_type()},
+    nullptr);
+}
+
 BtreeLBAManager::move_mapping_ret
 BtreeLBAManager::_copy_mapping(
   op_context_t c,
@@ -1128,6 +1170,8 @@ BtreeLBAManager::_copy_mapping(
   move_mapping_ret_t ret{std::move(src), std::move(dest)};
   auto &cursor = *ret.dest;
   auto iter = btree.make_partial_iter(c, cursor);
+  auto &scursor = *ret.src;
+  auto src_iter = btree.make_partial_iter(c, scursor);
   if (!iter.is_end()) {
     assert(iter.get_key() >= dest_laddr + ret.src->get_length());
   }
@@ -1139,16 +1183,17 @@ BtreeLBAManager::_copy_mapping(
   } else {
     addr = ret.src->get_paddr();
   }
-  auto [niter, inserted] = co_await btree.insert(
+  c.trans.new_lba_key_copied(
+    ret.src->get_key(),
+    dest_laddr,
+    [this](Transaction &t, laddr_t laddr, paddr_t paddr) {
+      update_paddr_sync(t, laddr, paddr);
+    });
+  auto [niter, inserted] = co_await btree.copy(
       c,
       std::move(iter),
       dest_laddr,
-      lba_map_val_t{
-       ret.src->get_length(),
-        std::move(addr),
-       EXTENT_DEFAULT_REF_COUNT,
-       ret.src->is_indirect() ? 0 : ret.src->get_checksum(),
-       ret.src->get_extent_type()},
+      std::move(src_iter),
       extent ? extent : get_reserved_ptr<LBALeafNode, laddr_t>());
   ceph_assert(inserted);
   ret.dest = niter.get_cursor(c);
index 684cc86e38db2e2b54700f5ad9a83e32cc8c0ba1..d04eb7d1e1c9bfb605711e3c78e00c5a7ed640fc 100644 (file)
@@ -387,6 +387,19 @@ private:
     LBACursorRef dest,
     LogicalChildNode *extent);
 
+  /**
+   * update_paddr_sync
+   *
+   * This is basically for updating the paddr of the mapping
+   * that has been copied by the transaction t and modified
+   * by some background rewrite transaction.
+   */
+  void update_paddr_sync(
+    Transaction &t,
+    laddr_t laddr,
+    paddr_t paddr);
+
+
   /**
    * _update_mapping
    *
index 737985e863f08bb6798a867993a9c91f37916068..2f74dcc82871b154788a96f57c57933970c57449 100644 (file)
@@ -326,6 +326,21 @@ public:
     return {viewable, find_pending_version(t, key, state)};
   }
 
+  template <typename ChildT>
+  TCachedExtentRef<ChildT> get_child_sync(
+    Transaction &t,
+    ExtentTransViewRetriever &etvr,
+    btreenode_pos_t pos,
+    node_key_t key)
+  {
+    assert(children.capacity());
+    assert(key == down_cast().iter_idx(pos).get_key());
+    auto child = children[pos];
+    ceph_assert(!is_reserved_ptr(child));
+    assert(is_valid_child_ptr(child));
+    return static_cast<ChildT*>(child);
+  }
+
   template <typename ChildT>
   get_child_ret_t<T, ChildT> get_child(
     Transaction &t,
index 2b56aef47fe6cb38727fea3a081b0bbf08c1531e..a79d822de6a7af4a4e05ed23103a206b0d93301d 100644 (file)
@@ -646,8 +646,34 @@ public:
 
   btree_cursor_stats_t cursor_stats;
 
- bool need_wait_visibility = false;
-
+  bool need_wait_visibility = false;
+
+  using update_copied_lba_key_func_t =
+    std::function<void (Transaction&, laddr_t, paddr_t)>;
+  void new_lba_key_copied(
+    laddr_t src,
+    laddr_t dest,
+    update_copied_lba_key_func_t &&func) {
+    copied_lba_keys.emplace(src, dest);
+    if (!update_copied_lba_key) {
+      update_copied_lba_key = std::move(func);
+    }
+  }
+  void maybe_sync_copied_lba_key(laddr_t laddr, paddr_t paddr) {
+    if (likely(copied_lba_keys.empty())) {
+      return;
+    }
+    assert(update_copied_lba_key);
+    auto it = copied_lba_keys.find(laddr);
+    if (it == copied_lba_keys.end()) {
+      return;
+    }
+    laddr_t key = it->second;
+    update_copied_lba_key(*this, key, paddr);
+  }
+  RootBlockRef peek_root() {
+    return root;
+  }
 private:
   friend class Cache;
   friend Ref make_test_transaction();
@@ -869,6 +895,9 @@ private:
   backref_entry_refs_t backref_entries;
 
   cache_hint_t cache_hint = CACHE_HINT_TOUCH;
+
+  std::map<laddr_t, laddr_t> copied_lba_keys;
+  std::function<void (Transaction&, laddr_t, paddr_t)> update_copied_lba_key;
 };
 using TransactionRef = Transaction::Ref;