]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore/lba: avoid paddr from crossing coroutines 69076/head
authorXuehan Xu <xuxuehan@qianxin.com>
Sun, 24 May 2026 08:45:01 +0000 (16:45 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Wed, 17 Jun 2026 07:45:49 +0000 (15:45 +0800)
Previously, LBAManager::remap_mappings() works as follows:

1. get the mapping's val;
2. remove the mapping;
3. insert the remapped mappings.

With pessimistic cc in place, during the above step 2 and 3, a rewrite
transaction that modifies the same mapping might be committed and miss
the pending lba leaf nodes because it doesn't contain the mapping at the
time.

This commit change the above workflow as follows:

1. replace the mapping with the first remapped mapping;
2. insert the remaining remapped mappings.

Note that all the remaining mapped mappings' paddrs are calculated based
on the mapping before it in the same coroutine as the insertion, which
means it'll always see the modification of a background rewrite
transaction.

Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/os/seastore/btree/fixed_kv_btree.h
src/crimson/os/seastore/lba/btree_lba_manager.cc
src/crimson/os/seastore/lba/lba_btree_node.h
src/crimson/os/seastore/linked_tree_node.h

index d1891e1441a57778203d6371b1ddd24d078ad101..b701062610ab045b00a98573dbce9fbe5e4c3c31 100644 (file)
@@ -1135,6 +1135,89 @@ public:
     return iter;
   }
 
+  /**
+   * replace
+   *
+   * Replace the entry pointed by iter with the key and val. key
+   * must be within the range iter.get_key()~iter.get_length()
+   *
+   * @param c [in] op context
+   * @param iter [in] iterator to element to update, must not be end
+   * @param key [in] key with which to replace
+   * @param val [in] val with which to replace
+   * @return iterator to newly replaced element
+   */
+  using replace_iertr = base_iertr;
+  using replace_ret = replace_iertr::future<iterator>;
+  using get_val_func_t = std::function<node_val_t ()>;
+  replace_ret replace(
+    op_context_t c,
+    iterator iter,
+    node_key_t key,
+    get_val_func_t func,
+    BaseChildNode<leaf_node_t, node_key_t> *child)
+  {
+    LOG_PREFIX(FixedKVBtree::replace);
+    SUBTRACET(
+      seastore_fixedkv_tree,
+      "replace element at {} with {}",
+      c.trans,
+      iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key(),
+      key);
+    if (likely(iter.leaf.node->get_meta().end > key)) {
+      // the replacing key is also within the range of the current
+      // leaf node, so do the replacing directly
+      if (!iter.leaf.node->is_mutable()) {
+        CachedExtentRef mut = c.cache.duplicate_for_write(
+          c.trans, iter.leaf.node
+        );
+        iter.leaf.node = mut->cast<leaf_node_t>();
+      }
+      ++(get_tree_stats<self_type>(c.trans).num_updates);
+      iter.leaf.node->replace(
+        iter.leaf.node->iter_idx(iter.leaf.pos),
+        key,
+        func());
+      if constexpr (std::is_base_of_v<
+          ParentNode<leaf_node_t, node_key_t>, leaf_node_t>) {
+        if (child) {
+          iter.leaf.node->update_child_ptr(iter.leaf.pos, child);
+        }
+      }
+      co_return iter;
+    }
+    // the replacing key fall beyond the end of the current leaf
+    // node, insert it into the next leaf node and remove the
+    // original key
+    auto niter = co_await iter.next(c);
+    assert(!niter.is_end());
+    assert(niter.get_key() > key);
+    co_await handle_split(c, niter);
+    if (!niter.leaf.node->is_mutable()) {
+      CachedExtentRef mut = c.cache.duplicate_for_write(
+        c.trans, niter.leaf.node
+      );
+      niter.leaf.node = mut->cast<leaf_node_t>();
+    }
+    auto it = typename leaf_node_t::const_iterator(
+        niter.leaf.node.get(), niter.leaf.pos);
+    assert(key >= niter.leaf.node->get_meta().begin &&
+           key < niter.leaf.node->get_meta().end);
+    niter.leaf.node->insert(it, key, func());
+    if constexpr (std::is_base_of_v<
+        ParentNode<leaf_node_t, node_key_t>, leaf_node_t>) {
+      niter.leaf.node->insert_child_ptr(
+        niter.leaf.pos, child, niter.leaf.node->get_size() - 1);
+    }
+    (void)child;
+    // TODO: the keys pointed by iter and niter are overlapped which
+    // is against the invariant of the lba/backref tree, but since
+    // remove directly removes the iter, it wouldn't cause any issue
+    // for the time being.
+    niter = co_await remove(c, std::move(iter));
+    assert(niter.get_key() == key);
+    co_return niter;
+  }
 
   /**
    * remove
index 0d71c0969de25ddf81e4faad9e23fe2a285594a8..6a42f5ae662274ceb8bd1e8239c3b9f4cdcdf138 100644 (file)
@@ -1086,38 +1086,54 @@ BtreeLBAManager::remap_mappings(
         (orig_val.pladdr.is_paddr() &&
          orig_val.pladdr.get_paddr().is_absolute()));
   auto type = cursor->get_extent_type();
-  cursor = co_await update_mapping_refcount(
-    c.trans, cursor, -1);
-  iter = btree.make_partial_iter(c, *cursor);
+  auto off = 0;
+  auto last_iter = iter;
   for (auto &remap : remaps) {
     assert(remap.offset + remap.len <= orig_len);
     assert((bool)remap.extent == !orig_indirect);
-    lba_map_val_t val = orig_val;
     auto new_key = (orig_laddr + remap.offset).checked_to_laddr();
-    val.len = remap.len;
-    if (val.pladdr.is_laddr()) {
-      DEBUGT("{} + {:#x}",
-        t,
-        val.pladdr.get_local_clone_id(),
-        remap.offset);
+    auto f = [&last_iter, &remap, &off, &t, FNAME, type] {
+      lba_map_val_t val = last_iter.get_val();
+      auto cur_off = remap.offset - off;
+      if (val.pladdr.is_laddr()) {
+        DEBUGT("{} + {:#x}",
+          t,
+          val.pladdr.get_local_clone_id(),
+          remap.offset);
+      } else {
+        auto paddr = val.pladdr.get_paddr();
+        val.pladdr = paddr + cur_off;
+      }
+      val.len = remap.len;
+      val.refcount = EXTENT_DEFAULT_REF_COUNT;
+      // Checksum will be updated when the committing the transaction
+      val.checksum = CRC_NULL;
+      val.type = type;
+      return val;
+    };
+    // committing the transaction
+    if (remap.offset == remaps.front().offset) {
+      iter = co_await btree.replace(
+        c, std::move(iter), new_key,
+        std::move(f),
+        orig_indirect
+          ? get_reserved_ptr<LBALeafNode, laddr_t>()
+          : remap.extent);
+      ret.push_back(iter.get_cursor(c));
+      iter = co_await iter.next(c);
     } else {
-      auto paddr = val.pladdr.get_paddr();
-      val.pladdr = paddr + remap.offset;
+      auto p = co_await btree.insert(
+        c, iter, new_key, f(),
+        orig_indirect
+          ? get_reserved_ptr<LBALeafNode, laddr_t>()
+          : remap.extent);
+      auto &[it, inserted] = p;
+      ceph_assert(inserted);
+      ret.push_back(it.get_cursor(c));
+      last_iter = it;
+      iter = co_await it.next(c);
     }
-    val.refcount = EXTENT_DEFAULT_REF_COUNT;
-    // Checksum will be updated when the committing the transaction
-    val.checksum = CRC_NULL;
-    val.type = type;
-    // committing the transaction
-    auto p = co_await btree.insert(
-      c, iter, new_key, std::move(val),
-      orig_indirect
-        ? get_reserved_ptr<LBALeafNode, laddr_t>()
-        : remap.extent);
-    auto &[it, inserted] = p;
-    ceph_assert(inserted);
-    ret.push_back(it.get_cursor(c));
-    iter = co_await it.next(c);
+    off = remap.offset;
   }
   co_await trans_intr::parallel_for_each(
     ret,
index dd15c2a8cf2e133732f051b168957ccff8ee2ccf..cabb652bebf8cc748450095a721f462341e2096e 100644 (file)
@@ -148,6 +148,24 @@ struct LBALeafNode
     laddr_t addr,
     lba_map_val_t val) final;
 
+  void replace(
+    internal_const_iterator_t iter,
+    laddr_t pivot,
+    lba_map_val_t val) {
+    LOG_PREFIX(FixedKVInternalNode::replace);
+    SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, old key {}, key {}",
+      this->pending_for_transaction,
+      iter.get_offset(),
+      iter.get_key(),
+      pivot);
+    this->on_modify();
+    return this->journal_replace(
+      iter,
+      pivot,
+      std::move(val),
+      maybe_get_delta_buffer());
+  }
+
   void remove(internal_const_iterator_t iter) final {
     LOG_PREFIX(LBALeafNode::remove);
     SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}",
index 3683f90972b28a004ae36c1f02c908a210e63da2..1b136c8a6a2fb860c766a23fa29bedf352accc28 100644 (file)
@@ -426,7 +426,9 @@ public:
 
   void update_child_ptr(btreenode_pos_t pos, BaseChildNode<T, node_key_t>* child) {
     children[pos] = child;
-    set_child_ptracker(child);
+    if (!is_reserved_ptr(child)) {
+      set_child_ptracker(child);
+    }
   }
 
   // copy dests points from a stable node back to its pending nodes