From: Xuehan Xu Date: Sun, 24 May 2026 08:45:01 +0000 (+0800) Subject: crimson/os/seastore/lba: avoid paddr from crossing coroutines X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=af137a0d86678fe868a5d8339d7c059eadef107a;p=ceph.git crimson/os/seastore/lba: avoid paddr from crossing coroutines Previously, LBAManager::remap_mappings() works as follows: 1. get the mapping's val; 2. remove the mapping; 3. insert the remapped mappings. With pessimistic cc in place, during the above step 2 and 3, a rewrite transaction that modifies the same mapping might be committed and miss the pending lba leaf nodes because it doesn't contain the mapping at the time. This commit change the above workflow as follows: 1. replace the mapping with the first remapped mapping; 2. insert the remaining remapped mappings. Note that all the remaining mapped mappings' paddrs are calculated based on the mapping before it in the same coroutine as the insertion, which means it'll always see the modification of a background rewrite transaction. Signed-off-by: Xuehan Xu --- diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index d1891e1441a..b701062610a 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -1135,6 +1135,89 @@ public: return iter; } + /** + * replace + * + * Replace the entry pointed by iter with the key and val. key + * must be within the range iter.get_key()~iter.get_length() + * + * @param c [in] op context + * @param iter [in] iterator to element to update, must not be end + * @param key [in] key with which to replace + * @param val [in] val with which to replace + * @return iterator to newly replaced element + */ + using replace_iertr = base_iertr; + using replace_ret = replace_iertr::future; + using get_val_func_t = std::function; + replace_ret replace( + op_context_t c, + iterator iter, + node_key_t key, + get_val_func_t func, + BaseChildNode *child) + { + LOG_PREFIX(FixedKVBtree::replace); + SUBTRACET( + seastore_fixedkv_tree, + "replace element at {} with {}", + c.trans, + iter.is_end() ? min_max_t::max : iter.get_key(), + key); + if (likely(iter.leaf.node->get_meta().end > key)) { + // the replacing key is also within the range of the current + // leaf node, so do the replacing directly + if (!iter.leaf.node->is_mutable()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, iter.leaf.node + ); + iter.leaf.node = mut->cast(); + } + ++(get_tree_stats(c.trans).num_updates); + iter.leaf.node->replace( + iter.leaf.node->iter_idx(iter.leaf.pos), + key, + func()); + if constexpr (std::is_base_of_v< + ParentNode, leaf_node_t>) { + if (child) { + iter.leaf.node->update_child_ptr(iter.leaf.pos, child); + } + } + co_return iter; + } + // the replacing key fall beyond the end of the current leaf + // node, insert it into the next leaf node and remove the + // original key + auto niter = co_await iter.next(c); + assert(!niter.is_end()); + assert(niter.get_key() > key); + co_await handle_split(c, niter); + if (!niter.leaf.node->is_mutable()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, niter.leaf.node + ); + niter.leaf.node = mut->cast(); + } + auto it = typename leaf_node_t::const_iterator( + niter.leaf.node.get(), niter.leaf.pos); + assert(key >= niter.leaf.node->get_meta().begin && + key < niter.leaf.node->get_meta().end); + niter.leaf.node->insert(it, key, func()); + if constexpr (std::is_base_of_v< + ParentNode, leaf_node_t>) { + niter.leaf.node->insert_child_ptr( + niter.leaf.pos, child, niter.leaf.node->get_size() - 1); + } + (void)child; + // TODO: the keys pointed by iter and niter are overlapped which + // is against the invariant of the lba/backref tree, but since + // remove directly removes the iter, it wouldn't cause any issue + // for the time being. + niter = co_await remove(c, std::move(iter)); + assert(niter.get_key() == key); + co_return niter; + } /** * remove diff --git a/src/crimson/os/seastore/lba/btree_lba_manager.cc b/src/crimson/os/seastore/lba/btree_lba_manager.cc index 0d71c0969de..6a42f5ae662 100644 --- a/src/crimson/os/seastore/lba/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba/btree_lba_manager.cc @@ -1086,38 +1086,54 @@ BtreeLBAManager::remap_mappings( (orig_val.pladdr.is_paddr() && orig_val.pladdr.get_paddr().is_absolute())); auto type = cursor->get_extent_type(); - cursor = co_await update_mapping_refcount( - c.trans, cursor, -1); - iter = btree.make_partial_iter(c, *cursor); + auto off = 0; + auto last_iter = iter; for (auto &remap : remaps) { assert(remap.offset + remap.len <= orig_len); assert((bool)remap.extent == !orig_indirect); - lba_map_val_t val = orig_val; auto new_key = (orig_laddr + remap.offset).checked_to_laddr(); - val.len = remap.len; - if (val.pladdr.is_laddr()) { - DEBUGT("{} + {:#x}", - t, - val.pladdr.get_local_clone_id(), - remap.offset); + auto f = [&last_iter, &remap, &off, &t, FNAME, type] { + lba_map_val_t val = last_iter.get_val(); + auto cur_off = remap.offset - off; + if (val.pladdr.is_laddr()) { + DEBUGT("{} + {:#x}", + t, + val.pladdr.get_local_clone_id(), + remap.offset); + } else { + auto paddr = val.pladdr.get_paddr(); + val.pladdr = paddr + cur_off; + } + val.len = remap.len; + val.refcount = EXTENT_DEFAULT_REF_COUNT; + // Checksum will be updated when the committing the transaction + val.checksum = CRC_NULL; + val.type = type; + return val; + }; + // committing the transaction + if (remap.offset == remaps.front().offset) { + iter = co_await btree.replace( + c, std::move(iter), new_key, + std::move(f), + orig_indirect + ? get_reserved_ptr() + : remap.extent); + ret.push_back(iter.get_cursor(c)); + iter = co_await iter.next(c); } else { - auto paddr = val.pladdr.get_paddr(); - val.pladdr = paddr + remap.offset; + auto p = co_await btree.insert( + c, iter, new_key, f(), + orig_indirect + ? get_reserved_ptr() + : remap.extent); + auto &[it, inserted] = p; + ceph_assert(inserted); + ret.push_back(it.get_cursor(c)); + last_iter = it; + iter = co_await it.next(c); } - val.refcount = EXTENT_DEFAULT_REF_COUNT; - // Checksum will be updated when the committing the transaction - val.checksum = CRC_NULL; - val.type = type; - // committing the transaction - auto p = co_await btree.insert( - c, iter, new_key, std::move(val), - orig_indirect - ? get_reserved_ptr() - : remap.extent); - auto &[it, inserted] = p; - ceph_assert(inserted); - ret.push_back(it.get_cursor(c)); - iter = co_await it.next(c); + off = remap.offset; } co_await trans_intr::parallel_for_each( ret, diff --git a/src/crimson/os/seastore/lba/lba_btree_node.h b/src/crimson/os/seastore/lba/lba_btree_node.h index dd15c2a8cf2..cabb652bebf 100644 --- a/src/crimson/os/seastore/lba/lba_btree_node.h +++ b/src/crimson/os/seastore/lba/lba_btree_node.h @@ -148,6 +148,24 @@ struct LBALeafNode laddr_t addr, lba_map_val_t val) final; + void replace( + internal_const_iterator_t iter, + laddr_t pivot, + lba_map_val_t val) { + LOG_PREFIX(FixedKVInternalNode::replace); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, old key {}, key {}", + this->pending_for_transaction, + iter.get_offset(), + iter.get_key(), + pivot); + this->on_modify(); + return this->journal_replace( + iter, + pivot, + std::move(val), + maybe_get_delta_buffer()); + } + void remove(internal_const_iterator_t iter) final { LOG_PREFIX(LBALeafNode::remove); SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}", diff --git a/src/crimson/os/seastore/linked_tree_node.h b/src/crimson/os/seastore/linked_tree_node.h index 3683f90972b..1b136c8a6a2 100644 --- a/src/crimson/os/seastore/linked_tree_node.h +++ b/src/crimson/os/seastore/linked_tree_node.h @@ -426,7 +426,9 @@ public: void update_child_ptr(btreenode_pos_t pos, BaseChildNode* child) { children[pos] = child; - set_child_ptracker(child); + if (!is_reserved_ptr(child)) { + set_child_ptracker(child); + } } // copy dests points from a stable node back to its pending nodes