From 02e511c5d5d2fb35a966bfec8a9a3650b5ab562b Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 21 May 2026 20:35:47 +0800 Subject: [PATCH] crimson/os/seastore: force rewrite transactions to conflict with others if it involve insertions on the lba tree The issue was introduced since bd0ce704f24afbe11830d31b46d8b22771f54456 Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cache.cc | 35 ++++++++++--------- src/crimson/os/seastore/cached_extent.cc | 2 +- src/crimson/os/seastore/seastore_types.h | 28 --------------- src/crimson/os/seastore/transaction.h | 29 +++++++++++++++ .../os/seastore/transaction_manager.cc | 6 ++++ 5 files changed, 54 insertions(+), 46 deletions(-) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 5bbb8feeb03..3d2efdbb6b2 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -929,7 +929,7 @@ void Cache::stage_visibility_handoff( assert(next->get_paddr().is_absolute() || next->get_paddr().is_root()); assert(next->version == prev->version + 1); const auto t_src = t.get_src(); - ceph_assert(should_use_no_conflict_publish(t_src, next->get_type())); + ceph_assert(should_use_no_conflict_publish(t, next->get_type())); bool was_stable_dirty = prev->is_stable_dirty(); if (!was_stable_dirty) { @@ -1386,7 +1386,7 @@ record_t Cache::prepare_record( DEBUGT("invalid mutated extent -- {}", t, *i); continue; } - if (should_use_no_conflict_publish(t.get_src(), i->get_type())) { + if (should_use_no_conflict_publish(t, i->get_type())) { i->new_committer(t); i->committer->block_trans(t); } @@ -1507,7 +1507,8 @@ record_t Cache::prepare_record( } if (i->is_mutation_pending()) { - const bool use_no_conflict = should_use_no_conflict_publish(t.get_src(), i->get_type()); + const bool use_no_conflict = + should_use_no_conflict_publish(t, i->get_type()); // Block the new extent readers until the journal commit completes. i->set_io_wait(CachedExtent::extent_state_t::DIRTY, use_no_conflict); @@ -1536,7 +1537,7 @@ record_t Cache::prepare_record( retire_stat.increment(extent->get_length()); DEBUGT("retired and remove extent {}~0x{:x} -- {}", t, extent->get_paddr(), extent->get_length(), *extent); - if (should_use_no_conflict_publish(t.get_src(), extent->get_type())) { + if (should_use_no_conflict_publish(t, extent->get_type())) { // avoid extent invalidation on retirement // only adjust dirty bookkeeping // we would invalidate them in complete_commit final stage @@ -1659,10 +1660,10 @@ record_t Cache::prepare_record( i->get_type())); } i->set_io_wait(CachedExtent::extent_state_t::CLEAN, - should_use_no_conflict_publish(t.get_src(), i->get_type())); + should_use_no_conflict_publish(t, i->get_type())); // Note, paddr is known until complete_commit(), // so add_extent() later. - if (should_use_no_conflict_publish(t.get_src(), i->get_type())) { + if (should_use_no_conflict_publish(t, i->get_type())) { assert(i->get_prior_instance()); assert(!i->committer); assert(!i->get_prior_instance()->committer); @@ -1671,7 +1672,8 @@ record_t Cache::prepare_record( auto &committer = *i->committer; committer.block_trans(t); i->get_prior_instance()->set_io_wait( - CachedExtent::extent_state_t::CLEAN, should_use_no_conflict_publish(t.get_src(), i->get_type())); + CachedExtent::extent_state_t::CLEAN, + should_use_no_conflict_publish(t, i->get_type())); } } @@ -1696,7 +1698,7 @@ record_t Cache::prepare_record( i->get_length(), i->get_type())); } - if (should_use_no_conflict_publish(t.get_src(), i->get_type())) { + if (should_use_no_conflict_publish(t, i->get_type())) { assert(i->get_prior_instance()); assert(!i->committer); assert(!i->get_prior_instance()->committer); @@ -1709,7 +1711,7 @@ record_t Cache::prepare_record( CachedExtent::extent_state_t::CLEAN, true); } i->set_io_wait(CachedExtent::extent_state_t::CLEAN, - should_use_no_conflict_publish(t.get_src(), i->get_type())); + should_use_no_conflict_publish(t, i->get_type())); // Note, paddr is (can be) known until complete_commit(), // so add_extent() later. } @@ -1758,7 +1760,7 @@ record_t Cache::prepare_record( } else { assert(i->is_exist_mutation_pending()); i->set_io_wait(CachedExtent::extent_state_t::DIRTY, - should_use_no_conflict_publish(t.get_src(), i->get_type())); + should_use_no_conflict_publish(t, i->get_type())); } // exist mutation pending extents must be in t.mutated_block_list @@ -1992,8 +1994,7 @@ void Cache::complete_commit( t, final_block_start, start_seq); for (auto &i: t.retired_set) { auto &extent = i.extent; - auto trans_src = t.get_src(); - if (should_use_no_conflict_publish(trans_src, extent->get_type())) { + if (should_use_no_conflict_publish(t, extent->get_type())) { // retired extents should remain valid through complete_commit(). // We only free space post-commit *AFTER* handoff. assert(extent->is_valid()); @@ -2023,7 +2024,7 @@ void Cache::complete_commit( i->pending_for_transaction = TRANS_ID_NULL; i->on_initial_write(); const auto t_src = t.get_src(); - if (should_use_no_conflict_publish(t_src, i->get_type())) { + if (should_use_no_conflict_publish(t, i->get_type())) { ceph_assert(i->committer); auto &committer = *i->committer; auto &prior = *i->get_prior_instance(); @@ -2106,7 +2107,7 @@ void Cache::complete_commit( if (i->version == 1 || is_root_type(i->get_type())) { i->dirty_from = start_seq; DEBUGT("commit extent done, become dirty -- {}", t, *i); - if (should_use_no_conflict_publish(t.get_src(), i->get_type())) { + if (should_use_no_conflict_publish(t, i->get_type())) { auto &prior = *i->get_prior_instance(); prior.dirty_from = start_seq; ceph_assert(i->committer); @@ -2117,7 +2118,7 @@ void Cache::complete_commit( DEBUGT("commit extent done -- {}", t, *i); } i->on_delta_write(final_block_start); - if (should_use_no_conflict_publish(t.get_src(), i->get_type())) { + if (should_use_no_conflict_publish(t, i->get_type())) { TRACET("committing paddr to prior for {}, prior={}", t, *i, *i->prior_instance); assert(i->committer); @@ -2163,13 +2164,13 @@ void Cache::complete_commit( } t.for_each_finalized_fresh_block([&t](const CachedExtentRef &i) { - if (should_use_no_conflict_publish(t.get_src(), i->get_type())) { + if (should_use_no_conflict_publish(t, i->get_type())) { i->set_invalid(t); } }); for (auto &i: t.mutated_block_list) { - if (should_use_no_conflict_publish(t.get_src(), i->get_type())) { + if (should_use_no_conflict_publish(t, i->get_type())) { i->set_invalid(t); } } diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index 08951ee8f8f..bf30cf163ec 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -502,7 +502,7 @@ void ExtentCommitter::_share_prior_data_to_pending_versions() } void CachedExtent::new_committer(Transaction &t) { - ceph_assert(should_use_no_conflict_publish(t.get_src(), this->get_type())); + ceph_assert(should_use_no_conflict_publish(t, this->get_type())); ceph_assert(!committer); committer = new ExtentCommitter(*this, t); assert(prior_instance); diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 878f6cf9ae2..6f29c9c9b70 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -2756,34 +2756,6 @@ constexpr bool is_modify_transaction(transaction_type_t type) { is_background_transaction(type)); } -/** - * should_use_no_conflict_publish() - * - * Returns true when this (transaction source, extent type) pair should take - * the no-conflict publish path (i.e avoid invalidate-and-retry and use the - * committer + visibility hand-off). - * - * Currently true for: - * - rewrite (background) transactions, for any non-root extent - * - * To be expanded to: - * - user (txn_manager) transactions that mutate LBA nodes - * - Onode/Omap nodes - */ -constexpr bool should_use_no_conflict_publish(transaction_type_t txn_type, - extent_types_t ext_type) { - // keep classic handling for ROOT - if (is_root_type(ext_type)) { - return false; - } - - // TODO: Extend this as support grows (e.g. Onode/OMAP nodes). - // is_user_transaction(txn_type) && is_lba_node(ext_type) - - return is_rewrite_transaction(txn_type); -} - - // Note: It is possible to statically introduce structs for OOL, which must be // more efficient, but that requires to specialize the RecordSubmitter as well. // Let's delay this optimization until necessary. diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index a79d822de6a..a6e2e1c7272 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -528,6 +528,7 @@ public: rewrite_stats = {}; conflicted = false; need_wait_visibility = false; + force_rewrite_conflict = false; assert(backref_entries.empty()); if (!has_reset) { has_reset = true; @@ -647,6 +648,7 @@ public: btree_cursor_stats_t cursor_stats; bool need_wait_visibility = false; + bool force_rewrite_conflict = false; using update_copied_lba_key_func_t = std::function; @@ -913,6 +915,33 @@ inline TransactionRef make_test_transaction() { CACHE_HINT_TOUCH) ); } +/** + * should_use_no_conflict_publish() + * + * Returns true when this (transaction source, extent type) pair should take + * the no-conflict publish path (i.e avoid invalidate-and-retry and use the + * committer + visibility hand-off). + * + * Currently true for: + * - rewrite (background) transactions, for any non-root extent + * + * To be expanded to: + * - user (txn_manager) transactions that mutate LBA nodes + * - Onode/Omap nodes + */ +constexpr bool should_use_no_conflict_publish(const Transaction &t, + extent_types_t ext_type) { + // keep classic handling for ROOT + if (is_root_type(ext_type)) { + return false; + } + + // TODO: Extend this as support grows (e.g. Onode/OMAP nodes). + // is_user_transaction(txn_type) && is_lba_node(ext_type) + + return !t.force_rewrite_conflict && is_rewrite_transaction(t.get_src()); +} + } diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 11b113b3378..408e7ef009e 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -750,6 +750,12 @@ TransactionManager::rewrite_logical_extent( extent_len_t off = 0; auto left = extent->get_length(); extent_ref_count_t refcount = 0; + // if rewriting a logical range resolves to multiple extents, + // the LBA update likely involves insertions/splits (structural + // btree changes), which the no-conflict publish-to-prior path + // does not currently cover safely. Fall back to optimistic + // conflict handling. + t.force_rewrite_conflict = (extents.size() > 1); for (auto &_nextent : extents) { auto nextent = _nextent->template cast(); bool first_extent = (off == 0); -- 2.47.3