From: Xuehan Xu Date: Mon, 18 May 2026 09:55:35 +0000 (+0800) Subject: crimson/os/seastore: demote/promote background processes are also X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=80d9dc2ab6efba57d6020cc5e78ae8eae5ed0dfa;p=ceph-ci.git crimson/os/seastore: demote/promote background processes are also rewrite transactions Signed-off-by: Xuehan Xu --- diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 03997631933..e82cf96080f 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -1804,7 +1804,7 @@ record_t Cache::prepare_record( if (i->is_exist_clean()) { assert(i->version == 0); - assert(!i->prior_instance); + assert(!i->prior_instance || t.get_src() == transaction_type_t::DEMOTE); // no set_io_wait(), skip complete_commit() assert(!i->is_pending_io()); i->pending_for_transaction = TRANS_ID_NULL; @@ -1815,13 +1815,26 @@ record_t Cache::prepare_record( should_use_no_conflict_publish(t.get_src(), i->get_type())); } - // exist mutation pending extents must be in t.mutated_block_list - add_extent(i); - const auto t_src = t.get_src(); - if (i->is_stable_dirty()) { - add_to_dirty(i, &t_src); + assert(i->is_logical()); + if (t.get_src() == transaction_type_t::DEMOTE) { + assert(!i->committer); + assert(!i->get_prior_instance()->committer); + i->new_committer(t); + assert(i->committer); + i->get_prior_instance()->committer = i->committer; + auto &committer = *i->committer; + committer.block_trans(t); + i->get_prior_instance()->set_io_wait( + CachedExtent::extent_state_t::CLEAN, true); } else { - touch_extent_fully(*i, &t_src, t.get_cache_hint()); + // exist mutation pending extents must be in t.mutated_block_list + add_extent(i); + const auto t_src = t.get_src(); + if (i->is_stable_dirty()) { + add_to_dirty(i, &t_src); + } else { + touch_extent_fully(*i, &t_src, t.get_cache_hint()); + } } alloc_delta.alloc_blk_ranges.emplace_back( @@ -2096,6 +2109,10 @@ void Cache::complete_commit( if (is_lba_backref_node(i->get_type())) { committer.commit_data(); } + if (i->is_logical() && + t.get_src() == transaction_type_t::PROMOTE) { + committer.commit_shadow_promote(t); + } touch_extent_fully(prior, &t_src, t.get_cache_hint()); committer.sync_version(); committer.unblock_trans(t); @@ -2205,6 +2222,32 @@ void Cache::complete_commit( continue; } epm.mark_space_used(i->get_paddr(), i->get_length()); + assert(i->is_logical()); + auto t_src = t.get_src(); + if (t.get_src() == transaction_type_t::DEMOTE) { + assert(i->committer); + auto &committer = *i->committer; + auto &prior = static_cast( + *i->get_prior_instance()); + ceph_assert(prior.is_valid()); + TRACET("committing rewritten extent into " + "existing -- {}, prior={}", + t, *i, prior); + prior.pending_for_transaction = TRANS_ID_NULL; + if (auto shadow = prior.get_shadow(); shadow) { + committer.commit_shadow_demote(t); + prior.reset_shadow(); + } + committer.commit_state(); + committer.sync_checksum(); + committer.commit_and_share_paddr(); + touch_extent_fully(prior, &t_src, t.get_cache_hint()); + committer.sync_version(); + committer.unblock_trans(t); + prior.complete_io(); + i->committer.reset(); + prior.committer.reset(); + } } for (auto &i: t.pre_alloc_list) { if (!i->is_valid()) { diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 0b3d216ffb6..93651568f0b 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -1682,6 +1682,9 @@ public: read_extent_futs, [](auto &fut) { return std::move(fut); }); } + bool is_on_cold_tier(paddr_t paddr) const { + return epm.is_cold_device(paddr.get_device_id()); + } private: void touch_extent_fully( CachedExtent &ext, diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index 7ee7c8980f6..e15a5618ac5 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -542,4 +542,43 @@ void ExtentCommitter::unblock_trans(Transaction &t) { } } +void ExtentCommitter::commit_shadow_demote(Transaction &t) { + LOG_PREFIX(ExtentCommitter::commit_shadow_demote); + assert(t.get_src() == transaction_type_t::DEMOTE); + auto &prior = *extent.prior_instance->template cast(); + auto shadow = prior.get_shadow(); + assert(shadow); + for (auto &trans_view : prior.retired_transactions) { + assert(trans_view.t != nullptr); + auto view_tid = trans_view.t->get_trans_id(); + if (view_tid == t.get_trans_id()) { + continue; + } + TRACET("removing shadow {} from retired_set of t.{}", t, *shadow, view_tid); + [[maybe_unused]] bool removed = + trans_view.t->remove_from_retired_set(*shadow); + assert(removed); + } +} + +void ExtentCommitter::commit_shadow_promote(Transaction &t) { + LOG_PREFIX(ExtentCommitter::commit_shadow_promote); + assert(t.get_src() == transaction_type_t::PROMOTE); + assert(extent.is_logical()); + auto &lprior = static_cast(*extent.prior_instance); + auto &lext = static_cast(extent); + auto shadow = lext.get_shadow(); + assert(shadow); + lprior.set_shadow(shadow); + for (auto &trans_view : lprior.retired_transactions) { + assert(trans_view.t != nullptr); + auto view_tid = trans_view.t->get_trans_id(); + if (view_tid == t.get_trans_id()) { + continue; + } + TRACET("adding shadow {} from t.{}", t, *shadow, view_tid); + trans_view.t->add_absent_to_retired_set(shadow); + } +} + } diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 9371c984544..89db7219b7a 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -300,6 +300,8 @@ public: void commit_and_share_paddr(); + void commit_shadow_demote(Transaction&); + void commit_shadow_promote(Transaction&); private: // the rewritten extent CachedExtent &extent; diff --git a/src/crimson/os/seastore/lba/btree_lba_manager.cc b/src/crimson/os/seastore/lba/btree_lba_manager.cc index fff5c5bf164..ee4f5e719d5 100644 --- a/src/crimson/os/seastore/lba/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba/btree_lba_manager.cc @@ -951,13 +951,22 @@ BtreeLBAManager::update_mappings( return this->_update_mapping( c.trans, *cursor, - [prev_addr, addr, len, checksum]( + [prev_addr, addr, len, checksum, extent, c]( const lba_map_val_t &in) { lba_map_val_t ret = in; ceph_assert(in.pladdr.is_paddr()); - ceph_assert(in.pladdr.get_paddr() == prev_addr); ceph_assert(in.len == len); - ret.pladdr = addr; + if (likely(in.pladdr.get_paddr() == prev_addr)) { + ret.pladdr = addr; + } else { + // this can only happen when the extent is EXIST_CLEAN + // and is demoted onto the cold tier by a DEMOTE trans. + assert(in.shadow_paddr == P_ADDR_NULL); + assert(extent->is_exist_clean()); + assert(extent->get_paddr() == in.pladdr.get_paddr()); + assert(c.cache.is_on_cold_tier(extent->get_paddr())); + assert(!c.cache.is_on_cold_tier(prev_addr)); + } ret.checksum = checksum; return ret; }, diff --git a/src/crimson/os/seastore/lba/lba_btree_node.h b/src/crimson/os/seastore/lba/lba_btree_node.h index 447a6caa9c0..6c95e434191 100644 --- a/src/crimson/os/seastore/lba/lba_btree_node.h +++ b/src/crimson/os/seastore/lba/lba_btree_node.h @@ -294,6 +294,7 @@ struct LBALeafNode iterator &iter) { LOG_PREFIX(LBALeafNode::merge_content_to); + SUBTRACET(seastore_lba, "merging with {}", t, pending_version); std::map modified; auto it = pending_version.begin(); while (it != pending_version.end() && iter != this->end()) { @@ -315,11 +316,17 @@ struct LBALeafNode ceph_abort(); } if (is_valid_child_ptr(child) && - (child->_is_mutable() || child->_is_pending_io())) { - // skip the ones that the pending version is also modifying + (// skip the ones that the pending version is also modifying + (child->_is_mutable() || child->_is_pending_io()) || + // EXIST_CLEAN extents created by DEMOTE transactions also + // updates their paddrs, so they should also be skpped. + (pending_version.t->get_src() == transaction_type_t::DEMOTE))) { + SUBTRACET(seastore_lba, "skipping {}~{}", t, it->get_key(), it->get_val()); it++; continue; } + SUBTRACET(seastore_lba, "examing v2: {}~{}, v1: {}~{}", + t, it->get_key(), it->get_val(), iter->get_key(), iter->get_val()); auto pending_key = it->get_key(); auto stable_key = iter->get_key(); auto stable_end = stable_key + v1.len; @@ -338,6 +345,11 @@ struct LBALeafNode auto paddr = v1.pladdr.get_paddr(); paddr = paddr + off; m_v2.pladdr = paddr; + if (v1.shadow_paddr == P_ADDR_NULL) { + m_v2.shadow_paddr = P_ADDR_NULL; + } else { + m_v2.shadow_paddr = (v1.shadow_paddr + off); + } SUBTRACET(seastore_lba, "merging to {}, paddr: {} -> {}", t, pending_version, m_v2.pladdr, paddr); if (!is_valid_child_ptr(child) || diff --git a/src/crimson/os/seastore/logical_child_node.h b/src/crimson/os/seastore/logical_child_node.h index ea4bb9c6a76..977b96aaf21 100644 --- a/src/crimson/os/seastore/logical_child_node.h +++ b/src/crimson/os/seastore/logical_child_node.h @@ -44,6 +44,20 @@ public: laddr_t get_end() const { return (get_laddr() + get_length()).checked_to_laddr(); } + + TCachedExtentRef get_shadow() const { + return shadow; + } + + void set_shadow(TCachedExtentRef &s) { + assert(!shadow); + shadow = s; + } + + void reset_shadow() { + shadow.reset(); + } + protected: void on_replace_prior(Transaction &t) final { assert(is_seen_by_users()); @@ -56,6 +70,8 @@ protected: void on_data_commit() final { ceph_abort("impossible"); } +private: + TCachedExtentRef shadow; }; using LogicalChildNodeRef = TCachedExtentRef; } // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index a000f2ee089..03dcd08c505 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -2756,7 +2756,9 @@ constexpr bool is_background_transaction(transaction_type_t type) { constexpr bool is_rewrite_transaction(transaction_type_t type) { return type == transaction_type_t::TRIM_DIRTY || type == transaction_type_t::CLEANER_MAIN || - type == transaction_type_t::CLEANER_COLD; + type == transaction_type_t::CLEANER_COLD || + type == transaction_type_t::DEMOTE || + type == transaction_type_t::PROMOTE; } constexpr bool is_trim_transaction(transaction_type_t type) { diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index b11f505c79e..4688c1057c2 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -363,6 +363,21 @@ public: } } + bool remove_from_retired_set(CachedExtent &ext) { + auto it = retired_set.find(ext.get_paddr()); + if (it == retired_set.end()) { + return false; + } + auto &extent = it->extent; + if (extent->get_paddr() != ext.get_paddr()) { + return false; + } else { + assert(ext.get_length() == extent->get_length()); + retired_set.erase(it); + return true; + } + } + std::pair pre_stable_extent_paddr_mod( read_set_item_t &item) { diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index f7b5aa8af44..54184acfbf6 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -259,13 +259,14 @@ TransactionManager::ref_ret TransactionManager::remove( auto laddr = ref->get_laddr(); cache->retire_absent_extent_addr_by_type( t, laddr, shadow_addr, length, ref->get_type(), - [laddr](auto &extent) { + [ref, laddr](auto &extent) { auto lextent = extent.template cast(); assert(extent.is_logical()); assert(!lextent->has_laddr()); assert(!extent.has_been_invalidated()); lextent->set_laddr(laddr); extent.set_shadow_extent(true); + ref->set_shadow(lextent); }); } } @@ -332,11 +333,32 @@ TransactionManager::_remove( LogicalChildNode >(); ceph_assert(extent); - cache->retire_extent(t, std::move(extent)); + cache->retire_extent(t, extent); + if (mapping.has_shadow_val()) { + if (auto shadow = extent->get_shadow(); shadow) { + t.add_absent_to_retired_set(shadow); + } else { + auto laddr = mapping.get_intermediate_base(); + std::ignore = cache->retire_absent_extent_addr_by_type( + t, laddr, + mapping.get_shadow_val(), + mapping.get_intermediate_length(), + mapping.get_extent_type(), + [extent, laddr](auto &ext) { + auto lextent = ext.template cast(); + assert(ext.is_logical()); + assert(!lextent->has_laddr()); + assert(!ext.has_been_invalidated()); + lextent->set_laddr(laddr); + ext.set_shadow_extent(true); + extent->set_shadow(lextent); + }); + } + } } else { auto &child_pos = maybe_mapped_extent.get_child_pos(); auto laddr = mapping.get_intermediate_base(); - std::ignore = cache->retire_absent_extent_addr_by_type( + auto ext = cache->retire_absent_extent_addr_by_type( t, laddr, mapping.get_val(), mapping.get_intermediate_length(), @@ -349,13 +371,23 @@ TransactionManager::_remove( child_pos.link_child(lextent.get()); lextent->set_laddr(laddr); } - ); - } - if (mapping.has_shadow_val()) { - cache->retire_absent_extent_addr( - t, mapping.get_intermediate_base(), - mapping.get_shadow_val(), - mapping.get_intermediate_length()); + )->template cast(); + if (mapping.has_shadow_val()) { + std::ignore = cache->retire_absent_extent_addr_by_type( + t, mapping.get_intermediate_base(), + mapping.get_shadow_val(), + mapping.get_intermediate_length(), + mapping.get_extent_type(), + [laddr, ext](auto &extent) { + auto lextent = extent.template cast(); + assert(extent.is_logical()); + assert(!lextent->has_laddr()); + assert(!extent.has_been_invalidated()); + lextent->set_laddr(laddr); + extent.set_shadow_extent(true); + ext->set_shadow(lextent); + }); + } } } @@ -501,8 +533,9 @@ TransactionManager::relocate_shadow_extent( assert(mapping.has_shadow_val()); assert(!mapping.is_zero_reserved()); assert(mapping.is_viewable()); + assert(t.get_src() == transaction_type_t::DEMOTE); auto v = mapping.get_logical_extent(t); - CachedExtentRef extent; + LogicalChildNodeRef extent; auto laddr = mapping.get_key(); if (!v.has_child()) { auto &child_pos = v.get_child_pos(); @@ -520,26 +553,34 @@ TransactionManager::relocate_shadow_extent( child_pos.link_child(lextent.get()); lextent->set_laddr(laddr); } - ); + )->template cast(); } else { - auto extent = co_await std::move(v.get_child_fut()); + extent = co_await std::move(v.get_child_fut()); cache->retire_extent(t, extent); } - auto shadow_paddr = mapping.get_shadow_val(); - std::ignore = cache->retire_absent_extent_addr_by_type( - t, laddr, shadow_paddr, mapping.get_length(), mapping.get_extent_type(), - [laddr](auto &ext) { - auto lextent = ext.template cast(); - assert(ext.is_logical()); - assert(!lextent->has_laddr()); - assert(!ext.has_been_invalidated()); - lextent->set_laddr(laddr); - } - ); - co_return cache->alloc_remapped_extent_by_type( + if (auto shadow = extent->get_shadow(); shadow) { + t.add_absent_to_retired_set(shadow); + } else { + auto shadow_paddr = mapping.get_shadow_val(); + std::ignore = cache->retire_absent_extent_addr_by_type( + t, laddr, shadow_paddr, mapping.get_length(), mapping.get_extent_type(), + [laddr, extent](auto &ext) { + auto lextent = ext.template cast(); + assert(ext.is_logical()); + assert(!lextent->has_laddr()); + assert(!ext.has_been_invalidated()); + lextent->set_laddr(laddr); + ext.set_shadow_extent(true); + extent->set_shadow(lextent); + } + ); + } + auto nextent = cache->alloc_remapped_extent_by_type( t, mapping.get_extent_type(), laddr, mapping.get_shadow_val(), 0, mapping.get_length(), std::nullopt )->cast(); + nextent->set_prior_instance(extent); + co_return nextent; } TransactionManager::submit_transaction_iertr::future<> @@ -1194,6 +1235,8 @@ TransactionManager::promote_extent( slice_length, std::nullopt); remapped_cold_extent->set_shadow_extent(true); + auto lremapped = remapped_cold_extent->template cast(); + lext->set_shadow(lremapped); offset += slice_length; } @@ -1235,8 +1278,8 @@ TransactionManager::promote_extent( orig_ext->get_length(), std::nullopt); remapped_cold_extent->set_shadow_extent(true); - - remapped_cold_extent->set_shadow_extent(true); + auto lremapped = remapped_cold_extent->template cast(); + lext->set_shadow(lremapped); } auto cursor = co_await lba_manager->get_cursor( diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 64ac9c009de..e549572bd14 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -1543,7 +1543,7 @@ private: SUBTRACET(seastore_tm, "retire extent place holder...", t); auto &child_pos = ret.get_child_pos(); auto laddr = pin.get_key(); - std::ignore = cache->retire_absent_extent_addr_by_type( + auto ext = cache->retire_absent_extent_addr_by_type( t, laddr, original_paddr, original_len, pin.get_extent_type(), [&child_pos, laddr](auto &extent) mutable { auto lextent = extent.template cast(); @@ -1553,17 +1553,18 @@ private: child_pos.link_child(lextent.get()); lextent->set_laddr(laddr); } - ); + )->template cast(); if (pin.has_shadow_val()) { cache->retire_absent_extent_addr_by_type( t, pin.get_key(), pin.get_shadow_val(), original_len, pin.get_extent_type(), - [laddr](auto &extent) { + [laddr, ext](auto &extent) { auto lextent = extent.template cast(); assert(extent.is_logical()); assert(!lextent->has_laddr()); assert(!extent.has_been_invalidated()); lextent->set_laddr(laddr); + ext->set_shadow(lextent); } ); }