From 7cb01c0331bceb9b6b671f57b7d79cc1ab7e8f55 Mon Sep 17 00:00:00 2001 From: Zhang Song Date: Wed, 3 Sep 2025 16:01:35 +0800 Subject: [PATCH] crimson/os/seastore/lba: introduce shadow paddr Signed-off-by: Zhang Song Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/btree/btree_types.cc | 10 ++++-- src/crimson/os/seastore/btree/btree_types.h | 8 +++-- .../os/seastore/lba/btree_lba_manager.cc | 32 ++++++++++++++++--- .../os/seastore/lba/btree_lba_manager.h | 5 ++- src/crimson/os/seastore/lba/lba_btree_node.h | 18 ++++++++--- src/crimson/os/seastore/lba_manager.h | 2 +- src/crimson/os/seastore/lba_mapping.h | 12 +++++++ .../os/seastore/transaction_manager.cc | 27 ++++++++++++++++ src/crimson/os/seastore/transaction_manager.h | 31 ++++++++++++++++++ .../seastore/test_btree_lba_manager.cc | 4 +-- .../seastore/test_transaction_manager.cc | 6 ++-- 11 files changed, 134 insertions(+), 21 deletions(-) diff --git a/src/crimson/os/seastore/btree/btree_types.cc b/src/crimson/os/seastore/btree/btree_types.cc index bc5d0cafab7..56f4382ed9d 100644 --- a/src/crimson/os/seastore/btree/btree_types.cc +++ b/src/crimson/os/seastore/btree/btree_types.cc @@ -12,9 +12,13 @@ namespace lba { std::ostream& operator<<(std::ostream& out, const lba_map_val_t& v) { - return out << "lba_map_val_t(" - << v.pladdr - << "~0x" << std::hex << v.len + out << "lba_map_val_t("; + if (v.shadow_paddr != P_ADDR_NULL) { + out << '[' << v.pladdr << ',' << v.shadow_paddr << ']'; + } else { + out << v.pladdr; + } + return out << "~0x" << std::hex << v.len << ", type=" << (extent_types_t)v.type << ", checksum=0x" << v.checksum << ", refcount=" << std::dec << v.refcount diff --git a/src/crimson/os/seastore/btree/btree_types.h b/src/crimson/os/seastore/btree/btree_types.h index ade1e759899..5ee1b5d6874 100644 --- a/src/crimson/os/seastore/btree/btree_types.h +++ b/src/crimson/os/seastore/btree/btree_types.h @@ -110,6 +110,7 @@ struct lba_map_val_t { extent_len_t len = 0; ///< length of mapping pladdr_t pladdr; ///< direct addr of mapping or // laddr of a direct lba mapping(see btree_lba_manager.h) + paddr_t shadow_paddr; extent_ref_count_t refcount = 0; ///< refcount checksum_t checksum = 0; ///< checksum of original block written at paddr (TODO) extent_types_t type = extent_types_t::NONE; @@ -118,11 +119,12 @@ struct lba_map_val_t { lba_map_val_t( extent_len_t len, pladdr_t pladdr, + paddr_t shadow_paddr, extent_ref_count_t refcount, checksum_t checksum, extent_types_t type) - : len(len), pladdr(pladdr), refcount(refcount), - checksum(checksum), type(type) {} + : len(len), pladdr(pladdr), shadow_paddr(shadow_paddr), + refcount(refcount), checksum(checksum), type(type) {} bool operator==(const lba_map_val_t&) const = default; }; @@ -136,6 +138,7 @@ std::ostream& operator<<(std::ostream& out, const lba_map_val_t&); struct __attribute__((packed)) lba_map_val_le_t { extent_len_le_t len = init_extent_len_le(0); pladdr_le_t pladdr; + paddr_le_t shadow_paddr; extent_ref_count_le_t refcount{0}; checksum_le_t checksum{0}; extent_types_le_t type{EXTENT_TYPES_MAX}; @@ -153,6 +156,7 @@ struct __attribute__((packed)) lba_map_val_le_t { return lba_map_val_t{ len, pladdr, + shadow_paddr, refcount, checksum, static_cast(type)}; diff --git a/src/crimson/os/seastore/lba/btree_lba_manager.cc b/src/crimson/os/seastore/lba/btree_lba_manager.cc index 16f6a45ba1f..e593ce7a7bf 100644 --- a/src/crimson/os/seastore/lba/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba/btree_lba_manager.cc @@ -220,6 +220,7 @@ BtreeLBAManager::resolve_indirect_cursor( const LBACursor &indirect_cursor) { ceph_assert(indirect_cursor.is_indirect()); + ceph_assert(!indirect_cursor.has_shadow_paddr()); return get_cursors( c, btree, @@ -265,6 +266,7 @@ BtreeLBAManager::reserve_region( lba_map_val_t val{ len, pladdr_t{P_ADDR_ZERO}, + P_ADDR_NULL, EXTENT_DEFAULT_REF_COUNT, 0, type}; @@ -300,6 +302,7 @@ BtreeLBAManager::alloc_extents( lba_map_val_t{ ext->get_length(), pladdr_t{ext->get_paddr()}, + P_ADDR_NULL, EXTENT_DEFAULT_REF_COUNT, ext->get_last_committed_crc(), ext->get_type()}, @@ -353,6 +356,7 @@ BtreeLBAManager::clone_mapping( lba_map_val_t{ len, pladdr_t{inter_key.get_local_clone_id()}, + P_ADDR_NULL, EXTENT_DEFAULT_REF_COUNT, 0, mapping->get_extent_type()}, @@ -716,7 +720,8 @@ BtreeLBAManager::scan_mappings( } ceph_assert((pos.get_key() + pos.get_val().len) > begin); if (pos.get_val().pladdr.is_paddr()) { - f(pos.get_key(), pos.get_val().pladdr.get_paddr(), pos.get_val().len); + f(pos.get_key(), pos.get_val().pladdr.get_paddr(), + pos.get_val().shadow_paddr, pos.get_val().len); } return LBABtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, @@ -777,11 +782,19 @@ BtreeLBAManager::update_mapping( assert(!addr.is_null()); lba_map_val_t ret = in; ceph_assert(in.pladdr.is_paddr()); - ceph_assert(in.pladdr.get_paddr() == prev_addr); ceph_assert(in.len == prev_len); - ret.pladdr = addr; - ret.len = len; - ret.checksum = checksum; + if (prev_addr == in.pladdr.get_paddr()) { + ret.pladdr = addr; + ret.len = len; + ret.checksum = checksum; + if (ret.shadow_paddr != P_ADDR_NULL) { + ceph_assert( + addr.get_device_id() != ret.shadow_paddr.get_device_id()); + } + } else { + ceph_assert(in.shadow_paddr == prev_addr); + ret.shadow_paddr = addr; + } return ret; }, &nextent @@ -1083,6 +1096,9 @@ BtreeLBAManager::remap_mappings( } else { auto paddr = val.pladdr.get_paddr(); val.pladdr = paddr + remap.offset; + if (val.shadow_paddr != P_ADDR_NULL) { + val.shadow_paddr = val.shadow_paddr.add_offset(remap.offset); + } } val.refcount = EXTENT_DEFAULT_REF_COUNT; // Checksum will be updated when the committing the transaction @@ -1130,6 +1146,10 @@ BtreeLBAManager::_copy_mapping( if (!iter.is_end()) { assert(iter.get_key() >= dest_laddr + ret.src->get_length()); } + paddr_t shadow = P_ADDR_NULL; + if (!ret.src->is_indirect() && ret.src->has_shadow_paddr()) { + shadow = ret.src->get_shadow_paddr(); + } // insert the src mapping to dest // attach extent to the new mapping if it exists pladdr_t addr; @@ -1145,6 +1165,7 @@ BtreeLBAManager::_copy_mapping( lba_map_val_t{ ret.src->get_length(), std::move(addr), + shadow, EXTENT_DEFAULT_REF_COUNT, ret.src->is_indirect() ? 0 : ret.src->get_checksum(), ret.src->get_extent_type()}, @@ -1218,6 +1239,7 @@ BtreeLBAManager::move_and_clone_direct_mapping( lba_map_val_t val = in; val.pladdr = ret.dest->get_key().get_local_clone_id(); val.checksum = 0; + val.shadow_paddr = P_ADDR_NULL; return val; }, nullptr diff --git a/src/crimson/os/seastore/lba/btree_lba_manager.h b/src/crimson/os/seastore/lba/btree_lba_manager.h index 684cc86e38d..0db84a027d8 100644 --- a/src/crimson/os/seastore/lba/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba/btree_lba_manager.h @@ -115,6 +115,7 @@ public: laddr_t dest_laddr, LBACursorRef dest) final { assert(src->is_indirect()); + assert(!src->has_shadow_paddr()); return _move_mapping( t, std::move(src), dest_laddr, std::move(dest), nullptr); } @@ -313,6 +314,7 @@ private: { len, pladdr_t(P_ADDR_ZERO), + P_ADDR_NULL, EXTENT_DEFAULT_REF_COUNT, 0, type @@ -327,6 +329,7 @@ private: { len, pladdr_t(intermediate_key.get_local_clone_id()), + P_ADDR_NULL, EXTENT_DEFAULT_REF_COUNT, 0, // crc will only be used and checked with LBA direct mappings // also see pin_to_extent(_by_type) @@ -343,7 +346,7 @@ private: LogicalChildNode& extent) { return { laddr, - {len, pladdr_t(paddr), refcount, checksum, extent.get_type()}, + {len, pladdr_t(paddr), P_ADDR_NULL, refcount, checksum, extent.get_type()}, &extent }; } diff --git a/src/crimson/os/seastore/lba/lba_btree_node.h b/src/crimson/os/seastore/lba/lba_btree_node.h index 0f020f98fe3..447a6caa9c0 100644 --- a/src/crimson/os/seastore/lba/lba_btree_node.h +++ b/src/crimson/os/seastore/lba/lba_btree_node.h @@ -90,14 +90,14 @@ using LBAInternalNodeRef = LBAInternalNode::Ref; * checksum : ceph_le32[1] 4B * size : ceph_le32[1] 4B * meta : lba_node_meta_le_t[1] 36B - * keys : laddr_le_t[CAPACITY] (106*16)B - * values : lba_map_val_le_t[CAPACITY] (106*21)B - * = 4077B + * keys : laddr_le_t[CAPACITY] (88*16)B + * values : lba_map_val_le_t[CAPACITY] (88*30)B + * = 4092B * * TODO: update FixedKVNodeLayout to handle the above calculation * TODO: the above alignment probably isn't portable without further work */ -constexpr size_t LEAF_NODE_CAPACITY = 106; +constexpr size_t LEAF_NODE_CAPACITY = 88; struct LBALeafNode : FixedKVLeafNode< @@ -453,11 +453,21 @@ struct LBACursor : BtreeCursor { } extent_types_t get_extent_type() const { + assert(iter.get_val().type != extent_types_t::NONE); assert(is_viewable()); assert(!is_end()); return iter.get_val().type; } + bool has_shadow_paddr() const { + return iter.get_val().shadow_paddr != P_ADDR_NULL; + } + + paddr_t get_shadow_paddr() const { + assert(has_shadow_paddr()); + return iter.get_val().shadow_paddr; + } + base_iertr::future<> refresh(); private: diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index 8826d96a2b4..a537a351418 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -253,7 +253,7 @@ public: using scan_mappings_iertr = base_iertr; using scan_mappings_ret = scan_mappings_iertr::future<>; using scan_mappings_func_t = std::function< - void(laddr_t, paddr_t, extent_len_t)>; + void(laddr_t, paddr_t, paddr_t, extent_len_t)>; virtual scan_mappings_ret scan_mappings( Transaction &t, laddr_t begin, diff --git a/src/crimson/os/seastore/lba_mapping.h b/src/crimson/os/seastore/lba_mapping.h index 395e51e9534..d72198ef101 100644 --- a/src/crimson/os/seastore/lba_mapping.h +++ b/src/crimson/os/seastore/lba_mapping.h @@ -132,6 +132,18 @@ public: return direct_cursor->get_length(); } + bool has_shadow_val() const { + assert(is_linked_direct()); + assert(!direct_cursor->is_end()); + return direct_cursor->has_shadow_paddr(); + } + + paddr_t get_shadow_val() const { + assert(is_linked_direct()); + assert(!direct_cursor->is_end()); + return direct_cursor->get_shadow_paddr(); + } + checksum_t get_checksum() const { assert(is_linked_direct()); assert(!direct_cursor->is_end()); diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 5479549190e..c20c3e0fac5 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -229,12 +229,33 @@ TransactionManager::ref_ret TransactionManager::remove( extent_ref_count_t refcount = cursor->get_refcount(); auto laddr = cursor->get_laddr(); auto length = cursor->get_length(); + paddr_t shadow_addr = P_ADDR_NULL; + if (cursor->has_shadow_paddr()) { + shadow_addr = cursor->get_shadow_paddr(); + } assert(refcount > 0); --refcount; co_await lba_manager->update_mapping_refcount( t, std::move(cursor), -1); if (refcount == 0) { cache->retire_extent(t, ref); + if (shadow_addr != P_ADDR_NULL) { + if (auto shadow = ref->get_shadow(); shadow) { + t.add_absent_to_retired_set(shadow); + } else { + auto laddr = ref->get_laddr(); + cache->retire_absent_extent_addr_by_type( + t, laddr, shadow_addr, length, ref->get_type(), + [laddr](auto &extent) { + auto lextent = extent.template cast(); + assert(extent.is_logical()); + assert(!lextent->has_laddr()); + assert(!extent.has_been_invalidated()); + lextent->set_laddr(laddr); + extent.set_shadow_extent(true); + }); + } + } } DEBUGT("removed {}~0x{:x} refcount={} -- {}", t, laddr, length, @@ -317,6 +338,12 @@ TransactionManager::_remove( } ); } + if (mapping.has_shadow_val()) { + cache->retire_absent_extent_addr( + t, mapping.get_intermediate_base(), + mapping.get_shadow_val(), + mapping.get_intermediate_length()); + } } LBACursorRef indirect_cursor; diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 5e5b1b64f21..17afb573643 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -1103,6 +1103,7 @@ public: if (!mapping.is_indirect() && mapping.is_zero_reserved()) { SUBDEBUGT(seastore_tm, "zero reserved, mapping {}, {} remaps", t, mapping, remaps); + ceph_assert(!mapping.has_shadow_val()); //TODO: drop this assert assert(mapping.get_extent_type() == extent_types_t::OBJECT_DATA_BLOCK); auto type = mapping.get_extent_type(); @@ -1469,6 +1470,20 @@ private: lextent->set_laddr(laddr); } ); + if (pin.has_shadow_val()) { + cache->retire_absent_extent_addr_by_type( + t, pin.get_key(), pin.get_shadow_val(), + original_len, pin.get_extent_type(), + [laddr](auto &extent) { + auto lextent = extent.template cast(); + assert(extent.is_logical()); + assert(!lextent->has_laddr()); + assert(!extent.has_been_invalidated()); + lextent->set_laddr(laddr); + } + ); + } + } } @@ -1498,6 +1513,11 @@ private: auto remap_len = remap.len; auto remap_laddr = (original_laddr + remap_offset).checked_to_laddr(); auto remap_paddr = original_paddr.add_offset(remap_offset); + auto shadow_paddr = P_ADDR_NULL; + if (pin.has_shadow_val()) { + assert(pin.get_shadow_val() != P_ADDR_NULL); + shadow_paddr = pin.get_shadow_val().add_offset(remap_offset); + } SUBDEBUGT(seastore_tm, "remap direct pin into {}~0x{:x} {} ...", t, remap_laddr, remap_len, remap_paddr); ceph_assert(remap_len < original_len); @@ -1512,6 +1532,17 @@ private: remap_offset, remap_len, original_bptr); + if (shadow_paddr != P_ADDR_NULL) { + SUBTRACET(seastore_tm, "remap shadow {}", t, shadow_paddr); + auto cold_ext = cache->alloc_remapped_extent( + t, + remap_laddr, + shadow_paddr, + remap_offset, + remap_len, + std::nullopt); + boost::ignore_unused(cold_ext); + } // user must initialize the logical extent themselves. remapped_extent->set_seen_by_users(); remap.extent = remapped_extent.get(); diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index bdf0f5d4a65..cb04f36cc38 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -313,7 +313,7 @@ struct lba_btree_test : btree_test_base { } static auto get_map_val(extent_len_t len, extent_types_t type) { - return lba_map_val_t{0, (pladdr_t)P_ADDR_NULL, len, 0, type}; + return lba_map_val_t{0, (pladdr_t)P_ADDR_NULL, P_ADDR_NULL, len, 0, type}; } device_off_t next_off = 0; @@ -715,7 +715,7 @@ struct btree_lba_manager_test : btree_test_base { *t.t, L_ADDR_MIN, L_ADDR_MAX, - [iter=t.mappings.begin(), &t](auto l, auto p, auto len) mutable { + [iter=t.mappings.begin(), &t](auto l, auto p, auto s, auto len) mutable { EXPECT_NE(iter, t.mappings.end()); EXPECT_EQ(l, iter->first); EXPECT_EQ(p, iter->second.addr); diff --git a/src/test/crimson/seastore/test_transaction_manager.cc b/src/test/crimson/seastore/test_transaction_manager.cc index 8461d151f89..25917d31e00 100644 --- a/src/test/crimson/seastore/test_transaction_manager.cc +++ b/src/test/crimson/seastore/test_transaction_manager.cc @@ -797,7 +797,7 @@ struct transaction_manager_test_t : t, get_laddr_hint(0), L_ADDR_MAX, - [iter=overlay.begin(), &overlay](auto l, auto p, auto len) mutable { + [iter=overlay.begin(), &overlay](auto l, auto p, auto s, auto len) mutable { EXPECT_NE(iter, overlay.end()); logger().debug( "check_mappings: scan {}", @@ -1892,10 +1892,10 @@ TEST_P(tm_random_block_device_test_t, scatter_allocation) laddr_t ADDR = get_laddr_hint(0xFF * 4096); epm->prefill_fragmented_devices(); auto t = create_transaction(); - for (int i = 0; i < 1974; i++) { + for (int i = 0; i < 1958; i++) { auto extents = alloc_extents(t, (ADDR + i * 16384).checked_to_laddr(), 16384, 'a'); } - alloc_extents_deemed_fail(t, (ADDR + 1974 * 16384).checked_to_laddr(), 16384, 'a'); + alloc_extents_deemed_fail(t, (ADDR + 1958 * 16384).checked_to_laddr(), 16384, 'a'); check_mappings(t); check(); submit_transaction(std::move(t)); -- 2.47.3