From: Zhang Song Date: Wed, 3 Sep 2025 07:54:40 +0000 (+0800) Subject: crimson/os/seastore: adapt copy on write for static onode prefix X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9b56d37c73e4a23a1a87838a646dc784fd18f64c;p=ceph-ci.git crimson/os/seastore: adapt copy on write for static onode prefix Signed-off-by: Zhang Song Signed-off-by: Xuehan Xu --- diff --git a/src/crimson/os/seastore/lba/btree_lba_manager.cc b/src/crimson/os/seastore/lba/btree_lba_manager.cc index ad62f08ef96..18f585cda81 100644 --- a/src/crimson/os/seastore/lba/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba/btree_lba_manager.cc @@ -237,6 +237,17 @@ BtreeLBAManager::resolve_indirect_cursor( }); } +BtreeLBAManager::lower_bound_ret +BtreeLBAManager::lower_bound( + Transaction &t, + laddr_t laddr) +{ + auto c = get_context(t); + auto btree = co_await get_btree(cache, c); + auto iter = co_await btree.lower_bound(c, laddr); + co_return iter.get_cursor(c); +} + BtreeLBAManager::alloc_extent_ret BtreeLBAManager::reserve_region( Transaction &t, diff --git a/src/crimson/os/seastore/lba/btree_lba_manager.h b/src/crimson/os/seastore/lba/btree_lba_manager.h index 04944968ea5..684cc86e38d 100644 --- a/src/crimson/os/seastore/lba/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba/btree_lba_manager.h @@ -75,6 +75,10 @@ public: Transaction &t, LogicalChildNode &extent) final; + lower_bound_ret lower_bound( + Transaction &t, + laddr_t laddr) final; + alloc_extent_ret reserve_region( Transaction &t, LBACursorRef pos, diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index 3e3d1d9f852..8826d96a2b4 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -55,6 +55,11 @@ public: Transaction &t, LogicalChildNode &extent) = 0; + using lower_bound_ret = base_iertr::future; + virtual lower_bound_ret lower_bound( + Transaction &t, + laddr_t laddr) = 0; + #ifdef UNIT_TESTS_BUILT using get_end_mapping_iertr = base_iertr; using get_end_mapping_ret = get_end_mapping_iertr::future; diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index 9ebd21e003c..cfa344e935d 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -1714,29 +1714,42 @@ ObjectDataHandler::clone_ret ObjectDataHandler::copy_on_write( context_t ctx) { - return with_object_data( + return with_objects_data( ctx, - [ctx, this](auto &object_data) -> clone_ret { + [ctx, this](auto &object_data, auto &d_object_data) -> clone_ret + { auto mapping = co_await ctx.tm.get_pin( ctx.t, object_data.get_reserved_data_base() ).handle_error_interruptible( clone_iertr::pass_further{}, crimson::ct_error::assert_all{"unexpected enoent"} ); - object_data_t d_object_data = get_null_object_data(); co_await do_clone(ctx, object_data, d_object_data, mapping, false); auto old_base = object_data.get_reserved_data_base(); auto old_len = object_data.get_reserved_data_len(); - object_data.update_reserved( - d_object_data.get_reserved_data_base(), - d_object_data.get_reserved_data_len()); - ctx.onode.unset_need_cow(ctx.t); + assert(ctx.d_onode->need_cow()); + ctx.d_onode->unset_need_cow(ctx.t); co_await ctx.tm.remove_mappings_in_range( ctx.t, old_base, old_len, std::move(mapping), {false, true} ).handle_error_interruptible( clone_iertr::pass_further{}, crimson::ct_error::assert_all{"unexpected enoent"} ).discard_result(); + + auto old_md_start = old_base.with_metadata().with_offset_by_blocks(0); + auto md_mapping = co_await ctx.tm.lower_bound_pin(ctx.t, old_md_start); + if (md_mapping.is_end() || + md_mapping.get_key().get_clone_prefix() != + old_md_start.get_clone_prefix()) { + co_return; + } + auto new_prefix = d_object_data + .get_reserved_data_base() + .get_clone_prefix() + .with_metadata(); + auto md_dst_mapping = co_await ctx.tm.lower_bound_pin(ctx.t, new_prefix); + co_await ctx.tm.move_region( + ctx.t, md_mapping, md_dst_mapping, new_prefix, true); }); } @@ -1755,6 +1768,8 @@ ObjectDataHandler::do_clone( auto mapping = co_await prepare_data_reservation( ctx, *ctx.d_onode, d_object_data, old_len); ceph_assert(mapping.has_value()); + assert(old_base.get_object_prefix() == mapping->get_key().get_object_prefix()); + assert(old_base.get_clone_prefix() != mapping->get_key().get_clone_prefix()); DEBUGT("new obj reserve_data_base: {}, len 0x{:x}", ctx.t, d_object_data.get_reserved_data_base(), diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h index 2ee6ca32a71..83392ba7dc6 100644 --- a/src/crimson/os/seastore/onode.h +++ b/src/crimson/os/seastore/onode.h @@ -135,6 +135,7 @@ public: virtual void set_need_cow(Transaction&) = 0; virtual void unset_need_cow(Transaction&) = 0; virtual void swap_layout(Transaction&, Onode&) = 0; + virtual boost::intrusive_ptr offload_data_and_md(Transaction&) = 0; laddr_hint_t get_metadata_hint(uint64_t block_size = laddr_t::UNIT_SIZE) const { return get_hint(block_size, /*is_metadata*/true); diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h index 2d234eeb763..59603dfd15a 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h @@ -9,6 +9,57 @@ namespace crimson::os::seastore::onode { +struct FakeOnode final : Onode { + FakeOnode(const hobject_t &hobj, onode_layout_t layout) + : Onode(hobj), layout(layout) {} + + onode_layout_t layout{}; + + laddr_hint_t init_hint(extent_len_t block_size, bool is_metadata) const final { + ceph_abort("impossible"); + return LADDR_HINT_NULL; + } + laddr_hint_t generate_clone_hint( + local_object_id_t object_id, + extent_len_t block_size, + bool is_metadata) const final { + ceph_abort("impossible"); + return LADDR_HINT_NULL; + } + + bool is_alive() const final { return true; } + const onode_layout_t &get_layout() const final { + return layout; + } + void update_onode_size(Transaction &, uint32_t) final { + ceph_abort("impossible"); + } + void update_omap_root(Transaction &, omap_root_t &root) final { + ceph_abort("impossible"); + } + void update_xattr_root(Transaction &, omap_root_t &root) final { + ceph_abort("impossible"); + } + void update_object_data(Transaction &, object_data_t &data) final { + ceph_abort("impossible"); + } + void update_object_info(Transaction &, ceph::bufferlist &) final { + ceph_abort("impossible"); + } + void update_snapset(Transaction &, ceph::bufferlist &) final { + ceph_abort("impossible"); + } + void clear_object_info(Transaction &) final { ceph_abort("impossible"); } + void clear_snapset(Transaction &) final { ceph_abort("impossible"); } + void set_need_cow(Transaction &) final {} + void unset_need_cow(Transaction &) final {} + void swap_layout(Transaction &, Onode &o) final { ceph_abort("impossible"); } + boost::intrusive_ptr offload_data_and_md(Transaction &t) final { + ceph_abort("impossible"); + return nullptr; + } +}; + struct FLTreeOnode final : Onode, Value { static constexpr tree_conf_t TREE_CONF = { value_magic_t::ONODE, @@ -91,6 +142,20 @@ struct FLTreeOnode final : Onode, Value { _swap_layout(t, static_cast(onode)); } + boost::intrusive_ptr offload_data_and_md(Transaction & t) final { + assert(status != status_t::DELETED); + auto fake_onode = new FakeOnode(hobj, get_layout()); + object_data_t data{L_ADDR_NULL, 0}; + update_object_data(t, data); + omap_root_t root; + root.type = omap_type_t::OMAP; + update_omap_root(t, root); + root.type = omap_type_t::XATTR; + update_xattr_root(t, root); + root.type = omap_type_t::LOG; + return fake_onode; + } + void _swap_layout(Transaction &t, FLTreeOnode &other) { assert(status != status_t::DELETED); assert(other.status != status_t::DELETED); diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index dac5e97dcee..41f52ce9948 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -2062,41 +2062,45 @@ SeaStore::Shard::_do_transaction_step( ); } -SeaStore::Shard::tm_ret -SeaStore::Shard::_rename( - internal_context_t &ctx, - OnodeRef &onode, - OnodeRef &d_onode) +namespace { +void rename_onode_omap_metadata( + Transaction &t, Onode &src, Onode &dst) { - auto &objHandler = ObjectDataHandler(max_object_size); - co_await objHanlder.rename(ObjectDataHandler::context_t{ - *transaction_manager, *ctx.transaction, *onode, d_onode.get() - }); - auto get_prefix = [](Onode &onode) { - auto p = onode.get_clone_prefix(); - assert(p); - return *p; - }; - auto src_prefix = get_prefix(*onode); - auto dst_prefix = get_prefix(*d_onode); + auto src_prefix = *src.get_clone_prefix(); + auto dst_prefix = *dst.get_clone_prefix(); - auto rename_omap_root = [&](omap_type_t type) { - auto root = onode->get_root(type).get(d_onode->get_metadata_hint()); + auto rename_root = [&src, &dst, src_prefix, dst_prefix](omap_type_t type) { + auto root = src.get_root(type).get(dst.get_metadata_hint()); if (root.is_null()) { return root; } auto offset = root.addr.get_byte_distance(src_prefix); root.update( (dst_prefix + offset).checked_to_laddr(), - root.depth, d_onode->get_metadata_hint(), type); + root.depth, dst.get_metadata_hint(), type); return root; }; + auto omap_root = rename_root(omap_type_t::OMAP); + auto xattr_root = rename_root(omap_type_t::XATTR); + + dst.update_omap_root(t, omap_root); + dst.update_xattr_root(t, xattr_root); +} +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_rename( + internal_context_t &ctx, + OnodeRef &onode, + OnodeRef &d_onode) +{ + auto &objHandler = ObjectDataHandler(max_object_size); + co_await objHanlder.rename(ObjectDataHandler::context_t{ + *transaction_manager, *ctx.transaction, *onode, d_onode.get() + }); auto olayout = onode->get_layout(); uint32_t size = olayout.size; - auto omap_root = rename_omap_root(omap_type_t::OMAP); - auto xattr_root = rename_omap_root(omap_type_t::XATTR); - auto log_root = rename_omap_root(omap_type_t::LOG); auto oi_bl = ceph::bufferlist::static_from_mem( &olayout.oi[0], (uint32_t)olayout.oi_size); @@ -2105,11 +2109,9 @@ SeaStore::Shard::_rename( (uint32_t)olayout.ss_size); d_onode->update_onode_size(*ctx.transaction, size); - d_onode->update_omap_root(*ctx.transaction, omap_root); - d_onode->update_xattr_root(*ctx.transaction, xattr_root); - d_onode->update_log_root(*ctx.transaction, log_root); d_onode->update_object_info(*ctx.transaction, oi_bl); d_onode->update_snapset(*ctx.transaction, ss_bl); + rename_onode_omap_metadata(*ctx.transaction, *onode, *d_onode); co_await onode_manager->erase_onode( *ctx.transaction, onode ).handle_error_interruptible( @@ -2135,17 +2137,14 @@ SeaStore::Shard::_remove( ObjectDataHandler(max_object_size), [&onode, this, &ctx](auto &objhandler) { - auto fut = ObjectDataHandler::clone_iertr::now(); - auto objctx = ObjectDataHandler::context_t{ - *transaction_manager, - *ctx.transaction, - *onode, - }; - if (onode->need_cow()) { - fut = objhandler.copy_on_write(objctx); - } - return fut.si_then([&objhandler, objctx] { - return objhandler.clear(objctx); + return _maybe_copy_on_write(ctx, *onode, objhandler + ).si_then([&onode, this, &ctx, &objhandler] { + return objhandler.clear( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *onode, + }); }); }); }).si_then([this, &ctx, &onode] { @@ -2188,20 +2187,20 @@ SeaStore::Shard::_write( return seastar::do_with( std::move(_bl), ObjectDataHandler(max_object_size), - [=, this, &ctx, &onode](auto &bl, auto &objhandler) { - auto fut = ObjectDataHandler::clone_iertr::now(); - auto objctx = ObjectDataHandler::context_t{ + [=, this, &ctx, &onode](auto &bl, auto &objhandler) + { + return _maybe_copy_on_write(ctx, onode, objhandler + ).si_then([&ctx, &onode, &objhandler, offset, &bl, this] { + return objhandler.write( + ObjectDataHandler::context_t{ *transaction_manager, *ctx.transaction, onode, - }; - if (onode.need_cow()) { - fut = objhandler.copy_on_write(objctx); - } - return fut.si_then([&objhandler, objctx, offset, &bl] { - return objhandler.write(objctx, offset, bl); - }); + }, + offset, + bl); }); + }); } SeaStore::Shard::tm_ret @@ -2225,7 +2224,13 @@ SeaStore::Shard::_clone( * the case where the *source* is not further mutated, so here we * reverse the two onodes so that HEAD will be the target. */ + auto id = onode.get_layout() + .object_data + .get() + .get_reserved_data_base() + .get_local_object_id(); onode.swap_layout(*ctx.transaction, d_onode); + onode.set_sibling_object_id(id); return objHandler.clone( ObjectDataHandler::context_t{ *transaction_manager, @@ -2256,6 +2261,27 @@ SeaStore::Shard::_clone( }); } +SeaStore::Shard::tm_ret +SeaStore::Shard::_maybe_copy_on_write( + internal_context_t &ctx, + Onode &onode, + ObjectDataHandler &handler) +{ + if (!onode.need_cow()) { + co_return; + } + auto fake_onode = onode.offload_data_and_md(*ctx.transaction); + onode.set_sibling_object_id(fake_onode->get_clone_prefix()->get_local_object_id()); + co_await handler.copy_on_write( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *fake_onode, + &onode + }); + rename_onode_omap_metadata(*ctx.transaction, *fake_onode, onode); +} + SeaStore::Shard::tm_ret SeaStore::Shard::_clone_range( internal_context_t &ctx, @@ -2308,18 +2334,18 @@ SeaStore::Shard::_zero( std::max(offset + len, object_size)); return seastar::do_with( ObjectDataHandler(max_object_size), - [=, this, &ctx, &onode](auto &objhandler) { - auto fut = ObjectDataHandler::clone_iertr::now(); - auto objctx = ObjectDataHandler::context_t{ - *transaction_manager, - *ctx.transaction, - onode, - }; - if (onode.need_cow()) { - fut = objhandler.copy_on_write(objctx); - } - return fut.si_then([&objhandler, objctx, offset, len] { - return objhandler.zero(objctx, offset, len); + [=, this, &ctx, &onode](auto &objhandler) + { + return _maybe_copy_on_write(ctx, onode, objhandler + ).si_then([this, &ctx, &onode, &objhandler, offset, len] { + return objhandler.zero( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + onode, + }, + offset, + len); }); }); } @@ -2363,18 +2389,17 @@ SeaStore::Shard::_truncate( onode.update_onode_size(*ctx.transaction, size); return seastar::do_with( ObjectDataHandler(max_object_size), - [=, this, &ctx, &onode](auto &objhandler) { - auto fut = ObjectDataHandler::clone_iertr::now(); - auto objctx = ObjectDataHandler::context_t{ - *transaction_manager, - *ctx.transaction, - onode, - }; - if (onode.need_cow()) { - fut = objhandler.copy_on_write(objctx); - } - return fut.si_then([&objhandler, objctx, size] { - return objhandler.truncate(objctx, size); + [=, this, &ctx, &onode](auto &objhandler) + { + return _maybe_copy_on_write(ctx, onode, objhandler + ).si_then([this, &ctx, &onode, &objhandler, size] { + return objhandler.truncate( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + onode, + }, + size); }); }); } diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index 70f334efdb9..8ae335796bf 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -360,6 +360,10 @@ public: internal_context_t &ctx, Onode &onode, Onode &d_onode); + tm_ret _maybe_copy_on_write( + internal_context_t &ctx, + Onode &onode, + ObjectDataHandler &handler); tm_ret _rename( internal_context_t &ctx, OnodeRef &onode, diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index d1b7c11f859..2e59c701ea3 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -181,6 +181,13 @@ public: co_return ret; } + base_iertr::future lower_bound_pin( + Transaction &t, + laddr_t laddr) { + auto cursor = co_await lba_manager->lower_bound(t, laddr); + co_return co_await resolve_cursor_to_mapping(t, cursor); + } + /** * maybe_indirect_extent_t * diff --git a/src/test/crimson/seastore/test_object_data_handler.cc b/src/test/crimson/seastore/test_object_data_handler.cc index fa94bcf3126..903a1314a53 100644 --- a/src/test/crimson/seastore/test_object_data_handler.cc +++ b/src/test/crimson/seastore/test_object_data_handler.cc @@ -135,7 +135,27 @@ public: mlayout.ss_size = 0; }); } - + boost::intrusive_ptr offload_data_and_md(Transaction& t) final { + auto ret = new TestOnode(); + { + auto data = layout.object_data.get(); + ret->update_object_data(t, data); + auto root = layout.omap_root.get(LADDR_HINT_NULL); + ret->update_omap_root(t, root); + root = layout.xattr_root.get(LADDR_HINT_NULL); + ret->update_xattr_root(t, root); + } + { + auto data = object_data_t{L_ADDR_NULL, 0}; + update_object_data(t, data); + auto root = omap_root_t{}; + root.type = omap_type_t::OMAP; + update_omap_root(t, root); + root.type = omap_type_t::XATTR; + update_xattr_root(t, root); + } + return ret; + } }; struct object_data_handler_test_t: