From e29927358c5da767e62c0a435eb036f0bbe2103e Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 12 Aug 2025 16:24:08 +0800 Subject: [PATCH] crimson/os/seastore/object_data_handler: we don't have to maintain the symmetric indirect lba relationship at the time of clone OP_CLONE is done in the following way: 1. First, swap the layout of the head onode and the clone onode, so that clone onode's object_data, omap_root, xattr_root and log_root all point to the head onode's corresponding fields; 2. Do SeaStore::_clone() from the clone onode to the head onode, which is exactly what rollback is done. This makes the code of ObjectDataHandler::clone() and ObjectDataHandler::copy_on_write() even simpler, and can facilitate the clone/rollback scenarios when the "128-bit" lba key layout is involved. Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/lba_mapping.h | 1 + .../os/seastore/object_data_handler.cc | 88 ++++++++----------- src/crimson/os/seastore/object_data_handler.h | 19 ++++ src/crimson/os/seastore/onode.h | 9 ++ src/crimson/os/seastore/seastore.cc | 46 ++++++++-- src/crimson/os/seastore/seastore_types.h | 3 + src/crimson/os/seastore/transaction_manager.h | 15 +++- 7 files changed, 117 insertions(+), 64 deletions(-) diff --git a/src/crimson/os/seastore/lba_mapping.h b/src/crimson/os/seastore/lba_mapping.h index 8e9cf8462da77..b7c93d4287603 100644 --- a/src/crimson/os/seastore/lba_mapping.h +++ b/src/crimson/os/seastore/lba_mapping.h @@ -111,6 +111,7 @@ public: bool is_zero_reserved() const { return !is_indirect() && get_val().is_zero(); } + // true if the mapping corresponds to real data bool is_real() const { return !is_indirect() && !get_val().is_zero(); } diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index 7a3692e6e78f4..4fb7fb98b72a2 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -1383,17 +1383,43 @@ ObjectDataHandler::clear_ret ObjectDataHandler::clear( }); } +ObjectDataHandler::clone_ret +ObjectDataHandler::do_clone( + context_t ctx, + object_data_t &object_data, + object_data_t &d_object_data, + LBAMapping first_mapping, + bool updateref) +{ + LOG_PREFIX("ObjectDataHandler::do_clone"); + assert(d_object_data.is_null()); + auto old_base = object_data.get_reserved_data_base(); + auto old_len = object_data.get_reserved_data_len(); + auto mapping = co_await prepare_data_reservation( + ctx, d_object_data, old_len); + ceph_assert(mapping.has_value()); + DEBUGT("new obj reserve_data_base: {}, len 0x{:x}", + ctx.t, + d_object_data.get_reserved_data_base(), + d_object_data.get_reserved_data_len()); + auto pos = co_await ctx.tm.remove(ctx.t, std::move(*mapping) + ).handle_error_interruptible( + clone_iertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected enoent"} + ); + auto base = d_object_data.get_reserved_data_base(); + auto len = d_object_data.get_reserved_data_len(); + auto cr_ret = co_await ctx.tm.clone_range( + ctx.t, old_base, base, 0, len, std::move(pos), + std::move(first_mapping), updateref); + if (cr_ret.shared_direct_mapping) { + ctx.onode.set_need_cow(ctx.t); + } +} + ObjectDataHandler::clone_ret ObjectDataHandler::clone( context_t ctx) { - // the whole clone procedure can be seperated into the following steps: - // 1. let clone onode(d_object_data) take the head onode's - // object data base; - // 2. reserve a new region in lba tree for the head onode; - // 3. clone all extents of the clone onode, see transaction_manager.h - // for the details of clone_pin; - // 4. reserve the space between the head onode's size and its reservation - // length. return with_objects_data( ctx, [ctx, this](auto &object_data, auto &d_object_data) { @@ -1403,50 +1429,8 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone( } return ctx.tm.get_pin(ctx.t, object_data.get_reserved_data_base() ).si_then([this, &object_data, &d_object_data, ctx](auto mapping) { - auto old_base = object_data.get_reserved_data_base(); - auto old_len = object_data.get_reserved_data_len(); - return prepare_data_reservation( - ctx, - d_object_data, - object_data.get_reserved_data_len() - ).si_then([&object_data, &d_object_data, ctx](auto mapping) { - assert(!object_data.is_null()); - assert(mapping); - LOG_PREFIX(ObjectDataHandler::clone); - DEBUGT("cloned obj reserve_data_base: {}, len 0x{:x}", - ctx.t, - d_object_data.get_reserved_data_base(), - d_object_data.get_reserved_data_len()); - return ctx.tm.remove(ctx.t, std::move(*mapping)); - }).si_then([mapping, &d_object_data, ctx](auto pos) mutable { - auto base = d_object_data.get_reserved_data_base(); - auto len = d_object_data.get_reserved_data_len(); - return ctx.tm.clone_range( - ctx.t, base, len, std::move(pos), std::move(mapping), true); - }).si_then([ctx, &object_data, &d_object_data, this] { - object_data.clear(); - return prepare_data_reservation( - ctx, - object_data, - d_object_data.get_reserved_data_len() - ).si_then([ctx, &object_data](auto mapping) { - LOG_PREFIX("ObjectDataHandler::clone"); - DEBUGT("head obj reserve_data_base: {}, len 0x{:x}", - ctx.t, - object_data.get_reserved_data_base(), - object_data.get_reserved_data_len()); - return ctx.tm.remove(ctx.t, std::move(*mapping)); - }); - }).si_then([ctx, &object_data, mapping](auto pos) mutable { - auto base = object_data.get_reserved_data_base(); - auto len = object_data.get_reserved_data_len(); - return ctx.tm.clone_range( - ctx.t, base, len, std::move(pos), std::move(mapping), false); - }).si_then([ctx, mapping, old_base, old_len] { - return ctx.tm.remove_mappings_in_range( - ctx.t, old_base, old_len, std::move(mapping), {false, true} - ).discard_result(); - }); + ceph_assert(ctx.d_onode); + return do_clone(ctx, object_data, d_object_data, std::move(mapping), true); }).handle_error_interruptible( clone_iertr::pass_further{}, crimson::ct_error::assert_all{"unexpected enoent"} diff --git a/src/crimson/os/seastore/object_data_handler.h b/src/crimson/os/seastore/object_data_handler.h index aecacc2b5bee5..0438acff4c01c 100644 --- a/src/crimson/os/seastore/object_data_handler.h +++ b/src/crimson/os/seastore/object_data_handler.h @@ -323,6 +323,10 @@ public: clear_ret clear(context_t ctx); /// Clone data of an Onode + /// Note that the clone always assume that ctx.onode + /// is a snap onode, so, for OP_CLONE, the caller of + /// this method should swap the layout of the onode + /// and the dest_onode first. using clone_iertr = base_iertr; using clone_ret = clone_iertr::future<>; clone_ret clone(context_t ctx); @@ -337,6 +341,21 @@ private: std::optional &&bl, LBAMapping first_mapping); + /** + * do_clone + * + * Clone lba mappings from object_data to d_object_data. + * object_data must belong to ctx.onode, and d_object_data must belong to ctx.d_onode + * This implementation is asymmetric and optimizes for (but does not require) the case + * that source is not further mutated. + */ + clone_ret do_clone( + context_t ctx, + object_data_t &object_data, + object_data_t &d_object_data, + LBAMapping first_mapping, + bool updateref); + /// Ensures object_data reserved region is prepared write_iertr::future> prepare_data_reservation( diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h index 98369424011f3..4a0d982ebdf7c 100644 --- a/src/crimson/os/seastore/onode.h +++ b/src/crimson/os/seastore/onode.h @@ -93,6 +93,15 @@ public: virtual const onode_layout_t &get_layout() const = 0; virtual ~Onode() = default; + bool is_head() const { + return hobj.is_head(); + } + bool is_snap() const { + return hobj.is_snap(); + } + bool need_cow() const { + return get_layout().need_cow; + } virtual void update_onode_size(Transaction&, uint32_t) = 0; virtual void update_omap_root(Transaction&, omap_root_t&) = 0; virtual void update_log_root(Transaction&, omap_root_t&) = 0; diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index 17926425cf1ed..a46010d2c89c1 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -1957,21 +1957,49 @@ SeaStore::Shard::_clone( { auto &object_size = onode.get_layout().size; d_onode.update_onode_size(*ctx.transaction, object_size); - return objHandler.clone( - ObjectDataHandler::context_t{ - *transaction_manager, - *ctx.transaction, - onode, - &d_onode}); + if (onode.is_head()) { // OP_CLONE + assert(onode.is_head()); + assert(d_onode.is_snap()); + /* The most common usage of OP_CLONE is during a write operation. + * The osd will submit a transaction cloning HEAD to clone and + * then mutating HEAD. ObjectDataHandler::do_clone optimizes for + * the case where the *source* is not further mutated, so here we + * reverse the two onodes so that HEAD will be the target. + */ + onode.swap_layout(*ctx.transaction, d_onode); + return objHandler.clone( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + d_onode, + &onode}); + } else { // OP_ROLLBACK + assert(d_onode.is_head()); + return objHandler.clone( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + onode, + &d_onode}); + } }).si_then([&ctx, &onode, &d_onode, this] { return omaptree_clone( - *ctx.transaction, omap_type_t::XATTR, onode, d_onode); + *ctx.transaction, + omap_type_t::XATTR, + onode.is_head() ? d_onode : onode, + onode.is_head() ? onode : d_onode); }).si_then([&ctx, &onode, &d_onode, this] { return omaptree_clone( - *ctx.transaction, omap_type_t::OMAP, onode, d_onode); + *ctx.transaction, + omap_type_t::OMAP, + onode.is_head() ? d_onode : onode, + onode.is_head() ? onode : d_onode); }).si_then([&ctx, &onode, &d_onode, this] { return omaptree_clone( - *ctx.transaction, omap_type_t::LOG, onode, d_onode); + *ctx.transaction, + omap_type_t::LOG, + onode.is_head() ? d_onode : onode, + onode.is_head() ? onode : d_onode); }); } diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 976b6a59c39c4..f7b243f900655 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -1824,6 +1824,9 @@ public: reserved_data_len = 0; } }; +constexpr object_data_t get_null_object_data() { + return object_data_t{L_ADDR_NULL, 0}; +} struct __attribute__((packed)) object_data_le_t { laddr_le_t reserved_data_base = laddr_le_t(L_ADDR_NULL); diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 120dfbea670d2..40868c09adcdc 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -611,8 +611,10 @@ public: }); } + // clone the mappings in range base~len, returns true if there exists + // direct mappings that are cloned. using clone_iertr = base_iertr; - using clone_ret = clone_iertr::future<>; + using clone_ret = clone_iertr::future; clone_ret clone_range( Transaction &t, laddr_t base, @@ -628,9 +630,11 @@ public: std::move(pos), std::move(mapping), (extent_len_t)0, - [&t, this, updateref, base, len](auto &pos, auto &mapping, auto &offset) { + false, + [&t, this, updateref, base, len] + (auto &pos, auto &mapping, auto &offset, auto &ret) { return trans_intr::repeat( - [&t, this, &pos, &mapping, &offset, updateref, base, len]() + [&t, this, &pos, &mapping, &offset, updateref, base, len, &ret]() -> clone_iertr::future { if (offset >= len) { return clone_iertr::make_ready_future< @@ -657,6 +661,9 @@ public: crimson::ct_error::assert_all{"unexpected error"} ); } + if (mapping.is_real()) { + ret = true; + } return clone_pin( t, std::move(pos), std::move(mapping), (base + offset).checked_to_laddr(), updateref @@ -671,6 +678,8 @@ public: return seastar::stop_iteration::no; }); }); + }).si_then([&ret] { + return ret; }); }); } -- 2.39.5