From 484344d8d6e70a7ac3b6332cedbbb22a6a1608c0 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 14 Aug 2025 13:34:31 +0800 Subject: [PATCH] crimson/os/seastore: support OP_CLONERANGE2 Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/lba_mapping.h | 4 + .../os/seastore/object_data_handler.cc | 312 +++++++++++++++++- src/crimson/os/seastore/object_data_handler.h | 75 ++++- src/crimson/os/seastore/onode.h | 3 + src/crimson/os/seastore/seastore.cc | 52 ++- src/crimson/os/seastore/seastore.h | 7 + src/crimson/os/seastore/transaction_manager.h | 114 +++---- 7 files changed, 487 insertions(+), 80 deletions(-) diff --git a/src/crimson/os/seastore/lba_mapping.h b/src/crimson/os/seastore/lba_mapping.h index b7c93d4287603..13acb4b92174c 100644 --- a/src/crimson/os/seastore/lba_mapping.h +++ b/src/crimson/os/seastore/lba_mapping.h @@ -148,6 +148,10 @@ public: return direct_cursor->get_laddr(); } + laddr_t get_end() const { + return (get_key() + get_length()).checked_to_laddr(); + } + // An lba pin may be indirect, see comments in lba/btree_lba_manager.h laddr_t get_intermediate_key() const { assert(is_indirect()); diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index e99a56bcbdb8d..2db4b70104bb3 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -130,6 +130,9 @@ ObjectDataHandler::read_iertr::future> read_mapping( extent_len_t unaligned_len, bool for_zero /* whether this is for zero overwrite*/) { + LOG_PREFIX(ObjectDataHandler::read_mapping); + TRACET("{}~{} {} zero?{}", + ctx.t, unaligned_offset, unaligned_len, read_pos, for_zero); assert(unaligned_len != 0); if (read_pos.is_zero_reserved()) { if (for_zero) { @@ -164,23 +167,42 @@ ObjectDataHandler::read_iertr::future> read_mapping( } std::ostream& operator<<( - std::ostream &out, const overwrite_range_t &overwrite_range) { - return out << "overwrite_range_t{" << std::hex - << "unaligned_len=0x" << overwrite_range.unaligned_len - << ", unaligned_begin=0x" << overwrite_range.unaligned_begin - << ", aligned_begin=0x" << overwrite_range.aligned_begin - << ", unaligned_end=0x" << overwrite_range.unaligned_end - << ", aligned_end=0x" << overwrite_range.aligned_end - << ", aligned_len=0x" << overwrite_range.aligned_len << std::dec + std::ostream &out, const clone_range_t &clonerange_info) +{ + return out << "clone_range_t{" + << "first_src_mapping=" << clonerange_info.first_src_mapping + << ", src_base=" << std::hex << clonerange_info.src_base << std::dec + << ", dest_base=" << std::hex << clonerange_info.dest_base << std::dec + << ", offset=" << std::hex << clonerange_info.offset << std::dec + << ", len=" << std::hex << clonerange_info.len << std::dec << "}"; } +std::ostream& operator<<( + std::ostream &out, const overwrite_range_t &overwrite_range) { + out << "overwrite_range_t{" << std::hex + << "unaligned_len=0x" << overwrite_range.unaligned_len + << ", unaligned_begin=0x" << overwrite_range.unaligned_begin + << ", aligned_begin=0x" << overwrite_range.aligned_begin + << ", unaligned_end=0x" << overwrite_range.unaligned_end + << ", aligned_end=0x" << overwrite_range.aligned_end + << ", aligned_len=0x" << overwrite_range.aligned_len << std::dec; + if (overwrite_range.clonerange_info) { + out << ", clonerange_info=" << *overwrite_range.clonerange_info; + } + return out << "}"; +} + std::ostream& operator<<(std::ostream &out, const data_t &data) { return out << "data_t{" << std::hex << "headbl=0x" << (data.headbl ? data.headbl->length() : 0) + << ", head_padding=0x" << (data.head_padding + ? data.head_padding->length() : 0) << ", bl=0x" << (data.bl ? data.bl->length() : 0) - << ", tailbl=0x" << (data.tailbl ? data.tailbl->length() : 0) << std::dec - << "}"; + << ", tailbl=0x" << (data.tailbl ? data.tailbl->length() : 0) + << ", tail_padding=0x" << (data.tail_padding + ? data.tail_padding->length() : 0) + << std::dec << "}"; } ObjectDataHandler::write_ret @@ -299,6 +321,94 @@ ObjectDataHandler::write_ret do_zero( ); } +ObjectDataHandler::clone_ret do_clonerange( + context_t ctx, + LBAMapping write_pos, + const overwrite_range_t &overwrite_range, + data_t &data) +{ + LOG_PREFIX(ObjectDataHandler::do_clonerange); + DEBUGT("{} {} write_pos={}", ctx.t, overwrite_range, data, write_pos); + ceph_assert(overwrite_range.clonerange_info.has_value()); + assert(write_pos.is_end() || + write_pos.get_key() >= overwrite_range.aligned_end); + if (data.head_padding.has_value()) { + // merge data.headbl and data.head_padding, and write the merged data + // into the first 4KB region of the aligned cloned range + assert(data.head_padding->length() < ctx.tm.get_block_size()); + data.merge_head(ctx.tm.get_block_size()); + auto extents = co_await ctx.tm.alloc_data_extents( + ctx.t, + overwrite_range.aligned_begin, + ctx.tm.get_block_size(), + std::move(write_pos) + ).handle_error_interruptible( + crimson::ct_error::enospc::assert_failure{"unexpected enospc"}, + TransactionManager::get_pin_iertr::pass_further{} + ); + assert(extents.size() == 1); + auto &extent = extents.back(); + assert(overwrite_range.aligned_begin == extent->get_laddr()); + auto iter = data.headbl->cbegin(); + iter.copy(extent->get_length(), extent->get_bptr().c_str()); + auto mapping = co_await ctx.tm.get_pin(ctx.t, *extent + ).handle_error_interruptible( + crimson::ct_error::enoent::assert_failure{"unexpected enospc"}, + TransactionManager::get_pin_iertr::pass_further{} + ); + write_pos = co_await mapping.next().handle_error_interruptible( + crimson::ct_error::enospc::assert_failure{"unexpected enospc"}, + TransactionManager::get_pin_iertr::pass_further{} + ); + } + // clone the src mappings + auto src = overwrite_range.clonerange_info->first_src_mapping; + auto offset = overwrite_range.clonerange_info->offset; + auto len = overwrite_range.clonerange_info->len; + auto src_base = overwrite_range.clonerange_info->src_base; + auto dest_base = overwrite_range.clonerange_info->dest_base; + auto aligned_off = p2roundup(offset, ctx.tm.get_block_size()); + auto aligned_len = + p2align(offset + len, ctx.tm.get_block_size()) - aligned_off; + if (!is_aligned(offset, ctx.tm.get_block_size()) && + src_base + aligned_off == src.get_end()) { + // the first mapping of the src range ends exactly at the rounded up + // begin, we need to push the first mapping one step further in this + // case + src = co_await src.next(); + } + auto cr_ret = co_await ctx.tm.clone_range( + ctx.t, src_base, dest_base, aligned_off, aligned_len, + std::move(write_pos), std::move(src), true); + if (cr_ret.shared_direct_mapping) { + ctx.onode.set_need_cow(ctx.t); + } + write_pos = std::move(cr_ret.next_mapping); + if (data.tail_padding.has_value()) { + // merge data.tailbl and data.tail_padding, and write the merged data + // into the last 4KB region of the aligned cloned range + assert(data.tail_padding->length() < ctx.tm.get_block_size()); + data.merge_tail(ctx.tm.get_block_size()); + auto extents = co_await ctx.tm.alloc_data_extents( + ctx.t, + (overwrite_range.aligned_end - ctx.tm.get_block_size() + ).checked_to_laddr(), + ctx.tm.get_block_size(), + std::move(write_pos) + ).handle_error_interruptible( + crimson::ct_error::enospc::assert_failure{"unexpected enospc"}, + TransactionManager::get_pin_iertr::pass_further{} + ); + assert(extents.size() == 1); + auto &extent = extents.back(); + assert((overwrite_range.aligned_end - ctx.tm.get_block_size() + ).checked_to_laddr() == extent->get_laddr()); + auto iter = data.tailbl->cbegin(); + iter.copy(extent->get_length(), extent->get_bptr().c_str()); + } +} + + ObjectDataHandler::write_ret do_write( context_t ctx, LBAMapping write_pos, @@ -602,6 +712,14 @@ ObjectDataHandler::merge_into_pending_edge( t_bl.substr_of(*data.bl, 0, unaligned_offset); } data.bl = std::move(t_bl); + } else if (edge == edge_t::LEFT && data.head_padding.has_value()) { + assert(!data.headbl); + bl = std::move(*data.head_padding); + data.head_padding.reset(); + } else if (edge == edge_t::RIGHT && data.tail_padding.has_value()) { + assert(!data.tailbl); + bl = std::move(*data.tail_padding); + data.tail_padding.reset(); } else { bl.append_zero(unaligned_len); } @@ -875,7 +993,7 @@ ObjectDataHandler::handle_single_mapping_overwrite( overwrite_range.aligned_begin, overwrite_range.aligned_len, op_type); - auto do_overwrite = [ctx, &overwrite_range, &data](auto pos) { + auto do_overwrite = [ctx, &overwrite_range, &data, op_type](auto pos) { if (overwrite_range.is_empty()) { // the overwrite is completed in the previous steps, // this can happen if delta based overwrites are involved. @@ -904,7 +1022,11 @@ ObjectDataHandler::handle_single_mapping_overwrite( if (data.bl) { return do_write(ctx, std::move(pos), overwrite_range, data); } else { - return do_zero(ctx, std::move(pos), overwrite_range, data); + if (op_type == op_type_t::OP_CLONERANGE) { + return do_clonerange(ctx, std::move(pos), overwrite_range, data); + } else { + return do_zero(ctx, std::move(pos), overwrite_range, data); + } } }; @@ -959,7 +1081,7 @@ ObjectDataHandler::handle_multi_mapping_overwrite( { return punch_multi_mapping_hole( ctx, overwrite_range, data, std::move(first_mapping), op_type - ).si_then([ctx, &overwrite_range, &data](auto pos) { + ).si_then([ctx, &overwrite_range, &data, op_type](auto pos) { if (overwrite_range.is_empty()) { // the overwrite is completed in the previous steps, // this can happen if delta based overwrites are involved. @@ -988,7 +1110,11 @@ ObjectDataHandler::handle_multi_mapping_overwrite( if (data.bl) { return do_write(ctx, std::move(pos), overwrite_range, data); } else { - return do_zero(ctx, std::move(pos), overwrite_range, data); + if (op_type == op_type_t::OP_CLONERANGE) { + return do_clonerange(ctx, std::move(pos), overwrite_range, data); + } else { + return do_zero(ctx, std::move(pos), overwrite_range, data); + } } }); } @@ -1014,7 +1140,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite( unaligned_begin.get_aligned_laddr(ctx.tm.get_block_size()), unaligned_end.get_roundup_laddr(ctx.tm.get_block_size())); return seastar::do_with( - data_t{std::nullopt, std::move(bl), std::nullopt}, + data_t{std::move(bl)}, overwrite_range_t{ len, unaligned_begin, @@ -1034,6 +1160,160 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite( }); } +ObjectDataHandler::read_iertr::future<> +ObjectDataHandler::read_edge_for_clone_range( + context_t ctx, + object_data_t &object_data, + extent_len_t offset, + extent_len_t len, + data_t &data) +{ + LOG_PREFIX(ObjectDataHandler::read_edge_for_clone_range); + auto base = object_data.get_reserved_data_base(); + TRACET("object_data: {}~{} range: {}~{}", + ctx.t, base, object_data.get_reserved_data_len(), offset, len); + auto block_size = ctx.tm.get_block_size(); + if (is_aligned(offset, block_size) && + is_aligned(offset + len, block_size)) { + return read_iertr::now(); + } + std::vector< + TransactionManager::get_pin_iertr::future<>> read_paddings; + auto begin = base + offset; + auto end = base + offset + len; + auto aligned_offset = begin.get_aligned_laddr(block_size); + auto aligned_end = end.get_aligned_laddr(block_size); + if (aligned_offset == aligned_end) { + // the cloned range is strictly within the same block, read the + // data from src and put it in data.bl. The clone range is effectively + // turned into an overwrite + return ctx.tm.get_containing_pin(ctx.t, aligned_offset + ).si_then([begin, ctx, len](auto read_pos) { + auto unaligned_offset = begin.template get_byte_distance< + extent_len_t>(read_pos.get_key()); + return read_mapping( + ctx, std::move(read_pos), + unaligned_offset, len, false); + }).si_then([&data](auto bl) { + data.bl = std::move(bl); + }).handle_error_interruptible( + read_iertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected error"} + ); + } + if (!begin.is_aligned(block_size)) { + // read the data of data.head_padding + read_paddings.emplace_back( + ctx.tm.get_containing_pin(ctx.t, aligned_offset + ).si_then([begin, ctx, block_size](auto read_pos) { + auto unaligned_offset = begin.template get_byte_distance< + extent_len_t>(read_pos.get_key()); + auto rounded_begin = begin.get_roundup_laddr(block_size); + auto len = rounded_begin.template get_byte_distance< + extent_len_t>(begin); + return read_mapping( + ctx, std::move(read_pos), + unaligned_offset, len, false); + }).si_then([&data](auto head_padding) { + data.head_padding = std::move(head_padding); + }) + ); + } + if (!end.is_aligned(block_size)) { + // read the data of data.tail_padding + read_paddings.emplace_back( + ctx.tm.get_containing_pin(ctx.t, aligned_end + ).si_then([aligned_end, end, ctx](auto read_pos) { + auto aligned_offset = aligned_end.template get_byte_distance< + extent_len_t>(read_pos.get_key()); + auto len = end.template get_byte_distance< + extent_len_t>(aligned_end); + return read_mapping( + ctx, std::move(read_pos), + aligned_offset, len, false); + }).si_then([&data](auto tail_padding) { + data.tail_padding = std::move(tail_padding); + }) + ); + } + // TODO: when_all_succeed should be utilized here, however, it doesn't + // actually work with interruptible errorated futures for now. + return trans_intr::parallel_for_each( + read_paddings, [](auto &fut) { return std::move(fut); } + ).handle_error_interruptible( + read_iertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected error"} + ); +} + +ObjectDataHandler::clone_ret ObjectDataHandler::clone_range( + context_t ctx, + extent_len_t srcoff, + extent_len_t len, + extent_len_t destoff) +{ + LOG_PREFIX(ObjectDataHandler::clone_range); + ceph_assert(ctx.d_onode); + DEBUGT("{}->{}, {}~{}", + ctx.t, + ctx.onode.get_hobj(), + ctx.d_onode->get_hobj(), + srcoff, len); + // doesn't support inconsistent range clone yet + ceph_assert(srcoff == destoff); + return with_objects_data( + ctx, + [ctx, this, srcoff, len](auto &object_data, auto &d_object_data) + -> clone_ret { + ceph_assert(!object_data.is_null()); + data_t data; + auto dest_mapping = co_await prepare_data_reservation( + ctx, d_object_data, object_data.get_reserved_data_len()); + if (!dest_mapping) { + auto d_base = d_object_data.get_reserved_data_base(); + auto laddr = (d_base + srcoff).get_aligned_laddr( + ctx.tm.get_block_size()); + dest_mapping = co_await ctx.tm.get_containing_pin(ctx.t, laddr + ).handle_error_interruptible( + clone_iertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected enoent"} + ); + } + // For unaligned range cloning, we need to read data.head_padding + // and data.tail_padding from the src range, and later write into + // the dest range with data.headbl and data.tailbl. + co_await read_edge_for_clone_range( + ctx, object_data, srcoff, len, data); + auto base = object_data.get_reserved_data_base(); + auto begin = base + srcoff; + auto block_size = ctx.tm.get_block_size(); + auto src_mapping = co_await ctx.tm.get_containing_pin( + ctx.t, begin.get_aligned_laddr(block_size) + ).handle_error_interruptible( + clone_iertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected enoent"} + ); + auto d_base = d_object_data.get_reserved_data_base(); + auto unaligned_begin = d_base + srcoff; + auto unaligned_end = unaligned_begin + len; + auto overwrite_range = overwrite_range_t{ + len, + unaligned_begin, + unaligned_end, + ctx.tm.get_block_size(), + clone_range_t{std::move(src_mapping), base, d_base, srcoff, len}}; + if (overwrite_range.is_range_in_mapping(*dest_mapping)) { + co_await handle_single_mapping_overwrite( + ctx, overwrite_range, data, std::move(*dest_mapping), + op_type_t::OP_CLONERANGE); + } else { + co_await handle_multi_mapping_overwrite( + ctx, overwrite_range, data, std::move(*dest_mapping), + op_type_t::OP_CLONERANGE); + } + }); +} + ObjectDataHandler::zero_ret ObjectDataHandler::zero( context_t ctx, objaddr_t offset, @@ -1389,7 +1669,7 @@ ObjectDataHandler::copy_on_write( { return with_object_data( ctx, - [ctx, this](auto &object_data) -> clone_iertr::future<> { + [ctx, this](auto &object_data) -> clone_ret { auto mapping = co_await ctx.tm.get_pin( ctx.t, object_data.get_reserved_data_base() ).handle_error_interruptible( diff --git a/src/crimson/os/seastore/object_data_handler.h b/src/crimson/os/seastore/object_data_handler.h index 204a72e11e96a..e78285c198f96 100644 --- a/src/crimson/os/seastore/object_data_handler.h +++ b/src/crimson/os/seastore/object_data_handler.h @@ -79,6 +79,15 @@ private: mutable std::optional ptr = std::nullopt; }; +struct clone_range_t { + LBAMapping first_src_mapping; + laddr_t src_base = L_ADDR_NULL; + laddr_t dest_base = L_ADDR_NULL; + extent_len_t offset = 0; + extent_len_t len = 0; +}; +std::ostream& operator<<(std::ostream &out, const clone_range_t &); + struct overwrite_range_t { objaddr_t unaligned_len = 0; laddr_offset_t unaligned_begin; @@ -86,6 +95,7 @@ struct overwrite_range_t { laddr_t aligned_begin = L_ADDR_NULL; laddr_t aligned_end = L_ADDR_NULL; objaddr_t aligned_len = 0; + std::optional clonerange_info; overwrite_range_t( objaddr_t unaligned_len, laddr_offset_t unaligned_begin, @@ -100,6 +110,22 @@ struct overwrite_range_t { aligned_end.template get_byte_distance< extent_len_t>(aligned_begin)) {} + overwrite_range_t( + objaddr_t unaligned_len, + laddr_offset_t unaligned_begin, + laddr_offset_t unaligned_end, + extent_len_t block_size, + clone_range_t &&clonerange_info) + : unaligned_len(unaligned_len), + unaligned_begin(unaligned_begin), + unaligned_end(unaligned_end), + aligned_begin(unaligned_begin.get_aligned_laddr(block_size)), + aligned_end(unaligned_end.get_roundup_laddr(block_size)), + aligned_len( + aligned_end.template get_byte_distance< + extent_len_t>(aligned_begin)), + clonerange_info(std::move(clonerange_info)) + {} bool is_empty() const { return unaligned_begin == unaligned_end; @@ -165,10 +191,38 @@ struct overwrite_range_t { }; std::ostream& operator<<(std::ostream &, const overwrite_range_t &); +// |<-headbl->|<-head_padding->|<--------bl------->|<-tail_padding->|<-tailbl->| +// |----------4KB--------------|-------------------|----------4KB--------------| +// |------------------overwrite_range--------------------| struct data_t { std::optional headbl; + std::optional head_padding; std::optional bl; std::optional tailbl; + std::optional tail_padding; + data_t() = default; + data_t(std::optional &&_bl) : bl(std::move(_bl)) {} + void merge_head(extent_len_t block_size) { + assert(head_padding.has_value()); + if (headbl) { + headbl->append(*head_padding); + } else { + headbl = bufferlist{}; + headbl->append_zero(block_size - head_padding->length()); + headbl->append(*head_padding); + } + head_padding.reset(); + } + void merge_tail(extent_len_t block_size) { + assert(tail_padding.has_value()); + if (tailbl) { + tail_padding->append(*tailbl); + } else { + tail_padding->append_zero(block_size - tail_padding->length()); + } + tailbl = std::move(tail_padding); + tail_padding.reset(); + } }; std::ostream& operator<<(std::ostream &out, const data_t &data); @@ -331,8 +385,19 @@ public: using clone_ret = clone_iertr::future<>; clone_ret clone(context_t ctx); + /// Clone the object so that the later modification + /// won't be seen by other objects sharing the same + /// direct lba mappings. clone_ret copy_on_write(context_t ctx); + /// Clone the specified range from the src object + /// to the dest object + clone_ret clone_range( + context_t ctx, + extent_len_t srcoff, + extent_len_t len, + extent_len_t destoff); + private: /// Updates region [_offset, _offset + bl.length) to bl write_ret overwrite( @@ -380,7 +445,8 @@ private: enum op_type_t : uint8_t { OVERWRITE, ZERO, - TRIM + TRIM, + OP_CLONERANGE }; enum edge_handle_policy_t : uint8_t { DELTA_BASED_PUNCH, @@ -570,6 +636,13 @@ private: data_t &data, LBAMapping edge_mapping); + read_iertr::future<> read_edge_for_clone_range( + context_t ctx, + object_data_t &object_data, + extent_len_t offset, + extent_len_t len, + data_t &data); + private: /** * max_object_size diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h index 4a0d982ebdf7c..3aa4247855971 100644 --- a/src/crimson/os/seastore/onode.h +++ b/src/crimson/os/seastore/onode.h @@ -93,6 +93,9 @@ public: virtual const onode_layout_t &get_layout() const = 0; virtual ~Onode() = default; + const hobject_t &get_hobj() const { + return hobj; + } bool is_head() const { return hobj.is_head(); } diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index a5963800b2a1b..cb2cb52bc02ea 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -1617,7 +1617,8 @@ SeaStore::Shard::_do_transaction_step( } OnodeRef& d_onode = onodes[op->dest_oid]; if ((op->op == Transaction::OP_CLONE - || op->op == Transaction::OP_COLL_MOVE_RENAME) + || op->op == Transaction::OP_COLL_MOVE_RENAME + || op->op == Transaction::OP_CLONERANGE2) && !d_onode) { const ghobject_t& dest_oid = i.get_oid(op->dest_oid); DEBUGT("op {}, get_or_create dest oid={} ...", @@ -1791,6 +1792,22 @@ SeaStore::Shard::_do_transaction_step( *ctx.transaction, oid, i.get_oid(op->dest_oid)); return _clone(ctx, *onode, *onodes[op->dest_oid]); } + case Transaction::OP_CLONERANGE2: + { + assert(op->off <= std::numeric_limits::max()); + assert(op->len <= std::numeric_limits::max()); + assert(op->dest_off <= std::numeric_limits::max()); + extent_len_t srcoff = (extent_len_t)op->off; + extent_len_t len = (extent_len_t)op->len; + extent_len_t dstoff = (extent_len_t)op->dest_off; + return _clone_range( + ctx, + onode, + onodes[op->dest_oid], + srcoff, + len, + dstoff); + } case Transaction::OP_COLL_MOVE_RENAME: { DEBUGT("op COLL_MOVE_RENAME, oid={}, dest oid={} ...", @@ -2013,6 +2030,39 @@ SeaStore::Shard::_clone( }); } +SeaStore::Shard::tm_ret +SeaStore::Shard::_clone_range( + internal_context_t &ctx, + OnodeRef &src_onode, + OnodeRef &dst_onode, + extent_len_t srcoff, + extent_len_t length, + extent_len_t dstoff) +{ + LOG_PREFIX(SeaStore::_clone_range); + DEBUGT("src_onode={}, dst_onode={}, src {}~{}, dst {}", + *ctx.transaction, *src_onode, *dst_onode, srcoff, length, dstoff); + const auto &d_object_size = dst_onode->get_layout().size; + if (srcoff + length > d_object_size) { + dst_onode->update_onode_size( + *ctx.transaction, + std::max(srcoff + length, d_object_size)); + } + return seastar::do_with( + ObjectDataHandler(max_object_size), + [=, this, &ctx](auto &objHandler) { + return objHandler.clone_range( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *src_onode, + dst_onode.get()}, + srcoff, + length, + dstoff); + }); +} + SeaStore::Shard::tm_ret SeaStore::Shard::_zero( internal_context_t &ctx, diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index 09e65efdb7ac8..28811f342b96c 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -352,6 +352,13 @@ public: internal_context_t &ctx, OnodeRef &onode, OnodeRef &d_onode); + tm_ret _clone_range( + internal_context_t &ctx, + OnodeRef &src_onode, + OnodeRef &dst_onode, + extent_len_t srcoff, + extent_len_t length, + extent_len_t dstoff); tm_ret _zero( internal_context_t &ctx, Onode &onode, diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 1080d3491822d..7116b2461b3e8 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -616,79 +616,69 @@ public: }); } + struct clone_range_ret_t { + bool shared_direct_mapping = false; + LBAMapping next_mapping; + }; // clone the mappings in range base~len, returns true if there exists // direct mappings that are cloned. using clone_iertr = base_iertr; - using clone_ret = clone_iertr::future; + using clone_ret = clone_iertr::future; clone_ret clone_range( Transaction &t, - laddr_t base, + laddr_t src_base, + laddr_t dst_base, + extent_len_t offset, extent_len_t len, LBAMapping pos, LBAMapping mapping, bool updateref) { LOG_PREFIX(TransactionManager::clone_range); - SUBDEBUGT(seastore_tm, "object_data={}~{} mapping={} updateref={}", - t, base, len, mapping, updateref); - return seastar::do_with( - std::move(pos), - std::move(mapping), - (extent_len_t)0, - false, - [&t, this, updateref, base, len] - (auto &pos, auto &mapping, auto &offset, auto &ret) { - return trans_intr::repeat( - [&t, this, &pos, &mapping, &offset, updateref, base, len, &ret]() - -> clone_iertr::future { - if (offset >= len) { - return clone_iertr::make_ready_future< - seastar::stop_iteration>(seastar::stop_iteration::yes); - } - if (!mapping.is_indirect() && mapping.is_zero_reserved()) { - return reserve_region( - t, - std::move(pos), - (base + offset).checked_to_laddr(), - mapping.get_length() - ).si_then([base, &offset](auto r) { - assert((base + offset).checked_to_laddr() == r.get_key()); - offset += r.get_length(); - return r.next(); - }).si_then([&pos, &mapping](auto r) { - pos = std::move(r); - return mapping.next(); - }).si_then([&mapping](auto p) { - mapping = std::move(p); - return seastar::stop_iteration::no; - }).handle_error_interruptible( - clone_iertr::pass_further{}, - crimson::ct_error::assert_all{"unexpected error"} - ); - } - if (mapping.is_real()) { - ret = true; - } - auto len = mapping.get_length(); - return clone_pin( - t, std::move(pos), std::move(mapping), - (base + offset).checked_to_laddr(), - 0, len, updateref - ).si_then([&offset, &pos, &mapping](auto ret) { - offset += ret.cloned_mapping.get_length(); - return ret.cloned_mapping.next( - ).si_then([&pos, ret=std::move(ret)](auto p) mutable { - pos = std::move(p); - return ret.orig_mapping.next(); - }).si_then([&mapping](auto p) { - mapping = std::move(p); - return seastar::stop_iteration::no; - }); - }); - }).si_then([&ret] { - return ret; - }); - }); + SUBDEBUGT(seastore_tm, + "src_base={}, dst_base={}, {}~{}, mapping={}, pos={}, updateref={}", + t, src_base, dst_base, offset, len, mapping, pos, updateref); + pos = co_await pos.refresh(); + mapping = co_await mapping.refresh(); + auto left = len; + bool shared_direct = false; + auto cloned_to = offset; + while (left != 0) { + auto src_offset = src_base.template get_byte_distance< + extent_len_t>(mapping.get_key()); + ceph_assert(cloned_to >= src_offset); + extent_len_t clone_offset = cloned_to - src_offset; + extent_len_t clone_len = mapping.get_length() - clone_offset; + clone_len = std::min(clone_len, left); + left -= clone_len; + if (!mapping.is_indirect() && mapping.get_val().is_zero()) { + auto r = co_await reserve_region( + t, + std::move(pos), + (dst_base + cloned_to).checked_to_laddr(), + clone_len + ).handle_error_interruptible( + clone_iertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected error"} + ); + assert((dst_base + cloned_to).checked_to_laddr() == r.get_key()); + cloned_to += clone_len; + pos = co_await r.next(); + mapping = co_await mapping.next(); + continue; + } + if (mapping.is_real()) { + shared_direct = true; + } + auto ret = co_await clone_pin( + t, std::move(pos), std::move(mapping), + (dst_base + cloned_to).checked_to_laddr(), + clone_offset, clone_len, updateref); + cloned_to += clone_len; + pos = co_await ret.cloned_mapping.next(); + mapping = co_await ret.orig_mapping.next(); + } + co_return clone_range_ret_t{shared_direct, std::move(pos)}; } /* alloc_extents -- 2.39.5