]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: support OP_CLONERANGE2
authorXuehan Xu <xuxuehan@qianxin.com>
Thu, 14 Aug 2025 05:34:31 +0000 (13:34 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Sun, 28 Sep 2025 06:48:59 +0000 (14:48 +0800)
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/os/seastore/lba_mapping.h
src/crimson/os/seastore/object_data_handler.cc
src/crimson/os/seastore/object_data_handler.h
src/crimson/os/seastore/onode.h
src/crimson/os/seastore/seastore.cc
src/crimson/os/seastore/seastore.h
src/crimson/os/seastore/transaction_manager.h

index b7c93d428760390c0c3ddccdd4b8add3e65351bb..13acb4b92174c32d731cdfc1497f35d16f9f3f3d 100644 (file)
@@ -148,6 +148,10 @@ public:
     return direct_cursor->get_laddr();
   }
 
+  laddr_t get_end() const {
+    return (get_key() + get_length()).checked_to_laddr();
+  }
+
    // An lba pin may be indirect, see comments in lba/btree_lba_manager.h
   laddr_t get_intermediate_key() const {
     assert(is_indirect());
index e99a56bcbdb8dacfc1968004098fc3894c81089f..2db4b70104bb31706c13065a6e398c724932bfab 100644 (file)
@@ -130,6 +130,9 @@ ObjectDataHandler::read_iertr::future<std::optional<bufferlist>> read_mapping(
   extent_len_t unaligned_len,
   bool for_zero /* whether this is for zero overwrite*/)
 {
+  LOG_PREFIX(ObjectDataHandler::read_mapping);
+  TRACET("{}~{} {} zero?{}",
+    ctx.t, unaligned_offset, unaligned_len, read_pos, for_zero);
   assert(unaligned_len != 0);
   if (read_pos.is_zero_reserved()) {
     if (for_zero) {
@@ -164,23 +167,42 @@ ObjectDataHandler::read_iertr::future<std::optional<bufferlist>> read_mapping(
 }
 
 std::ostream& operator<<(
-  std::ostream &out, const overwrite_range_t &overwrite_range) {
-  return out << "overwrite_range_t{" << std::hex
-    << "unaligned_len=0x" << overwrite_range.unaligned_len
-    << ", unaligned_begin=0x" << overwrite_range.unaligned_begin
-    << ", aligned_begin=0x" << overwrite_range.aligned_begin
-    << ", unaligned_end=0x" << overwrite_range.unaligned_end
-    << ", aligned_end=0x" << overwrite_range.aligned_end
-    << ", aligned_len=0x" << overwrite_range.aligned_len << std::dec
+  std::ostream &out, const clone_range_t &clonerange_info)
+{
+  return out << "clone_range_t{"
+    << "first_src_mapping=" << clonerange_info.first_src_mapping
+    << ", src_base=" << std::hex << clonerange_info.src_base << std::dec
+    << ", dest_base=" << std::hex << clonerange_info.dest_base << std::dec
+    << ", offset=" << std::hex << clonerange_info.offset << std::dec
+    << ", len=" << std::hex << clonerange_info.len << std::dec
     << "}";
 }
 
+std::ostream& operator<<(
+  std::ostream &out, const overwrite_range_t &overwrite_range) {
+  out << "overwrite_range_t{" << std::hex
+      << "unaligned_len=0x" << overwrite_range.unaligned_len
+      << ", unaligned_begin=0x" << overwrite_range.unaligned_begin
+      << ", aligned_begin=0x" << overwrite_range.aligned_begin
+      << ", unaligned_end=0x" << overwrite_range.unaligned_end
+      << ", aligned_end=0x" << overwrite_range.aligned_end
+      << ", aligned_len=0x" << overwrite_range.aligned_len << std::dec;
+  if (overwrite_range.clonerange_info) {
+    out << ", clonerange_info=" << *overwrite_range.clonerange_info;
+  }
+  return out << "}";
+}
+
 std::ostream& operator<<(std::ostream &out, const data_t &data) {
   return out << "data_t{" << std::hex
     << "headbl=0x" << (data.headbl ? data.headbl->length() : 0)
+    << ", head_padding=0x" << (data.head_padding 
+                               ? data.head_padding->length() : 0)
     << ", bl=0x" << (data.bl ? data.bl->length() : 0)
-    << ", tailbl=0x" << (data.tailbl ? data.tailbl->length() : 0) << std::dec
-    << "}";
+    << ", tailbl=0x" << (data.tailbl ? data.tailbl->length() : 0)
+    << ", tail_padding=0x" << (data.tail_padding 
+                               ? data.tail_padding->length() : 0)
+    << std::dec << "}";
 }
 
 ObjectDataHandler::write_ret
@@ -299,6 +321,94 @@ ObjectDataHandler::write_ret do_zero(
   );
 }
 
+ObjectDataHandler::clone_ret do_clonerange(
+  context_t ctx,
+  LBAMapping write_pos,
+  const overwrite_range_t &overwrite_range,
+  data_t &data)
+{
+  LOG_PREFIX(ObjectDataHandler::do_clonerange);
+  DEBUGT("{} {} write_pos={}", ctx.t, overwrite_range, data, write_pos);
+  ceph_assert(overwrite_range.clonerange_info.has_value());
+  assert(write_pos.is_end() ||
+    write_pos.get_key() >= overwrite_range.aligned_end);
+  if (data.head_padding.has_value()) {
+    // merge data.headbl and data.head_padding, and write the merged data
+    // into the first 4KB region of the aligned cloned range
+    assert(data.head_padding->length() < ctx.tm.get_block_size());
+    data.merge_head(ctx.tm.get_block_size());
+    auto extents = co_await ctx.tm.alloc_data_extents<ObjectDataBlock>(
+      ctx.t,
+      overwrite_range.aligned_begin,
+      ctx.tm.get_block_size(),
+      std::move(write_pos)
+    ).handle_error_interruptible(
+      crimson::ct_error::enospc::assert_failure{"unexpected enospc"},
+      TransactionManager::get_pin_iertr::pass_further{}
+    );
+    assert(extents.size() == 1);
+    auto &extent = extents.back();
+    assert(overwrite_range.aligned_begin == extent->get_laddr());
+    auto iter = data.headbl->cbegin();
+    iter.copy(extent->get_length(), extent->get_bptr().c_str());
+    auto mapping = co_await ctx.tm.get_pin(ctx.t, *extent
+    ).handle_error_interruptible(
+      crimson::ct_error::enoent::assert_failure{"unexpected enospc"},
+      TransactionManager::get_pin_iertr::pass_further{}
+    );
+    write_pos = co_await mapping.next().handle_error_interruptible(
+      crimson::ct_error::enospc::assert_failure{"unexpected enospc"},
+      TransactionManager::get_pin_iertr::pass_further{}
+    );
+  }
+  // clone the src mappings
+  auto src = overwrite_range.clonerange_info->first_src_mapping;
+  auto offset = overwrite_range.clonerange_info->offset;
+  auto len = overwrite_range.clonerange_info->len;
+  auto src_base = overwrite_range.clonerange_info->src_base;
+  auto dest_base = overwrite_range.clonerange_info->dest_base;
+  auto aligned_off = p2roundup(offset, ctx.tm.get_block_size());
+  auto aligned_len =
+    p2align(offset + len, ctx.tm.get_block_size()) - aligned_off;
+  if (!is_aligned(offset, ctx.tm.get_block_size()) &&
+      src_base + aligned_off == src.get_end()) {
+    // the first mapping of the src range ends exactly at the rounded up
+    // begin, we need to push the first mapping one step further in this
+    // case
+    src = co_await src.next();
+  }
+  auto cr_ret = co_await ctx.tm.clone_range(
+    ctx.t, src_base, dest_base, aligned_off, aligned_len,
+    std::move(write_pos), std::move(src), true);
+  if (cr_ret.shared_direct_mapping) {
+    ctx.onode.set_need_cow(ctx.t);
+  }
+  write_pos = std::move(cr_ret.next_mapping);
+  if (data.tail_padding.has_value()) {
+    // merge data.tailbl and data.tail_padding, and write the merged data
+    // into the last 4KB region of the aligned cloned range
+    assert(data.tail_padding->length() < ctx.tm.get_block_size());
+    data.merge_tail(ctx.tm.get_block_size());
+    auto extents = co_await ctx.tm.alloc_data_extents<ObjectDataBlock>(
+      ctx.t,
+      (overwrite_range.aligned_end - ctx.tm.get_block_size()
+       ).checked_to_laddr(),
+      ctx.tm.get_block_size(),
+      std::move(write_pos)
+    ).handle_error_interruptible(
+      crimson::ct_error::enospc::assert_failure{"unexpected enospc"},
+      TransactionManager::get_pin_iertr::pass_further{}
+    );
+    assert(extents.size() == 1);
+    auto &extent = extents.back();
+    assert((overwrite_range.aligned_end - ctx.tm.get_block_size()
+      ).checked_to_laddr() == extent->get_laddr());
+    auto iter = data.tailbl->cbegin();
+    iter.copy(extent->get_length(), extent->get_bptr().c_str());
+  }
+}
+
+
 ObjectDataHandler::write_ret do_write(
   context_t ctx,
   LBAMapping write_pos,
@@ -602,6 +712,14 @@ ObjectDataHandler::merge_into_pending_edge(
       t_bl.substr_of(*data.bl, 0, unaligned_offset);
     }
     data.bl = std::move(t_bl);
+  } else if (edge == edge_t::LEFT && data.head_padding.has_value()) {
+    assert(!data.headbl);
+    bl = std::move(*data.head_padding);
+    data.head_padding.reset();
+  } else if (edge == edge_t::RIGHT && data.tail_padding.has_value()) {
+    assert(!data.tailbl);
+    bl = std::move(*data.tail_padding);
+    data.tail_padding.reset();
   } else {
     bl.append_zero(unaligned_len);
   }
@@ -875,7 +993,7 @@ ObjectDataHandler::handle_single_mapping_overwrite(
     overwrite_range.aligned_begin,
     overwrite_range.aligned_len,
     op_type);
-  auto do_overwrite = [ctx, &overwrite_range, &data](auto pos) {
+  auto do_overwrite = [ctx, &overwrite_range, &data, op_type](auto pos) {
     if (overwrite_range.is_empty()) {
       // the overwrite is completed in the previous steps,
       // this can happen if delta based overwrites are involved.
@@ -904,7 +1022,11 @@ ObjectDataHandler::handle_single_mapping_overwrite(
     if (data.bl) {
       return do_write(ctx, std::move(pos), overwrite_range, data);
     } else {
-      return do_zero(ctx, std::move(pos), overwrite_range, data);
+      if (op_type == op_type_t::OP_CLONERANGE) {
+       return do_clonerange(ctx, std::move(pos), overwrite_range, data);
+      } else {
+       return do_zero(ctx, std::move(pos), overwrite_range, data);
+      }
     }
   };
 
@@ -959,7 +1081,7 @@ ObjectDataHandler::handle_multi_mapping_overwrite(
 {
   return punch_multi_mapping_hole(
     ctx, overwrite_range, data, std::move(first_mapping), op_type
-  ).si_then([ctx, &overwrite_range, &data](auto pos) {
+  ).si_then([ctx, &overwrite_range, &data, op_type](auto pos) {
     if (overwrite_range.is_empty()) {
       // the overwrite is completed in the previous steps,
       // this can happen if delta based overwrites are involved.
@@ -988,7 +1110,11 @@ ObjectDataHandler::handle_multi_mapping_overwrite(
     if (data.bl) {
       return do_write(ctx, std::move(pos), overwrite_range, data);
     } else {
-      return do_zero(ctx, std::move(pos), overwrite_range, data);
+      if (op_type == op_type_t::OP_CLONERANGE) {
+       return do_clonerange(ctx, std::move(pos), overwrite_range, data);
+      } else {
+       return do_zero(ctx, std::move(pos), overwrite_range, data);
+      }
     }
   });
 }
@@ -1014,7 +1140,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
     unaligned_begin.get_aligned_laddr(ctx.tm.get_block_size()),
     unaligned_end.get_roundup_laddr(ctx.tm.get_block_size()));
   return seastar::do_with(
-    data_t{std::nullopt, std::move(bl), std::nullopt},
+    data_t{std::move(bl)},
     overwrite_range_t{
       len,
       unaligned_begin,
@@ -1034,6 +1160,160 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
   });
 }
 
+ObjectDataHandler::read_iertr::future<>
+ObjectDataHandler::read_edge_for_clone_range(
+  context_t ctx,
+  object_data_t &object_data,
+  extent_len_t offset,
+  extent_len_t len,
+  data_t &data)
+{
+  LOG_PREFIX(ObjectDataHandler::read_edge_for_clone_range);
+  auto base = object_data.get_reserved_data_base();
+  TRACET("object_data: {}~{} range: {}~{}",
+    ctx.t, base, object_data.get_reserved_data_len(), offset, len);
+  auto block_size = ctx.tm.get_block_size();
+  if (is_aligned(offset, block_size) &&
+      is_aligned(offset + len, block_size)) {
+    return read_iertr::now();
+  }
+  std::vector<
+    TransactionManager::get_pin_iertr::future<>> read_paddings;
+  auto begin = base + offset;
+  auto end = base + offset + len;
+  auto aligned_offset = begin.get_aligned_laddr(block_size);
+  auto aligned_end = end.get_aligned_laddr(block_size);
+  if (aligned_offset == aligned_end) {
+    // the cloned range is strictly within the same block, read the
+    // data from src and put it in data.bl. The clone range is effectively
+    // turned into an overwrite
+    return ctx.tm.get_containing_pin(ctx.t, aligned_offset
+    ).si_then([begin, ctx, len](auto read_pos) {
+      auto unaligned_offset = begin.template get_byte_distance<
+       extent_len_t>(read_pos.get_key());
+      return read_mapping(
+       ctx, std::move(read_pos),
+       unaligned_offset, len, false);
+    }).si_then([&data](auto bl) {
+      data.bl = std::move(bl);
+    }).handle_error_interruptible(
+      read_iertr::pass_further{},
+      crimson::ct_error::assert_all{"unexpected error"}
+    );
+  }
+  if (!begin.is_aligned(block_size)) {
+    // read the data of data.head_padding
+    read_paddings.emplace_back(
+      ctx.tm.get_containing_pin(ctx.t, aligned_offset
+      ).si_then([begin, ctx, block_size](auto read_pos) {
+       auto unaligned_offset = begin.template get_byte_distance<
+         extent_len_t>(read_pos.get_key());
+       auto rounded_begin = begin.get_roundup_laddr(block_size);
+       auto len = rounded_begin.template get_byte_distance<
+         extent_len_t>(begin);
+       return read_mapping(
+         ctx, std::move(read_pos),
+         unaligned_offset, len, false);
+      }).si_then([&data](auto head_padding) {
+       data.head_padding = std::move(head_padding);
+      })
+    );
+  }
+  if (!end.is_aligned(block_size)) {
+    // read the data of data.tail_padding
+    read_paddings.emplace_back(
+      ctx.tm.get_containing_pin(ctx.t, aligned_end
+      ).si_then([aligned_end, end, ctx](auto read_pos) {
+       auto aligned_offset = aligned_end.template get_byte_distance<
+           extent_len_t>(read_pos.get_key());
+       auto len = end.template get_byte_distance<
+         extent_len_t>(aligned_end);
+       return read_mapping(
+         ctx, std::move(read_pos),
+         aligned_offset, len, false);
+      }).si_then([&data](auto tail_padding) {
+       data.tail_padding = std::move(tail_padding);
+      })
+    );
+  }
+  // TODO: when_all_succeed should be utilized here, however, it doesn't
+  //       actually work with interruptible errorated futures for now.
+  return trans_intr::parallel_for_each(
+    read_paddings, [](auto &fut) { return std::move(fut); }
+  ).handle_error_interruptible(
+    read_iertr::pass_further{},
+    crimson::ct_error::assert_all{"unexpected error"}
+  );
+}
+
+ObjectDataHandler::clone_ret ObjectDataHandler::clone_range(
+  context_t ctx,
+  extent_len_t srcoff,
+  extent_len_t len,
+  extent_len_t destoff)
+{
+  LOG_PREFIX(ObjectDataHandler::clone_range);
+  ceph_assert(ctx.d_onode);
+  DEBUGT("{}->{}, {}~{}",
+    ctx.t,
+    ctx.onode.get_hobj(),
+    ctx.d_onode->get_hobj(),
+    srcoff, len);
+  // doesn't support inconsistent range clone yet
+  ceph_assert(srcoff == destoff);
+  return with_objects_data(
+    ctx,
+    [ctx, this, srcoff, len](auto &object_data, auto &d_object_data)
+    -> clone_ret {
+    ceph_assert(!object_data.is_null());
+    data_t data;
+    auto dest_mapping = co_await prepare_data_reservation(
+      ctx, d_object_data, object_data.get_reserved_data_len());
+    if (!dest_mapping) {
+      auto d_base = d_object_data.get_reserved_data_base();
+      auto laddr = (d_base + srcoff).get_aligned_laddr(
+       ctx.tm.get_block_size());
+      dest_mapping = co_await ctx.tm.get_containing_pin(ctx.t, laddr
+      ).handle_error_interruptible(
+       clone_iertr::pass_further{},
+       crimson::ct_error::assert_all{"unexpected enoent"}
+      );
+    }
+    // For unaligned range cloning, we need to read data.head_padding
+    // and data.tail_padding from the src range, and later write into
+    // the dest range with data.headbl and data.tailbl.
+    co_await read_edge_for_clone_range(
+      ctx, object_data, srcoff, len, data);
+    auto base = object_data.get_reserved_data_base();
+    auto begin = base + srcoff;
+    auto block_size = ctx.tm.get_block_size();
+    auto src_mapping = co_await ctx.tm.get_containing_pin(
+      ctx.t, begin.get_aligned_laddr(block_size)
+    ).handle_error_interruptible(
+      clone_iertr::pass_further{},
+      crimson::ct_error::assert_all{"unexpected enoent"}
+    );
+    auto d_base = d_object_data.get_reserved_data_base();
+    auto unaligned_begin = d_base + srcoff;
+    auto unaligned_end = unaligned_begin + len;
+    auto overwrite_range = overwrite_range_t{
+      len,
+      unaligned_begin,
+      unaligned_end,
+      ctx.tm.get_block_size(),
+      clone_range_t{std::move(src_mapping), base, d_base, srcoff, len}};
+    if (overwrite_range.is_range_in_mapping(*dest_mapping)) {
+      co_await handle_single_mapping_overwrite(
+       ctx, overwrite_range, data, std::move(*dest_mapping),
+       op_type_t::OP_CLONERANGE);
+    } else {
+      co_await handle_multi_mapping_overwrite(
+       ctx, overwrite_range, data, std::move(*dest_mapping),
+       op_type_t::OP_CLONERANGE);
+    }
+  });
+}
+
 ObjectDataHandler::zero_ret ObjectDataHandler::zero(
   context_t ctx,
   objaddr_t offset,
@@ -1389,7 +1669,7 @@ ObjectDataHandler::copy_on_write(
 {
   return with_object_data(
     ctx,
-    [ctx, this](auto &object_data) -> clone_iertr::future<> {
+    [ctx, this](auto &object_data) -> clone_ret {
     auto mapping = co_await ctx.tm.get_pin(
       ctx.t, object_data.get_reserved_data_base()
     ).handle_error_interruptible(
index 204a72e11e96afdcc899a7fd8e2f8ffb13a506d4..e78285c198f96d9455e948e7e0a528f42cfb054f 100644 (file)
@@ -79,6 +79,15 @@ private:
   mutable std::optional<ceph::bufferptr> ptr = std::nullopt;
 };
 
+struct clone_range_t {
+  LBAMapping first_src_mapping;
+  laddr_t src_base = L_ADDR_NULL;
+  laddr_t dest_base = L_ADDR_NULL;
+  extent_len_t offset = 0;
+  extent_len_t len = 0;
+};
+std::ostream& operator<<(std::ostream &out, const clone_range_t &);
+
 struct overwrite_range_t {
   objaddr_t unaligned_len = 0;
   laddr_offset_t unaligned_begin;
@@ -86,6 +95,7 @@ struct overwrite_range_t {
   laddr_t aligned_begin = L_ADDR_NULL;
   laddr_t aligned_end = L_ADDR_NULL;
   objaddr_t aligned_len = 0;
+  std::optional<clone_range_t> clonerange_info;
   overwrite_range_t(
     objaddr_t unaligned_len,
     laddr_offset_t unaligned_begin,
@@ -100,6 +110,22 @@ struct overwrite_range_t {
        aligned_end.template get_byte_distance<
          extent_len_t>(aligned_begin))
   {}
+  overwrite_range_t(
+    objaddr_t unaligned_len,
+    laddr_offset_t unaligned_begin,
+    laddr_offset_t unaligned_end,
+    extent_len_t block_size,
+    clone_range_t &&clonerange_info)
+    : unaligned_len(unaligned_len),
+      unaligned_begin(unaligned_begin),
+      unaligned_end(unaligned_end),
+      aligned_begin(unaligned_begin.get_aligned_laddr(block_size)),
+      aligned_end(unaligned_end.get_roundup_laddr(block_size)),
+      aligned_len(
+       aligned_end.template get_byte_distance<
+         extent_len_t>(aligned_begin)),
+      clonerange_info(std::move(clonerange_info))
+  {}
 
   bool is_empty() const {
     return unaligned_begin == unaligned_end;
@@ -165,10 +191,38 @@ struct overwrite_range_t {
 };
 std::ostream& operator<<(std::ostream &, const overwrite_range_t &);
 
+// |<-headbl->|<-head_padding->|<--------bl------->|<-tail_padding->|<-tailbl->|
+// |----------4KB--------------|-------------------|----------4KB--------------|
+//            |------------------overwrite_range--------------------|
 struct data_t {
   std::optional<bufferlist> headbl;
+  std::optional<bufferlist> head_padding;
   std::optional<bufferlist> bl;
   std::optional<bufferlist> tailbl;
+  std::optional<bufferlist> tail_padding;
+  data_t() = default;
+  data_t(std::optional<bufferlist> &&_bl) : bl(std::move(_bl)) {}
+  void merge_head(extent_len_t block_size) {
+    assert(head_padding.has_value());
+    if (headbl) {
+      headbl->append(*head_padding);
+    } else {
+      headbl = bufferlist{};
+      headbl->append_zero(block_size - head_padding->length());
+      headbl->append(*head_padding);
+    }
+    head_padding.reset();
+  }
+  void merge_tail(extent_len_t block_size) {
+    assert(tail_padding.has_value());
+    if (tailbl) {
+      tail_padding->append(*tailbl);
+    } else {
+      tail_padding->append_zero(block_size - tail_padding->length());
+    }
+    tailbl = std::move(tail_padding);
+    tail_padding.reset();
+  }
 };
 std::ostream& operator<<(std::ostream &out, const data_t &data);
 
@@ -331,8 +385,19 @@ public:
   using clone_ret = clone_iertr::future<>;
   clone_ret clone(context_t ctx);
 
+  /// Clone the object so that the later modification
+  /// won't be seen by other objects sharing the same
+  /// direct lba mappings.
   clone_ret copy_on_write(context_t ctx);
 
+  /// Clone the specified range from the src object
+  /// to the dest object
+  clone_ret clone_range(
+    context_t ctx,
+    extent_len_t srcoff,
+    extent_len_t len,
+    extent_len_t destoff);
+
 private:
   /// Updates region [_offset, _offset + bl.length) to bl
   write_ret overwrite(
@@ -380,7 +445,8 @@ private:
   enum op_type_t : uint8_t {
     OVERWRITE,
     ZERO,
-    TRIM
+    TRIM,
+    OP_CLONERANGE
   };
   enum edge_handle_policy_t : uint8_t {
     DELTA_BASED_PUNCH,
@@ -570,6 +636,13 @@ private:
     data_t &data,
     LBAMapping edge_mapping);
 
+  read_iertr::future<> read_edge_for_clone_range(
+    context_t ctx,
+    object_data_t &object_data,
+    extent_len_t offset,
+    extent_len_t len,
+    data_t &data);
+
 private:
   /**
    * max_object_size
index 4a0d982ebdf7cfb6161e25cb69dbbf0b286289cb..3aa4247855971ea7529250fb8c3999dca442c206 100644 (file)
@@ -93,6 +93,9 @@ public:
   virtual const onode_layout_t &get_layout() const = 0;
   virtual ~Onode() = default;
 
+  const hobject_t &get_hobj() const {
+    return hobj;
+  }
   bool is_head() const {
     return hobj.is_head();
   }
index a5963800b2a1b704a92eae79df2d27a2e0c6092f..cb2cb52bc02ea11b363aa42531bdb884c8edb13d 100644 (file)
@@ -1617,7 +1617,8 @@ SeaStore::Shard::_do_transaction_step(
     }
     OnodeRef& d_onode = onodes[op->dest_oid];
     if ((op->op == Transaction::OP_CLONE
-         || op->op == Transaction::OP_COLL_MOVE_RENAME)
+         || op->op == Transaction::OP_COLL_MOVE_RENAME
+         || op->op == Transaction::OP_CLONERANGE2)
        && !d_onode) {
       const ghobject_t& dest_oid = i.get_oid(op->dest_oid);
       DEBUGT("op {}, get_or_create dest oid={} ...",
@@ -1791,6 +1792,22 @@ SeaStore::Shard::_do_transaction_step(
                *ctx.transaction, oid, i.get_oid(op->dest_oid));
        return _clone(ctx, *onode, *onodes[op->dest_oid]);
       }
+      case Transaction::OP_CLONERANGE2:
+      {
+        assert(op->off <= std::numeric_limits<extent_len_t>::max());
+        assert(op->len <= std::numeric_limits<extent_len_t>::max());
+        assert(op->dest_off <= std::numeric_limits<extent_len_t>::max());
+        extent_len_t srcoff = (extent_len_t)op->off;
+        extent_len_t len = (extent_len_t)op->len;
+        extent_len_t dstoff = (extent_len_t)op->dest_off;
+        return _clone_range(
+          ctx,
+          onode,
+          onodes[op->dest_oid],
+          srcoff,
+          len,
+          dstoff);
+      }
       case Transaction::OP_COLL_MOVE_RENAME:
       {
         DEBUGT("op COLL_MOVE_RENAME, oid={}, dest oid={} ...",
@@ -2013,6 +2030,39 @@ SeaStore::Shard::_clone(
   });
 }
 
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_clone_range(
+  internal_context_t &ctx,
+  OnodeRef &src_onode,
+  OnodeRef &dst_onode,
+  extent_len_t srcoff,
+  extent_len_t length,
+  extent_len_t dstoff)
+{
+  LOG_PREFIX(SeaStore::_clone_range);
+  DEBUGT("src_onode={}, dst_onode={}, src {}~{}, dst {}",
+    *ctx.transaction, *src_onode, *dst_onode, srcoff, length, dstoff);
+  const auto &d_object_size = dst_onode->get_layout().size;
+  if (srcoff + length > d_object_size) {
+    dst_onode->update_onode_size(
+      *ctx.transaction,
+      std::max<uint64_t>(srcoff + length, d_object_size));
+  }
+  return seastar::do_with(
+    ObjectDataHandler(max_object_size),
+    [=, this, &ctx](auto &objHandler) {
+    return objHandler.clone_range(
+      ObjectDataHandler::context_t{
+       *transaction_manager,
+       *ctx.transaction,
+       *src_onode,
+       dst_onode.get()},
+      srcoff,
+      length,
+      dstoff);
+  });
+}
+
 SeaStore::Shard::tm_ret
 SeaStore::Shard::_zero(
   internal_context_t &ctx,
index 09e65efdb7ac890bddadafd1c72fcc23855fb335..28811f342b96c9828b798d61de7011599feafe31 100644 (file)
@@ -352,6 +352,13 @@ public:
       internal_context_t &ctx,
       OnodeRef &onode,
       OnodeRef &d_onode);
+    tm_ret _clone_range(
+      internal_context_t &ctx,
+      OnodeRef &src_onode,
+      OnodeRef &dst_onode,
+      extent_len_t srcoff,
+      extent_len_t length,
+      extent_len_t dstoff);
     tm_ret _zero(
       internal_context_t &ctx,
       Onode &onode,
index 1080d3491822de8b7d6452e8458e21249914b506..7116b2461b3e88e4e131f2414fac8a3a14e67fa0 100644 (file)
@@ -616,79 +616,69 @@ public:
     });
   }
 
+  struct clone_range_ret_t {
+    bool shared_direct_mapping = false;
+    LBAMapping next_mapping;
+  };
   // clone the mappings in range base~len, returns true if there exists
   // direct mappings that are cloned.
   using clone_iertr = base_iertr;
-  using clone_ret = clone_iertr::future<bool>;
+  using clone_ret = clone_iertr::future<clone_range_ret_t>;
   clone_ret clone_range(
     Transaction &t,
-    laddr_t base,
+    laddr_t src_base,
+    laddr_t dst_base,
+    extent_len_t offset,
     extent_len_t len,
     LBAMapping pos,
     LBAMapping mapping,
     bool updateref)
   {
     LOG_PREFIX(TransactionManager::clone_range);
-    SUBDEBUGT(seastore_tm, "object_data={}~{} mapping={} updateref={}",
-      t, base, len, mapping, updateref);
-    return seastar::do_with(
-      std::move(pos),
-      std::move(mapping),
-      (extent_len_t)0,
-      false,
-      [&t, this, updateref, base, len]
-      (auto &pos, auto &mapping, auto &offset, auto &ret) {
-      return trans_intr::repeat(
-       [&t, this, &pos, &mapping, &offset, updateref, base, len, &ret]()
-       -> clone_iertr::future<seastar::stop_iteration> {
-       if (offset >= len) {
-         return clone_iertr::make_ready_future<
-           seastar::stop_iteration>(seastar::stop_iteration::yes);
-       }
-       if (!mapping.is_indirect() && mapping.is_zero_reserved()) {
-         return reserve_region(
-           t,
-           std::move(pos),
-           (base + offset).checked_to_laddr(),
-           mapping.get_length()
-         ).si_then([base, &offset](auto r) {
-           assert((base + offset).checked_to_laddr() == r.get_key());
-           offset += r.get_length();
-           return r.next();
-         }).si_then([&pos, &mapping](auto r) {
-           pos = std::move(r);
-           return mapping.next();
-         }).si_then([&mapping](auto p) {
-           mapping = std::move(p);
-           return seastar::stop_iteration::no;
-         }).handle_error_interruptible(
-           clone_iertr::pass_further{},
-           crimson::ct_error::assert_all{"unexpected error"}
-         );
-       }
-       if (mapping.is_real()) {
-         ret = true;
-       }
-       auto len = mapping.get_length();
-       return clone_pin(
-         t, std::move(pos), std::move(mapping),
-         (base + offset).checked_to_laddr(),
-         0, len, updateref
-       ).si_then([&offset, &pos, &mapping](auto ret) {
-         offset += ret.cloned_mapping.get_length();
-         return ret.cloned_mapping.next(
-         ).si_then([&pos, ret=std::move(ret)](auto p) mutable {
-           pos = std::move(p);
-           return ret.orig_mapping.next();
-         }).si_then([&mapping](auto p) {
-           mapping = std::move(p);
-           return seastar::stop_iteration::no;
-         });
-       });
-      }).si_then([&ret] {
-       return ret;
-      });
-    });
+    SUBDEBUGT(seastore_tm,
+      "src_base={}, dst_base={}, {}~{}, mapping={}, pos={}, updateref={}",
+      t, src_base, dst_base, offset, len, mapping, pos, updateref);
+    pos = co_await pos.refresh();
+    mapping = co_await mapping.refresh();
+    auto left = len;
+    bool shared_direct = false;
+    auto cloned_to = offset;
+    while (left != 0) {
+      auto src_offset = src_base.template get_byte_distance<
+       extent_len_t>(mapping.get_key());
+      ceph_assert(cloned_to >= src_offset);
+      extent_len_t clone_offset = cloned_to - src_offset;
+      extent_len_t clone_len = mapping.get_length() - clone_offset;
+      clone_len = std::min(clone_len, left);
+      left -= clone_len;
+      if (!mapping.is_indirect() && mapping.get_val().is_zero()) {
+       auto r = co_await reserve_region(
+         t,
+         std::move(pos),
+         (dst_base + cloned_to).checked_to_laddr(),
+         clone_len
+       ).handle_error_interruptible(
+         clone_iertr::pass_further{},
+         crimson::ct_error::assert_all{"unexpected error"}
+       );
+       assert((dst_base + cloned_to).checked_to_laddr() == r.get_key());
+       cloned_to += clone_len;
+       pos = co_await r.next();
+       mapping = co_await mapping.next();
+       continue;
+      }
+      if (mapping.is_real()) {
+       shared_direct = true;
+      }
+      auto ret = co_await clone_pin(
+       t, std::move(pos), std::move(mapping),
+       (dst_base + cloned_to).checked_to_laddr(),
+       clone_offset, clone_len, updateref);
+      cloned_to += clone_len;
+      pos = co_await ret.cloned_mapping.next();
+      mapping = co_await ret.orig_mapping.next();
+    }
+    co_return clone_range_ret_t{shared_direct, std::move(pos)};
   }
 
   /* alloc_extents