]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: implement OP_CLONE
authorXuehan Xu <xxhdx1985126@gmail.com>
Wed, 29 Mar 2023 10:07:20 +0000 (18:07 +0800)
committerMatan Breizman <mbreizma@redhat.com>
Thu, 19 Oct 2023 07:13:17 +0000 (07:13 +0000)
Signed-off-by: Xuehan Xu <xxhdx1985126@gmail.com>
(cherry picked from commit 9f303cde7b9d7aca14a2024948079b4280168a2d)

src/crimson/os/seastore/btree/btree_range_pin.h
src/crimson/os/seastore/lba_manager.h
src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
src/crimson/os/seastore/object_data_handler.cc
src/crimson/os/seastore/object_data_handler.h
src/crimson/os/seastore/seastore.cc
src/crimson/os/seastore/seastore.h
src/crimson/os/seastore/seastore_types.h
src/crimson/os/seastore/transaction_manager.h
src/test/crimson/seastore/test_btree_lba_manager.cc

index 684d81ce9917f58ed46517e83640484f80a73ef9..c753a1c3b035c34b8e232277953f8404f1f5be6b 100644 (file)
@@ -132,10 +132,6 @@ class BtreeNodeMapping : public PhysicalNodeMapping<key_t, val_t> {
   fixed_kv_node_meta_t<key_t> range;
   uint16_t pos = std::numeric_limits<uint16_t>::max();
 
-  pladdr_t _get_val() const final {
-    return value;
-  }
-
 public:
   using val_type = val_t;
   BtreeNodeMapping(op_context_t<key_t> ctx) : ctx(ctx) {}
@@ -186,7 +182,12 @@ public:
   }
 
   val_t get_val() const final {
-    return value;
+    if constexpr (std::is_same_v<val_t, paddr_t>) {
+      return value.get_paddr();
+    } else {
+      static_assert(std::is_same_v<val_t, laddr_t>);
+      return value.get_laddr();
+    }
   }
 
   key_t get_key() const final {
index f36a788344ac04a2f8b7280b2f5e5222f5de70b2..6275d4dbbf5cac732c78961e4b45d532cf206ce8 100644 (file)
@@ -81,7 +81,19 @@ public:
     laddr_t hint,
     extent_len_t len,
     paddr_t addr,
-    LogicalCachedExtent *nextent) = 0;
+    LogicalCachedExtent &nextent) = 0;
+
+  virtual alloc_extent_ret clone_extent(
+    Transaction &t,
+    laddr_t hint,
+    extent_len_t len,
+    laddr_t intermediate_key,
+    paddr_t actual_addr) = 0;
+
+  virtual alloc_extent_ret reserve_region(
+    Transaction &t,
+    laddr_t hint,
+    extent_len_t len) = 0;
 
   struct ref_update_result_t {
     unsigned refcount = 0;
index f109b8a99822f2e39d99ca2958483fd02cd2e1cb..c1bfc25dd06632625df0d8a13c373e75aee3faed 100644 (file)
@@ -205,11 +205,12 @@ BtreeLBAManager::get_mapping(
 }
 
 BtreeLBAManager::alloc_extent_ret
-BtreeLBAManager::alloc_extent(
+BtreeLBAManager::_alloc_extent(
   Transaction &t,
   laddr_t hint,
   extent_len_t len,
-  paddr_t addr,
+  pladdr_t addr,
+  paddr_t actual_addr,
   LogicalCachedExtent* nextent)
 {
   struct state_t {
@@ -221,7 +222,7 @@ BtreeLBAManager::alloc_extent(
     state_t(laddr_t hint) : last_end(hint) {}
   };
 
-  LOG_PREFIX(BtreeLBAManager::alloc_extent);
+  LOG_PREFIX(BtreeLBAManager::_alloc_extent);
   TRACET("{}~{}, hint={}", t, addr, len, hint);
   auto c = get_context(t);
   ++stats.num_alloc_extents;
@@ -272,21 +273,30 @@ BtreeLBAManager::alloc_extent(
            c,
            *state.insert_iter,
            state.last_end,
-           lba_map_val_t{len, pladdr_t(addr), 1, 0}
+           lba_map_val_t{len, pladdr_t(addr), 1, 0},
            nextent
          ).si_then([&state, FNAME, c, addr, len, hint, nextent](auto &&p) {
            auto [iter, inserted] = std::move(p);
            TRACET("{}~{}, hint={}, inserted at {}",
                   c.trans, addr, len, hint, state.last_end);
            if (nextent) {
+             ceph_assert(addr.is_paddr());
              nextent->set_laddr(iter.get_key());
            }
            ceph_assert(inserted);
            state.ret = iter;
          });
        });
-    }).si_then([c](auto &&state) {
-      return state.ret->get_pin(c);
+    }).si_then([c, actual_addr, addr](auto &&state) {
+      auto ret_pin = state.ret->get_pin(c);
+      if (actual_addr != P_ADDR_NULL) {
+       ceph_assert(addr.is_laddr());
+       ret_pin->set_paddr(actual_addr);
+      } else {
+       ceph_assert(addr.is_paddr());
+      }
+      return alloc_extent_iertr::make_ready_future<LBAMappingRef>(
+       std::move(ret_pin));
     });
 }
 
index 7c5d42cec792cd70235170c6643c0d38237c2eb8..396b024ec62fdddd747abaa325c1c50d7dfed5af 100644 (file)
@@ -89,12 +89,34 @@ public:
     Transaction &t,
     laddr_t offset) final;
 
+  alloc_extent_ret reserve_region(
+    Transaction &t,
+    laddr_t hint,
+    extent_len_t len)
+  {
+    return _alloc_extent(t, hint, len, P_ADDR_ZERO, P_ADDR_NULL, nullptr);
+  }
+
+  alloc_extent_ret clone_extent(
+    Transaction &t,
+    laddr_t hint,
+    extent_len_t len,
+    laddr_t intermediate_key,
+    paddr_t actual_addr)
+  {
+    return _alloc_extent(t, hint, len, intermediate_key, actual_addr, nullptr);
+  }
+
   alloc_extent_ret alloc_extent(
     Transaction &t,
     laddr_t hint,
     extent_len_t len,
     paddr_t addr,
-    LogicalCachedExtent*) final;
+    LogicalCachedExtent &ext) final
+  {
+    assert(ext);
+    return _alloc_extent(t, hint, len, addr, P_ADDR_NULL, &ext);
+  }
 
   ref_ret decref_extent(
     Transaction &t,
@@ -187,6 +209,14 @@ private:
     laddr_t addr,
     update_func_t &&f,
     LogicalCachedExtent*);
+
+  alloc_extent_ret _alloc_extent(
+    Transaction &t,
+    laddr_t hint,
+    extent_len_t len,
+    pladdr_t addr,
+    paddr_t actual_addr,
+    LogicalCachedExtent*);
 };
 using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>;
 
index c1cd5f32267437fecafb22f817a7d967e0e720e2..9ac406f3eb7529a35ccbd0d1dbea0036735200c6 100644 (file)
@@ -883,6 +883,31 @@ auto with_object_data(
     });
 }
 
+template <typename F>
+auto with_objects_data(
+  ObjectDataHandler::context_t ctx,
+  F &&f)
+{
+  ceph_assert(ctx.d_onode);
+  return seastar::do_with(
+    ctx.onode.get_layout().object_data.get(),
+    ctx.d_onode->get_layout().object_data.get(),
+    std::forward<F>(f),
+    [ctx](auto &object_data, auto &d_object_data, auto &f) {
+      return std::invoke(f, object_data, d_object_data
+      ).si_then([ctx, &object_data, &d_object_data] {
+       if (object_data.must_update()) {
+         ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data);
+       }
+       if (d_object_data.must_update()) {
+         ctx.d_onode->get_mutable_layout(
+           ctx.t).object_data.update(d_object_data);
+       }
+       return seastar::now();
+      });
+    });
+}
+
 ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
   context_t ctx,
   object_data_t &object_data,
@@ -1456,4 +1481,126 @@ ObjectDataHandler::clear_ret ObjectDataHandler::clear(
     });
 }
 
+ObjectDataHandler::clone_ret ObjectDataHandler::clone_extents(
+  context_t ctx,
+  object_data_t &object_data,
+  lba_pin_list_t &pins,
+  laddr_t data_base)
+{
+  LOG_PREFIX(ObjectDataHandler::clone_extents);
+  TRACET(" object_data: {}~{}, data_base: {}",
+    ctx.t,
+    object_data.get_reserved_data_base(),
+    object_data.get_reserved_data_len(),
+    data_base);
+  return ctx.tm.dec_ref(
+    ctx.t,
+    object_data.get_reserved_data_base()
+  ).si_then(
+    [&pins, &object_data, ctx, data_base](auto) mutable {
+      return seastar::do_with(
+       (extent_len_t)0,
+       [&object_data, ctx, data_base, &pins](auto &last_pos) {
+       return trans_intr::do_for_each(
+         pins,
+         [&last_pos, &object_data, ctx, data_base](auto &pin) {
+         auto offset = pin->get_key() - data_base;
+         ceph_assert(offset == last_pos);
+         auto fut = TransactionManager::alloc_extent_iertr
+           ::make_ready_future<LBAMappingRef>();
+         auto addr = object_data.get_reserved_data_base() + offset;
+         if (pin->get_val().is_zero()) {
+           fut = ctx.tm.reserve_region(ctx.t, addr, pin->get_length());
+         } else {
+           fut = ctx.tm.clone_pin(ctx.t, addr, *pin);
+         }
+         return fut.si_then(
+           [&pin, &last_pos, offset](auto) {
+           last_pos = offset + pin->get_length();
+           return seastar::now();
+         }).handle_error_interruptible(
+           crimson::ct_error::input_output_error::pass_further(),
+           crimson::ct_error::assert_all("not possible")
+         );
+       }).si_then([&last_pos, &object_data, ctx] {
+         if (last_pos != object_data.get_reserved_data_len()) {
+           return ctx.tm.reserve_region(
+             ctx.t,
+             object_data.get_reserved_data_base() + last_pos,
+             object_data.get_reserved_data_len() - last_pos
+           ).si_then([](auto) {
+             return seastar::now();
+           });
+         }
+         return TransactionManager::reserve_extent_iertr::now();
+       });
+      });
+    },
+    ObjectDataHandler::write_iertr::pass_further{},
+    crimson::ct_error::assert_all{
+      "object_data_handler::clone invalid error"
+    }
+  );
+}
+
+ObjectDataHandler::clone_ret ObjectDataHandler::clone(
+  context_t ctx)
+{
+  // the whole clone procedure can be seperated into the following steps:
+  //   1. let clone onode(d_object_data) take the head onode's
+  //      object data base;
+  //   2. reserve a new region in lba tree for the head onode;
+  //   3. clone all extents of the clone onode, see transaction_manager.h
+  //      for the details of clone_pin;
+  //   4. reserve the space between the head onode's size and its reservation
+  //      length.
+  return with_objects_data(
+    ctx,
+    [ctx, this](auto &object_data, auto &d_object_data) {
+    ceph_assert(d_object_data.is_null());
+    if (object_data.is_null()) {
+      return clone_iertr::now();
+    }
+    return prepare_data_reservation(
+      ctx,
+      d_object_data,
+      object_data.get_reserved_data_len()
+    ).si_then([&object_data, &d_object_data, ctx, this] {
+      assert(!object_data.is_null());
+      auto base = object_data.get_reserved_data_base();
+      auto len = object_data.get_reserved_data_len();
+      object_data.clear();
+      LOG_PREFIX(ObjectDataHandler::clone);
+      DEBUGT("cloned obj reserve_data_base: {}, len {}",
+       ctx.t,
+       d_object_data.get_reserved_data_base(),
+       d_object_data.get_reserved_data_len());
+      return prepare_data_reservation(
+       ctx,
+       object_data,
+       d_object_data.get_reserved_data_len()
+      ).si_then([&d_object_data, ctx, &object_data, base, len, this] {
+       LOG_PREFIX("ObjectDataHandler::clone");
+       DEBUGT("head obj reserve_data_base: {}, len {}",
+         ctx.t,
+         object_data.get_reserved_data_base(),
+         object_data.get_reserved_data_len());
+       return ctx.tm.get_pins(ctx.t, base, len
+       ).si_then([ctx, &object_data, &d_object_data, base, this](auto pins) {
+         return seastar::do_with(
+           std::move(pins),
+           [ctx, &object_data, &d_object_data, base, this](auto &pins) {
+           return clone_extents(ctx, object_data, pins, base
+           ).si_then([ctx, &d_object_data, base, &pins, this] {
+             return clone_extents(ctx, d_object_data, pins, base);
+           }).si_then([&pins, ctx] {
+             return do_removals(ctx, pins);
+           });
+         });
+       });
+      });
+    });
+  });
+}
+
 } // namespace crimson::os::seastore
index ca648f12c2e43048d7a80c34ed94be1db5bb288b..b5f432d5ac7760140895059a157740e0e68ae4e7 100644 (file)
@@ -58,6 +58,7 @@ public:
     TransactionManager &tm;
     Transaction &t;
     Onode &onode;
+    Onode *d_onode = nullptr; // The desination node in case of clone
   };
 
   /// Writes bl to [offset, offset + bl.length())
@@ -103,6 +104,11 @@ public:
   using clear_ret = clear_iertr::future<>;
   clear_ret clear(context_t ctx);
 
+  /// Clone data of an Onode
+  using clone_iertr = base_iertr;
+  using clone_ret = clone_iertr::future<>;
+  clone_ret clone(context_t ctx);
+
 private:
   /// Updates region [_offset, _offset + bl.length) to bl
   write_ret overwrite(
@@ -124,6 +130,13 @@ private:
     context_t ctx,
     object_data_t &object_data,
     extent_len_t size);
+
+  clone_ret clone_extents(
+    context_t ctx,
+    object_data_t &object_data,
+    lba_pin_list_t &pins,
+    laddr_t data_base);
+
 private:
   /**
    * max_object_size
index b188724ab9895d72629351530516d7ee89d21aaa..beed4e69280bee7c66ad521aa20e114cb1f121aa 100644 (file)
@@ -1196,8 +1196,9 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
     op_type_t::TRANSACTION,
     [this](auto &ctx) {
       return with_trans_intr(*ctx.transaction, [&, this](auto &t) {
-        return seastar::do_with(std::vector<OnodeRef>(ctx.iter.objects.size()),
-          std::vector<OnodeRef>(),
+        return seastar::do_with(
+         std::vector<OnodeRef>(ctx.iter.objects.size()),
+          std::vector<OnodeRef>(ctx.iter.objects.size()),
           [this, &ctx](auto& onodes, auto& d_onodes) mutable {
           return trans_intr::repeat(
             [this, &ctx, &onodes, &d_onodes]() mutable
@@ -1289,20 +1290,42 @@ SeaStore::Shard::_do_transaction_step(
         *ctx.transaction, i.get_oid(op->oid));
     }
   }
-  return fut.si_then([&, op, this](auto&& get_onode) -> tm_ret {
-    LOG_PREFIX(SeaStore::_do_transaction_step);
+  return fut.si_then([&, op](auto get_onode) {
     OnodeRef &o = onodes[op->oid];
     if (!o) {
       assert(get_onode);
       o = get_onode;
-      d_onodes.push_back(get_onode);
+      d_onodes[op->oid] = get_onode;
+    }
+    if (op->op == Transaction::OP_CLONE && !d_onodes[op->dest_oid]) {
+      //TODO: use when_all_succeed after making onode tree
+      //      support parallel extents loading
+      return onode_manager->get_or_create_onode(
+       *ctx.transaction, i.get_oid(op->dest_oid)
+      ).si_then([&, op](auto dest_onode) {
+       assert(dest_onode);
+       auto &d_o = onodes[op->dest_oid];
+       assert(!d_o);
+       assert(!d_onodes[op->dest_oid]);
+       d_o = dest_onode;
+       d_onodes[op->dest_oid] = dest_onode;
+       return seastar::now();
+      });
+    } else {
+      return OnodeManager::get_or_create_onode_iertr::now();
     }
+  }).si_then([&, op, this]() -> tm_ret {
+    LOG_PREFIX(SeaStore::_do_transaction_step);
     try {
       switch (op->op) {
       case Transaction::OP_REMOVE:
       {
        TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid));
-        return _remove(ctx, onodes[op->oid]);
+        return _remove(ctx, onodes[op->oid]
+       ).si_then([&onodes, &d_onodes, op] {
+         onodes[op->oid].reset();
+         d_onodes[op->oid].reset();
+       });
       }
       case Transaction::OP_CREATE:
       case Transaction::OP_TOUCH:
@@ -1390,6 +1413,10 @@ SeaStore::Shard::_do_transaction_step(
         // TODO
         return tm_iertr::now();
       }
+      case Transaction::OP_CLONE:
+      {
+       return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]);
+      }
       default:
         ERROR("bad op {}", static_cast<unsigned>(op->op));
         return crimson::ct_error::input_output_error::make();
@@ -1507,6 +1534,31 @@ SeaStore::Shard::_write(
     });
 }
 
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_clone(
+  internal_context_t &ctx,
+  OnodeRef &onode,
+  OnodeRef &d_onode)
+{
+  LOG_PREFIX(SeaStore::_clone);
+  DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode);
+  return seastar::do_with(
+    ObjectDataHandler(max_object_size),
+    [this, &ctx, &onode, &d_onode](auto &objHandler) {
+    //TODO: currently, we only care about object data, leaving cloning
+    //      of xattr/omap for future work
+    auto &object_size = onode->get_layout().size;
+    auto &d_object_size = d_onode->get_mutable_layout(*ctx.transaction).size;
+    d_object_size = object_size;
+    return objHandler.clone(
+      ObjectDataHandler::context_t{
+       *transaction_manager,
+       *ctx.transaction,
+       *onode,
+       d_onode.get()});
+  });
+}
+
 SeaStore::Shard::tm_ret
 SeaStore::Shard::_zero(
   internal_context_t &ctx,
index df4323df55736d95b0b8a8f99544f310055b1838..876fadca8c78753511cf99ee4d9e90c7eddb2e89 100644 (file)
@@ -353,6 +353,10 @@ public:
       uint64_t offset, size_t len,
       ceph::bufferlist &&bl,
       uint32_t fadvise_flags);
+    tm_ret _clone(
+      internal_context_t &ctx,
+      OnodeRef &onode,
+      OnodeRef &d_onode);
     tm_ret _zero(
       internal_context_t &ctx,
       OnodeRef &onode,
index 0713609fc87975cda7b2c51f020f48286ddc7049..17438ec0479da98bd634b5e874b24d88df211581 100644 (file)
@@ -1042,9 +1042,9 @@ struct pladdr_t {
 
   pladdr_t() = default;
   pladdr_t(const pladdr_t &) = default;
-  explicit pladdr_t(laddr_t laddr)
+  pladdr_t(laddr_t laddr)
     : pladdr(laddr) {}
-  explicit pladdr_t(paddr_t paddr)
+  pladdr_t(paddr_t paddr)
     : pladdr(paddr) {}
 
   bool is_laddr() const {
index 8f71323cde8db83664d45335db7fc014d931dd9f..10cc6f0e7ced5f7addc9c75237f072df84eed762 100644 (file)
@@ -282,7 +282,7 @@ public:
       laddr_hint,
       len,
       ext->get_paddr(),
-      ext.get()
+      *ext
     ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable {
       LOG_PREFIX(TransactionManager::alloc_extent);
       SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint);
@@ -419,12 +419,50 @@ public:
     LOG_PREFIX(TransactionManager::reserve_region);
     SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint);
     ceph_assert(is_aligned(hint, epm->get_block_size()));
-    return lba_manager->alloc_extent(
+    return lba_manager->reserve_region(
       t,
       hint,
-      len,
-      P_ADDR_ZERO,
-      nullptr);
+      len);
+  }
+
+  /*
+   * clone_pin
+   *
+   * create an indirect lba mapping pointing to the physical
+   * lba mapping whose key is clone_offset. Resort to btree_lba_manager.h
+   * for the definition of "indirect lba mapping" and "physical lba mapping"
+   *
+   */
+  using clone_extent_iertr = alloc_extent_iertr;
+  using clone_extent_ret = clone_extent_iertr::future<LBAMappingRef>;
+  clone_extent_ret clone_pin(
+    Transaction &t,
+    laddr_t hint,
+    const LBAMapping &mapping) {
+    auto clone_offset =
+      mapping.is_indirect()
+       ? mapping.get_intermediate_key()
+       : mapping.get_key();
+
+    LOG_PREFIX(TransactionManager::clone_pin);
+    SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}",
+      t, mapping.get_length(), hint, clone_offset);
+    ceph_assert(is_aligned(hint, epm->get_block_size()));
+    return lba_manager->clone_extent(
+      t,
+      hint,
+      mapping.get_length(),
+      clone_offset,
+      mapping.get_val()
+    ).si_then([this, &t, clone_offset](auto pin) {
+      return inc_ref(t, clone_offset
+      ).si_then([pin=std::move(pin)](auto) mutable {
+       return std::move(pin);
+      }).handle_error_interruptible(
+       crimson::ct_error::input_output_error::pass_further(),
+       crimson::ct_error::assert_all("not possible")
+      );
+    });
   }
 
   /* alloc_extents
@@ -780,7 +818,7 @@ private:
       remap_laddr,
       remap_length,
       remap_paddr,
-      ext.get()
+      *ext
     ).si_then([remap_laddr, remap_length, remap_paddr](auto &&ref) {
       assert(ref->get_key() == remap_laddr);
       assert(ref->get_val() == remap_paddr);
index f55d0d6abd48486a9c4bcd9110cb5a9d25fbd420..95b165fab29fbe9937519ea7771418f277b6523c 100644 (file)
@@ -432,7 +432,7 @@ struct btree_lba_manager_test : btree_test_base {
            0,
            get_paddr());
        return lba_manager->alloc_extent(
-         t, hint, len, extent->get_paddr(), extent.get());
+         t, hint, len, extent->get_paddr(), *extent);
       }).unsafe_get0();
     logger().debug("alloc'd: {}", *ret);
     EXPECT_EQ(len, ret->get_length());