]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/os/seastore: adapt copy on write for static onode prefix
authorZhang Song <zhangsong02@qianxin.com>
Wed, 3 Sep 2025 07:54:40 +0000 (15:54 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Mon, 2 Feb 2026 11:37:02 +0000 (19:37 +0800)
Signed-off-by: Zhang Song <zhangsong02@qianxin.com>
src/crimson/os/seastore/lba/btree_lba_manager.cc
src/crimson/os/seastore/lba/btree_lba_manager.h
src/crimson/os/seastore/lba_manager.h
src/crimson/os/seastore/object_data_handler.cc
src/crimson/os/seastore/onode.h
src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h
src/crimson/os/seastore/seastore.cc
src/crimson/os/seastore/seastore.h
src/crimson/os/seastore/transaction_manager.h
src/test/crimson/seastore/test_object_data_handler.cc

index 5fd42857bf2e54599db394baab63cdbf3d2fdac2..d01d649b8effb07895f9d888f3f78b08f69e582d 100644 (file)
@@ -319,6 +319,29 @@ BtreeLBAManager::get_mapping(
   });
 }
 
+BtreeLBAManager::lower_bound_ret
+BtreeLBAManager::lower_bound(
+  Transaction &t,
+  laddr_t laddr)
+{
+  auto c = get_context(t);
+  return with_btree<LBABtree>(
+    cache,
+    c,
+    [c, laddr](LBABtree &btree)
+  {
+    return btree.lower_bound(c, laddr
+    ).si_then([c](LBABtree::iterator iter) {
+      auto cursor = iter.get_cursor(c);
+      if (cursor->is_indirect()) {
+       return LBAMapping::create_indirect(nullptr, std::move(cursor));
+      } else {
+       return LBAMapping::create_direct(std::move(cursor));
+      }
+    });
+  });
+}
+
 BtreeLBAManager::alloc_extent_ret
 BtreeLBAManager::reserve_region(
   Transaction &t,
index 7dbd3f1e62139f54469dd26cdb8b0027287585cf..6d6335317f21614751b6b0f5e461be6837222a46 100644 (file)
@@ -75,6 +75,10 @@ public:
     Transaction &t,
     LogicalChildNode &extent) final;
 
+  lower_bound_ret lower_bound(
+    Transaction &t,
+    laddr_t laddr) final;
+
   alloc_extent_ret reserve_region(
     Transaction &t,
     LBAMapping pos,
index c8ba193df4ad69699b3ec82ba429f18270009758..49c2c9273b351a1f8939950de6bd6fd592be3e8e 100644 (file)
@@ -63,6 +63,11 @@ public:
     laddr_t offset,
     bool search_containing = false) = 0;
 
+  using lower_bound_ret = base_iertr::future<LBAMapping>;
+  virtual lower_bound_ret lower_bound(
+    Transaction &t,
+    laddr_t laddr) = 0;
+
   /*
    * Fetches the mapping corresponding to the "extent"
    *
index 29e692a9efaf1838335c63846016e056803cd07a..9075ce4a5b25bc4b60a9032be732fce9f7fd1f05 100644 (file)
@@ -1685,29 +1685,41 @@ ObjectDataHandler::clone_ret
 ObjectDataHandler::copy_on_write(
   context_t ctx)
 {
-  return with_object_data(
+  return with_objects_data(
     ctx,
-    [ctx, this](auto &object_data) -> clone_ret {
+    [ctx, this](auto &object_data, auto &d_object_data) -> clone_ret
+  {
     auto mapping = co_await ctx.tm.get_pin(
       ctx.t, object_data.get_reserved_data_base()
     ).handle_error_interruptible(
       clone_iertr::pass_further{},
       crimson::ct_error::assert_all{"unexpected enoent"}
     );
-    object_data_t d_object_data = get_null_object_data();
     co_await do_clone(ctx, object_data, d_object_data, mapping, false);
     auto old_base = object_data.get_reserved_data_base();
     auto old_len = object_data.get_reserved_data_len();
-    object_data.update_reserved(
-      d_object_data.get_reserved_data_base(),
-      d_object_data.get_reserved_data_len());
-    ctx.onode.unset_need_cow(ctx.t);
+    assert(ctx.d_onode->need_cow());
+    ctx.d_onode->unset_need_cow(ctx.t);
     co_await ctx.tm.remove_mappings_in_range(
       ctx.t, old_base, old_len, std::move(mapping), {false, true}
     ).handle_error_interruptible(
       clone_iertr::pass_further{},
       crimson::ct_error::assert_all{"unexpected enoent"}
     ).discard_result();
+
+    auto old_md_start = old_base.with_metadata().with_offset_by_blocks(0);
+    auto md_mapping = co_await ctx.tm.lower_bound_pin(ctx.t, old_md_start);
+    if (md_mapping.is_end() ||
+       md_mapping.get_key().get_clone_prefix() !=
+       old_md_start.get_clone_prefix()) {
+      co_return;
+    }
+    auto new_prefix = d_object_data
+       .get_reserved_data_base()
+       .get_clone_prefix()
+       .with_metadata();
+    auto md_dst_mapping = co_await ctx.tm.lower_bound_pin(ctx.t, new_prefix);
+    co_await ctx.tm.move_region(ctx.t, md_mapping, md_dst_mapping, new_prefix);
   });
 }
 
@@ -1726,6 +1738,8 @@ ObjectDataHandler::do_clone(
   auto mapping = co_await prepare_data_reservation(
     ctx, *ctx.d_onode, d_object_data, old_len);
   ceph_assert(mapping.has_value());
+  assert(old_base.get_object_prefix() == mapping->get_key().get_object_prefix());
+  assert(old_base.get_clone_prefix() != mapping->get_key().get_clone_prefix());
   DEBUGT("new obj reserve_data_base: {}, len 0x{:x}",
     ctx.t,
     d_object_data.get_reserved_data_base(),
index 6cf47a37510f5b5a49c76a66e5590a8694af2b19..d105a8f940c2c30bd2b1f5e588caf3eb0e737e3c 100644 (file)
@@ -139,6 +139,7 @@ public:
   virtual void set_need_cow(Transaction&) = 0;
   virtual void unset_need_cow(Transaction&) = 0;
   virtual void swap_layout(Transaction&, Onode&) = 0;
+  virtual boost::intrusive_ptr<Onode> offload_data_and_md(Transaction&) = 0;
 
   laddr_hint_t get_metadata_hint(uint64_t block_size = laddr_t::UNIT_SIZE) const {
     return get_hint(block_size, /*is_metadata*/true);
index 0067cd5f968fe89dbc7b1470f9962bc719bc5280..fbd0c153b5acd9c472faf2ece219f9ce8f749667 100644 (file)
@@ -9,6 +9,60 @@
 
 namespace crimson::os::seastore::onode {
 
+struct FakeOnode final : Onode {
+  FakeOnode(const hobject_t &hobj, onode_layout_t layout)
+      : Onode(hobj), layout(layout) {}
+
+  onode_layout_t layout{};
+
+  laddr_hint_t init_hint(extent_len_t block_size, bool is_metadata) const final {
+    ceph_abort("impossible");
+    return LADDR_HINT_NULL;
+  }
+  laddr_hint_t generate_clone_hint(
+    local_object_id_t object_id,
+    extent_len_t block_size,
+    bool is_metadata) const final {
+    ceph_abort("impossible");
+    return LADDR_HINT_NULL;
+  }
+
+  bool is_alive() const final { return true; }
+  const onode_layout_t &get_layout() const final {
+    return layout;
+  }
+  void update_onode_size(Transaction &, uint32_t) final {
+    ceph_abort("impossible");
+  }
+  void update_omap_root(Transaction &, omap_root_t &root) final {
+    ceph_abort("impossible");
+  }
+  void update_log_root(Transaction &, omap_root_t &root) final {
+    ceph_abort("impossible");
+  }
+  void update_xattr_root(Transaction &, omap_root_t &root) final {
+    ceph_abort("impossible");
+  }
+  void update_object_data(Transaction &, object_data_t &data) final {
+    ceph_abort("impossible");
+  }
+  void update_object_info(Transaction &, ceph::bufferlist &) final {
+    ceph_abort("impossible");
+  }
+  void update_snapset(Transaction &, ceph::bufferlist &) final {
+    ceph_abort("impossible");
+  }
+  void clear_object_info(Transaction &) final { ceph_abort("impossible"); }
+  void clear_snapset(Transaction &) final { ceph_abort("impossible"); }
+  void set_need_cow(Transaction &) final {}
+  void unset_need_cow(Transaction &) final {}
+  void swap_layout(Transaction &, Onode &o) final { ceph_abort("impossible"); }
+  boost::intrusive_ptr<Onode> offload_data_and_md(Transaction &t) final {
+    ceph_abort("impossible");
+    return nullptr;
+  }
+};
+
 struct FLTreeOnode final : Onode, Value {
   static constexpr tree_conf_t TREE_CONF = {
     value_magic_t::ONODE,
@@ -92,6 +146,21 @@ struct FLTreeOnode final : Onode, Value {
     _swap_layout(t, static_cast<FLTreeOnode&>(onode));
   }
 
+  boost::intrusive_ptr<Onode> offload_data_and_md(Transaction & t) final {
+    assert(status != status_t::DELETED);
+    auto fake_onode = new FakeOnode(hobj, get_layout());
+    object_data_t data{L_ADDR_NULL, 0};
+    update_object_data(t, data);
+    omap_root_t root;
+    root.type = omap_type_t::OMAP;
+    update_omap_root(t, root);
+    root.type = omap_type_t::XATTR;
+    update_xattr_root(t, root);
+    root.type = omap_type_t::LOG;
+    update_log_root(t, root);
+    return fake_onode;
+  }
+
   void _swap_layout(Transaction &t, FLTreeOnode &other) {
     assert(status != status_t::DELETED);
     assert(other.status != status_t::DELETED);
index 1560f7a4d7aa4f7ff57f93544d2669026cda0db6..f1fce5756573f9ca5f9db2ecee71e0c47680709f 100644 (file)
@@ -1854,6 +1854,35 @@ SeaStore::Shard::_do_transaction_step(
   );
 }
 
+namespace {
+void rename_onode_omap_metadata(
+  Transaction &t, Onode &src, Onode &dst)
+{
+  auto src_prefix = *src.get_clone_prefix();
+  auto dst_prefix = *dst.get_clone_prefix();
+
+  auto rename_root = [&src, &dst, src_prefix, dst_prefix](omap_type_t type) {
+    auto root = src.get_root(type).get(dst.get_metadata_hint());
+    if (root.is_null()) {
+      return root;
+    }
+    auto offset = root.addr.get_byte_distance<loffset_t>(src_prefix);
+    root.update(
+      (dst_prefix + offset).checked_to_laddr(),
+      root.depth, dst.get_metadata_hint(), type);
+    return root;
+  };
+
+  auto omap_root = rename_root(omap_type_t::OMAP);
+  auto xattr_root = rename_root(omap_type_t::XATTR);
+  auto log_root = rename_root(omap_type_t::LOG);
+
+  dst.update_omap_root(t, omap_root);
+  dst.update_xattr_root(t, xattr_root);
+  dst.update_log_root(t, log_root);
+}
+}
+
 SeaStore::Shard::tm_ret
 SeaStore::Shard::_rename(
   internal_context_t &ctx,
@@ -1867,31 +1896,8 @@ SeaStore::Shard::_rename(
     return objHanlder.rename(ObjectDataHandler::context_t{
       *transaction_manager, *ctx.transaction, *onode, d_onode.get()
     }).si_then([&ctx, &onode, &d_onode] {
-      auto get_prefix = [](Onode &onode) {
-       auto p = onode.get_clone_prefix();
-       assert(p);
-       return *p;
-      };
-      auto src_prefix = get_prefix(*onode);
-      auto dst_prefix = get_prefix(*d_onode);
-
-      auto rename_omap_root = [&](omap_type_t type) {
-       auto root = onode->get_root(type).get(d_onode->get_metadata_hint());
-       if (root.is_null()) {
-         return root;
-       }
-       auto offset = root.addr.get_byte_distance<loffset_t>(src_prefix);
-       root.update(
-         (dst_prefix + offset).checked_to_laddr(),
-         root.depth, d_onode->get_metadata_hint(), type);
-       return root;
-      };
-
       auto olayout = onode->get_layout();
       uint32_t size = olayout.size;
-      auto omap_root = rename_omap_root(omap_type_t::OMAP);
-      auto xattr_root = rename_omap_root(omap_type_t::XATTR);
-      auto log_root = rename_omap_root(omap_type_t::LOG);
       auto oi_bl = ceph::bufferlist::static_from_mem(
        &olayout.oi[0],
        (uint32_t)olayout.oi_size);
@@ -1900,11 +1906,9 @@ SeaStore::Shard::_rename(
        (uint32_t)olayout.ss_size);
 
       d_onode->update_onode_size(*ctx.transaction, size);
-      d_onode->update_omap_root(*ctx.transaction, omap_root);
-      d_onode->update_xattr_root(*ctx.transaction, xattr_root);
-      d_onode->update_log_root(*ctx.transaction, log_root);
       d_onode->update_object_info(*ctx.transaction, oi_bl);
       d_onode->update_snapset(*ctx.transaction, ss_bl);
+      rename_onode_omap_metadata(*ctx.transaction, *onode, *d_onode);
     });
   }).si_then([this, &ctx, &onode] {
     return onode_manager->erase_onode(
@@ -1937,17 +1941,14 @@ SeaStore::Shard::_remove(
       ObjectDataHandler(max_object_size),
       [&onode, this, &ctx](auto &objhandler)
     {
-      auto fut = ObjectDataHandler::clone_iertr::now();
-      auto objctx = ObjectDataHandler::context_t{
-         *transaction_manager,
-         *ctx.transaction,
-         *onode,
-       };
-      if (onode->need_cow()) {
-       fut = objhandler.copy_on_write(objctx);
-      }
-      return fut.si_then([&objhandler, objctx] {
-       return objhandler.clear(objctx);
+      return _maybe_copy_on_write(ctx, *onode, objhandler
+      ).si_then([&onode, this, &ctx, &objhandler] {
+       return objhandler.clear(
+         ObjectDataHandler::context_t{
+           *transaction_manager,
+           *ctx.transaction,
+           *onode,
+         });
       });
     });
   }).si_then([this, &ctx, &onode] {
@@ -1990,20 +1991,20 @@ SeaStore::Shard::_write(
   return seastar::do_with(
     std::move(_bl),
     ObjectDataHandler(max_object_size),
-    [=, this, &ctx, &onode](auto &bl, auto &objhandler) {
-      auto fut = ObjectDataHandler::clone_iertr::now();
-      auto objctx = ObjectDataHandler::context_t{
+    [=, this, &ctx, &onode](auto &bl, auto &objhandler)
+  {
+    return _maybe_copy_on_write(ctx, onode, objhandler
+    ).si_then([&ctx, &onode, &objhandler, offset, &bl, this] {
+      return objhandler.write(
+       ObjectDataHandler::context_t{
          *transaction_manager,
          *ctx.transaction,
          onode,
-       };
-      if (onode.need_cow()) {
-       fut = objhandler.copy_on_write(objctx);
-      }
-      return fut.si_then([&objhandler, objctx, offset, &bl] {
-       return objhandler.write(objctx, offset, bl);
-      });
+       },
+       offset,
+       bl);
     });
+  });
 }
 
 SeaStore::Shard::tm_ret
@@ -2027,7 +2028,13 @@ SeaStore::Shard::_clone(
        * the case where the *source* is not further mutated, so here we
        * reverse the two onodes so that HEAD will be the target.
        */
+      auto id = onode.get_layout()
+         .object_data
+         .get()
+         .get_reserved_data_base()
+         .get_local_object_id();
       onode.swap_layout(*ctx.transaction, d_onode);
+      onode.set_sibling_object_id(id);
       return objHandler.clone(
        ObjectDataHandler::context_t{
          *transaction_manager,
@@ -2064,6 +2071,27 @@ SeaStore::Shard::_clone(
   });
 }
 
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_maybe_copy_on_write(
+  internal_context_t &ctx,
+  Onode &onode,
+  ObjectDataHandler &handler)
+{
+  if (!onode.need_cow()) {
+    co_return;
+  }
+  auto fake_onode = onode.offload_data_and_md(*ctx.transaction);
+  onode.set_sibling_object_id(fake_onode->get_clone_prefix()->get_local_object_id());
+  co_await handler.copy_on_write(
+    ObjectDataHandler::context_t{
+      *transaction_manager,
+      *ctx.transaction,
+      *fake_onode,
+      &onode
+    });
+  rename_onode_omap_metadata(*ctx.transaction, *fake_onode, onode);
+}
+
 SeaStore::Shard::tm_ret
 SeaStore::Shard::_clone_range(
   internal_context_t &ctx,
@@ -2116,18 +2144,18 @@ SeaStore::Shard::_zero(
     std::max<uint64_t>(offset + len, object_size));
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
-    [=, this, &ctx, &onode](auto &objhandler) {
-    auto fut = ObjectDataHandler::clone_iertr::now();
-    auto objctx = ObjectDataHandler::context_t{
-       *transaction_manager,
-       *ctx.transaction,
-       onode,
-      };
-    if (onode.need_cow()) {
-      fut = objhandler.copy_on_write(objctx);
-    }
-    return fut.si_then([&objhandler, objctx, offset, len] {
-      return objhandler.zero(objctx, offset, len);
+    [=, this, &ctx, &onode](auto &objhandler)
+  {
+    return _maybe_copy_on_write(ctx, onode, objhandler
+    ).si_then([this, &ctx, &onode, &objhandler, offset, len] {
+      return objhandler.zero(
+       ObjectDataHandler::context_t{
+         *transaction_manager,
+         *ctx.transaction,
+         onode,
+       },
+       offset,
+       len);
     });
   });
 }
@@ -2176,18 +2204,17 @@ SeaStore::Shard::_truncate(
   onode.update_onode_size(*ctx.transaction, size);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
-    [=, this, &ctx, &onode](auto &objhandler) {
-    auto fut = ObjectDataHandler::clone_iertr::now();
-    auto objctx = ObjectDataHandler::context_t{
-       *transaction_manager,
-       *ctx.transaction,
-       onode,
-      };
-    if (onode.need_cow()) {
-      fut = objhandler.copy_on_write(objctx);
-    }
-    return fut.si_then([&objhandler, objctx, size] {
-      return objhandler.truncate(objctx, size);
+    [=, this, &ctx, &onode](auto &objhandler)
+  {
+    return _maybe_copy_on_write(ctx, onode, objhandler
+    ).si_then([this, &ctx, &onode, &objhandler, size] {
+      return objhandler.truncate(
+       ObjectDataHandler::context_t{
+         *transaction_manager,
+         *ctx.transaction,
+         onode,
+       },
+       size);
     });
   });
 }
index b4a059fcbe1794201f3db75787a3922b79415cb0..e0bc5330b52420e00521c5c02cb74ecca0c19511 100644 (file)
@@ -348,6 +348,10 @@ public:
       internal_context_t &ctx,
       Onode &onode,
       Onode &d_onode);
+    tm_ret _maybe_copy_on_write(
+      internal_context_t &ctx,
+      Onode &onode,
+      ObjectDataHandler &handler);
     tm_ret _rename(
       internal_context_t &ctx,
       OnodeRef &onode,
index 8353cea834145226f563094b37b739b639484760..0faa3e2d71f6a24781357299c5c037c0bd0be0c0 100644 (file)
@@ -181,6 +181,12 @@ public:
     });
   }
 
+  base_iertr::future<LBAMapping> lower_bound_pin(
+    Transaction &t,
+    laddr_t laddr) {
+    return lba_manager->lower_bound(t, laddr);
+  }
+
   /**
    * maybe_indirect_extent_t
    *
index 127ec6413df0d6bb774c9066331dd63d3f8742bb..6a95e5ad294644a6f1edc46e754a68ed748e1c31 100644 (file)
@@ -142,7 +142,31 @@ public:
       mlayout.ss_size = 0;
     });
   }
-
+  boost::intrusive_ptr<Onode> offload_data_and_md(Transaction& t) final {
+    auto ret = new TestOnode();
+    {
+      auto data = layout.object_data.get();
+      ret->update_object_data(t, data);
+      auto root = layout.omap_root.get(LADDR_HINT_NULL);
+      ret->update_omap_root(t, root);
+      root = layout.xattr_root.get(LADDR_HINT_NULL);
+      ret->update_xattr_root(t, root);
+      root = layout.log_root.get(LADDR_HINT_NULL);
+      ret->update_log_root(t, root);
+    }
+    {
+      auto data = object_data_t{L_ADDR_NULL, 0};
+      update_object_data(t, data);
+      auto root = omap_root_t{};
+      root.type = omap_type_t::OMAP;
+      update_omap_root(t, root);
+      root.type = omap_type_t::XATTR;
+      update_xattr_root(t, root);
+      root.type = omap_type_t::LOG;
+      update_log_root(t, root);
+    }
+    return ret;
+  }
 };
 
 struct object_data_handler_test_t: