From 9f303cde7b9d7aca14a2024948079b4280168a2d Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 29 Mar 2023 18:07:20 +0800 Subject: [PATCH] crimson/os/seastore: implement OP_CLONE Signed-off-by: Xuehan Xu --- .../os/seastore/btree/btree_range_pin.h | 11 +- src/crimson/os/seastore/lba_manager.h | 14 +- .../lba_manager/btree/btree_lba_manager.cc | 22 ++- .../lba_manager/btree/btree_lba_manager.h | 32 +++- .../os/seastore/object_data_handler.cc | 147 ++++++++++++++++++ src/crimson/os/seastore/object_data_handler.h | 13 ++ src/crimson/os/seastore/seastore.cc | 64 +++++++- src/crimson/os/seastore/seastore.h | 4 + src/crimson/os/seastore/seastore_types.h | 4 +- src/crimson/os/seastore/transaction_manager.h | 50 +++++- .../seastore/test_btree_lba_manager.cc | 2 +- 11 files changed, 335 insertions(+), 28 deletions(-) diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h index 684d81ce991..c753a1c3b03 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.h +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -132,10 +132,6 @@ class BtreeNodeMapping : public PhysicalNodeMapping { fixed_kv_node_meta_t range; uint16_t pos = std::numeric_limits::max(); - pladdr_t _get_val() const final { - return value; - } - public: using val_type = val_t; BtreeNodeMapping(op_context_t ctx) : ctx(ctx) {} @@ -186,7 +182,12 @@ public: } val_t get_val() const final { - return value; + if constexpr (std::is_same_v) { + return value.get_paddr(); + } else { + static_assert(std::is_same_v); + return value.get_laddr(); + } } key_t get_key() const final { diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index f36a788344a..6275d4dbbf5 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -81,7 +81,19 @@ public: laddr_t hint, extent_len_t len, paddr_t addr, - LogicalCachedExtent *nextent) = 0; + LogicalCachedExtent &nextent) = 0; + + virtual alloc_extent_ret clone_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + laddr_t intermediate_key, + paddr_t actual_addr) = 0; + + virtual alloc_extent_ret reserve_region( + Transaction &t, + laddr_t hint, + extent_len_t len) = 0; struct ref_update_result_t { unsigned refcount = 0; diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index f109b8a9982..c1bfc25dd06 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -205,11 +205,12 @@ BtreeLBAManager::get_mapping( } BtreeLBAManager::alloc_extent_ret -BtreeLBAManager::alloc_extent( +BtreeLBAManager::_alloc_extent( Transaction &t, laddr_t hint, extent_len_t len, - paddr_t addr, + pladdr_t addr, + paddr_t actual_addr, LogicalCachedExtent* nextent) { struct state_t { @@ -221,7 +222,7 @@ BtreeLBAManager::alloc_extent( state_t(laddr_t hint) : last_end(hint) {} }; - LOG_PREFIX(BtreeLBAManager::alloc_extent); + LOG_PREFIX(BtreeLBAManager::_alloc_extent); TRACET("{}~{}, hint={}", t, addr, len, hint); auto c = get_context(t); ++stats.num_alloc_extents; @@ -272,21 +273,30 @@ BtreeLBAManager::alloc_extent( c, *state.insert_iter, state.last_end, - lba_map_val_t{len, pladdr_t(addr), 1, 0} + lba_map_val_t{len, pladdr_t(addr), 1, 0}, nextent ).si_then([&state, FNAME, c, addr, len, hint, nextent](auto &&p) { auto [iter, inserted] = std::move(p); TRACET("{}~{}, hint={}, inserted at {}", c.trans, addr, len, hint, state.last_end); if (nextent) { + ceph_assert(addr.is_paddr()); nextent->set_laddr(iter.get_key()); } ceph_assert(inserted); state.ret = iter; }); }); - }).si_then([c](auto &&state) { - return state.ret->get_pin(c); + }).si_then([c, actual_addr, addr](auto &&state) { + auto ret_pin = state.ret->get_pin(c); + if (actual_addr != P_ADDR_NULL) { + ceph_assert(addr.is_laddr()); + ret_pin->set_paddr(actual_addr); + } else { + ceph_assert(addr.is_paddr()); + } + return alloc_extent_iertr::make_ready_future( + std::move(ret_pin)); }); } diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index 7c5d42cec79..396b024ec62 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -89,12 +89,34 @@ public: Transaction &t, laddr_t offset) final; + alloc_extent_ret reserve_region( + Transaction &t, + laddr_t hint, + extent_len_t len) + { + return _alloc_extent(t, hint, len, P_ADDR_ZERO, P_ADDR_NULL, nullptr); + } + + alloc_extent_ret clone_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + laddr_t intermediate_key, + paddr_t actual_addr) + { + return _alloc_extent(t, hint, len, intermediate_key, actual_addr, nullptr); + } + alloc_extent_ret alloc_extent( Transaction &t, laddr_t hint, extent_len_t len, paddr_t addr, - LogicalCachedExtent*) final; + LogicalCachedExtent &ext) final + { + assert(ext); + return _alloc_extent(t, hint, len, addr, P_ADDR_NULL, &ext); + } ref_ret decref_extent( Transaction &t, @@ -187,6 +209,14 @@ private: laddr_t addr, update_func_t &&f, LogicalCachedExtent*); + + alloc_extent_ret _alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + pladdr_t addr, + paddr_t actual_addr, + LogicalCachedExtent*); }; using BtreeLBAManagerRef = std::unique_ptr; diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index 3de23176a42..e4f85e74d3c 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -872,6 +872,31 @@ auto with_object_data( }); } +template +auto with_objects_data( + ObjectDataHandler::context_t ctx, + F &&f) +{ + ceph_assert(ctx.d_onode); + return seastar::do_with( + ctx.onode.get_layout().object_data.get(), + ctx.d_onode->get_layout().object_data.get(), + std::forward(f), + [ctx](auto &object_data, auto &d_object_data, auto &f) { + return std::invoke(f, object_data, d_object_data + ).si_then([ctx, &object_data, &d_object_data] { + if (object_data.must_update()) { + ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data); + } + if (d_object_data.must_update()) { + ctx.d_onode->get_mutable_layout( + ctx.t).object_data.update(d_object_data); + } + return seastar::now(); + }); + }); +} + ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation( context_t ctx, object_data_t &object_data, @@ -1445,4 +1470,126 @@ ObjectDataHandler::clear_ret ObjectDataHandler::clear( }); } +ObjectDataHandler::clone_ret ObjectDataHandler::clone_extents( + context_t ctx, + object_data_t &object_data, + lba_pin_list_t &pins, + laddr_t data_base) +{ + LOG_PREFIX(ObjectDataHandler::clone_extents); + TRACET(" object_data: {}~{}, data_base: {}", + ctx.t, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len(), + data_base); + return ctx.tm.dec_ref( + ctx.t, + object_data.get_reserved_data_base() + ).si_then( + [&pins, &object_data, ctx, data_base](auto) mutable { + return seastar::do_with( + (extent_len_t)0, + [&object_data, ctx, data_base, &pins](auto &last_pos) { + return trans_intr::do_for_each( + pins, + [&last_pos, &object_data, ctx, data_base](auto &pin) { + auto offset = pin->get_key() - data_base; + ceph_assert(offset == last_pos); + auto fut = TransactionManager::alloc_extent_iertr + ::make_ready_future(); + auto addr = object_data.get_reserved_data_base() + offset; + if (pin->get_val().is_zero()) { + fut = ctx.tm.reserve_region(ctx.t, addr, pin->get_length()); + } else { + fut = ctx.tm.clone_pin(ctx.t, addr, *pin); + } + return fut.si_then( + [&pin, &last_pos, offset](auto) { + last_pos = offset + pin->get_length(); + return seastar::now(); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further(), + crimson::ct_error::assert_all("not possible") + ); + }).si_then([&last_pos, &object_data, ctx] { + if (last_pos != object_data.get_reserved_data_len()) { + return ctx.tm.reserve_region( + ctx.t, + object_data.get_reserved_data_base() + last_pos, + object_data.get_reserved_data_len() - last_pos + ).si_then([](auto) { + return seastar::now(); + }); + } + return TransactionManager::reserve_extent_iertr::now(); + }); + }); + }, + ObjectDataHandler::write_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "object_data_handler::clone invalid error" + } + ); +} + +ObjectDataHandler::clone_ret ObjectDataHandler::clone( + context_t ctx) +{ + // the whole clone procedure can be seperated into the following steps: + // 1. let clone onode(d_object_data) take the head onode's + // object data base; + // 2. reserve a new region in lba tree for the head onode; + // 3. clone all extents of the clone onode, see transaction_manager.h + // for the details of clone_pin; + // 4. reserve the space between the head onode's size and its reservation + // length. + return with_objects_data( + ctx, + [ctx, this](auto &object_data, auto &d_object_data) { + ceph_assert(d_object_data.is_null()); + if (object_data.is_null()) { + return clone_iertr::now(); + } + return prepare_data_reservation( + ctx, + d_object_data, + object_data.get_reserved_data_len() + ).si_then([&object_data, &d_object_data, ctx, this] { + assert(!object_data.is_null()); + auto base = object_data.get_reserved_data_base(); + auto len = object_data.get_reserved_data_len(); + object_data.clear(); + LOG_PREFIX(ObjectDataHandler::clone); + DEBUGT("cloned obj reserve_data_base: {}, len {}", + ctx.t, + d_object_data.get_reserved_data_base(), + d_object_data.get_reserved_data_len()); + return prepare_data_reservation( + ctx, + object_data, + d_object_data.get_reserved_data_len() + ).si_then([&d_object_data, ctx, &object_data, base, len, this] { + LOG_PREFIX("ObjectDataHandler::clone"); + DEBUGT("head obj reserve_data_base: {}, len {}", + ctx.t, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len()); + return ctx.tm.get_pins(ctx.t, base, len + ).si_then([ctx, &object_data, &d_object_data, base, this](auto pins) { + return seastar::do_with( + std::move(pins), + [ctx, &object_data, &d_object_data, base, this](auto &pins) { + return clone_extents(ctx, object_data, pins, base + ).si_then([ctx, &d_object_data, base, &pins, this] { + return clone_extents(ctx, d_object_data, pins, base); + }).si_then([&pins, ctx] { + return do_removals(ctx, pins); + }); + }); + }); + }); + }); + }); +} + } // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/object_data_handler.h b/src/crimson/os/seastore/object_data_handler.h index ca648f12c2e..b5f432d5ac7 100644 --- a/src/crimson/os/seastore/object_data_handler.h +++ b/src/crimson/os/seastore/object_data_handler.h @@ -58,6 +58,7 @@ public: TransactionManager &tm; Transaction &t; Onode &onode; + Onode *d_onode = nullptr; // The desination node in case of clone }; /// Writes bl to [offset, offset + bl.length()) @@ -103,6 +104,11 @@ public: using clear_ret = clear_iertr::future<>; clear_ret clear(context_t ctx); + /// Clone data of an Onode + using clone_iertr = base_iertr; + using clone_ret = clone_iertr::future<>; + clone_ret clone(context_t ctx); + private: /// Updates region [_offset, _offset + bl.length) to bl write_ret overwrite( @@ -124,6 +130,13 @@ private: context_t ctx, object_data_t &object_data, extent_len_t size); + + clone_ret clone_extents( + context_t ctx, + object_data_t &object_data, + lba_pin_list_t &pins, + laddr_t data_base); + private: /** * max_object_size diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index b44d6696701..350d78e084e 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -1196,8 +1196,9 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks( op_type_t::TRANSACTION, [this](auto &ctx) { return with_trans_intr(*ctx.transaction, [&, this](auto &t) { - return seastar::do_with(std::vector(ctx.iter.objects.size()), - std::vector(), + return seastar::do_with( + std::vector(ctx.iter.objects.size()), + std::vector(ctx.iter.objects.size()), [this, &ctx](auto& onodes, auto& d_onodes) mutable { return trans_intr::repeat( [this, &ctx, &onodes, &d_onodes]() mutable @@ -1289,20 +1290,42 @@ SeaStore::Shard::_do_transaction_step( *ctx.transaction, i.get_oid(op->oid)); } } - return fut.si_then([&, op, this](auto&& get_onode) -> tm_ret { - LOG_PREFIX(SeaStore::_do_transaction_step); + return fut.si_then([&, op](auto get_onode) { OnodeRef &o = onodes[op->oid]; if (!o) { assert(get_onode); o = get_onode; - d_onodes.push_back(get_onode); + d_onodes[op->oid] = get_onode; + } + if (op->op == Transaction::OP_CLONE && !d_onodes[op->dest_oid]) { + //TODO: use when_all_succeed after making onode tree + // support parallel extents loading + return onode_manager->get_or_create_onode( + *ctx.transaction, i.get_oid(op->dest_oid) + ).si_then([&, op](auto dest_onode) { + assert(dest_onode); + auto &d_o = onodes[op->dest_oid]; + assert(!d_o); + assert(!d_onodes[op->dest_oid]); + d_o = dest_onode; + d_onodes[op->dest_oid] = dest_onode; + return seastar::now(); + }); + } else { + return OnodeManager::get_or_create_onode_iertr::now(); } + }).si_then([&, op, this]() -> tm_ret { + LOG_PREFIX(SeaStore::_do_transaction_step); try { switch (op->op) { case Transaction::OP_REMOVE: { TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid)); - return _remove(ctx, onodes[op->oid]); + return _remove(ctx, onodes[op->oid] + ).si_then([&onodes, &d_onodes, op] { + onodes[op->oid].reset(); + d_onodes[op->oid].reset(); + }); } case Transaction::OP_CREATE: case Transaction::OP_TOUCH: @@ -1390,6 +1413,10 @@ SeaStore::Shard::_do_transaction_step( // TODO return tm_iertr::now(); } + case Transaction::OP_CLONE: + { + return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]); + } default: ERROR("bad op {}", static_cast(op->op)); return crimson::ct_error::input_output_error::make(); @@ -1507,6 +1534,31 @@ SeaStore::Shard::_write( }); } +SeaStore::Shard::tm_ret +SeaStore::Shard::_clone( + internal_context_t &ctx, + OnodeRef &onode, + OnodeRef &d_onode) +{ + LOG_PREFIX(SeaStore::_clone); + DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode); + return seastar::do_with( + ObjectDataHandler(max_object_size), + [this, &ctx, &onode, &d_onode](auto &objHandler) { + //TODO: currently, we only care about object data, leaving cloning + // of xattr/omap for future work + auto &object_size = onode->get_layout().size; + auto &d_object_size = d_onode->get_mutable_layout(*ctx.transaction).size; + d_object_size = object_size; + return objHandler.clone( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *onode, + d_onode.get()}); + }); +} + SeaStore::Shard::tm_ret SeaStore::Shard::_zero( internal_context_t &ctx, diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index df4323df557..876fadca8c7 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -353,6 +353,10 @@ public: uint64_t offset, size_t len, ceph::bufferlist &&bl, uint32_t fadvise_flags); + tm_ret _clone( + internal_context_t &ctx, + OnodeRef &onode, + OnodeRef &d_onode); tm_ret _zero( internal_context_t &ctx, OnodeRef &onode, diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 90e25455a27..c26b56d0677 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -1042,9 +1042,9 @@ struct pladdr_t { pladdr_t() = default; pladdr_t(const pladdr_t &) = default; - explicit pladdr_t(laddr_t laddr) + pladdr_t(laddr_t laddr) : pladdr(laddr) {} - explicit pladdr_t(paddr_t paddr) + pladdr_t(paddr_t paddr) : pladdr(paddr) {} bool is_laddr() const { diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 8f71323cde8..10cc6f0e7ce 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -282,7 +282,7 @@ public: laddr_hint, len, ext->get_paddr(), - ext.get() + *ext ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable { LOG_PREFIX(TransactionManager::alloc_extent); SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint); @@ -419,12 +419,50 @@ public: LOG_PREFIX(TransactionManager::reserve_region); SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint); ceph_assert(is_aligned(hint, epm->get_block_size())); - return lba_manager->alloc_extent( + return lba_manager->reserve_region( t, hint, - len, - P_ADDR_ZERO, - nullptr); + len); + } + + /* + * clone_pin + * + * create an indirect lba mapping pointing to the physical + * lba mapping whose key is clone_offset. Resort to btree_lba_manager.h + * for the definition of "indirect lba mapping" and "physical lba mapping" + * + */ + using clone_extent_iertr = alloc_extent_iertr; + using clone_extent_ret = clone_extent_iertr::future; + clone_extent_ret clone_pin( + Transaction &t, + laddr_t hint, + const LBAMapping &mapping) { + auto clone_offset = + mapping.is_indirect() + ? mapping.get_intermediate_key() + : mapping.get_key(); + + LOG_PREFIX(TransactionManager::clone_pin); + SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}", + t, mapping.get_length(), hint, clone_offset); + ceph_assert(is_aligned(hint, epm->get_block_size())); + return lba_manager->clone_extent( + t, + hint, + mapping.get_length(), + clone_offset, + mapping.get_val() + ).si_then([this, &t, clone_offset](auto pin) { + return inc_ref(t, clone_offset + ).si_then([pin=std::move(pin)](auto) mutable { + return std::move(pin); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further(), + crimson::ct_error::assert_all("not possible") + ); + }); } /* alloc_extents @@ -780,7 +818,7 @@ private: remap_laddr, remap_length, remap_paddr, - ext.get() + *ext ).si_then([remap_laddr, remap_length, remap_paddr](auto &&ref) { assert(ref->get_key() == remap_laddr); assert(ref->get_val() == remap_paddr); diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index f55d0d6abd4..95b165fab29 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -432,7 +432,7 @@ struct btree_lba_manager_test : btree_test_base { 0, get_paddr()); return lba_manager->alloc_extent( - t, hint, len, extent->get_paddr(), extent.get()); + t, hint, len, extent->get_paddr(), *extent); }).unsafe_get0(); logger().debug("alloc'd: {}", *ret); EXPECT_EQ(len, ret->get_length()); -- 2.39.5