fixed_kv_node_meta_t<key_t> range;
uint16_t pos = std::numeric_limits<uint16_t>::max();
- pladdr_t _get_val() const final {
- return value;
- }
-
public:
using val_type = val_t;
BtreeNodeMapping(op_context_t<key_t> ctx) : ctx(ctx) {}
}
val_t get_val() const final {
- return value;
+ if constexpr (std::is_same_v<val_t, paddr_t>) {
+ return value.get_paddr();
+ } else {
+ static_assert(std::is_same_v<val_t, laddr_t>);
+ return value.get_laddr();
+ }
}
key_t get_key() const final {
laddr_t hint,
extent_len_t len,
paddr_t addr,
- LogicalCachedExtent *nextent) = 0;
+ LogicalCachedExtent &nextent) = 0;
+
+ virtual alloc_extent_ret clone_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ laddr_t intermediate_key,
+ paddr_t actual_addr) = 0;
+
+ virtual alloc_extent_ret reserve_region(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len) = 0;
struct ref_update_result_t {
unsigned refcount = 0;
}
BtreeLBAManager::alloc_extent_ret
-BtreeLBAManager::alloc_extent(
+BtreeLBAManager::_alloc_extent(
Transaction &t,
laddr_t hint,
extent_len_t len,
- paddr_t addr,
+ pladdr_t addr,
+ paddr_t actual_addr,
LogicalCachedExtent* nextent)
{
struct state_t {
state_t(laddr_t hint) : last_end(hint) {}
};
- LOG_PREFIX(BtreeLBAManager::alloc_extent);
+ LOG_PREFIX(BtreeLBAManager::_alloc_extent);
TRACET("{}~{}, hint={}", t, addr, len, hint);
auto c = get_context(t);
++stats.num_alloc_extents;
c,
*state.insert_iter,
state.last_end,
- lba_map_val_t{len, pladdr_t(addr), 1, 0}
+ lba_map_val_t{len, pladdr_t(addr), 1, 0},
nextent
).si_then([&state, FNAME, c, addr, len, hint, nextent](auto &&p) {
auto [iter, inserted] = std::move(p);
TRACET("{}~{}, hint={}, inserted at {}",
c.trans, addr, len, hint, state.last_end);
if (nextent) {
+ ceph_assert(addr.is_paddr());
nextent->set_laddr(iter.get_key());
}
ceph_assert(inserted);
state.ret = iter;
});
});
- }).si_then([c](auto &&state) {
- return state.ret->get_pin(c);
+ }).si_then([c, actual_addr, addr](auto &&state) {
+ auto ret_pin = state.ret->get_pin(c);
+ if (actual_addr != P_ADDR_NULL) {
+ ceph_assert(addr.is_laddr());
+ ret_pin->set_paddr(actual_addr);
+ } else {
+ ceph_assert(addr.is_paddr());
+ }
+ return alloc_extent_iertr::make_ready_future<LBAMappingRef>(
+ std::move(ret_pin));
});
}
Transaction &t,
laddr_t offset) final;
+ alloc_extent_ret reserve_region(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len)
+ {
+ return _alloc_extent(t, hint, len, P_ADDR_ZERO, P_ADDR_NULL, nullptr);
+ }
+
+ alloc_extent_ret clone_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ laddr_t intermediate_key,
+ paddr_t actual_addr)
+ {
+ return _alloc_extent(t, hint, len, intermediate_key, actual_addr, nullptr);
+ }
+
alloc_extent_ret alloc_extent(
Transaction &t,
laddr_t hint,
extent_len_t len,
paddr_t addr,
- LogicalCachedExtent*) final;
+ LogicalCachedExtent &ext) final
+ {
+ assert(ext);
+ return _alloc_extent(t, hint, len, addr, P_ADDR_NULL, &ext);
+ }
ref_ret decref_extent(
Transaction &t,
laddr_t addr,
update_func_t &&f,
LogicalCachedExtent*);
+
+ alloc_extent_ret _alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ pladdr_t addr,
+ paddr_t actual_addr,
+ LogicalCachedExtent*);
};
using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>;
});
}
+template <typename F>
+auto with_objects_data(
+ ObjectDataHandler::context_t ctx,
+ F &&f)
+{
+ ceph_assert(ctx.d_onode);
+ return seastar::do_with(
+ ctx.onode.get_layout().object_data.get(),
+ ctx.d_onode->get_layout().object_data.get(),
+ std::forward<F>(f),
+ [ctx](auto &object_data, auto &d_object_data, auto &f) {
+ return std::invoke(f, object_data, d_object_data
+ ).si_then([ctx, &object_data, &d_object_data] {
+ if (object_data.must_update()) {
+ ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data);
+ }
+ if (d_object_data.must_update()) {
+ ctx.d_onode->get_mutable_layout(
+ ctx.t).object_data.update(d_object_data);
+ }
+ return seastar::now();
+ });
+ });
+}
+
ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
context_t ctx,
object_data_t &object_data,
});
}
+ObjectDataHandler::clone_ret ObjectDataHandler::clone_extents(
+ context_t ctx,
+ object_data_t &object_data,
+ lba_pin_list_t &pins,
+ laddr_t data_base)
+{
+ LOG_PREFIX(ObjectDataHandler::clone_extents);
+ TRACET(" object_data: {}~{}, data_base: {}",
+ ctx.t,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len(),
+ data_base);
+ return ctx.tm.dec_ref(
+ ctx.t,
+ object_data.get_reserved_data_base()
+ ).si_then(
+ [&pins, &object_data, ctx, data_base](auto) mutable {
+ return seastar::do_with(
+ (extent_len_t)0,
+ [&object_data, ctx, data_base, &pins](auto &last_pos) {
+ return trans_intr::do_for_each(
+ pins,
+ [&last_pos, &object_data, ctx, data_base](auto &pin) {
+ auto offset = pin->get_key() - data_base;
+ ceph_assert(offset == last_pos);
+ auto fut = TransactionManager::alloc_extent_iertr
+ ::make_ready_future<LBAMappingRef>();
+ auto addr = object_data.get_reserved_data_base() + offset;
+ if (pin->get_val().is_zero()) {
+ fut = ctx.tm.reserve_region(ctx.t, addr, pin->get_length());
+ } else {
+ fut = ctx.tm.clone_pin(ctx.t, addr, *pin);
+ }
+ return fut.si_then(
+ [&pin, &last_pos, offset](auto) {
+ last_pos = offset + pin->get_length();
+ return seastar::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further(),
+ crimson::ct_error::assert_all("not possible")
+ );
+ }).si_then([&last_pos, &object_data, ctx] {
+ if (last_pos != object_data.get_reserved_data_len()) {
+ return ctx.tm.reserve_region(
+ ctx.t,
+ object_data.get_reserved_data_base() + last_pos,
+ object_data.get_reserved_data_len() - last_pos
+ ).si_then([](auto) {
+ return seastar::now();
+ });
+ }
+ return TransactionManager::reserve_extent_iertr::now();
+ });
+ });
+ },
+ ObjectDataHandler::write_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "object_data_handler::clone invalid error"
+ }
+ );
+}
+
+ObjectDataHandler::clone_ret ObjectDataHandler::clone(
+ context_t ctx)
+{
+ // the whole clone procedure can be seperated into the following steps:
+ // 1. let clone onode(d_object_data) take the head onode's
+ // object data base;
+ // 2. reserve a new region in lba tree for the head onode;
+ // 3. clone all extents of the clone onode, see transaction_manager.h
+ // for the details of clone_pin;
+ // 4. reserve the space between the head onode's size and its reservation
+ // length.
+ return with_objects_data(
+ ctx,
+ [ctx, this](auto &object_data, auto &d_object_data) {
+ ceph_assert(d_object_data.is_null());
+ if (object_data.is_null()) {
+ return clone_iertr::now();
+ }
+ return prepare_data_reservation(
+ ctx,
+ d_object_data,
+ object_data.get_reserved_data_len()
+ ).si_then([&object_data, &d_object_data, ctx, this] {
+ assert(!object_data.is_null());
+ auto base = object_data.get_reserved_data_base();
+ auto len = object_data.get_reserved_data_len();
+ object_data.clear();
+ LOG_PREFIX(ObjectDataHandler::clone);
+ DEBUGT("cloned obj reserve_data_base: {}, len {}",
+ ctx.t,
+ d_object_data.get_reserved_data_base(),
+ d_object_data.get_reserved_data_len());
+ return prepare_data_reservation(
+ ctx,
+ object_data,
+ d_object_data.get_reserved_data_len()
+ ).si_then([&d_object_data, ctx, &object_data, base, len, this] {
+ LOG_PREFIX("ObjectDataHandler::clone");
+ DEBUGT("head obj reserve_data_base: {}, len {}",
+ ctx.t,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len());
+ return ctx.tm.get_pins(ctx.t, base, len
+ ).si_then([ctx, &object_data, &d_object_data, base, this](auto pins) {
+ return seastar::do_with(
+ std::move(pins),
+ [ctx, &object_data, &d_object_data, base, this](auto &pins) {
+ return clone_extents(ctx, object_data, pins, base
+ ).si_then([ctx, &d_object_data, base, &pins, this] {
+ return clone_extents(ctx, d_object_data, pins, base);
+ }).si_then([&pins, ctx] {
+ return do_removals(ctx, pins);
+ });
+ });
+ });
+ });
+ });
+ });
+}
+
} // namespace crimson::os::seastore
TransactionManager &tm;
Transaction &t;
Onode &onode;
+ Onode *d_onode = nullptr; // The desination node in case of clone
};
/// Writes bl to [offset, offset + bl.length())
using clear_ret = clear_iertr::future<>;
clear_ret clear(context_t ctx);
+ /// Clone data of an Onode
+ using clone_iertr = base_iertr;
+ using clone_ret = clone_iertr::future<>;
+ clone_ret clone(context_t ctx);
+
private:
/// Updates region [_offset, _offset + bl.length) to bl
write_ret overwrite(
context_t ctx,
object_data_t &object_data,
extent_len_t size);
+
+ clone_ret clone_extents(
+ context_t ctx,
+ object_data_t &object_data,
+ lba_pin_list_t &pins,
+ laddr_t data_base);
+
private:
/**
* max_object_size
op_type_t::TRANSACTION,
[this](auto &ctx) {
return with_trans_intr(*ctx.transaction, [&, this](auto &t) {
- return seastar::do_with(std::vector<OnodeRef>(ctx.iter.objects.size()),
- std::vector<OnodeRef>(),
+ return seastar::do_with(
+ std::vector<OnodeRef>(ctx.iter.objects.size()),
+ std::vector<OnodeRef>(ctx.iter.objects.size()),
[this, &ctx](auto& onodes, auto& d_onodes) mutable {
return trans_intr::repeat(
[this, &ctx, &onodes, &d_onodes]() mutable
*ctx.transaction, i.get_oid(op->oid));
}
}
- return fut.si_then([&, op, this](auto&& get_onode) -> tm_ret {
- LOG_PREFIX(SeaStore::_do_transaction_step);
+ return fut.si_then([&, op](auto get_onode) {
OnodeRef &o = onodes[op->oid];
if (!o) {
assert(get_onode);
o = get_onode;
- d_onodes.push_back(get_onode);
+ d_onodes[op->oid] = get_onode;
+ }
+ if (op->op == Transaction::OP_CLONE && !d_onodes[op->dest_oid]) {
+ //TODO: use when_all_succeed after making onode tree
+ // support parallel extents loading
+ return onode_manager->get_or_create_onode(
+ *ctx.transaction, i.get_oid(op->dest_oid)
+ ).si_then([&, op](auto dest_onode) {
+ assert(dest_onode);
+ auto &d_o = onodes[op->dest_oid];
+ assert(!d_o);
+ assert(!d_onodes[op->dest_oid]);
+ d_o = dest_onode;
+ d_onodes[op->dest_oid] = dest_onode;
+ return seastar::now();
+ });
+ } else {
+ return OnodeManager::get_or_create_onode_iertr::now();
}
+ }).si_then([&, op, this]() -> tm_ret {
+ LOG_PREFIX(SeaStore::_do_transaction_step);
try {
switch (op->op) {
case Transaction::OP_REMOVE:
{
TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid));
- return _remove(ctx, onodes[op->oid]);
+ return _remove(ctx, onodes[op->oid]
+ ).si_then([&onodes, &d_onodes, op] {
+ onodes[op->oid].reset();
+ d_onodes[op->oid].reset();
+ });
}
case Transaction::OP_CREATE:
case Transaction::OP_TOUCH:
// TODO
return tm_iertr::now();
}
+ case Transaction::OP_CLONE:
+ {
+ return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]);
+ }
default:
ERROR("bad op {}", static_cast<unsigned>(op->op));
return crimson::ct_error::input_output_error::make();
});
}
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_clone(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ OnodeRef &d_onode)
+{
+ LOG_PREFIX(SeaStore::_clone);
+ DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode);
+ return seastar::do_with(
+ ObjectDataHandler(max_object_size),
+ [this, &ctx, &onode, &d_onode](auto &objHandler) {
+ //TODO: currently, we only care about object data, leaving cloning
+ // of xattr/omap for future work
+ auto &object_size = onode->get_layout().size;
+ auto &d_object_size = d_onode->get_mutable_layout(*ctx.transaction).size;
+ d_object_size = object_size;
+ return objHandler.clone(
+ ObjectDataHandler::context_t{
+ *transaction_manager,
+ *ctx.transaction,
+ *onode,
+ d_onode.get()});
+ });
+}
+
SeaStore::Shard::tm_ret
SeaStore::Shard::_zero(
internal_context_t &ctx,
uint64_t offset, size_t len,
ceph::bufferlist &&bl,
uint32_t fadvise_flags);
+ tm_ret _clone(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ OnodeRef &d_onode);
tm_ret _zero(
internal_context_t &ctx,
OnodeRef &onode,
pladdr_t() = default;
pladdr_t(const pladdr_t &) = default;
- explicit pladdr_t(laddr_t laddr)
+ pladdr_t(laddr_t laddr)
: pladdr(laddr) {}
- explicit pladdr_t(paddr_t paddr)
+ pladdr_t(paddr_t paddr)
: pladdr(paddr) {}
bool is_laddr() const {
laddr_hint,
len,
ext->get_paddr(),
- ext.get()
+ *ext
).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable {
LOG_PREFIX(TransactionManager::alloc_extent);
SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint);
LOG_PREFIX(TransactionManager::reserve_region);
SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint);
ceph_assert(is_aligned(hint, epm->get_block_size()));
- return lba_manager->alloc_extent(
+ return lba_manager->reserve_region(
t,
hint,
- len,
- P_ADDR_ZERO,
- nullptr);
+ len);
+ }
+
+ /*
+ * clone_pin
+ *
+ * create an indirect lba mapping pointing to the physical
+ * lba mapping whose key is clone_offset. Resort to btree_lba_manager.h
+ * for the definition of "indirect lba mapping" and "physical lba mapping"
+ *
+ */
+ using clone_extent_iertr = alloc_extent_iertr;
+ using clone_extent_ret = clone_extent_iertr::future<LBAMappingRef>;
+ clone_extent_ret clone_pin(
+ Transaction &t,
+ laddr_t hint,
+ const LBAMapping &mapping) {
+ auto clone_offset =
+ mapping.is_indirect()
+ ? mapping.get_intermediate_key()
+ : mapping.get_key();
+
+ LOG_PREFIX(TransactionManager::clone_pin);
+ SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}",
+ t, mapping.get_length(), hint, clone_offset);
+ ceph_assert(is_aligned(hint, epm->get_block_size()));
+ return lba_manager->clone_extent(
+ t,
+ hint,
+ mapping.get_length(),
+ clone_offset,
+ mapping.get_val()
+ ).si_then([this, &t, clone_offset](auto pin) {
+ return inc_ref(t, clone_offset
+ ).si_then([pin=std::move(pin)](auto) mutable {
+ return std::move(pin);
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further(),
+ crimson::ct_error::assert_all("not possible")
+ );
+ });
}
/* alloc_extents
remap_laddr,
remap_length,
remap_paddr,
- ext.get()
+ *ext
).si_then([remap_laddr, remap_length, remap_paddr](auto &&ref) {
assert(ref->get_key() == remap_laddr);
assert(ref->get_val() == remap_paddr);
0,
get_paddr());
return lba_manager->alloc_extent(
- t, hint, len, extent->get_paddr(), extent.get());
+ t, hint, len, extent->get_paddr(), *extent);
}).unsafe_get0();
logger().debug("alloc'd: {}", *ret);
EXPECT_EQ(len, ret->get_length());