From: Xinyu Huang Date: Fri, 7 Apr 2023 00:54:36 +0000 (+0000) Subject: crimson/os/seastore: realize lazy read in split overwrite with overwrite refactor X-Git-Tag: v18.2.1~165^2~5 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=233f6c5cc6d0655da720823b28f339cca60a615b;p=ceph.git crimson/os/seastore: realize lazy read in split overwrite with overwrite refactor Signed-off-by: Xinyu Huang (cherry picked from commit 5d00ec0d00ddefe5b82ce82d2acfa134d964a883) --- diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index 76e179e2414e4..c10ef8c32b774 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -25,12 +25,10 @@ using get_iertr = ObjectDataHandler::write_iertr; /** * extent_to_write_t * - * Encapsulates extents to be written out using do_insertions. + * Encapsulates smallest write operations in overwrite. * Indicates a zero/existing extent or a data extent based on whether * to_write is populate. - * The meaning of existing_paddr is that the new extent to be - * written is the part of exising extent on the disk. existing_paddr - * must be absolute. + * Should be handled by prepare_ops_list. */ struct extent_to_write_t { enum class type_t { @@ -38,16 +36,18 @@ struct extent_to_write_t { ZERO, EXISTING, }; - type_t type; + + /// pin of original extent, not nullptr if type == EXISTING + LBAMappingRef pin; + laddr_t addr; extent_len_t len; + /// non-nullopt if and only if type == DATA std::optional to_write; - /// non-nullopt if and only if type == EXISTING - std::optional existing_paddr; - extent_to_write_t(const extent_to_write_t &) = default; + extent_to_write_t(const extent_to_write_t &) = delete; extent_to_write_t(extent_to_write_t &&) = default; bool is_data() const { @@ -72,13 +72,14 @@ struct extent_to_write_t { } static extent_to_write_t create_zero( - laddr_t addr, extent_len_t len) { + laddr_t addr, extent_len_t len) { return extent_to_write_t(addr, len); } static extent_to_write_t create_existing( - laddr_t addr, paddr_t existing_paddr, extent_len_t len) { - return extent_to_write_t(addr, existing_paddr, len); + LBAMappingRef &&pin, laddr_t addr, extent_len_t len) { + assert(pin); + return extent_to_write_t(std::move(pin), addr, len); } private: @@ -89,12 +90,208 @@ private: extent_to_write_t(laddr_t addr, extent_len_t len) : type(type_t::ZERO), addr(addr), len(len) {} - extent_to_write_t(laddr_t addr, paddr_t existing_paddr, extent_len_t len) - : type(type_t::EXISTING), addr(addr), len(len), - to_write(std::nullopt), existing_paddr(existing_paddr) {} + extent_to_write_t(LBAMappingRef &&pin, laddr_t addr, extent_len_t len) + : type(type_t::EXISTING), pin(std::move(pin)), addr(addr), len(len) {} }; using extent_to_write_list_t = std::list; +// Encapsulates extents to be written out using do_remappings. +struct extent_to_remap_t { + enum class type_t { + REMAP, + OVERWRITE + }; + type_t type; + /// pin of original extent + LBAMappingRef pin; + /// offset of remapped extent or overwrite part of overwrite extent. + /// overwrite part of overwrite extent might correspond to mutiple + /// fresh write extent. + extent_len_t new_offset; + /// length of remapped extent or overwrite part of overwrite extent + extent_len_t new_len; + + extent_to_remap_t(const extent_to_remap_t &) = delete; + extent_to_remap_t(extent_to_remap_t &&) = default; + + bool is_remap() const { + return type == type_t::REMAP; + } + + bool is_overwrite() const { + assert((new_offset != 0) && (pin->get_length() != new_offset + new_len)); + return type == type_t::OVERWRITE; + } + + using remap_entry = TransactionManager::remap_entry; + remap_entry create_remap_entry() { + assert(is_remap()); + return remap_entry( + new_offset, + new_len); + } + + remap_entry create_left_remap_entry() { + assert(is_overwrite()); + return remap_entry( + 0, + new_offset); + } + + remap_entry create_right_remap_entry() { + assert(is_overwrite()); + return remap_entry( + new_offset + new_len, + pin->get_length() - new_offset - new_len); + } + + static extent_to_remap_t create_remap( + LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) { + return extent_to_remap_t(type_t::REMAP, + std::move(pin), new_offset, new_len); + } + + static extent_to_remap_t create_overwrite( + LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) { + return extent_to_remap_t(type_t::OVERWRITE, + std::move(pin), new_offset, new_len); + } + +private: + extent_to_remap_t(type_t type, + LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) + : type(type), + pin(std::move(pin)), new_offset(new_offset), new_len(new_len) {} +}; +using extent_to_remap_list_t = std::list; + +// Encapsulates extents to be written out using do_insertions. +struct extent_to_insert_t { + enum class type_t { + DATA, + ZERO + }; + type_t type; + /// laddr of new extent + laddr_t addr; + /// length of new extent + extent_len_t len; + /// non-nullopt if type == DATA + std::optional bl; + + extent_to_insert_t(const extent_to_insert_t &) = default; + extent_to_insert_t(extent_to_insert_t &&) = default; + + bool is_data() const { + return type == type_t::DATA; + } + + bool is_zero() const { + return type == type_t::ZERO; + } + + static extent_to_insert_t create_data( + laddr_t addr, extent_len_t len, std::optional bl) { + return extent_to_insert_t(addr, len, bl); + } + + static extent_to_insert_t create_zero( + laddr_t addr, extent_len_t len) { + return extent_to_insert_t(addr, len); + } + +private: + extent_to_insert_t(laddr_t addr, extent_len_t len, + std::optional bl) + :type(type_t::DATA), addr(addr), len(len), bl(bl) {} + + extent_to_insert_t(laddr_t addr, extent_len_t len) + :type(type_t::ZERO), addr(addr), len(len) {} +}; +using extent_to_insert_list_t = std::list; + +// Encapsulates extents to be retired in do_removals. +using extent_to_remove_list_t = std::list; + +struct overwrite_ops_t { + extent_to_remap_list_t to_remap; + extent_to_insert_list_t to_insert; + extent_to_remove_list_t to_remove; +}; + +// prepare to_remap, to_retire, to_insert list +overwrite_ops_t prepare_ops_list( + lba_pin_list_t &pins_to_remove, + extent_to_write_list_t &to_write) { + assert(to_write.size() != 0 && pins_to_remove.size() != 0); + overwrite_ops_t ops; + ops.to_remove.swap(pins_to_remove); + auto& front = to_write.front(); + auto& back = to_write.back(); + long unsigned int visitted = 0; + + // prepare overwrite, happens in one original extent. + if (ops.to_remove.size() == 1 && + front.is_existing() && back.is_existing()) { + visitted += 2; + assert(to_write.size() > 2); + assert(front.addr == front.pin->get_key()); + assert(back.addr > back.pin->get_key()); + ops.to_remap.push_back(extent_to_remap_t::create_overwrite( + std::move(front.pin), + front.len, + back.addr - front.addr - front.len)); + ops.to_remove.pop_front(); + } else { + // prepare to_remap, happens in one or multiple extents + if (front.is_existing()) { + visitted++; + assert(to_write.size() > 1); + assert(front.addr == front.pin->get_key()); + ops.to_remap.push_back(extent_to_remap_t::create_remap( + std::move(front.pin), + 0, + front.len)); + ops.to_remove.pop_front(); + } + if (back.is_existing()) { + visitted++; + assert(to_write.size() > 1); + assert(back.addr + back.len == + back.pin->get_key() + back.pin->get_length()); + assert((*(to_write.begin())).addr == back.pin->get_key()); + ops.to_remap.push_back(extent_to_remap_t::create_remap( + std::move(back.pin), + back.addr - back.pin->get_key(), + back.len)); + ops.to_remove.pop_back(); + } + } + + // prepare to_insert + for (auto ®ion : to_write) { + if (region.is_data()) { + visitted++; + assert(region.to_write.has_value()); + ops.to_insert.push_back(extent_to_insert_t::create_data( + region.addr, region.len, region.to_write)); + } else if (region.is_zero()) { + visitted++; + assert(!(region.to_write.has_value())); + ops.to_insert.push_back(extent_to_insert_t::create_zero( + region.addr, region.len)); + } + } + + logger().debug( + "to_remap list size: {}" + " to_insert list size: {}" + " to_remove list size: {}", + ops.to_remap.size(), ops.to_insert.size(), ops.to_remove.size()); + assert(visitted == to_write.size()); + return ops; +} + /** * append_extent_to_write * @@ -134,13 +331,54 @@ void splice_extent_to_write( } } -/// Removes extents/mappings in pins +/// Creates remap extents in to_remap +ObjectDataHandler::write_ret do_remappings( + context_t ctx, + extent_to_remap_list_t &to_remap) +{ + return trans_intr::do_for_each( + to_remap, + [ctx](auto ®ion) { + if (region.is_remap()) { + return ctx.tm.remap_pin( + ctx.t, + std::move(region.pin), + std::array{ + region.create_remap_entry() + } + ).si_then([®ion](auto pins) { + ceph_assert(pins.size() == 1); + ceph_assert(region.new_len == pins[0]->get_length()); + return ObjectDataHandler::write_iertr::now(); + }); + } else if (region.is_overwrite()) { + return ctx.tm.remap_pin( + ctx.t, + std::move(region.pin), + std::array{ + region.create_left_remap_entry(), + region.create_right_remap_entry() + } + ).si_then([®ion](auto pins) { + ceph_assert(pins.size() == 2); + ceph_assert(region.pin->get_key() == pins[0]->get_key()); + ceph_assert(region.pin->get_key() + pins[0]->get_length() + + region.new_len == pins[1]->get_key()); + return ObjectDataHandler::write_iertr::now(); + }); + } else { + ceph_abort("impossible"); + return ObjectDataHandler::write_iertr::now(); + } + }); +} + ObjectDataHandler::write_ret do_removals( context_t ctx, - lba_pin_list_t &pins) + lba_pin_list_t &to_remove) { return trans_intr::do_for_each( - pins, + to_remove, [ctx](auto &pin) { LOG_PREFIX(object_data_handler.cc::do_removals); DEBUGT("decreasing ref: {}", @@ -159,19 +397,19 @@ ObjectDataHandler::write_ret do_removals( }); } -/// Creates zero/data extents in to_write +/// Creates zero/data extents in to_insert ObjectDataHandler::write_ret do_insertions( context_t ctx, - extent_to_write_list_t &to_write) + extent_to_insert_list_t &to_insert) { return trans_intr::do_for_each( - to_write, + to_insert, [ctx](auto ®ion) { LOG_PREFIX(object_data_handler.cc::do_insertions); if (region.is_data()) { assert_aligned(region.addr); assert_aligned(region.len); - ceph_assert(region.len == region.to_write->length()); + ceph_assert(region.len == region.bl->length()); DEBUGT("allocating extent: {}~{}", ctx.t, region.addr, @@ -190,7 +428,7 @@ ObjectDataHandler::write_ret do_insertions( } ceph_assert(extent->get_laddr() == region.addr); ceph_assert(extent->get_length() == region.len); - auto iter = region.to_write->cbegin(); + auto iter = region.bl->cbegin(); iter.copy(region.len, extent->get_bptr().c_str()); return ObjectDataHandler::write_iertr::now(); }); @@ -216,25 +454,8 @@ ObjectDataHandler::write_ret do_insertions( return ObjectDataHandler::write_iertr::now(); }); } else { - ceph_assert(region.is_existing()); - DEBUGT("map existing extent: laddr {} len {} {}", - ctx.t, region.addr, region.len, *region.existing_paddr); - return ctx.tm.map_existing_extent( - ctx.t, region.addr, *region.existing_paddr, region.len - ).handle_error_interruptible( - TransactionManager::alloc_extent_iertr::pass_further{}, - Device::read_ertr::assert_all{"ignore read error"} - ).si_then([FNAME, ctx, ®ion](auto extent) { - if (extent->get_laddr() != region.addr) { - ERRORT( - "inconsistent laddr: extent: {} region {}", - ctx.t, - extent->get_laddr(), - region.addr); - } - ceph_assert(extent->get_laddr() == region.addr); - return ObjectDataHandler::write_iertr::now(); - }); + ceph_abort("impossible"); + return ObjectDataHandler::write_iertr::now(); } }); } @@ -524,14 +745,14 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan assert(extent_len); std::optional left_to_write_extent = std::make_optional(extent_to_write_t::create_existing( - overwrite_plan.pin_begin, - overwrite_plan.left_paddr, + pin->duplicate(), + pin->get_key(), extent_len)); auto prepend_len = overwrite_plan.get_left_alignment_size(); if (prepend_len == 0) { return get_iertr::make_ready_future( - left_to_write_extent, + std::move(left_to_write_extent), std::nullopt); } else { return ctx.tm.read_pin( @@ -540,7 +761,7 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan left_to_write_extent=std::move(left_to_write_extent)] (auto left_extent) mutable { return get_iertr::make_ready_future( - left_to_write_extent, + std::move(left_to_write_extent), std::make_optional(bufferptr( left_extent->get_bptr(), prepend_offset, @@ -606,14 +827,14 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla assert(extent_len); std::optional right_to_write_extent = std::make_optional(extent_to_write_t::create_existing( + pin->duplicate(), overwrite_plan.aligned_data_end, - overwrite_plan.right_paddr.add_offset(overwrite_plan.aligned_data_end - right_pin_begin), extent_len)); auto append_len = overwrite_plan.get_right_alignment_size(); if (append_len == 0) { return get_iertr::make_ready_future( - right_to_write_extent, + std::move(right_to_write_extent), std::nullopt); } else { auto append_offset = overwrite_plan.data_end - right_pin_begin; @@ -623,7 +844,7 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla right_to_write_extent=std::move(right_to_write_extent)] (auto right_extent) mutable { return get_iertr::make_ready_future( - right_to_write_extent, + std::move(right_to_write_extent), std::make_optional(bufferptr( right_extent->get_bptr(), append_offset, @@ -721,41 +942,68 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( object_data.get_reserved_data_len() - pin_offset)); return clear_iertr::now(); } else { - /* First pin overlaps the boundary and has data, read in extent - * and rewrite portion prior to size */ - return ctx.tm.read_pin( - ctx.t, - pin.duplicate() - ).si_then([ctx, size, pin_offset, &pin, &object_data, &to_write]( - auto extent) { - bufferlist bl; - bl.append( - bufferptr( - extent->get_bptr(), - 0, - size - pin_offset - )); - bl.append_zero(p2roundup(size, ctx.tm.get_block_size()) - size); - to_write.push_back(extent_to_write_t::create_data( - pin.get_key(), - bl)); + /* First pin overlaps the boundary and has data, remap it + * if aligned or rewrite it if not aligned to size */ + auto roundup_size = p2roundup(size, ctx.tm.get_block_size()); + auto append_len = roundup_size - size; + if (append_len == 0) { + LOG_PREFIX(ObjectDataHandler::trim_data_reservation); + TRACET("First pin overlaps the boundary and has aligned data" + "create existing at addr:{}, len:{}", + ctx.t, pin.get_key(), size - pin_offset); + to_write.push_back(extent_to_write_t::create_existing( + pin.duplicate(), + pin.get_key(), + size - pin_offset)); to_write.push_back(extent_to_write_t::create_zero( - object_data.get_reserved_data_base() + - p2roundup(size, ctx.tm.get_block_size()), - object_data.get_reserved_data_len() - - p2roundup(size, ctx.tm.get_block_size()))); - return clear_iertr::now(); - }); - } - }).si_then([ctx, &pins] { - return do_removals(ctx, pins); - }).si_then([ctx, &to_write] { - return do_insertions(ctx, to_write); - }).si_then([size, &object_data] { - if (size == 0) { - object_data.clear(); + object_data.get_reserved_data_base() + roundup_size, + object_data.get_reserved_data_len() - roundup_size)); + return clear_iertr::now(); + } else { + return ctx.tm.read_pin( + ctx.t, + pin.duplicate() + ).si_then([ctx, size, pin_offset, append_len, roundup_size, + &pin, &object_data, &to_write](auto extent) { + bufferlist bl; + bl.append( + bufferptr( + extent->get_bptr(), + 0, + size - pin_offset + )); + bl.append_zero(append_len); + LOG_PREFIX(ObjectDataHandler::trim_data_reservation); + TRACET("First pin overlaps the boundary and has unaligned data" + "create data at addr:{}, len:{}", + ctx.t, pin.get_key(), bl.length()); + to_write.push_back(extent_to_write_t::create_data( + pin.get_key(), + bl)); + to_write.push_back(extent_to_write_t::create_zero( + object_data.get_reserved_data_base() + roundup_size, + object_data.get_reserved_data_len() - roundup_size)); + return clear_iertr::now(); + }); + } } - return ObjectDataHandler::clear_iertr::now(); + }).si_then([ctx, size, &to_write, &object_data, &pins] { + assert(to_write.size()); + return seastar::do_with( + prepare_ops_list(pins, to_write), + [ctx, size, &object_data](auto &ops) { + return do_remappings(ctx, ops.to_remap + ).si_then([ctx, &ops] { + return do_removals(ctx, ops.to_remove); + }).si_then([ctx, &ops] { + return do_insertions(ctx, ops.to_insert); + }).si_then([size, &object_data] { + if (size == 0) { + object_data.clear(); + } + return ObjectDataHandler::clear_iertr::now(); + }); + }); }); }); } @@ -806,7 +1054,9 @@ extent_to_write_list_t get_to_writes_with_zero_buffer( } assert(bl.length() % block_size == 0); assert(bl.length() == (right - left)); - return {extent_to_write_t::create_data(left, bl)}; + extent_to_write_list_t ret; + ret.push_back(extent_to_write_t::create_data(left, bl)); + return ret; } else { // reserved section between ends, headptr and tailptr in different extents extent_to_write_list_t ret; @@ -931,9 +1181,16 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite( assert(pin_begin == to_write.front().addr); assert(pin_end == to_write.back().get_end_addr()); - return do_removals(ctx, pins); - }).si_then([ctx, &to_write] { - return do_insertions(ctx, to_write); + return seastar::do_with( + prepare_ops_list(pins, to_write), + [ctx](auto &ops) { + return do_remappings(ctx, ops.to_remap + ).si_then([ctx, &ops] { + return do_removals(ctx, ops.to_remove); + }).si_then([ctx, &ops] { + return do_insertions(ctx, ops.to_insert); + }); + }); }); }); }); diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 9dcc577e7773b..8f71323cde8db 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -410,68 +410,6 @@ public: }); } - /** - * map_existing_extent - * - * Allocates a new extent at given existing_paddr that must be absolute and - * reads disk to fill the extent. - * The common usage is that remove the LogicalCachedExtent (laddr~length at paddr) - * and map extent to multiple new extents. - * placement_hint and generation should follow the original extent. - */ - using map_existing_extent_iertr = - alloc_extent_iertr::extend_ertr; - template - using map_existing_extent_ret = - map_existing_extent_iertr::future>; - template - map_existing_extent_ret map_existing_extent( - Transaction &t, - laddr_t laddr_hint, - paddr_t existing_paddr, - extent_len_t length) { - LOG_PREFIX(TransactionManager::map_existing_extent); - // FIXME: existing_paddr can be absolute and pending - ceph_assert(existing_paddr.is_absolute()); - assert(t.is_retired(existing_paddr, length)); - - SUBDEBUGT(seastore_tm, " laddr_hint: {} existing_paddr: {} length: {}", - t, laddr_hint, existing_paddr, length); - auto bp = ceph::bufferptr(buffer::create_page_aligned(length)); - bp.zero(); - - // ExtentPlacementManager::alloc_new_extent will make a new - // (relative/temp) paddr, so make extent directly - auto ext = CachedExtent::make_cached_extent_ref(std::move(bp)); - - ext->init(CachedExtent::extent_state_t::EXIST_CLEAN, - existing_paddr, - PLACEMENT_HINT_NULL, - NULL_GENERATION, - t.get_trans_id()); - - t.add_fresh_extent(ext); - - return lba_manager->alloc_extent( - t, - laddr_hint, - length, - existing_paddr, - ext.get() - ).si_then([ext=std::move(ext), laddr_hint, this](auto &&ref) { - ceph_assert(laddr_hint == ref->get_key()); - return epm->read( - ext->get_paddr(), - ext->get_length(), - ext->get_bptr() - ).safe_then([ext=std::move(ext)] { - return map_existing_extent_iertr::make_ready_future> - (std::move(ext)); - }); - }); - } - - using reserve_extent_iertr = alloc_extent_iertr; using reserve_extent_ret = reserve_extent_iertr::future; reserve_extent_ret reserve_region(