From: Yingxin Cheng Date: Mon, 4 Nov 2024 02:50:56 +0000 (+0800) Subject: crimson/os/seastore: implement partial reads from cached_extent to object_data_handler X-Git-Tag: v20.0.0~619^2~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2770a61059f0ab076f5ef1e94f36d73230dc90e7;p=ceph.git crimson/os/seastore: implement partial reads from cached_extent to object_data_handler Signed-off-by: Yingxin Cheng Signed-off-by: Jianxin Li --- diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 9e371892639..e2df8b34960 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -10,6 +10,7 @@ #include "include/buffer.h" #include "crimson/common/errorator.h" +#include "crimson/common/errorator-loop.h" #include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/extent_placement_manager.h" #include "crimson/os/seastore/logging.h" @@ -410,14 +411,10 @@ public: ret->cast()); }); } else { - assert(!ret->is_mutable()); SUBDEBUGT(seastore_cache, "{} {}~0x{:x} is present on t without fully loaded, reading ... -- {}", t, T::TYPE, offset, length, *ret); - auto bp = create_extent_ptr_rand(ret->get_length()); - ret->set_bptr(std::move(bp)); - return read_extent( - ret->cast()); + return do_read_extent_maybe_partial(ret->cast(), 0, length); } } else { SUBTRACET(seastore_cache, "{} {}~0x{:x} is absent on t, query cache ...", @@ -438,12 +435,15 @@ public: * get_absent_extent * * The extent in query is supposed to be absent in Cache. + * partially load buffer from partial_off~partial_len if not present. */ template get_extent_iertr::future> get_absent_extent( Transaction &t, paddr_t offset, extent_len_t length, + extent_len_t partial_off, + extent_len_t partial_len, Func &&extent_init_func) { CachedExtentRef ret; LOG_PREFIX(Cache::get_absent_extent); @@ -474,7 +474,8 @@ public: }; return trans_intr::make_interruptible( do_get_caching_extent( - offset, length, std::forward(extent_init_func), std::move(f)) + offset, length, partial_off, partial_len, + std::forward(extent_init_func), std::move(f)) ); } @@ -498,6 +499,16 @@ public: return get_absent_extent(t, offset, length, [](T &){}); } + template + get_extent_iertr::future> get_absent_extent( + Transaction &t, + paddr_t offset, + extent_len_t length, + Func &&extent_init_func) { + return get_absent_extent(t, offset, length, 0, length, + std::forward(extent_init_func)); + } + bool is_viewable_extent_stable( Transaction &t, CachedExtentRef extent) @@ -586,20 +597,10 @@ public: // user should not see RETIRED_PLACEHOLDER extents ceph_assert(!is_retired_placeholder_type(p_extent->get_type())); - if (!p_extent->is_fully_loaded()) { - assert(is_logical_type(p_extent->get_type())); - assert(!p_extent->is_mutable()); - ++access_stats.load_present; - ++stats.access.s.load_present; - LOG_PREFIX(Cache::get_extent_viewable_by_trans); - SUBDEBUG(seastore_cache, - "{} {}~0x{:x} is present without fully loaded, reading ... -- {}", - p_extent->get_type(), p_extent->get_paddr(), p_extent->get_length(), - *p_extent); - auto bp = create_extent_ptr_rand(p_extent->get_length()); - p_extent->set_bptr(std::move(bp)); - return read_extent(CachedExtentRef(p_extent)); - } + // for logical extents, handle partial load in TM::read_pin(), + // also see read_extent_maybe_partial() and get_absent_extent() + assert(is_logical_type(p_extent->get_type()) || + p_extent->is_fully_loaded()); return p_extent->wait_io( ).then([p_extent] { return get_extent_ertr::make_ready_future( @@ -621,6 +622,37 @@ public: }); } + // wait extent io or do partial reads + template + read_extent_ret read_extent_maybe_partial( + Transaction &t, + TCachedExtentRef extent, + extent_len_t partial_off, + extent_len_t partial_len) { + assert(is_logical_type(extent->get_type())); + if (!extent->is_range_loaded(partial_off, partial_len)) { + LOG_PREFIX(Cache::read_extent_maybe_partial); + SUBDEBUGT(seastore_cache, + "{} {}~0x{:x} is present on t without range 0x{:x}~0x{:x}, reading ... -- {}", + t, extent->get_type(), extent->get_paddr(), extent->get_length(), + partial_off, partial_len, *extent); + const auto t_src = t.get_src(); + extent_access_stats_t& access_stats = get_by_ext( + get_by_src(stats.access_by_src_ext, t_src), + extent->get_type()); + ++access_stats.load_present; + ++stats.access.s.load_present; + return do_read_extent_maybe_partial( + std::move(extent), partial_off, partial_len); + } else { + // TODO(implement fine-grained-wait): + // the range might be already loaded, but we don't know + return extent->wait_io().then([extent]() -> read_extent_ret { + return seastar::make_ready_future>(extent); + }); + } + } + extent_len_t get_block_size() const { return epm.get_block_size(); } @@ -632,54 +664,112 @@ public: } private: + /// Implements exclusive call to read_extent() for the extent + template + read_extent_ret do_read_extent_maybe_partial( + TCachedExtentRef&& extent, + extent_len_t partial_off, + extent_len_t partial_len) + { + LOG_PREFIX(Cache::do_read_extent_maybe_partial); + // They must be atomic: + // 1. checking missing range and wait io + // 2. checking missing range and read + // because the extents in Caches can be accessed concurrently + // + // TODO(implement fine-grained-wait) + assert(!extent->is_range_loaded(partial_off, partial_len)); + assert(!extent->is_mutable()); + if (extent->is_pending_io()) { + auto* p_extent = extent.get(); + return p_extent->wait_io( + ).then([extent=std::move(extent), partial_off, partial_len, this, FNAME]() mutable + -> read_extent_ret { + if (extent->is_range_loaded(partial_off, partial_len)) { + SUBDEBUG(seastore_cache, + "{} {}~0x{:x} got range 0x{:x}~0x{:x} ... -- {}", + extent->get_type(), extent->get_paddr(), extent->get_length(), + partial_off, partial_len, *extent); + // we don't know whether the target range is loading or not + if (extent->is_pending_io()) { + auto* p_extent = extent.get(); + return p_extent->wait_io( + ).then([extent=std::move(extent)]() mutable { + return seastar::make_ready_future>(std::move(extent)); + }); + } else { + return seastar::make_ready_future>(std::move(extent)); + } + } else { // range not loaded + SUBDEBUG(seastore_cache, + "{} {}~0x{:x} without range 0x{:x}~0x{:x} ... -- {}", + extent->get_type(), extent->get_paddr(), extent->get_length(), + partial_off, partial_len, *extent); + return do_read_extent_maybe_partial( + std::move(extent), partial_off, partial_len); + } + }); + } else { + SUBDEBUG(seastore_cache, + "{} {}~0x{:x} is not pending without range 0x{:x}~0x{:x}, reading ... -- {}", + extent->get_type(), extent->get_paddr(), extent->get_length(), + partial_off, partial_len, *extent); + return read_extent( + std::move(extent), partial_off, partial_len); + } + } + /** * do_get_caching_extent * * returns ref to extent at offset~length of type T either from * - extent_set if already in cache * - disk + * only load partial_off~partial_len */ using src_ext_t = std::pair; template read_extent_ret do_get_caching_extent( paddr_t offset, ///< [in] starting addr extent_len_t length, ///< [in] length + extent_len_t partial_off, ///< [in] offset of piece in extent + extent_len_t partial_len, ///< [in] length of piece in extent Func &&extent_init_func, ///< [in] init func for extent OnCache &&on_cache ) { LOG_PREFIX(Cache::do_get_caching_extent); auto cached = query_cache(offset); if (!cached) { - auto ret = CachedExtent::make_cached_extent_ref( - create_extent_ptr_rand(length)); + // partial read + TCachedExtentRef ret = CachedExtent::make_cached_extent_ref(length); ret->init(CachedExtent::extent_state_t::CLEAN_PENDING, offset, PLACEMENT_HINT_NULL, NULL_GENERATION, TRANS_ID_NULL); SUBDEBUG(seastore_cache, - "{} {}~0x{:x} is absent, add extent and reading ... -- {}", - T::TYPE, offset, length, *ret); + "{} {}~0x{:x} is absent, add extent and reading range 0x{:x}~0x{:x} ... -- {}", + T::TYPE, offset, length, partial_off, partial_len, *ret); add_extent(ret); // touch_extent() should be included in on_cache on_cache(*ret); extent_init_func(*ret); return read_extent( - std::move(ret)); + std::move(ret), partial_off, partial_len); } // extent PRESENT in cache if (is_retired_placeholder_type(cached->get_type())) { - auto ret = CachedExtent::make_cached_extent_ref( - create_extent_ptr_rand(length)); + // partial read + TCachedExtentRef ret = CachedExtent::make_cached_extent_ref(length); ret->init(CachedExtent::extent_state_t::CLEAN_PENDING, offset, PLACEMENT_HINT_NULL, NULL_GENERATION, TRANS_ID_NULL); SUBDEBUG(seastore_cache, - "{} {}~0x{:x} is absent(placeholder), add extent and reading ... -- {}", - T::TYPE, offset, length, *ret); + "{} {}~0x{:x} is absent(placeholder), add extent and reading range 0x{:x}~0x{:x} ... -- {}", + T::TYPE, offset, length, partial_off, partial_len, *ret); extents_index.replace(*ret, *cached); on_cache(*ret); @@ -692,30 +782,39 @@ private: cached->state = CachedExtent::extent_state_t::INVALID; extent_init_func(*ret); return read_extent( - std::move(ret)); + std::move(ret), partial_off, partial_len); } auto ret = TCachedExtentRef(static_cast(cached.get())); on_cache(*ret); - if (ret->is_fully_loaded()) { + if (ret->is_range_loaded(partial_off, partial_len)) { SUBTRACE(seastore_cache, - "{} {}~0x{:x} is present in cache -- {}", - T::TYPE, offset, length, *ret); + "{} {}~0x{:x} is present with range 0x{:x}~0x{:x} ... -- {}", + T::TYPE, offset, length, partial_off, partial_len, *ret); return ret->wait_io().then([ret] { // ret may be invalid, caller must check return seastar::make_ready_future>(ret); }); } else { SUBDEBUG(seastore_cache, - "{} {}~0x{:x} is present without fully loaded, reading ... -- {}", - T::TYPE, offset, length, *ret); - auto bp = create_extent_ptr_rand(length); - ret->set_bptr(std::move(bp)); - return read_extent( - std::move(ret)); + "{} {}~0x{:x} is present without range 0x{:x}~0x{:x}, reading ... -- {}", + T::TYPE, offset, length, partial_off, partial_len, *ret); + return do_read_extent_maybe_partial( + std::move(ret), partial_off, partial_len); } } + template + read_extent_ret do_get_caching_extent( + paddr_t offset, ///< [in] starting addr + extent_len_t length, ///< [in] length + Func &&extent_init_func, ///< [in] init func for extent + OnCache &&on_cache + ) { + return do_get_caching_extent(offset, length, 0, length, + std::forward(extent_init_func), + std::forward(on_cache)); + } // This is a workaround std::move_only_function not being available, // not really worth generalizing at this time. @@ -789,14 +888,10 @@ private: return seastar::make_ready_future(ret); }); } else { - assert(!ret->is_mutable()); SUBDEBUGT(seastore_cache, "{} {}~0x{:x} {} is present on t without fully loaded, reading ... -- {}", t, type, offset, length, laddr, *ret); - auto bp = create_extent_ptr_rand(ret->get_length()); - ret->set_bptr(std::move(bp)); - return read_extent( - std::move(ret)); + return do_read_extent_maybe_partial(std::move(ret), 0, length); } } else { SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is absent on t, query cache ...", @@ -1817,39 +1912,69 @@ private: /// Introspect transaction when it is being destructed void on_transaction_destruct(Transaction& t); + /// Read the extent in range offset~length, + /// must be called exclusively for an extent, + /// also see do_read_extent_maybe_partial(). + /// + /// May return an invalid extent due to transaction conflict. template read_extent_ret read_extent( - TCachedExtentRef&& extent + TCachedExtentRef&& extent, + extent_len_t offset, + extent_len_t length ) { + LOG_PREFIX(Cache::read_extent); assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING || - extent->state == CachedExtent::extent_state_t::EXIST_CLEAN || - extent->state == CachedExtent::extent_state_t::CLEAN); + extent->state == CachedExtent::extent_state_t::EXIST_CLEAN || + extent->state == CachedExtent::extent_state_t::CLEAN); + assert(!extent->is_range_loaded(offset, length)); + assert(is_aligned(offset, get_block_size())); + assert(is_aligned(length, get_block_size())); extent->set_io_wait(); - return epm.read( - extent->get_paddr(), - extent->get_length(), - extent->get_bptr() - ).safe_then( - [extent=std::move(extent), this]() mutable { - LOG_PREFIX(Cache::read_extent); - if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) { - extent->state = CachedExtent::extent_state_t::CLEAN; - } - ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN - || extent->state == CachedExtent::extent_state_t::CLEAN - || !extent->is_valid()); - if (extent->is_valid()) { - // crc will be checked against LBA leaf entry for logical extents, - // or check against in-extent crc for physical extents. - if (epm.get_checksum_needed(extent->get_paddr())) { - extent->last_committed_crc = extent->calc_crc32c(); - } else { - extent->last_committed_crc = CRC_NULL; - } - extent->on_clean_read(); - } + load_ranges_t to_read = extent->load_ranges(offset, length); + return seastar::do_with(to_read.ranges, [extent, this, FNAME](auto &read_ranges) { + return ExtentPlacementManager::read_ertr::parallel_for_each( + read_ranges, [extent, this, FNAME](auto &read_range) { + SUBDEBUG(seastore_cache, "reading extent {} 0x{:x}~0x{:x} ...", + extent->get_paddr(), read_range.offset, read_range.get_length()); + assert(is_aligned(read_range.offset, get_block_size())); + assert(is_aligned(read_range.get_length(), get_block_size())); + return epm.read( + extent->get_paddr() + read_range.offset, + read_range.get_length(), + read_range.ptr); + }); + }).safe_then( + [this, FNAME, extent=std::move(extent), offset, length]() mutable { + if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) { + extent->state = CachedExtent::extent_state_t::CLEAN; + } + ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN + || extent->state == CachedExtent::extent_state_t::CLEAN + || !extent->is_valid()); + if (extent->is_valid()) { + if (extent->is_fully_loaded()) { + // crc will be checked against LBA leaf entry for logical extents, + // or check against in-extent crc for physical extents. + if (epm.get_checksum_needed(extent->get_paddr())) { + extent->last_committed_crc = extent->calc_crc32c(); + } else { + extent->last_committed_crc = CRC_NULL; + } + // on_clean_read() may change the content, call after calc_crc32c() + extent->on_clean_read(); + SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done -- {}", + offset, length, *extent); + } else { + extent->last_committed_crc = CRC_NULL; + SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (partial) -- {}", + offset, length, *extent); + } + } else { + SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (invalidated) -- {}", + offset, length, *extent); + } extent->complete_io(); - SUBDEBUG(seastore_cache, "read extent done -- {}", *extent); return get_extent_ertr::make_ready_future>( std::move(extent)); }, diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 026e677bbbc..07db3bd488e 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -443,12 +443,12 @@ public: << ", modify_time=" << sea_time_point_printer_t{modify_time} << ", paddr=" << get_paddr() << ", prior_paddr=" << prior_poffset_str - << std::hex << ", length=0x" << get_length() << std::dec + << std::hex << ", length=0x" << get_length() + << ", loaded=0x" << get_loaded_length() << std::dec << ", state=" << state << ", last_committed_crc=" << last_committed_crc << ", refcount=" << use_count() << ", user_hint=" << user_hint - << ", fully_loaded=" << is_fully_loaded() << ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation}; if (state != extent_state_t::INVALID && state != extent_state_t::CLEAN_PENDING) { @@ -632,12 +632,40 @@ public: bool is_fully_loaded() const { if (ptr.has_value()) { // length == 0 iff root + assert(length == loaded_length); + assert(!buffer_space.has_value()); return true; } else { // ptr is std::nullopt + assert(length > loaded_length); + assert(buffer_space.has_value()); return false; } } + /// Return true if range offset~_length is loaded + bool is_range_loaded(extent_len_t offset, extent_len_t _length) { + assert(is_aligned(offset, CEPH_PAGE_SIZE)); + assert(is_aligned(_length, CEPH_PAGE_SIZE)); + assert(_length > 0); + assert(offset + _length <= length); + if (is_fully_loaded()) { + return true; + } + return buffer_space->is_range_loaded(offset, _length); + } + + /// Get buffer by given offset and _length. + ceph::bufferlist get_range(extent_len_t offset, extent_len_t _length) { + assert(is_range_loaded(offset, _length)); + ceph::bufferlist res; + if (is_fully_loaded()) { + res.append(ceph::bufferptr(get_bptr(), offset, _length)); + } else { + res = buffer_space->get_buffer(offset, _length); + } + return res; + } + /** * get_paddr * @@ -651,13 +679,9 @@ public: return length; } - /// Returns length of lazily loaded extent data in cache + /// Returns length of partially loaded extent data in cache extent_len_t get_loaded_length() const { - if (ptr.has_value()) { - return ptr->length(); - } else { - return 0; - } + return loaded_length; } /// Returns version, get_version() == 0 iff is_clean() @@ -796,12 +820,19 @@ private: */ journal_seq_t dirty_from_or_retired_at; - /// cache data contents, std::nullopt iff lazily loaded + /// cache data contents, std::nullopt iff partially loaded std::optional ptr; /// disk data length, 0 iff root extent_len_t length; + /// loaded data length, buffer_space; + /// number of deltas since initial write extent_version_t version = 0; @@ -850,7 +881,8 @@ protected: /// construct a fully loaded CachedExtent CachedExtent(ceph::bufferptr &&_ptr) - : length(_ptr.length()) { + : length(_ptr.length()), + loaded_length(_ptr.length()) { ptr = std::move(_ptr); assert(ptr->is_page_aligned()); @@ -859,9 +891,11 @@ protected: // must call init() to fully initialize } - /// construct a lazily loaded CachedExtent + /// construct a partially loaded CachedExtent CachedExtent(extent_len_t _length) - : length(_length) { + : length(_length), + loaded_length(0), + buffer_space(std::in_place) { assert(is_aligned(length, CEPH_PAGE_SIZE)); assert(length > 0); assert(!is_fully_loaded()); @@ -873,6 +907,7 @@ protected: : state(other.state), dirty_from_or_retired_at(other.dirty_from_or_retired_at), length(other.get_length()), + loaded_length(other.get_loaded_length()), version(other.version), poffset(other.poffset) { // the extent must be fully loaded before CoW @@ -895,6 +930,7 @@ protected: dirty_from_or_retired_at(other.dirty_from_or_retired_at), ptr(other.ptr), length(other.get_length()), + loaded_length(other.get_loaded_length()), version(other.version), poffset(other.poffset) { // the extent must be fully loaded before CoW @@ -908,7 +944,8 @@ protected: struct root_construct_t {}; CachedExtent(root_construct_t) : ptr(ceph::bufferptr(0)), - length(0) { + length(0), + loaded_length(0) { assert(is_fully_loaded()); // must call init() to fully initialize } @@ -916,8 +953,9 @@ protected: struct retired_placeholder_construct_t {}; CachedExtent(retired_placeholder_construct_t, extent_len_t _length) : state(extent_state_t::CLEAN), - length(_length) { - assert(length > 0); + length(_length), + loaded_length(0), + buffer_space(std::in_place) { assert(!is_fully_loaded()); assert(is_aligned(length, CEPH_PAGE_SIZE)); // must call init() to fully initialize @@ -995,6 +1033,43 @@ protected: } } + /// Returns the ranges to load, convert to fully loaded is possible + load_ranges_t load_ranges(extent_len_t offset, extent_len_t _length) { + assert(is_aligned(offset, CEPH_PAGE_SIZE)); + assert(is_aligned(_length, CEPH_PAGE_SIZE)); + assert(_length > 0); + assert(offset + _length <= length); + assert(!is_fully_loaded()); + + if (loaded_length == 0 && _length == length) { + assert(offset == 0); + // skip rebuilding the buffer from buffer_space + ptr = create_extent_ptr_rand(length); + loaded_length = _length; + buffer_space.reset(); + assert(is_fully_loaded()); + load_ranges_t ret; + ret.push_back(offset, *ptr); + return ret; + } + + load_ranges_t ret = buffer_space->load_ranges(offset, _length); + loaded_length += ret.length; + assert(length >= loaded_length); + if (length == loaded_length) { + // convert to fully loaded + ptr = buffer_space->to_full_ptr(length); + buffer_space.reset(); + assert(is_fully_loaded()); + // adjust ret since the ptr has been rebuild + for (load_range_t& range : ret.ranges) { + auto range_length = range.ptr.length(); + range.ptr = ceph::bufferptr(*ptr, range.offset, range_length); + } + } + return ret; + } + friend class crimson::os::seastore::SegmentedAllocator; friend class crimson::os::seastore::TransactionManager; friend class crimson::os::seastore::ExtentPlacementManager; diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index 83bb2c08af8..70e6fe58e3a 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -1499,11 +1499,18 @@ ObjectDataHandler::read_ret ObjectDataHandler::read( [FNAME, ctx, l_start, l_end, &l_current, &ret](auto &pin) -> read_iertr::future<> { auto pin_start = pin->get_key(); + extent_len_t read_start; + extent_len_t read_start_aligned; if (l_current == l_start) { // first pin may skip head ceph_assert(l_current.get_aligned_laddr() >= pin_start); + read_start = l_current.template + get_byte_distance(pin_start); + read_start_aligned = p2align(read_start, ctx.tm.get_block_size()); } else { // non-first pin must match start assert(l_current > l_start); ceph_assert(l_current == pin_start); + read_start = 0; + read_start_aligned = 0; } ceph_assert(l_current < l_end); @@ -1528,24 +1535,37 @@ ObjectDataHandler::read_ret ObjectDataHandler::read( } // non-zero pin + laddr_t l_current_end_aligned = l_current_end.get_roundup_laddr(); + extent_len_t read_len_aligned = + l_current_end_aligned.get_byte_distance(pin_start); + read_len_aligned -= read_start_aligned; + extent_len_t unalign_start_offset = read_start - read_start_aligned; DEBUGT("reading {}~{} from pin {}~{}", ctx.t, l_current, read_len, pin_start, pin_len); - extent_len_t e_current_off = - l_current.template get_byte_distance(pin_start); return ctx.tm.read_pin( ctx.t, - std::move(pin) + std::move(pin), + read_start_aligned, + read_len_aligned ).si_then([&ret, &l_current, l_current_end, - e_current_off, read_len](auto maybe_indirect_extent) { - ret.append( - bufferptr( - maybe_indirect_extent.get_bptr(), - e_current_off, - read_len)); + read_start_aligned, read_len_aligned, + unalign_start_offset, read_len](auto maybe_indirect_extent) { + auto aligned_bl = maybe_indirect_extent.get_range( + read_start_aligned, read_len_aligned); + if (read_len < read_len_aligned) { + ceph::bufferlist unaligned_bl; + unaligned_bl.substr_of( + aligned_bl, unalign_start_offset, read_len); + ret.append(std::move(unaligned_bl)); + } else { + assert(read_len == read_len_aligned); + assert(unalign_start_offset == 0); + ret.append(std::move(aligned_bl)); + } l_current = l_current_end; return seastar::now(); }).handle_error_interruptible( diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index b2e500d9d9c..8f2870a6a27 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -166,6 +166,20 @@ public: return extent->get_bptr(); } } + + ceph::bufferlist get_range( + extent_len_t offset, extent_len_t length) const { + if (is_indirect()) { + assert(maybe_indirect_info->intermediate_offset + offset + length <= + extent->get_length()); + assert(offset + length <= maybe_indirect_info->length); + return extent->get_range( + maybe_indirect_info->intermediate_offset + offset, + length); + } else { + return extent->get_range(offset, length); + } + } }; /** @@ -226,16 +240,28 @@ public: template base_iertr::future> read_pin( Transaction &t, - LBAMappingRef pin) + LBAMappingRef pin, + extent_len_t partial_off, + extent_len_t partial_len) { + static_assert(is_logical_type(T::TYPE)); + assert(is_aligned(partial_off, get_block_size())); + assert(is_aligned(partial_len, get_block_size())); + + extent_len_t direct_partial_off = partial_off; bool is_clone = pin->is_clone(); std::optional maybe_indirect_info; if (pin->is_indirect()) { + auto intermediate_offset = pin->get_intermediate_offset(); + direct_partial_off = intermediate_offset + partial_off; maybe_indirect_info = indirect_info_t{ - pin->get_intermediate_offset(), pin->get_length()}; + intermediate_offset, pin->get_length()}; } + LOG_PREFIX(TransactionManager::read_pin); - SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin); + SUBDEBUGT(seastore_tm, "{} {} 0x{:x}~0x{:x} direct_off=0x{:x} ...", + t, T::TYPE, *pin, partial_off, partial_len, direct_partial_off); + auto fut = base_iertr::make_ready_future(); if (!pin->is_parent_viewable()) { if (pin->is_parent_valid()) { @@ -252,14 +278,19 @@ public: pin->maybe_fix_pos(); fut = base_iertr::make_ready_future(std::move(pin)); } - return fut.si_then([&t, this](auto npin) { + return fut.si_then([&t, this, direct_partial_off, partial_len](auto npin) { // checking the lba child must be atomic with creating // and linking the absent child auto ret = get_extent_if_linked(t, std::move(npin)); if (ret.index() == 1) { - return std::move(std::get<1>(ret)); + return std::get<1>(ret + ).si_then([direct_partial_off, partial_len, this, &t](auto extent) { + return cache->read_extent_maybe_partial( + t, std::move(extent), direct_partial_off, partial_len); + }); } else { - return this->pin_to_extent(t, std::move(std::get<0>(ret))); + return this->pin_to_extent( + t, std::move(std::get<0>(ret)), direct_partial_off, partial_len); } }).si_then([FNAME, maybe_indirect_info, is_clone, &t](TCachedExtentRef ext) { if (maybe_indirect_info.has_value()) { @@ -274,6 +305,15 @@ public: }); } + template + base_iertr::future> read_pin( + Transaction &t, + LBAMappingRef pin) + { + auto& pin_ref = *pin; + return read_pin(t, std::move(pin), 0, pin_ref.get_length()); + } + /// Obtain mutable copy of extent LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) { LOG_PREFIX(TransactionManager::get_mutable_extent); @@ -523,6 +563,7 @@ public: ? (ext && ext->is_fully_loaded()) : true); std::optional original_bptr; + // TODO: preserve the bufferspace if partially loaded if (ext && ext->is_fully_loaded()) { ceph_assert(!ext->is_mutable()); ceph_assert(ext->get_length() >= original_len); @@ -969,6 +1010,7 @@ private: * pin_to_extent * * Get extent mapped at pin. + * partially load buffer from direct_partial_off~partial_len if not present. */ using pin_to_extent_iertr = base_iertr; template @@ -977,19 +1019,28 @@ private: template pin_to_extent_ret pin_to_extent( Transaction &t, - LBAMappingRef pin) { - LOG_PREFIX(TransactionManager::pin_to_extent); - SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin); + LBAMappingRef pin, + extent_len_t direct_partial_off, + extent_len_t partial_len) { static_assert(is_logical_type(T::TYPE)); using ret = pin_to_extent_ret; auto &pref = *pin; auto direct_length = pref.is_indirect() ? pref.get_intermediate_length() : pref.get_length(); + if (full_extent_integrity_check) { + direct_partial_off = 0; + partial_len = direct_length; + } + LOG_PREFIX(TransactionManager::pin_to_extent); + SUBTRACET(seastore_tm, "getting absent extent from pin {}, 0x{:x}~0x{:x} ...", + t, *pin, direct_partial_off, partial_len); return cache->get_absent_extent( t, pref.get_val(), direct_length, + direct_partial_off, + partial_len, [&pref] (T &extent) mutable { assert(!extent.has_laddr()); @@ -1000,30 +1051,33 @@ private: extent.maybe_set_intermediate_laddr(pref); } ).si_then([FNAME, &t, pin=std::move(pin), this](auto ref) mutable -> ret { - auto crc = ref->calc_crc32c(); - SUBTRACET( - seastore_tm, - "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}", - t, - *ref, - pin->get_checksum(), - crc); - assert(ref->is_fully_loaded()); - bool inconsistent = false; - if (full_extent_integrity_check) { - inconsistent = (pin->get_checksum() != crc); - } else { // !full_extent_integrity_check: remapped extent may be skipped - inconsistent = !(pin->get_checksum() == 0 || - pin->get_checksum() == crc); - } - if (unlikely(inconsistent)) { - SUBERRORT(seastore_tm, - "extent checksum inconsistent, recorded: {}, actual: {}, {}", + if (ref->is_fully_loaded()) { + auto crc = ref->calc_crc32c(); + SUBTRACET( + seastore_tm, + "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}", t, + *ref, pin->get_checksum(), - crc, - *ref); - ceph_abort(); + crc); + bool inconsistent = false; + if (full_extent_integrity_check) { + inconsistent = (pin->get_checksum() != crc); + } else { // !full_extent_integrity_check: remapped extent may be skipped + inconsistent = !(pin->get_checksum() == 0 || + pin->get_checksum() == crc); + } + if (unlikely(inconsistent)) { + SUBERRORT(seastore_tm, + "extent checksum inconsistent, recorded: {}, actual: {}, {}", + t, + pin->get_checksum(), + crc, + *ref); + ceph_abort(); + } + } else { + assert(!full_extent_integrity_check); } return pin_to_extent_ret( interruptible::ready_future_marker{},