]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: implement partial reads from cached_extent to object_data_handler
authorYingxin Cheng <yingxin.cheng@intel.com>
Mon, 4 Nov 2024 02:50:56 +0000 (10:50 +0800)
committerYingxin Cheng <yingxin.cheng@intel.com>
Thu, 28 Nov 2024 01:32:51 +0000 (09:32 +0800)
Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
Signed-off-by: Jianxin Li <jianxin1.li@intel.com>
src/crimson/os/seastore/cache.h
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/object_data_handler.cc
src/crimson/os/seastore/transaction_manager.h

index 9e3718926399fd671157324c25e133cc895f1fbe..e2df8b349604f4dc8513c488acb9cbb8eb2d0ef2 100644 (file)
@@ -10,6 +10,7 @@
 #include "include/buffer.h"
 
 #include "crimson/common/errorator.h"
+#include "crimson/common/errorator-loop.h"
 #include "crimson/os/seastore/cached_extent.h"
 #include "crimson/os/seastore/extent_placement_manager.h"
 #include "crimson/os/seastore/logging.h"
@@ -410,14 +411,10 @@ public:
             ret->cast<T>());
         });
       } else {
-       assert(!ret->is_mutable());
         SUBDEBUGT(seastore_cache,
             "{} {}~0x{:x} is present on t without fully loaded, reading ... -- {}",
             t, T::TYPE, offset, length, *ret);
-        auto bp = create_extent_ptr_rand(ret->get_length());
-        ret->set_bptr(std::move(bp));
-        return read_extent<T>(
-          ret->cast<T>());
+        return do_read_extent_maybe_partial<T>(ret->cast<T>(), 0, length);
       }
     } else {
       SUBTRACET(seastore_cache, "{} {}~0x{:x} is absent on t, query cache ...",
@@ -438,12 +435,15 @@ public:
    * get_absent_extent
    *
    * The extent in query is supposed to be absent in Cache.
+   * partially load buffer from partial_off~partial_len if not present.
    */
   template <typename T, typename Func>
   get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
     Transaction &t,
     paddr_t offset,
     extent_len_t length,
+    extent_len_t partial_off,
+    extent_len_t partial_len,
     Func &&extent_init_func) {
     CachedExtentRef ret;
     LOG_PREFIX(Cache::get_absent_extent);
@@ -474,7 +474,8 @@ public:
     };
     return trans_intr::make_interruptible(
       do_get_caching_extent<T>(
-       offset, length, std::forward<Func>(extent_init_func), std::move(f))
+        offset, length, partial_off, partial_len,
+        std::forward<Func>(extent_init_func), std::move(f))
     );
   }
 
@@ -498,6 +499,16 @@ public:
     return get_absent_extent<T>(t, offset, length, [](T &){});
   }
 
+  template <typename T, typename Func>
+  get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
+    Transaction &t,
+    paddr_t offset,
+    extent_len_t length,
+    Func &&extent_init_func) {
+    return get_absent_extent<T>(t, offset, length, 0, length,
+      std::forward<Func>(extent_init_func));
+  }
+
   bool is_viewable_extent_stable(
     Transaction &t,
     CachedExtentRef extent)
@@ -586,20 +597,10 @@ public:
 
     // user should not see RETIRED_PLACEHOLDER extents
     ceph_assert(!is_retired_placeholder_type(p_extent->get_type()));
-    if (!p_extent->is_fully_loaded()) {
-      assert(is_logical_type(p_extent->get_type()));
-      assert(!p_extent->is_mutable());
-      ++access_stats.load_present;
-      ++stats.access.s.load_present;
-      LOG_PREFIX(Cache::get_extent_viewable_by_trans);
-      SUBDEBUG(seastore_cache,
-        "{} {}~0x{:x} is present without fully loaded, reading ... -- {}",
-        p_extent->get_type(), p_extent->get_paddr(), p_extent->get_length(),
-        *p_extent);
-      auto bp = create_extent_ptr_rand(p_extent->get_length());
-      p_extent->set_bptr(std::move(bp));
-      return read_extent<CachedExtent>(CachedExtentRef(p_extent));
-    }
+    // for logical extents, handle partial load in TM::read_pin(),
+    // also see read_extent_maybe_partial() and get_absent_extent()
+    assert(is_logical_type(p_extent->get_type()) ||
+           p_extent->is_fully_loaded());
     return p_extent->wait_io(
     ).then([p_extent] {
       return get_extent_ertr::make_ready_future<CachedExtentRef>(
@@ -621,6 +622,37 @@ public:
     });
   }
 
+  // wait extent io or do partial reads
+  template <typename T>
+  read_extent_ret<T> read_extent_maybe_partial(
+    Transaction &t,
+    TCachedExtentRef<T> extent,
+    extent_len_t partial_off,
+    extent_len_t partial_len) {
+    assert(is_logical_type(extent->get_type()));
+    if (!extent->is_range_loaded(partial_off, partial_len)) {
+      LOG_PREFIX(Cache::read_extent_maybe_partial);
+      SUBDEBUGT(seastore_cache,
+        "{} {}~0x{:x} is present on t without range 0x{:x}~0x{:x}, reading ... -- {}",
+        t, extent->get_type(), extent->get_paddr(), extent->get_length(),
+        partial_off, partial_len, *extent);
+      const auto t_src = t.get_src();
+      extent_access_stats_t& access_stats = get_by_ext(
+        get_by_src(stats.access_by_src_ext, t_src),
+        extent->get_type());
+      ++access_stats.load_present;
+      ++stats.access.s.load_present;
+      return do_read_extent_maybe_partial(
+          std::move(extent), partial_off, partial_len);
+    } else {
+      // TODO(implement fine-grained-wait):
+      // the range might be already loaded, but we don't know
+      return extent->wait_io().then([extent]() -> read_extent_ret<T> {
+        return seastar::make_ready_future<TCachedExtentRef<T>>(extent);
+      });
+    }
+  }
+
   extent_len_t get_block_size() const {
     return epm.get_block_size();
   }
@@ -632,54 +664,112 @@ public:
   }
 
 private:
+  /// Implements exclusive call to read_extent() for the extent
+  template <typename T>
+  read_extent_ret<T> do_read_extent_maybe_partial(
+    TCachedExtentRef<T>&& extent,
+    extent_len_t partial_off,
+    extent_len_t partial_len)
+  {
+    LOG_PREFIX(Cache::do_read_extent_maybe_partial);
+    // They must be atomic:
+    // 1. checking missing range and wait io
+    // 2. checking missing range and read
+    // because the extents in Caches can be accessed concurrently
+    //
+    // TODO(implement fine-grained-wait)
+    assert(!extent->is_range_loaded(partial_off, partial_len));
+    assert(!extent->is_mutable());
+    if (extent->is_pending_io()) {
+      auto* p_extent = extent.get();
+      return p_extent->wait_io(
+      ).then([extent=std::move(extent), partial_off, partial_len, this, FNAME]() mutable
+             -> read_extent_ret<T> {
+        if (extent->is_range_loaded(partial_off, partial_len)) {
+          SUBDEBUG(seastore_cache,
+            "{} {}~0x{:x} got range 0x{:x}~0x{:x} ... -- {}",
+            extent->get_type(), extent->get_paddr(), extent->get_length(),
+            partial_off, partial_len, *extent);
+          // we don't know whether the target range is loading or not
+          if (extent->is_pending_io()) {
+            auto* p_extent = extent.get();
+            return p_extent->wait_io(
+            ).then([extent=std::move(extent)]() mutable {
+              return seastar::make_ready_future<TCachedExtentRef<T>>(std::move(extent));
+            });
+          } else {
+            return seastar::make_ready_future<TCachedExtentRef<T>>(std::move(extent));
+          }
+        } else { // range not loaded
+          SUBDEBUG(seastore_cache,
+            "{} {}~0x{:x} without range 0x{:x}~0x{:x} ... -- {}",
+            extent->get_type(), extent->get_paddr(), extent->get_length(),
+            partial_off, partial_len, *extent);
+          return do_read_extent_maybe_partial(
+              std::move(extent), partial_off, partial_len);
+        }
+      });
+    } else {
+      SUBDEBUG(seastore_cache,
+        "{} {}~0x{:x} is not pending without range 0x{:x}~0x{:x}, reading ... -- {}",
+        extent->get_type(), extent->get_paddr(), extent->get_length(),
+        partial_off, partial_len, *extent);
+      return read_extent<T>(
+        std::move(extent), partial_off, partial_len);
+    }
+  }
+
   /**
    * do_get_caching_extent
    *
    * returns ref to extent at offset~length of type T either from
    * - extent_set if already in cache
    * - disk
+   * only load partial_off~partial_len
    */
   using src_ext_t = std::pair<Transaction::src_t, extent_types_t>;
   template <typename T, typename Func, typename OnCache>
   read_extent_ret<T> do_get_caching_extent(
     paddr_t offset,                ///< [in] starting addr
     extent_len_t length,           ///< [in] length
+    extent_len_t partial_off,      ///< [in] offset of piece in extent
+    extent_len_t partial_len,      ///< [in] length of piece in extent
     Func &&extent_init_func,       ///< [in] init func for extent
     OnCache &&on_cache
   ) {
     LOG_PREFIX(Cache::do_get_caching_extent);
     auto cached = query_cache(offset);
     if (!cached) {
-      auto ret = CachedExtent::make_cached_extent_ref<T>(
-        create_extent_ptr_rand(length));
+      // partial read
+      TCachedExtentRef<T> ret = CachedExtent::make_cached_extent_ref<T>(length);
       ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
                 offset,
                 PLACEMENT_HINT_NULL,
                 NULL_GENERATION,
                TRANS_ID_NULL);
       SUBDEBUG(seastore_cache,
-          "{} {}~0x{:x} is absent, add extent and reading ... -- {}",
-          T::TYPE, offset, length, *ret);
+          "{} {}~0x{:x} is absent, add extent and reading range 0x{:x}~0x{:x} ... -- {}",
+          T::TYPE, offset, length, partial_off, partial_len, *ret);
       add_extent(ret);
       // touch_extent() should be included in on_cache
       on_cache(*ret);
       extent_init_func(*ret);
       return read_extent<T>(
-       std::move(ret));
+       std::move(ret), partial_off, partial_len);
     }
 
     // extent PRESENT in cache
     if (is_retired_placeholder_type(cached->get_type())) {
-      auto ret = CachedExtent::make_cached_extent_ref<T>(
-        create_extent_ptr_rand(length));
+      // partial read
+      TCachedExtentRef<T> ret = CachedExtent::make_cached_extent_ref<T>(length);
       ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
                 offset,
                 PLACEMENT_HINT_NULL,
                 NULL_GENERATION,
                TRANS_ID_NULL);
       SUBDEBUG(seastore_cache,
-          "{} {}~0x{:x} is absent(placeholder), add extent and reading ... -- {}",
-          T::TYPE, offset, length, *ret);
+          "{} {}~0x{:x} is absent(placeholder), add extent and reading range 0x{:x}~0x{:x} ... -- {}",
+          T::TYPE, offset, length, partial_off, partial_len, *ret);
       extents_index.replace(*ret, *cached);
       on_cache(*ret);
 
@@ -692,30 +782,39 @@ private:
       cached->state = CachedExtent::extent_state_t::INVALID;
       extent_init_func(*ret);
       return read_extent<T>(
-       std::move(ret));
+       std::move(ret), partial_off, partial_len);
     }
 
     auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
     on_cache(*ret);
-    if (ret->is_fully_loaded()) {
+    if (ret->is_range_loaded(partial_off, partial_len)) {
       SUBTRACE(seastore_cache,
-          "{} {}~0x{:x} is present in cache -- {}",
-          T::TYPE, offset, length, *ret);
+          "{} {}~0x{:x} is present with range 0x{:x}~0x{:x} ... -- {}",
+          T::TYPE, offset, length, partial_off, partial_len, *ret);
       return ret->wait_io().then([ret] {
         // ret may be invalid, caller must check
         return seastar::make_ready_future<TCachedExtentRef<T>>(ret);
       });
     } else {
       SUBDEBUG(seastore_cache,
-          "{} {}~0x{:x} is present without fully loaded, reading ... -- {}",
-          T::TYPE, offset, length, *ret);
-      auto bp = create_extent_ptr_rand(length);
-      ret->set_bptr(std::move(bp));
-      return read_extent<T>(
-        std::move(ret));
+          "{} {}~0x{:x} is present without range 0x{:x}~0x{:x}, reading ... -- {}",
+          T::TYPE, offset, length, partial_off, partial_len, *ret);
+      return do_read_extent_maybe_partial(
+          std::move(ret), partial_off, partial_len);
     }
   }
 
+  template <typename T, typename Func, typename OnCache>
+  read_extent_ret<T> do_get_caching_extent(
+    paddr_t offset,                ///< [in] starting addr
+    extent_len_t length,           ///< [in] length
+    Func &&extent_init_func,       ///< [in] init func for extent
+    OnCache &&on_cache
+  ) {
+    return do_get_caching_extent<T>(offset, length, 0, length,
+      std::forward<Func>(extent_init_func),
+      std::forward<OnCache>(on_cache));
+  }
 
   // This is a workaround std::move_only_function not being available,
   // not really worth generalizing at this time.
@@ -789,14 +888,10 @@ private:
          return seastar::make_ready_future<CachedExtentRef>(ret);
         });
       } else {
-       assert(!ret->is_mutable());
         SUBDEBUGT(seastore_cache,
             "{} {}~0x{:x} {} is present on t without fully loaded, reading ... -- {}",
             t, type, offset, length, laddr, *ret);
-        auto bp = create_extent_ptr_rand(ret->get_length());
-        ret->set_bptr(std::move(bp));
-        return read_extent<CachedExtent>(
-          std::move(ret));
+        return do_read_extent_maybe_partial<CachedExtent>(std::move(ret), 0, length);
       }
     } else {
       SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is absent on t, query cache ...",
@@ -1817,39 +1912,69 @@ private:
   /// Introspect transaction when it is being destructed
   void on_transaction_destruct(Transaction& t);
 
+  /// Read the extent in range offset~length,
+  /// must be called exclusively for an extent,
+  /// also see do_read_extent_maybe_partial().
+  ///
+  /// May return an invalid extent due to transaction conflict.
   template <typename T>
   read_extent_ret<T> read_extent(
-    TCachedExtentRef<T>&& extent
+    TCachedExtentRef<T>&& extent,
+    extent_len_t offset,
+    extent_len_t length
   ) {
+    LOG_PREFIX(Cache::read_extent);
     assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING ||
-      extent->state == CachedExtent::extent_state_t::EXIST_CLEAN ||
-      extent->state == CachedExtent::extent_state_t::CLEAN);
+           extent->state == CachedExtent::extent_state_t::EXIST_CLEAN ||
+           extent->state == CachedExtent::extent_state_t::CLEAN);
+    assert(!extent->is_range_loaded(offset, length));
+    assert(is_aligned(offset, get_block_size()));
+    assert(is_aligned(length, get_block_size()));
     extent->set_io_wait();
-    return epm.read(
-      extent->get_paddr(),
-      extent->get_length(),
-      extent->get_bptr()
-    ).safe_then(
-      [extent=std::move(extent), this]() mutable {
-        LOG_PREFIX(Cache::read_extent);
-       if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) {
-         extent->state = CachedExtent::extent_state_t::CLEAN;
-       }
-       ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN
-         || extent->state == CachedExtent::extent_state_t::CLEAN
-         || !extent->is_valid());
-       if (extent->is_valid()) {
-         // crc will be checked against LBA leaf entry for logical extents,
-         // or check against in-extent crc for physical extents.
-         if (epm.get_checksum_needed(extent->get_paddr())) {
-           extent->last_committed_crc = extent->calc_crc32c();
-         } else {
-           extent->last_committed_crc = CRC_NULL;
-         }
-         extent->on_clean_read();
-       }
+    load_ranges_t to_read = extent->load_ranges(offset, length);
+    return seastar::do_with(to_read.ranges, [extent, this, FNAME](auto &read_ranges) {
+      return ExtentPlacementManager::read_ertr::parallel_for_each(
+          read_ranges, [extent, this, FNAME](auto &read_range) {
+        SUBDEBUG(seastore_cache, "reading extent {} 0x{:x}~0x{:x} ...",
+                 extent->get_paddr(), read_range.offset, read_range.get_length());
+        assert(is_aligned(read_range.offset, get_block_size()));
+        assert(is_aligned(read_range.get_length(), get_block_size()));
+        return epm.read(
+          extent->get_paddr() + read_range.offset,
+          read_range.get_length(),
+          read_range.ptr);
+      });
+    }).safe_then(
+      [this, FNAME, extent=std::move(extent), offset, length]() mutable {
+        if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) {
+          extent->state = CachedExtent::extent_state_t::CLEAN;
+        }
+        ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN
+          || extent->state == CachedExtent::extent_state_t::CLEAN
+          || !extent->is_valid());
+        if (extent->is_valid()) {
+          if (extent->is_fully_loaded()) {
+            // crc will be checked against LBA leaf entry for logical extents,
+            // or check against in-extent crc for physical extents.
+            if (epm.get_checksum_needed(extent->get_paddr())) {
+              extent->last_committed_crc = extent->calc_crc32c();
+            } else {
+              extent->last_committed_crc = CRC_NULL;
+            }
+            // on_clean_read() may change the content, call after calc_crc32c()
+            extent->on_clean_read();
+            SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done -- {}",
+              offset, length, *extent);
+          } else {
+            extent->last_committed_crc = CRC_NULL;
+            SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (partial) -- {}",
+              offset, length, *extent);
+          }
+        } else {
+          SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (invalidated) -- {}",
+            offset, length, *extent);
+        }
         extent->complete_io();
-        SUBDEBUG(seastore_cache, "read extent done -- {}", *extent);
         return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
           std::move(extent));
       },
index 026e677bbbc166eb17d81e82531fd764d3db8191..07db3bd488e11b6b88fe1aee9e1745214833e612 100644 (file)
@@ -443,12 +443,12 @@ public:
        << ", modify_time=" << sea_time_point_printer_t{modify_time}
        << ", paddr=" << get_paddr()
        << ", prior_paddr=" << prior_poffset_str
-       << std::hex << ", length=0x" << get_length() << std::dec
+       << std::hex << ", length=0x" << get_length()
+       << ", loaded=0x" << get_loaded_length() << std::dec
        << ", state=" << state
        << ", last_committed_crc=" << last_committed_crc
        << ", refcount=" << use_count()
        << ", user_hint=" << user_hint
-       << ", fully_loaded=" << is_fully_loaded()
        << ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation};
     if (state != extent_state_t::INVALID &&
         state != extent_state_t::CLEAN_PENDING) {
@@ -632,12 +632,40 @@ public:
   bool is_fully_loaded() const {
     if (ptr.has_value()) {
       // length == 0 iff root
+      assert(length == loaded_length);
+      assert(!buffer_space.has_value());
       return true;
     } else { // ptr is std::nullopt
+      assert(length > loaded_length);
+      assert(buffer_space.has_value());
       return false;
     }
   }
 
+  /// Return true if range offset~_length is loaded
+  bool is_range_loaded(extent_len_t offset, extent_len_t _length) {
+    assert(is_aligned(offset, CEPH_PAGE_SIZE));
+    assert(is_aligned(_length, CEPH_PAGE_SIZE));
+    assert(_length > 0);
+    assert(offset + _length <= length);
+    if (is_fully_loaded()) {
+      return true;
+    }
+    return buffer_space->is_range_loaded(offset, _length);
+  }
+
+  /// Get buffer by given offset and _length.
+  ceph::bufferlist get_range(extent_len_t offset, extent_len_t _length) {
+    assert(is_range_loaded(offset, _length));
+    ceph::bufferlist res;
+    if (is_fully_loaded()) {
+      res.append(ceph::bufferptr(get_bptr(), offset, _length));
+    } else {
+      res = buffer_space->get_buffer(offset, _length);
+    }
+    return res;
+  }
+
   /**
    * get_paddr
    *
@@ -651,13 +679,9 @@ public:
     return length;
   }
 
-  /// Returns length of lazily loaded extent data in cache
+  /// Returns length of partially loaded extent data in cache
   extent_len_t get_loaded_length() const {
-    if (ptr.has_value()) {
-      return ptr->length();
-    } else {
-      return 0;
-    }
+    return loaded_length;
   }
 
   /// Returns version, get_version() == 0 iff is_clean()
@@ -796,12 +820,19 @@ private:
    */
   journal_seq_t dirty_from_or_retired_at;
 
-  /// cache data contents, std::nullopt iff lazily loaded
+  /// cache data contents, std::nullopt iff partially loaded
   std::optional<ceph::bufferptr> ptr;
 
   /// disk data length, 0 iff root
   extent_len_t length;
 
+  /// loaded data length, <length iff partially loaded
+  extent_len_t loaded_length;
+
+  /// manager of buffer pieces for ObjectDataBLock
+  /// valid iff partially loaded
+  std::optional<BufferSpace> buffer_space;
+
   /// number of deltas since initial write
   extent_version_t version = 0;
 
@@ -850,7 +881,8 @@ protected:
 
   /// construct a fully loaded CachedExtent
   CachedExtent(ceph::bufferptr &&_ptr)
-    : length(_ptr.length()) {
+    : length(_ptr.length()),
+      loaded_length(_ptr.length()) {
     ptr = std::move(_ptr);
 
     assert(ptr->is_page_aligned());
@@ -859,9 +891,11 @@ protected:
     // must call init() to fully initialize
   }
 
-  /// construct a lazily loaded CachedExtent
+  /// construct a partially loaded CachedExtent
   CachedExtent(extent_len_t _length)
-    : length(_length) {
+    : length(_length),
+      loaded_length(0),
+      buffer_space(std::in_place) {
     assert(is_aligned(length, CEPH_PAGE_SIZE));
     assert(length > 0);
     assert(!is_fully_loaded());
@@ -873,6 +907,7 @@ protected:
     : state(other.state),
       dirty_from_or_retired_at(other.dirty_from_or_retired_at),
       length(other.get_length()),
+      loaded_length(other.get_loaded_length()),
       version(other.version),
       poffset(other.poffset) {
     // the extent must be fully loaded before CoW
@@ -895,6 +930,7 @@ protected:
       dirty_from_or_retired_at(other.dirty_from_or_retired_at),
       ptr(other.ptr),
       length(other.get_length()),
+      loaded_length(other.get_loaded_length()),
       version(other.version),
       poffset(other.poffset) {
     // the extent must be fully loaded before CoW
@@ -908,7 +944,8 @@ protected:
   struct root_construct_t {};
   CachedExtent(root_construct_t)
     : ptr(ceph::bufferptr(0)),
-      length(0) {
+      length(0),
+      loaded_length(0) {
     assert(is_fully_loaded());
     // must call init() to fully initialize
   }
@@ -916,8 +953,9 @@ protected:
   struct retired_placeholder_construct_t {};
   CachedExtent(retired_placeholder_construct_t, extent_len_t _length)
     : state(extent_state_t::CLEAN),
-      length(_length) {
-    assert(length > 0);
+      length(_length),
+      loaded_length(0),
+      buffer_space(std::in_place) {
     assert(!is_fully_loaded());
     assert(is_aligned(length, CEPH_PAGE_SIZE));
     // must call init() to fully initialize
@@ -995,6 +1033,43 @@ protected:
     }
   }
 
+  /// Returns the ranges to load, convert to fully loaded is possible
+  load_ranges_t load_ranges(extent_len_t offset, extent_len_t _length) {
+    assert(is_aligned(offset, CEPH_PAGE_SIZE));
+    assert(is_aligned(_length, CEPH_PAGE_SIZE));
+    assert(_length > 0);
+    assert(offset + _length <= length);
+    assert(!is_fully_loaded());
+
+    if (loaded_length == 0 && _length == length) {
+      assert(offset == 0);
+      // skip rebuilding the buffer from buffer_space
+      ptr = create_extent_ptr_rand(length);
+      loaded_length = _length;
+      buffer_space.reset();
+      assert(is_fully_loaded());
+      load_ranges_t ret;
+      ret.push_back(offset, *ptr);
+      return ret;
+    }
+
+    load_ranges_t ret = buffer_space->load_ranges(offset, _length);
+    loaded_length += ret.length;
+    assert(length >= loaded_length);
+    if (length == loaded_length) {
+      // convert to fully loaded
+      ptr = buffer_space->to_full_ptr(length);
+      buffer_space.reset();
+      assert(is_fully_loaded());
+      // adjust ret since the ptr has been rebuild
+      for (load_range_t& range : ret.ranges) {
+        auto range_length = range.ptr.length();
+        range.ptr = ceph::bufferptr(*ptr, range.offset, range_length);
+      }
+    }
+    return ret;
+  }
+
   friend class crimson::os::seastore::SegmentedAllocator;
   friend class crimson::os::seastore::TransactionManager;
   friend class crimson::os::seastore::ExtentPlacementManager;
index 83bb2c08af843ff576bdcf0e070363251cc4d4a4..70e6fe58e3a5b80b914c678d4434900a8b4b5553 100644 (file)
@@ -1499,11 +1499,18 @@ ObjectDataHandler::read_ret ObjectDataHandler::read(
             [FNAME, ctx, l_start, l_end,
              &l_current, &ret](auto &pin) -> read_iertr::future<> {
             auto pin_start = pin->get_key();
+            extent_len_t read_start;
+            extent_len_t read_start_aligned;
             if (l_current == l_start) { // first pin may skip head
               ceph_assert(l_current.get_aligned_laddr() >= pin_start);
+              read_start = l_current.template
+                get_byte_distance<extent_len_t>(pin_start);
+              read_start_aligned = p2align(read_start, ctx.tm.get_block_size());
             } else { // non-first pin must match start
               assert(l_current > l_start);
               ceph_assert(l_current == pin_start);
+              read_start = 0;
+              read_start_aligned = 0;
             }
 
             ceph_assert(l_current < l_end);
@@ -1528,24 +1535,37 @@ ObjectDataHandler::read_ret ObjectDataHandler::read(
             }
 
             // non-zero pin
+            laddr_t l_current_end_aligned = l_current_end.get_roundup_laddr();
+            extent_len_t read_len_aligned =
+              l_current_end_aligned.get_byte_distance<extent_len_t>(pin_start);
+            read_len_aligned -= read_start_aligned;
+            extent_len_t unalign_start_offset = read_start - read_start_aligned;
             DEBUGT("reading {}~{} from pin {}~{}",
               ctx.t,
               l_current,
               read_len,
               pin_start,
               pin_len);
-            extent_len_t e_current_off =
-              l_current.template get_byte_distance<extent_len_t>(pin_start);
             return ctx.tm.read_pin<ObjectDataBlock>(
               ctx.t,
-              std::move(pin)
+              std::move(pin),
+              read_start_aligned,
+              read_len_aligned
             ).si_then([&ret, &l_current, l_current_end,
-                       e_current_off, read_len](auto maybe_indirect_extent) {
-              ret.append(
-                bufferptr(
-                  maybe_indirect_extent.get_bptr(),
-                  e_current_off,
-                  read_len));
+                       read_start_aligned, read_len_aligned,
+                       unalign_start_offset, read_len](auto maybe_indirect_extent) {
+              auto aligned_bl = maybe_indirect_extent.get_range(
+                  read_start_aligned, read_len_aligned);
+              if (read_len < read_len_aligned) {
+                ceph::bufferlist unaligned_bl;
+                unaligned_bl.substr_of(
+                    aligned_bl, unalign_start_offset, read_len);
+                ret.append(std::move(unaligned_bl));
+              } else {
+                assert(read_len == read_len_aligned);
+                assert(unalign_start_offset == 0);
+                ret.append(std::move(aligned_bl));
+              }
               l_current = l_current_end;
               return seastar::now();
             }).handle_error_interruptible(
index b2e500d9d9ca527af86bfc5ba1b0b495a383c728..8f2870a6a2776842b02811562482cc099a8b1d40 100644 (file)
@@ -166,6 +166,20 @@ public:
         return extent->get_bptr();
       }
     }
+
+    ceph::bufferlist get_range(
+        extent_len_t offset, extent_len_t length) const {
+      if (is_indirect()) {
+        assert(maybe_indirect_info->intermediate_offset + offset + length <=
+               extent->get_length());
+        assert(offset + length <= maybe_indirect_info->length);
+        return extent->get_range(
+            maybe_indirect_info->intermediate_offset + offset,
+            length);
+      } else {
+        return extent->get_range(offset, length);
+      }
+    }
   };
 
   /**
@@ -226,16 +240,28 @@ public:
   template <typename T>
   base_iertr::future<maybe_indirect_extent_t<T>> read_pin(
     Transaction &t,
-    LBAMappingRef pin)
+    LBAMappingRef pin,
+    extent_len_t partial_off,
+    extent_len_t partial_len)
   {
+    static_assert(is_logical_type(T::TYPE));
+    assert(is_aligned(partial_off, get_block_size()));
+    assert(is_aligned(partial_len, get_block_size()));
+
+    extent_len_t direct_partial_off = partial_off;
     bool is_clone = pin->is_clone();
     std::optional<indirect_info_t> maybe_indirect_info;
     if (pin->is_indirect()) {
+      auto intermediate_offset = pin->get_intermediate_offset();
+      direct_partial_off = intermediate_offset + partial_off;
       maybe_indirect_info = indirect_info_t{
-        pin->get_intermediate_offset(), pin->get_length()};
+        intermediate_offset, pin->get_length()};
     }
+
     LOG_PREFIX(TransactionManager::read_pin);
-    SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin);
+    SUBDEBUGT(seastore_tm, "{} {} 0x{:x}~0x{:x} direct_off=0x{:x} ...",
+              t, T::TYPE, *pin, partial_off, partial_len, direct_partial_off);
+
     auto fut = base_iertr::make_ready_future<LBAMappingRef>();
     if (!pin->is_parent_viewable()) {
       if (pin->is_parent_valid()) {
@@ -252,14 +278,19 @@ public:
       pin->maybe_fix_pos();
       fut = base_iertr::make_ready_future<LBAMappingRef>(std::move(pin));
     }
-    return fut.si_then([&t, this](auto npin) {
+    return fut.si_then([&t, this, direct_partial_off, partial_len](auto npin) {
       // checking the lba child must be atomic with creating
       // and linking the absent child
       auto ret = get_extent_if_linked<T>(t, std::move(npin));
       if (ret.index() == 1) {
-       return std::move(std::get<1>(ret));
+       return std::get<1>(ret
+        ).si_then([direct_partial_off, partial_len, this, &t](auto extent) {
+          return cache->read_extent_maybe_partial(
+            t, std::move(extent), direct_partial_off, partial_len);
+        });
       } else {
-       return this->pin_to_extent<T>(t, std::move(std::get<0>(ret)));
+       return this->pin_to_extent<T>(
+          t, std::move(std::get<0>(ret)), direct_partial_off, partial_len);
       }
     }).si_then([FNAME, maybe_indirect_info, is_clone, &t](TCachedExtentRef<T> ext) {
       if (maybe_indirect_info.has_value()) {
@@ -274,6 +305,15 @@ public:
     });
   }
 
+  template <typename T>
+  base_iertr::future<maybe_indirect_extent_t<T>> read_pin(
+    Transaction &t,
+    LBAMappingRef pin)
+  {
+    auto& pin_ref = *pin;
+    return read_pin<T>(t, std::move(pin), 0, pin_ref.get_length());
+  }
+
   /// Obtain mutable copy of extent
   LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
     LOG_PREFIX(TransactionManager::get_mutable_extent);
@@ -523,6 +563,7 @@ public:
              ? (ext && ext->is_fully_loaded())
              : true);
          std::optional<ceph::bufferptr> original_bptr;
+         // TODO: preserve the bufferspace if partially loaded
          if (ext && ext->is_fully_loaded()) {
            ceph_assert(!ext->is_mutable());
            ceph_assert(ext->get_length() >= original_len);
@@ -969,6 +1010,7 @@ private:
    * pin_to_extent
    *
    * Get extent mapped at pin.
+   * partially load buffer from direct_partial_off~partial_len if not present.
    */
   using pin_to_extent_iertr = base_iertr;
   template <typename T>
@@ -977,19 +1019,28 @@ private:
   template <typename T>
   pin_to_extent_ret<T> pin_to_extent(
     Transaction &t,
-    LBAMappingRef pin) {
-    LOG_PREFIX(TransactionManager::pin_to_extent);
-    SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin);
+    LBAMappingRef pin,
+    extent_len_t direct_partial_off,
+    extent_len_t partial_len) {
     static_assert(is_logical_type(T::TYPE));
     using ret = pin_to_extent_ret<T>;
     auto &pref = *pin;
     auto direct_length = pref.is_indirect() ?
       pref.get_intermediate_length() :
       pref.get_length();
+    if (full_extent_integrity_check) {
+      direct_partial_off = 0;
+      partial_len = direct_length;
+    }
+    LOG_PREFIX(TransactionManager::pin_to_extent);
+    SUBTRACET(seastore_tm, "getting absent extent from pin {}, 0x{:x}~0x{:x} ...",
+              t, *pin, direct_partial_off, partial_len);
     return cache->get_absent_extent<T>(
       t,
       pref.get_val(),
       direct_length,
+      direct_partial_off,
+      partial_len,
       [&pref]
       (T &extent) mutable {
        assert(!extent.has_laddr());
@@ -1000,30 +1051,33 @@ private:
        extent.maybe_set_intermediate_laddr(pref);
       }
     ).si_then([FNAME, &t, pin=std::move(pin), this](auto ref) mutable -> ret {
-      auto crc = ref->calc_crc32c();
-      SUBTRACET(
-       seastore_tm,
-       "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}",
-       t,
-       *ref,
-       pin->get_checksum(),
-       crc);
-      assert(ref->is_fully_loaded());
-      bool inconsistent = false;
-      if (full_extent_integrity_check) {
-       inconsistent = (pin->get_checksum() != crc);
-      } else { // !full_extent_integrity_check: remapped extent may be skipped
-       inconsistent = !(pin->get_checksum() == 0 ||
-                        pin->get_checksum() == crc);
-      }
-      if (unlikely(inconsistent)) {
-       SUBERRORT(seastore_tm,
-         "extent checksum inconsistent, recorded: {}, actual: {}, {}",
+      if (ref->is_fully_loaded()) {
+        auto crc = ref->calc_crc32c();
+        SUBTRACET(
+         seastore_tm,
+         "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}",
          t,
+         *ref,
          pin->get_checksum(),
-         crc,
-         *ref);
-       ceph_abort();
+         crc);
+        bool inconsistent = false;
+        if (full_extent_integrity_check) {
+         inconsistent = (pin->get_checksum() != crc);
+        } else { // !full_extent_integrity_check: remapped extent may be skipped
+         inconsistent = !(pin->get_checksum() == 0 ||
+                           pin->get_checksum() == crc);
+        }
+        if (unlikely(inconsistent)) {
+         SUBERRORT(seastore_tm,
+           "extent checksum inconsistent, recorded: {}, actual: {}, {}",
+           t,
+           pin->get_checksum(),
+           crc,
+           *ref);
+         ceph_abort();
+        }
+      } else {
+        assert(!full_extent_integrity_check);
       }
       return pin_to_extent_ret<T>(
        interruptible::ready_future_marker{},