]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: add aio_read
authorYuan Lu <yuan.y.lu@intel.com>
Tue, 14 Apr 2020 07:58:28 +0000 (15:58 +0800)
committerYuan Lu <yuan.y.lu@intel.com>
Mon, 27 Apr 2020 02:25:54 +0000 (10:25 +0800)
Signed-off-by: Peterson, Scott <scott.d.peterson@intel.com>
Signed-off-by: Li, Xiaoyan <xiaoyan.li@intel.com>
Signed-off-by: Lu, Yuan <yuan.y.lu@intel.com>
Signed-off-by: Chamarthy, Mahati <mahati.chamarthy@intel.com>
src/librbd/cache/ReplicatedWriteLog.cc
src/librbd/cache/ReplicatedWriteLog.h
src/librbd/cache/rwl/LogEntry.h
src/librbd/cache/rwl/Request.cc
src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc

index 661215085d33b1083e0871616b841d2e345b0826..533ba7eacb6a69e3e0958d804bea13b714ac4ac3 100644 (file)
@@ -52,6 +52,7 @@ ReplicatedWriteLog<I>::ReplicatedWriteLog(I &image_ctx, librbd::cache::rwl::Imag
       "librbd::cache::ReplicatedWriteLog::m_lock", this))),
     m_blockguard_lock(ceph::make_mutex(util::unique_lock_name(
       "librbd::cache::ReplicatedWriteLog::m_blockguard_lock", this))),
+    m_blocks_to_log_entries(image_ctx.cct),
     m_thread_pool(image_ctx.cct, "librbd::cache::ReplicatedWriteLog::thread_pool", "tp_rwl",
                   4,
                   ""),
@@ -565,6 +566,114 @@ template <typename I>
 void ReplicatedWriteLog<I>::aio_read(Extents&& image_extents,
                                      ceph::bufferlist* bl,
                                      int fadvise_flags, Context *on_finish) {
+  // TODO: handle writesame and discard case in later PRs
+  CephContext *cct = m_image_ctx.cct;
+  utime_t now = ceph_clock_now();
+  C_ReadRequest *read_ctx = new C_ReadRequest(cct, now, m_perfcounter, bl, on_finish);
+  ldout(cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id
+                 << "image_extents=" << image_extents << ", "
+                 << "bl=" << bl << ", "
+                 << "on_finish=" << on_finish << dendl;
+
+  ceph_assert(m_initialized);
+  bl->clear();
+  m_perfcounter->inc(l_librbd_rwl_rd_req, 1);
+
+  /*
+   * The strategy here is to look up all the WriteLogMapEntries that overlap
+   * this read, and iterate through those to separate this read into hits and
+   * misses. A new Extents object is produced here with Extents for each miss
+   * region. The miss Extents is then passed on to the read cache below RWL. We
+   * also produce an ImageExtentBufs for all the extents (hit or miss) in this
+   * read. When the read from the lower cache layer completes, we iterate
+   * through the ImageExtentBufs and insert buffers for each cache hit at the
+   * appropriate spot in the bufferlist returned from below for the miss
+   * read. The buffers we insert here refer directly to regions of various
+   * write log entry data buffers.
+   *
+   * Locking: These buffer objects hold a reference on the write log entries
+   * they refer to. Log entries can't be retired until there are no references.
+   * The GenericWriteLogEntry references are released by the buffer destructor.
+   */
+  for (auto &extent : image_extents) {
+    uint64_t extent_offset = 0;
+    RWLock::RLocker entry_reader_locker(m_entry_reader_lock);
+    WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries(block_extent(extent));
+    for (auto &map_entry : map_entries) {
+      Extent entry_image_extent(rwl::image_extent(map_entry.block_extent));
+      /* If this map entry starts after the current image extent offset ... */
+      if (entry_image_extent.first > extent.first + extent_offset) {
+        /* ... add range before map_entry to miss extents */
+        uint64_t miss_extent_start = extent.first + extent_offset;
+        uint64_t miss_extent_length = entry_image_extent.first - miss_extent_start;
+        Extent miss_extent(miss_extent_start, miss_extent_length);
+        read_ctx->miss_extents.push_back(miss_extent);
+        /* Add miss range to read extents */
+        ImageExtentBuf miss_extent_buf(miss_extent);
+        read_ctx->read_extents.push_back(miss_extent_buf);
+        extent_offset += miss_extent_length;
+      }
+      ceph_assert(entry_image_extent.first <= extent.first + extent_offset);
+      uint64_t entry_offset = 0;
+      /* If this map entry starts before the current image extent offset ... */
+      if (entry_image_extent.first < extent.first + extent_offset) {
+        /* ... compute offset into log entry for this read extent */
+        entry_offset = (extent.first + extent_offset) - entry_image_extent.first;
+      }
+      /* This read hit ends at the end of the extent or the end of the log
+         entry, whichever is less. */
+      uint64_t entry_hit_length = min(entry_image_extent.second - entry_offset,
+                                      extent.second - extent_offset);
+      Extent hit_extent(entry_image_extent.first, entry_hit_length);
+
+      /* Offset of the map entry into the log entry's buffer */
+      uint64_t map_entry_buffer_offset = entry_image_extent.first - map_entry.log_entry->ram_entry.image_offset_bytes;
+      /* Offset into the log entry buffer of this read hit */
+      uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset;
+      /* Create buffer object referring to pmem pool for this read hit */
+      auto write_entry = map_entry.log_entry;
+
+      /* Make a bl for this hit extent. This will add references to the write_entry->pmem_bp */
+      buffer::list hit_bl;
+
+      buffer::list entry_bl_copy;
+      write_entry->copy_pmem_bl(&entry_bl_copy);
+      entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl);
+
+      ceph_assert(hit_bl.length() == entry_hit_length);
+
+      /* Add hit extent to read extents */
+      ImageExtentBuf hit_extent_buf(hit_extent, hit_bl);
+      read_ctx->read_extents.push_back(hit_extent_buf);
+
+      /* Exclude RWL hit range from buffer and extent */
+      extent_offset += entry_hit_length;
+      ldout(cct, 20) << map_entry << dendl;
+    }
+    /* If the last map entry didn't consume the entire image extent ... */
+    if (extent.second > extent_offset) {
+      /* ... add the rest of this extent to miss extents */
+      uint64_t miss_extent_start = extent.first + extent_offset;
+      uint64_t miss_extent_length = extent.second - extent_offset;
+      Extent miss_extent(miss_extent_start, miss_extent_length);
+      read_ctx->miss_extents.push_back(miss_extent);
+      /* Add miss range to read extents */
+      ImageExtentBuf miss_extent_buf(miss_extent);
+      read_ctx->read_extents.push_back(miss_extent_buf);
+      extent_offset += miss_extent_length;
+    }
+  }
+
+  ldout(cct, 20) << "miss_extents=" << read_ctx->miss_extents << ", "
+                 << "miss_bl=" << read_ctx->miss_bl << dendl;
+
+  if (read_ctx->miss_extents.empty()) {
+    /* All of this read comes from RWL */
+    read_ctx->complete(0);
+  } else {
+    /* Pass the read misses on to the layer below RWL */
+    m_image_writeback.aio_read(std::move(read_ctx->miss_extents), &read_ctx->miss_bl, fadvise_flags, read_ctx);
+  }
 }
 
 template <typename I>
@@ -2041,6 +2150,11 @@ void ReplicatedWriteLog<I>::internal_flush(Context *on_finish) {
   detain_guarded_request(nullptr, guarded_ctx, true);
 }
 
+template <typename I>
+void ReplicatedWriteLog<I>::add_into_log_map(GenericWriteLogEntries &log_entries) {
+  m_blocks_to_log_entries.add_log_entries(log_entries);
+}
+
 } // namespace cache
 } // namespace librbd
 
index 88fa56e5a71a612405365a3e9da05f0a5281f2f4..668289889dc4a8a131a0eeca05e33619d75a5cae 100644 (file)
@@ -14,6 +14,7 @@
 #include "librbd/cache/Types.h"
 #include "librbd/cache/rwl/LogOperation.h"
 #include "librbd/cache/rwl/Request.h"
+#include "librbd/cache/rwl/LogMap.h"
 #include <functional>
 #include <list>
 
@@ -36,6 +37,8 @@ class GenericLogEntry;
 typedef std::list<std::shared_ptr<WriteLogEntry>> WriteLogEntries;
 typedef std::list<std::shared_ptr<GenericLogEntry>> GenericLogEntries;
 typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
+typedef LogMapEntries<GenericWriteLogEntry> WriteLogMapEntries;
+typedef LogMap<GenericWriteLogEntry> WriteLogMap;
 
 /**** Write log entries end ****/
 
@@ -125,6 +128,7 @@ public:
   uint32_t get_free_log_entries() {
     return m_free_log_entries;
   }
+  void add_into_log_map(rwl::GenericWriteLogEntries &log_entries);
 private:
   typedef std::list<rwl::C_WriteRequest<This> *> C_WriteRequests;
   typedef std::list<rwl::C_BlockIORequest<This> *> C_BlockIORequests;
@@ -220,6 +224,8 @@ private:
   rwl::GenericLogOperations m_ops_to_flush; /* Write ops needing flush in local log */
   rwl::GenericLogOperations m_ops_to_append; /* Write ops needing event append in local log */
 
+  rwl::WriteLogMap m_blocks_to_log_entries;
+
   /* New entries are at the back. Oldest at the front */
   rwl::GenericLogEntries m_log_entries;
   rwl::GenericLogEntries m_dirty_log_entries;
index 471102f6c5e837e987aea63df0cf31c28fec490c..9e5d3c53403df4b3032f966fbe5a45abab6cdcc3 100644 (file)
@@ -19,6 +19,8 @@ class SyncPointLogEntry;
 class GenericWriteLogEntry;
 class WriteLogEntry;
 
+typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
+
 class GenericLogEntry {
 public:
   WriteLogPmemEntry ram_entry;
@@ -106,6 +108,7 @@ public:
   std::shared_ptr<SyncPointLogEntry> get_sync_point_entry() override {
     return sync_point_entry;
   }
+  virtual void copy_pmem_bl(bufferlist *out_bl) = 0;
   std::ostream &format(std::ostream &os) const;
   friend std::ostream &operator<<(std::ostream &os,
                                   const GenericWriteLogEntry &entry);
@@ -151,7 +154,7 @@ public:
   /* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */
   buffer::list &get_pmem_bl();
   /* Constructs a new bl containing copies of pmem_bp */
-  void copy_pmem_bl(bufferlist *out_bl);
+  void copy_pmem_bl(bufferlist *out_bl) override;
   void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
                  Context *ctx) override;
   std::ostream &format(std::ostream &os) const;
index 697832894090105009273f54a7f5a3be89738b73..a19dfbba469cd727d065eef7db20ee94896aba99 100644 (file)
@@ -14,8 +14,6 @@ namespace librbd {
 namespace cache {
 namespace rwl {
 
-typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
-
 template <typename T>
 C_BlockIORequest<T>::C_BlockIORequest(T &rwl, const utime_t arrived, io::Extents &&extents,
                                       bufferlist&& bl, const int fadvise_flags, Context *user_req)
@@ -191,6 +189,7 @@ void C_WriteRequest<T>::setup_buffer_resources(
 
 template <typename T>
 void C_WriteRequest<T>::setup_log_operations(DeferredContexts &on_exit) {
+  GenericWriteLogEntries log_entries;
   {
     std::lock_guard locker(m_lock);
     std::shared_ptr<SyncPoint> current_sync_point = rwl.get_current_sync_point();
@@ -232,6 +231,7 @@ void C_WriteRequest<T>::setup_log_operations(DeferredContexts &on_exit) {
       /* A WS is also a write */
       ldout(rwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get()
                                    << " operation=" << operation << dendl;
+      log_entries.emplace_back(operation->log_entry);
       rwl.inc_last_op_sequence_num();
       operation->init(true, allocation, current_sync_gen,
                       rwl.get_last_op_sequence_num(), this->bl, buffer_offset, op_set->persist_on_flush);
@@ -248,6 +248,7 @@ void C_WriteRequest<T>::setup_log_operations(DeferredContexts &on_exit) {
   for (auto &operation : op_set->operations) {
     operation->copy_bl_to_pmem_buffer();
   }
+  rwl.add_into_log_map(log_entries);
 }
 
 template <typename T>
index d6b230fc6de591592ec8f2ab59dcc7c121230bfd..d2a07b92985302d75faab67764ec507fa6b405de 100644 (file)
@@ -45,6 +45,7 @@ inline ImageCtx *get_image_ctx(MockImageCtx *image_ctx) {
 #include "librbd/cache/rwl/Request.cc"
 #include "librbd/cache/rwl/Types.cc"
 #include "librbd/cache/rwl/LogOperation.cc"
+#include "librbd/cache/rwl/LogMap.cc"
 
 template class librbd::cache::ImageWriteback<librbd::MockImageCtx>;
 template class librbd::cache::rwl::ImageCacheState<librbd::MockImageCtx>;