]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd/cache: Implement aio_read operation 38796/head
authorMahati Chamarthy <mahati.chamarthy@intel.com>
Mon, 4 Jan 2021 10:27:59 +0000 (15:57 +0530)
committerMahati Chamarthy <mahati.chamarthy@intel.com>
Thu, 7 Jan 2021 04:24:25 +0000 (09:54 +0530)
... and retire entries i.e. flush to OSD.
Support writesame, compare_and_write, discard
and invalidate IO operations with tests.

Signed-off-by: Lisa Li <xiaoyan.li@intel.com>
Signed-off-by: Mahati Chamarthy <mahati.chamarthy@intel.com>
Signed-off-by: Changcheng Liu <changcheng.liu@intel.com>
19 files changed:
src/librbd/CMakeLists.txt
src/librbd/cache/pwl/AbstractWriteLog.cc
src/librbd/cache/pwl/AbstractWriteLog.h
src/librbd/cache/pwl/Builder.h
src/librbd/cache/pwl/LogEntry.h
src/librbd/cache/pwl/ReadRequest.cc [deleted file]
src/librbd/cache/pwl/ReadRequest.h
src/librbd/cache/pwl/Types.h
src/librbd/cache/pwl/rwl/Builder.h
src/librbd/cache/pwl/rwl/ReadRequest.cc [new file with mode: 0644]
src/librbd/cache/pwl/rwl/ReadRequest.h [new file with mode: 0644]
src/librbd/cache/pwl/rwl/WriteLog.cc
src/librbd/cache/pwl/rwl/WriteLog.h
src/librbd/cache/pwl/ssd/Builder.h
src/librbd/cache/pwl/ssd/ReadRequest.cc [new file with mode: 0644]
src/librbd/cache/pwl/ssd/ReadRequest.h [new file with mode: 0644]
src/librbd/cache/pwl/ssd/WriteLog.cc
src/librbd/cache/pwl/ssd/WriteLog.h
src/test/librbd/cache/pwl/test_mock_SSDWriteLog.cc

index 44663f4af67e8e2b104de971de31dcae84bca1e1..c1bacb991dfad616f8551b1649af1745d95375c3 100644 (file)
@@ -265,7 +265,6 @@ if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE)
     cache/pwl/LogEntry.cc
     cache/pwl/LogMap.cc
     cache/pwl/LogOperation.cc
-    cache/pwl/ReadRequest.cc
     cache/pwl/Request.cc
     cache/pwl/ShutdownRequest.cc
     cache/pwl/SyncPoint.cc
@@ -276,6 +275,7 @@ if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE)
       set(rbd_plugin_pwl_srcs
         ${rbd_plugin_pwl_srcs}
         cache/pwl/ssd/LogEntry.cc
+        cache/pwl/ssd/ReadRequest.cc
         cache/pwl/ssd/Request.cc
         cache/pwl/ssd/WriteLog.cc)
     endif()
@@ -285,6 +285,7 @@ if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE)
         cache/pwl/rwl/WriteLog.cc
         cache/pwl/rwl/LogEntry.cc
         cache/pwl/rwl/LogOperation.cc
+        cache/pwl/rwl/ReadRequest.cc
         cache/pwl/rwl/Request.cc)
     endif()
 
index 075a8b15bfb63aa09c3ca0542bbd347355cda260..987f5258df8cceb3ccc2332c9b4dbae09726cefb 100644 (file)
@@ -17,7 +17,6 @@
 #include "librbd/asio/ContextWQ.h"
 #include "librbd/cache/pwl/ImageCacheState.h"
 #include "librbd/cache/pwl/LogEntry.h"
-#include "librbd/cache/pwl/ReadRequest.h"
 #include "librbd/plugin/Api.h"
 #include <map>
 #include <vector>
@@ -670,7 +669,8 @@ void AbstractWriteLog<I>::read(Extents&& image_extents,
   // TODO: handle writesame and discard case in later PRs
   CephContext *cct = m_image_ctx.cct;
   utime_t now = ceph_clock_now();
-  C_ReadRequest *read_ctx = new C_ReadRequest(cct, now, m_perfcounter, bl, on_finish);
+  C_ReadRequest *read_ctx = m_builder->create_read_request(
+      cct, now, m_perfcounter, bl, on_finish);
   ldout(cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id
                  << "image_extents=" << image_extents << ", "
                  << "bl=" << bl << ", "
@@ -680,6 +680,22 @@ void AbstractWriteLog<I>::read(Extents&& image_extents,
   bl->clear();
   m_perfcounter->inc(l_librbd_pwl_rd_req, 1);
 
+  std::vector<WriteLogCacheEntry*> log_entries_to_read;
+  std::vector<bufferlist*> bls_to_read;
+
+  Context *ctx = new LambdaContext(
+    [this, read_ctx, fadvise_flags](int r) {
+      if (read_ctx->miss_extents.empty()) {
+      /* All of this read comes from RWL */
+        read_ctx->complete(0);
+      } else {
+      /* Pass the read misses on to the layer below RWL */
+        m_image_writeback.aio_read(
+            std::move(read_ctx->miss_extents), &read_ctx->miss_bl,
+            fadvise_flags, read_ctx);
+      }
+    });
+
   /*
    * The strategy here is to look up all the WriteLogMapEntries that overlap
    * this read, and iterate through those to separate this read into hits and
@@ -699,14 +715,16 @@ void AbstractWriteLog<I>::read(Extents&& image_extents,
   for (auto &extent : image_extents) {
     uint64_t extent_offset = 0;
     RWLock::RLocker entry_reader_locker(m_entry_reader_lock);
-    WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries(block_extent(extent));
+    WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries(
+        block_extent(extent));
     for (auto &map_entry : map_entries) {
       Extent entry_image_extent(pwl::image_extent(map_entry.block_extent));
       /* If this map entry starts after the current image extent offset ... */
       if (entry_image_extent.first > extent.first + extent_offset) {
         /* ... add range before map_entry to miss extents */
         uint64_t miss_extent_start = extent.first + extent_offset;
-        uint64_t miss_extent_length = entry_image_extent.first - miss_extent_start;
+        uint64_t miss_extent_length = entry_image_extent.first -
+          miss_extent_start;
         Extent miss_extent(miss_extent_start, miss_extent_length);
         read_ctx->miss_extents.push_back(miss_extent);
         /* Add miss range to read extents */
@@ -726,10 +744,13 @@ void AbstractWriteLog<I>::read(Extents&& image_extents,
       uint64_t entry_hit_length = min(entry_image_extent.second - entry_offset,
                                       extent.second - extent_offset);
       Extent hit_extent(entry_image_extent.first, entry_hit_length);
-      if (0 == map_entry.log_entry->write_bytes() && 0 < map_entry.log_entry->bytes_dirty()) {
+      if (0 == map_entry.log_entry->write_bytes() &&
+          0 < map_entry.log_entry->bytes_dirty()) {
         /* discard log entry */
         auto discard_entry = map_entry.log_entry;
-        ldout(cct, 20) << "read hit on discard entry: log_entry=" << *discard_entry << dendl;
+        ldout(cct, 20) << "read hit on discard entry: log_entry="
+                       << *discard_entry
+                       << dendl;
         /* Discards read as zero, so we'll construct a bufferlist of zeros */
         bufferlist zero_bl;
         zero_bl.append_zero(entry_hit_length);
@@ -739,24 +760,14 @@ void AbstractWriteLog<I>::read(Extents&& image_extents,
       } else {
         /* write and writesame log entry */
         /* Offset of the map entry into the log entry's buffer */
-        uint64_t map_entry_buffer_offset = entry_image_extent.first - map_entry.log_entry->ram_entry.image_offset_bytes;
+        uint64_t map_entry_buffer_offset = entry_image_extent.first -
+          map_entry.log_entry->ram_entry.image_offset_bytes;
         /* Offset into the log entry buffer of this read hit */
         uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset;
-        /* Create buffer object referring to cache pool for this read hit */
-        auto write_entry = map_entry.log_entry;
-
-        /* Make a bl for this hit extent. This will add references to the write_entry->pmem_bp */
-        buffer::list hit_bl;
-
-        buffer::list entry_bl_copy;
-        write_entry->copy_cache_bl(&entry_bl_copy);
-        entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl);
-
-        ceph_assert(hit_bl.length() == entry_hit_length);
-
-        /* Add hit extent to read extents */
-        ImageExtentBuf hit_extent_buf(hit_extent, hit_bl);
-        read_ctx->read_extents.push_back(hit_extent_buf);
+        /* Create buffer object referring to pmem pool for this read hit */
+        collect_read_extents(
+            read_buffer_offset, map_entry, log_entries_to_read, bls_to_read,
+            entry_hit_length, hit_extent, read_ctx);
       }
       /* Exclude RWL hit range from buffer and extent */
       extent_offset += entry_hit_length;
@@ -779,13 +790,7 @@ void AbstractWriteLog<I>::read(Extents&& image_extents,
   ldout(cct, 20) << "miss_extents=" << read_ctx->miss_extents << ", "
                  << "miss_bl=" << read_ctx->miss_bl << dendl;
 
-  if (read_ctx->miss_extents.empty()) {
-    /* All of this read comes from RWL */
-    read_ctx->complete(0);
-  } else {
-    /* Pass the read misses on to the layer below RWL */
-    m_image_writeback.aio_read(std::move(read_ctx->miss_extents), &read_ctx->miss_bl, fadvise_flags, read_ctx);
-  }
+  complete_read(log_entries_to_read, bls_to_read, ctx);
 }
 
 template <typename I>
index dc0058bf97fcff23fb8581ebb44a85f79ecf4916..52b24dee3c2980f9a8226eda48ba33c3f11f6af8 100644 (file)
@@ -12,6 +12,7 @@
 #include "librbd/BlockGuard.h"
 #include "librbd/cache/Types.h"
 #include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/ReadRequest.h"
 #include "librbd/cache/pwl/Request.h"
 #include "librbd/cache/pwl/LogMap.h"
 #include "librbd/cache/pwl/Builder.h"
@@ -365,6 +366,14 @@ protected:
   virtual void remove_pool_file() = 0;
   virtual void initialize_pool(Context *on_finish,
                                pwl::DeferredContexts &later) = 0;
+  virtual void collect_read_extents(
+      uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+      std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+      std::vector<bufferlist*> &bls_to_read, uint64_t entry_hit_length,
+      Extent hit_extent, pwl::C_ReadRequest *read_ctx) = 0;
+  virtual void complete_read(
+      std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+      std::vector<bufferlist*> &bls_to_read, Context *ctx) = 0;
   virtual void write_data_to_buffer(
       std::shared_ptr<pwl::WriteLogEntry> ws_entry,
       pwl::WriteLogCacheEntry *cache_entry) {}
index 6cea7ac0d6a21bc0e0fa92dcccafcc422612e439..b108d0e1845a739cbf715a81d14a409f07eae6f8 100644 (file)
@@ -45,6 +45,9 @@ public:
       WriteLogOperationSet &set, uint64_t image_offset_bytes,
       uint64_t write_bytes, uint32_t data_len, CephContext *cct,
       std::shared_ptr<WriteLogEntry> writesame_log_entry) = 0;
+  virtual C_ReadRequest *create_read_request(CephContext *cct, utime_t arrived,
+      PerfCounters *perfcounter, ceph::bufferlist *bl, Context *on_finish) = 0;
+
 };
 
 } // namespace pwl
index 7c216ad9e97b54583ad11a5b676a80a937f72e42..b29d7fb88bcb3439fd2da04f53d90ef19c3055b7 100644 (file)
@@ -62,6 +62,9 @@ public:
   virtual bool is_write_entry() const {
     return false;
   }
+  virtual bool is_writesame_entry() const {
+    return false;
+  }
   virtual bool is_sync_point() const {
     return false;
   }
@@ -223,6 +226,9 @@ public:
   bool is_write_entry() const override {
     return true;
   }
+  bool is_writesame_entry() const override {
+    return is_writesame;
+  }
   std::ostream &format(std::ostream &os) const;
   friend std::ostream &operator<<(std::ostream &os,
                                   const WriteLogEntry &entry);
diff --git a/src/librbd/cache/pwl/ReadRequest.cc b/src/librbd/cache/pwl/ReadRequest.cc
deleted file mode 100644 (file)
index 766e33f..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "ReadRequest.h"
-
-#define dout_subsys ceph_subsys_rbd_pwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::pwl::ReadRequest: " << this << " " \
-                           <<  __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace pwl {
-
-void C_ReadRequest::finish(int r) {
-  ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl;
-  int hits = 0;
-  int misses = 0;
-  int hit_bytes = 0;
-  int miss_bytes = 0;
-  if (r >= 0) {
-    /*
-     * At this point the miss read has completed. We'll iterate through
-     * read_extents and produce *m_out_bl by assembling pieces of miss_bl
-     * and the individual hit extent bufs in the read extents that represent
-     * hits.
-     */
-    uint64_t miss_bl_offset = 0;
-    for (auto &extent : read_extents) {
-      if (extent.m_bl.length()) {
-        /* This was a hit */
-        ceph_assert(extent.second == extent.m_bl.length());
-        ++hits;
-        hit_bytes += extent.second;
-        m_out_bl->claim_append(extent.m_bl);
-      } else {
-        /* This was a miss. */
-        ++misses;
-        miss_bytes += extent.second;
-        bufferlist miss_extent_bl;
-        miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent.second);
-        /* Add this read miss bufferlist to the output bufferlist */
-        m_out_bl->claim_append(miss_extent_bl);
-        /* Consume these bytes in the read miss bufferlist */
-        miss_bl_offset += extent.second;
-      }
-    }
-  }
-  ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl;
-  utime_t now = ceph_clock_now();
-  ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes);
-  m_on_finish->complete(r);
-  m_perfcounter->inc(l_librbd_pwl_rd_bytes, hit_bytes + miss_bytes);
-  m_perfcounter->inc(l_librbd_pwl_rd_hit_bytes, hit_bytes);
-  m_perfcounter->tinc(l_librbd_pwl_rd_latency, now - m_arrived_time);
-  if (!misses) {
-    m_perfcounter->inc(l_librbd_pwl_rd_hit_req, 1);
-    m_perfcounter->tinc(l_librbd_pwl_rd_hit_latency, now - m_arrived_time);
-  } else {
-    if (hits) {
-      m_perfcounter->inc(l_librbd_pwl_rd_part_hit_req, 1);
-    }
-  }
-}
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
index 7c953547c875d0301796db9fe7272faf67738e67..c188733e107537a2ebfc4367c466d5c5c3d8c83b 100644 (file)
@@ -24,13 +24,11 @@ public:
       m_arrived_time(arrived), m_perfcounter(perfcounter) {}
   ~C_ReadRequest() {}
 
-  void finish(int r) override;
-
   const char *get_name() const {
     return "C_ReadRequest";
   }
 
-private:
+protected:
   CephContext *m_cct;
   Context *m_on_finish;
   bufferlist *m_out_bl;
index d8bdcfa7ddbd34f5933ed7abc639be2d9a4c4e5e..0f4f9c00182132a0347947373b91a190436d9e85 100644 (file)
@@ -364,10 +364,20 @@ Context * override_ctx(int r, Context *ctx);
 class ImageExtentBuf : public io::Extent {
 public:
   bufferlist m_bl;
-  ImageExtentBuf(io::Extent extent)
-    : io::Extent(extent) { }
-  ImageExtentBuf(io::Extent extent, bufferlist bl)
-    : io::Extent(extent), m_bl(bl) { }
+  bool need_to_truncate;
+  int truncate_offset;
+  bool writesame;
+  ImageExtentBuf() {}
+  ImageExtentBuf(io::Extent extent,
+                 bool need_to_truncate = false, uint64_t truncate_offset = 0,
+                 bool writesame = false)
+    : io::Extent(extent), need_to_truncate(need_to_truncate),
+      truncate_offset(truncate_offset), writesame(writesame) {}
+  ImageExtentBuf(io::Extent extent, bufferlist bl,
+                 bool need_to_truncate = false, uint64_t truncate_offset = 0,
+                 bool writesame = false)
+    : io::Extent(extent), m_bl(bl), need_to_truncate(need_to_truncate),
+      truncate_offset(truncate_offset), writesame(writesame) {}
 };
 
 std::string unique_lock_name(const std::string &name, void *address);
index 9665a83afd39f7b667e6c040004cf282d34795f1..1321d711b927248dccfa399829c864c834d0d21c 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <iostream>
 #include "LogEntry.h"
+#include "ReadRequest.h"
 #include "Request.h"
 #include "LogOperation.h"
 
@@ -84,6 +85,10 @@ public:
         set, image_offset_bytes, write_bytes, data_len, cct,
         writesame_log_entry);
   }
+  C_ReadRequest *create_read_request(CephContext *cct, utime_t arrived,
+      PerfCounters *perfcounter, ceph::bufferlist *bl, Context *on_finish) {
+    return new C_ReadRequest(cct, arrived, perfcounter, bl, on_finish);
+  }
 };
 
 } // namespace rwl
diff --git a/src/librbd/cache/pwl/rwl/ReadRequest.cc b/src/librbd/cache/pwl/rwl/ReadRequest.cc
new file mode 100644 (file)
index 0000000..c453023
--- /dev/null
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ReadRequest.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::ReadRequest: " << this << " " \
+                           <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+void C_ReadRequest::finish(int r) {
+  ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl;
+  int hits = 0;
+  int misses = 0;
+  int hit_bytes = 0;
+  int miss_bytes = 0;
+  if (r >= 0) {
+    /*
+     * At this point the miss read has completed. We'll iterate through
+     * read_extents and produce *m_out_bl by assembling pieces of miss_bl
+     * and the individual hit extent bufs in the read extents that represent
+     * hits.
+     */
+    uint64_t miss_bl_offset = 0;
+    for (auto &extent : read_extents) {
+      if (extent.m_bl.length()) {
+        /* This was a hit */
+        ceph_assert(extent.second == extent.m_bl.length());
+        ++hits;
+        hit_bytes += extent.second;
+        m_out_bl->claim_append(extent.m_bl);
+      } else {
+        /* This was a miss. */
+        ++misses;
+        miss_bytes += extent.second;
+        bufferlist miss_extent_bl;
+        miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent.second);
+        /* Add this read miss bufferlist to the output bufferlist */
+        m_out_bl->claim_append(miss_extent_bl);
+        /* Consume these bytes in the read miss bufferlist */
+        miss_bl_offset += extent.second;
+      }
+    }
+  }
+  ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl;
+  utime_t now = ceph_clock_now();
+  ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes);
+  m_on_finish->complete(r);
+  m_perfcounter->inc(l_librbd_pwl_rd_bytes, hit_bytes + miss_bytes);
+  m_perfcounter->inc(l_librbd_pwl_rd_hit_bytes, hit_bytes);
+  m_perfcounter->tinc(l_librbd_pwl_rd_latency, now - m_arrived_time);
+  if (!misses) {
+    m_perfcounter->inc(l_librbd_pwl_rd_hit_req, 1);
+    m_perfcounter->tinc(l_librbd_pwl_rd_hit_latency, now - m_arrived_time);
+  } else {
+    if (hits) {
+      m_perfcounter->inc(l_librbd_pwl_rd_part_hit_req, 1);
+    }
+  }
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/rwl/ReadRequest.h b/src/librbd/cache/pwl/rwl/ReadRequest.h
new file mode 100644 (file)
index 0000000..25168e8
--- /dev/null
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_READ_REQUEST_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_READ_REQUEST_H
+
+#include "librbd/cache/pwl/ReadRequest.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+typedef std::vector<pwl::ImageExtentBuf> ImageExtentBufs;
+
+class C_ReadRequest : public pwl::C_ReadRequest {
+protected:
+  using pwl::C_ReadRequest::m_cct;
+  using pwl::C_ReadRequest::m_on_finish;
+  using pwl::C_ReadRequest::m_out_bl;
+  using pwl::C_ReadRequest::m_arrived_time;
+  using pwl::C_ReadRequest::m_perfcounter;
+public:
+  C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish)
+    : pwl::C_ReadRequest(cct, arrived, perfcounter, out_bl, on_finish) {}
+  void finish(int r) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_READ_REQUEST_H
index f05962863e80bea2fd4a5b7d9a30e8217b331243..c597464f9a87b0734395205b3dbd6dfcb7da69e6 100644 (file)
@@ -57,6 +57,36 @@ WriteLog<I>::~WriteLog() {
   delete m_builderobj;
 }
 
+template <typename I>
+void WriteLog<I>::collect_read_extents(
+      uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+      std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+      std::vector<bufferlist*> &bls_to_read, uint64_t entry_hit_length,
+      Extent hit_extent, pwl::C_ReadRequest *read_ctx) {
+  /* Make a bl for this hit extent. This will add references to the
+   * write_entry->pmem_bp */
+  buffer::list hit_bl;
+
+  /* Create buffer object referring to pmem pool for this read hit */
+  auto write_entry = map_entry.log_entry;
+
+  buffer::list entry_bl_copy;
+  write_entry->copy_cache_bl(&entry_bl_copy);
+  entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl);
+  ceph_assert(hit_bl.length() == entry_hit_length);
+
+  /* Add hit extent to read extents */
+  ImageExtentBuf hit_extent_buf(hit_extent, hit_bl);
+  read_ctx->read_extents.push_back(hit_extent_buf);
+}
+
+template <typename I>
+void WriteLog<I>::complete_read(
+    std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+    std::vector<bufferlist*> &bls_to_read, Context *ctx) {
+  ctx->complete(0);
+}
+
 /*
  * Allocate the (already reserved) write log entries for a set of operations.
  *
index 4d65a1de8b0f8327037079a25bcc06e38e19d2be..39d63776eca1db02f09fb607204632cdd00d2d1d 100644 (file)
@@ -42,6 +42,7 @@ public:
   WriteLog(const WriteLog&) = delete;
   WriteLog &operator=(const WriteLog&) = delete;
 
+  typedef io::Extent Extent;
   using This = AbstractWriteLog<ImageCtxT>;
   using C_WriteRequestT = pwl::C_WriteRequest<This>;
   using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
@@ -83,6 +84,14 @@ protected:
   void schedule_append_ops(pwl::GenericLogOperations &ops) override;
   void append_scheduled_ops(void) override;
   void reserve_cache(C_BlockIORequestT *req, bool &alloc_succeeds, bool &no_space) override;
+  void collect_read_extents(
+      uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+      std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+      std::vector<bufferlist*> &bls_to_read, uint64_t entry_hit_length,
+      Extent hit_extent, pwl::C_ReadRequest *read_ctx) override;
+  void complete_read(
+      std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+      std::vector<bufferlist*> &bls_to_read, Context *ctx) override;
   bool retire_entries(const unsigned long int frees_per_tx) override;
   void persist_last_flushed_sync_gen() override;
   bool alloc_resources(C_BlockIORequestT *req) override;
index f79d6855dfe8338922d36ca84975233418fae1bb..e761d4815efb129613e26120cbdcf6aeab8dceee 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <iostream>
 #include "LogEntry.h"
+#include "ReadRequest.h"
 #include "Request.h"
 
 #include "librbd/cache/ImageWriteback.h"
@@ -83,6 +84,10 @@ public:
         set, image_offset_bytes, write_bytes, data_len, cct,
         writesame_log_entry);
   }
+  C_ReadRequest *create_read_request(CephContext *cct, utime_t arrived,
+      PerfCounters *perfcounter, ceph::bufferlist *bl, Context *on_finish) {
+    return new C_ReadRequest(cct, arrived, perfcounter, bl, on_finish);
+  }
 };
 
 
diff --git a/src/librbd/cache/pwl/ssd/ReadRequest.cc b/src/librbd/cache/pwl/ssd/ReadRequest.cc
new file mode 100644 (file)
index 0000000..c04fdd8
--- /dev/null
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ReadRequest.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::ReadRequest: " << this << " " \
+                           <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+void C_ReadRequest::finish(int r) {
+  ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl;
+  int hits = 0;
+  int misses = 0;
+  int hit_bytes = 0;
+  int miss_bytes = 0;
+  if (r >= 0) {
+      /*
+       * At this point the miss read has completed. We'll iterate through
+       * m_read_extents and produce *m_out_bl by assembling pieces of m_miss_bl
+       * and the individual hit extent bufs in the read extents that represent
+       * hits.
+       */
+    uint64_t miss_bl_offset = 0;
+    for (auto &extent : read_extents) {
+      if (extent.m_bl.length()) {
+        /* This was a hit */
+        bufferlist data_bl;
+        if (extent.writesame) {
+          int data_len = extent.m_bl.length();
+          int read_buffer_offset = extent.truncate_offset;
+          if (extent.need_to_truncate && extent.truncate_offset >= data_len) {
+            read_buffer_offset = (extent.truncate_offset) % data_len;
+          }
+          // build data and truncate
+          bufferlist temp_bl;
+          uint64_t total_left_bytes = read_buffer_offset + extent.second;
+          while (total_left_bytes > 0) {
+            temp_bl.append(extent.m_bl);
+            total_left_bytes = total_left_bytes - data_len;
+          }
+          data_bl.substr_of(temp_bl, read_buffer_offset, extent.second);
+          m_out_bl->claim_append(data_bl);
+        } else if (extent.need_to_truncate) {
+          assert(extent.m_bl.length() >= extent.truncate_offset + extent.second);
+          data_bl.substr_of(extent.m_bl, extent.truncate_offset, extent.second);
+          m_out_bl->claim_append(data_bl);
+        } else {
+          assert(extent.second == extent.m_bl.length());
+          m_out_bl->claim_append(extent.m_bl);
+        }
+        ++hits;
+        hit_bytes += extent.second;
+      } else {
+        /* This was a miss. */
+        ++misses;
+        miss_bytes += extent.second;
+        bufferlist miss_extent_bl;
+        miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent.second);
+        /* Add this read miss bufferlist to the output bufferlist */
+        m_out_bl->claim_append(miss_extent_bl);
+        /* Consume these bytes in the read miss bufferlist */
+        miss_bl_offset += extent.second;
+      }
+    }
+  }
+  ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl;
+  utime_t now = ceph_clock_now();
+  ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes);
+  m_on_finish->complete(r);
+  m_perfcounter->inc(l_librbd_pwl_rd_bytes, hit_bytes + miss_bytes);
+  m_perfcounter->inc(l_librbd_pwl_rd_hit_bytes, hit_bytes);
+  m_perfcounter->tinc(l_librbd_pwl_rd_latency, now - m_arrived_time);
+  if (!misses) {
+    m_perfcounter->inc(l_librbd_pwl_rd_hit_req, 1);
+    m_perfcounter->tinc(l_librbd_pwl_rd_hit_latency, now - m_arrived_time);
+  } else {
+    if (hits) {
+      m_perfcounter->inc(l_librbd_pwl_rd_part_hit_req, 1);
+    }
+  }
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/ssd/ReadRequest.h b/src/librbd/cache/pwl/ssd/ReadRequest.h
new file mode 100644 (file)
index 0000000..345c4aa
--- /dev/null
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_READ_REQUEST_H
+#define CEPH_LIBRBD_CACHE_PWL_SSD_READ_REQUEST_H
+
+#include "librbd/cache/pwl/ReadRequest.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+typedef std::vector<pwl::ImageExtentBuf> ImageExtentBufs;
+
+class C_ReadRequest : public pwl::C_ReadRequest {
+protected:
+  using pwl::C_ReadRequest::m_cct;
+  using pwl::C_ReadRequest::m_on_finish;
+  using pwl::C_ReadRequest::m_out_bl;
+  using pwl::C_ReadRequest::m_arrived_time;
+  using pwl::C_ReadRequest::m_perfcounter;
+public:
+  C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish)
+    : pwl::C_ReadRequest(cct, arrived, perfcounter, out_bl, on_finish) {}
+  void finish(int r) override;
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_READ_REQUEST_H
index 6efd85ba1f5a26ab87cb2e8804282a88a7be41d3..c7d56f3dd3d5334039e355a223d441981851d97f 100644 (file)
@@ -56,6 +56,50 @@ WriteLog<I>::~WriteLog() {
   delete m_builderobj;
 }
 
+template <typename I>
+void WriteLog<I>::collect_read_extents(
+    uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+    std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+    std::vector<bufferlist*> &bls_to_read,
+    uint64_t entry_hit_length, Extent hit_extent,
+    pwl::C_ReadRequest *read_ctx) {
+    // Make a bl for this hit extent. This will add references to the
+    // write_entry->pmem_bp */
+    auto write_entry = static_pointer_cast<WriteLogEntry>(map_entry.log_entry);
+    buffer::list hit_bl;
+    hit_bl = write_entry->get_cache_bl();
+
+    if(!hit_bl.length()) {
+      ImageExtentBuf hit_extent_buf;
+      bool writesame = write_entry->is_writesame_entry();
+      hit_extent_buf = ImageExtentBuf(
+          {hit_extent, true, read_buffer_offset, writesame});
+      read_ctx->read_extents.push_back(hit_extent_buf);
+      ImageExtentBuf &read_extent = read_ctx->read_extents.back();
+
+      log_entries_to_read.push_back(&write_entry->ram_entry);
+      bls_to_read.push_back(&read_extent.m_bl);
+    } else {
+      buffer::list new_bl;
+      new_bl.substr_of(hit_bl, read_buffer_offset, entry_hit_length);
+      assert(new_bl.length() == entry_hit_length);
+      ImageExtentBuf hit_extent_buf(hit_extent, new_bl);
+      read_ctx->read_extents.push_back(hit_extent_buf);
+    }
+}
+
+template <typename I>
+void WriteLog<I>::complete_read(
+    std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+    std::vector<bufferlist*> &bls_to_read,
+    Context *ctx) {
+  if (!log_entries_to_read.empty()) {
+    aio_read_data_block(log_entries_to_read, bls_to_read, ctx);
+  } else {
+    ctx->complete(0);
+  }
+}
+
 template <typename I>
 void WriteLog<I>::initialize_pool(Context *on_finish,
                                   pwl::DeferredContexts &later) {
@@ -490,6 +534,8 @@ void WriteLog<I>::process_work() {
   CephContext *cct = m_image_ctx.cct;
   int max_iterations = 4;
   bool wake_up_requested = false;
+  uint64_t aggressive_high_water_bytes = m_log_pool_ring_buffer_size * AGGRESSIVE_RETIRE_HIGH_WATER;
+  uint64_t aggressive_high_water_entries = this->m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER;
   uint64_t high_water_bytes = m_log_pool_ring_buffer_size * RETIRE_HIGH_WATER;
   uint64_t high_water_entries = this->m_total_log_entries * RETIRE_HIGH_WATER;
 
@@ -509,11 +555,10 @@ void WriteLog<I>::process_work() {
                                  << ", allocated_entries > high_water="
                                  << (m_log_entries.size() > high_water_entries)
                                  << dendl;
-      //TODO: Implement and uncomment this in next PR
-      /*retire_entries((this->m_shutting_down || this->m_invalidating ||
+      retire_entries((this->m_shutting_down || this->m_invalidating ||
                     (m_bytes_allocated > aggressive_high_water_bytes) ||
                     (m_log_entries.size() > aggressive_high_water_entries))
-                    ? MAX_ALLOC_PER_TRANSACTION : MAX_FREE_PER_TRANSACTION);*/
+                    ? MAX_ALLOC_PER_TRANSACTION : MAX_FREE_PER_TRANSACTION);
     }
     this->dispatch_deferred_writes();
     this->process_writeback_dirty_entries();
@@ -533,6 +578,166 @@ void WriteLog<I>::process_work() {
   }
 }
 
+/**
+ * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries
+ * that are eligible to be retired. Returns true if anything was
+ * retired.
+ *
+*/
+template <typename I>
+bool WriteLog<I>::retire_entries(const unsigned long int frees_per_tx) {
+  CephContext *cct = m_image_ctx.cct;
+  GenericLogEntriesVector retiring_entries;
+  uint32_t initial_first_valid_entry;
+  uint32_t first_valid_entry;
+
+  std::lock_guard retire_locker(this->m_log_retire_lock);
+  ldout(cct, 20) << "Look for entries to retire" << dendl;
+  {
+    // Entry readers can't be added while we hold m_entry_reader_lock
+    RWLock::WLocker entry_reader_locker(this->m_entry_reader_lock);
+    std::lock_guard locker(m_lock);
+    initial_first_valid_entry = m_first_valid_entry;
+    first_valid_entry = m_first_valid_entry;
+    while (retiring_entries.size() < frees_per_tx && !m_log_entries.empty()) {
+      GenericLogEntriesVector retiring_subentries;
+      auto entry = m_log_entries.front();
+      uint64_t control_block_pos = entry->log_entry_index;
+      uint64_t data_length = 0;
+      for (auto it = m_log_entries.begin(); it != m_log_entries.end(); ++it) {
+        if (this->can_retire_entry(*it)) {
+          // log_entry_index is valid after appending to SSD
+          if ((*it)->log_entry_index != control_block_pos) {
+            ldout(cct, 20) << "Old log_entry_index is " << control_block_pos
+                           << ",New log_entry_index is "
+                           << (*it)->log_entry_index
+                           << ",data length is " << data_length << dendl;
+            ldout(cct, 20) << "The log entry is " << *(*it) << dendl;
+            if ((*it)->log_entry_index < control_block_pos) {
+              ceph_assert((*it)->log_entry_index ==
+                (control_block_pos + data_length + MIN_WRITE_ALLOC_SSD_SIZE)
+                % this->m_log_pool_config_size + DATA_RING_BUFFER_OFFSET);
+            } else {
+              ceph_assert((*it)->log_entry_index == control_block_pos +
+                  data_length + MIN_WRITE_ALLOC_SSD_SIZE);
+            }
+            break;
+          } else {
+            retiring_subentries.push_back(*it);
+            if ((*it)->is_write_entry()) {
+              data_length += (*it)->get_aligned_data_size();
+            }
+          }
+        } else {
+          retiring_subentries.clear();
+          break;
+        }
+      }
+      // SSD: retiring_subentries in a span
+      if (!retiring_subentries.empty()) {
+        for (auto it = retiring_subentries.begin();
+            it != retiring_subentries.end(); it++) {
+          ceph_assert(m_log_entries.front() == *it);
+          m_log_entries.pop_front();
+          if (entry->is_write_entry()) {
+            auto write_entry = static_pointer_cast<WriteLogEntry>(entry);
+            this->m_blocks_to_log_entries.remove_log_entry(write_entry);
+          }
+        }
+        retiring_entries.insert(
+            retiring_entries.end(), retiring_subentries.begin(),
+            retiring_subentries.end());
+      } else {
+        break;
+      }
+    }
+  }
+  if (retiring_entries.size()) {
+    ldout(cct, 1) << "Retiring " << retiring_entries.size()
+                  << " entries" << dendl;
+
+    // Advance first valid entry and release buffers
+    uint64_t flushed_sync_gen;
+    std::lock_guard append_locker(this->m_log_append_lock);
+    {
+      std::lock_guard locker(m_lock);
+      flushed_sync_gen = this->m_flushed_sync_gen;
+    }
+
+    //calculate new first_valid_entry based on last entry to retire
+    auto entry = retiring_entries.back();
+    if (entry->is_write_entry() || entry->is_writesame_entry()) {
+      first_valid_entry = entry->ram_entry.write_data_pos +
+        entry->get_aligned_data_size();
+    } else {
+      first_valid_entry = entry->log_entry_index + MIN_WRITE_ALLOC_SSD_SIZE;
+    }
+    if (first_valid_entry >= this->m_log_pool_config_size) {
+        first_valid_entry = first_valid_entry % this->m_log_pool_config_size +
+          DATA_RING_BUFFER_OFFSET;
+    }
+    ceph_assert(first_valid_entry != initial_first_valid_entry);
+    auto new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
+    new_root->flushed_sync_gen = flushed_sync_gen;
+    new_root->first_valid_entry = first_valid_entry;
+    pool_root.flushed_sync_gen = flushed_sync_gen;
+    pool_root.first_valid_entry = first_valid_entry;
+
+    Context *ctx = new LambdaContext(
+          [this, flushed_sync_gen, first_valid_entry,
+          initial_first_valid_entry, retiring_entries](int r) {
+          uint64_t allocated_bytes = 0;
+          uint64_t cached_bytes = 0;
+          uint64_t former_log_pos = 0;
+          for (auto &entry : retiring_entries) {
+            ceph_assert(entry->log_entry_index != 0);
+            if (entry->log_entry_index != former_log_pos ) {
+              // Space for control blocks
+              allocated_bytes  += MIN_WRITE_ALLOC_SSD_SIZE;
+              former_log_pos = entry->log_entry_index;
+            }
+            if (entry->is_write_entry()) {
+              cached_bytes += entry->write_bytes();
+              //space for userdata
+              allocated_bytes += entry->get_aligned_data_size();
+            }
+          }
+          {
+            std::lock_guard locker(m_lock);
+            m_first_valid_entry = first_valid_entry;
+            ceph_assert(m_first_valid_entry % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+            this->m_free_log_entries += retiring_entries.size();
+            ceph_assert(this->m_bytes_cached >= cached_bytes);
+            this->m_bytes_cached -= cached_bytes;
+
+            ldout(m_image_ctx.cct, 20)
+              << "Finished root update: " << "initial_first_valid_entry="
+              << initial_first_valid_entry << ", " << "m_first_valid_entry="
+              << m_first_valid_entry << "," << "release space = "
+              << allocated_bytes << "," << "m_bytes_allocated="
+              << m_bytes_allocated << "," << "release cached space="
+              << allocated_bytes << "," << "m_bytes_cached="
+              << this->m_bytes_cached << dendl;
+
+            this->m_alloc_failed_since_retire = false;
+            this->wake_up();
+            m_async_update_superblock--;
+            this->m_async_op_tracker.finish_op();
+          }
+
+          this->dispatch_deferred_writes();
+          this->process_writeback_dirty_entries();
+        });
+
+      std::lock_guard locker(m_lock);
+      schedule_update_root(new_root, ctx);
+  } else {
+    ldout(cct, 20) << "Nothing to retire" << dendl;
+    return false;
+  }
+  return true;
+}
+
 template <typename I>
 void WriteLog<I>::append_ops(GenericLogOperations &ops, Context *ctx,
                              uint64_t* new_first_free_entry,
index 3bc72bb5c07667454cc06bc13cd9a9391f15310b..e8236be76df3ce6e744e423d8efff796e71d9d2f 100644 (file)
@@ -42,6 +42,7 @@ public:
   WriteLog(const WriteLog&) = delete;
   WriteLog &operator=(const WriteLog&) = delete;
 
+  typedef io::Extent Extent;
   using This = AbstractWriteLog<ImageCtxT>;
   using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
   using C_WriteRequestT = pwl::C_WriteRequest<This>;
@@ -106,7 +107,16 @@ private:
 
   Builder<This>* create_builder();
   void load_existing_entries(pwl::DeferredContexts &later);
+  void collect_read_extents(
+      uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry,
+      std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+      std::vector<bufferlist*> &bls_to_read, uint64_t entry_hit_length,
+      Extent hit_extent, pwl::C_ReadRequest *read_ctx) override;
+  void complete_read(
+      std::vector<WriteLogCacheEntry*> &log_entries_to_read,
+      std::vector<bufferlist*> &bls_to_read, Context *ctx) override;
   void enlist_op_appender();
+  bool retire_entries(const unsigned long int frees_per_tx);
   bool has_sync_point_logs(GenericLogOperations &ops);
   void append_op_log_entries(GenericLogOperations &ops);
   void alloc_op_log_entries(GenericLogOperations &ops);
index 4eac15e0b7e88df942b1c79797c30f4bfd68e95a..22a9dcb1d0607d97a270594a0d1ecdde6b8a8e86 100644 (file)
@@ -67,7 +67,8 @@ struct TestMockCacheSSDWriteLog : public TestMockFixture {
 
   MockImageCacheStateSSD *get_cache_state(
       MockImageCtx& mock_image_ctx, MockApi& mock_api) {
-    MockImageCacheStateSSD *rwl_state = new MockImageCacheStateSSD(&mock_image_ctx, mock_api);
+    MockImageCacheStateSSD *rwl_state = new MockImageCacheStateSSD(
+        &mock_image_ctx, mock_api);
     return rwl_state;
   }
 
@@ -233,6 +234,381 @@ TEST_F(TestMockCacheSSDWriteLog, write) {
   ASSERT_EQ(0, finish_ctx3.wait());
 }
 
+TEST_F(TestMockCacheSSDWriteLog, read_hit_rwl_cache) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);
+  MockApi mock_api;
+  MockSSDWriteLog rwl(
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+      mock_image_writeback, mock_api);
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  MockContextSSD finish_ctx1;
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  Extents image_extents{{0, 4096}};
+  bufferlist bl;
+  bl.append(std::string(4096, '1'));
+  bufferlist bl_copy = bl;
+  int fadvise_flags = 0;
+  rwl.write(std::move(image_extents), std::move(bl),
+            fadvise_flags, &finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+
+  MockContextSSD finish_ctx_read;
+  expect_context_complete(finish_ctx_read, 0);
+  Extents image_extents_read{{0, 4096}};
+  bufferlist read_bl;
+  rwl.read(std::move(image_extents_read), &read_bl,
+                     fadvise_flags, &finish_ctx_read);
+  ASSERT_EQ(0, finish_ctx_read.wait());
+  ASSERT_EQ(4096, read_bl.length());
+  ASSERT_TRUE(bl_copy.contents_equal(read_bl));
+
+  MockContextSSD finish_ctx3;
+  expect_context_complete(finish_ctx3, 0);
+  rwl.shut_down(&finish_ctx3);
+  ASSERT_EQ(0, finish_ctx3.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, read_hit_part_rwl_cache) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);
+  MockApi mock_api;
+  MockSSDWriteLog rwl(
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+      mock_image_writeback, mock_api);
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  MockContextSSD finish_ctx1;
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  Extents image_extents{{0, 8192}};
+  bufferlist bl;
+  bl.append(std::string(8192, '1'));
+  bufferlist bl_copy = bl;
+  int fadvise_flags = 0;
+  rwl.write(std::move(image_extents), std::move(bl),
+            fadvise_flags, &finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+
+  MockContextSSD finish_ctx_read;
+  Extents image_extents_read{{4096, 4096}};
+  bufferlist hit_bl;
+  bl_copy.begin(4095).copy(4096, hit_bl);
+  expect_context_complete(finish_ctx_read, 0);
+  bufferlist read_bl;
+  rwl.read(std::move(image_extents_read), &read_bl,
+                     fadvise_flags, &finish_ctx_read);
+  ASSERT_EQ(0, finish_ctx_read.wait());
+  ASSERT_EQ(4096, read_bl.length());
+  bufferlist read_bl_hit;
+  read_bl.begin(0).copy(4096, read_bl_hit);
+  ASSERT_TRUE(hit_bl.contents_equal(read_bl_hit));
+
+  MockContextSSD finish_ctx3;
+  expect_context_complete(finish_ctx3, 0);
+  rwl.shut_down(&finish_ctx3);
+  ASSERT_EQ(0, finish_ctx3.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, read_miss_rwl_cache) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);
+  MockApi mock_api;
+  MockSSDWriteLog rwl(
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+      mock_image_writeback, mock_api);
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  MockContextSSD finish_ctx1;
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  Extents image_extents{{0, 4096}};
+  bufferlist bl;
+  bl.append(std::string(4096, '1'));
+  int fadvise_flags = 0;
+  rwl.write(std::move(image_extents), std::move(bl),
+            fadvise_flags, &finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+
+  MockContextSSD finish_ctx_read;
+  Extents image_extents_read{{4096, 4096}};
+  expect_context_complete(finish_ctx_read, 4096);
+  bufferlist read_bl;
+  ASSERT_EQ(0, read_bl.length());
+  rwl.read(std::move(image_extents_read), &read_bl,
+                     fadvise_flags, &finish_ctx_read);
+  ASSERT_EQ(4096, finish_ctx_read.wait());
+  ASSERT_EQ(4096, read_bl.length());
+
+  MockContextSSD finish_ctx3;
+  expect_context_complete(finish_ctx3, 0);
+  rwl.shut_down(&finish_ctx3);
+  ASSERT_EQ(0, finish_ctx3.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, compare_and_write_compare_matched) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);
+  MockApi mock_api;
+  MockSSDWriteLog rwl(
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+      mock_image_writeback, mock_api);
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  MockContextSSD finish_ctx1;
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  Extents image_extents{{0, 4096}};
+  bufferlist bl1;
+  bl1.append(std::string(4096, '1'));
+  bufferlist com_bl = bl1;
+  int fadvise_flags = 0;
+  rwl.write(std::move(image_extents), std::move(bl1), fadvise_flags, &finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+
+  MockContextSSD finish_ctx_cw;
+  bufferlist bl2;
+  bl2.append(std::string(4096, '2'));
+  bufferlist bl2_copy = bl2;
+  uint64_t mismatch_offset = -1;
+  expect_context_complete(finish_ctx_cw, 0);
+  rwl.compare_and_write({{0, 4096}}, std::move(com_bl), std::move(bl2),
+                            &mismatch_offset, fadvise_flags, &finish_ctx_cw);
+  ASSERT_EQ(0, finish_ctx_cw.wait());
+  ASSERT_EQ(0, mismatch_offset);
+
+  MockContextSSD finish_ctx_read;
+  bufferlist read_bl;
+  expect_context_complete(finish_ctx_read, 0);
+  rwl.read({{0, 4096}}, &read_bl, fadvise_flags, &finish_ctx_read);
+  ASSERT_EQ(0, finish_ctx_read.wait());
+  ASSERT_EQ(4096, read_bl.length());
+  ASSERT_TRUE(bl2_copy.contents_equal(read_bl));
+
+  MockContextSSD finish_ctx3;
+  expect_context_complete(finish_ctx3, 0);
+  rwl.shut_down(&finish_ctx3);
+
+  ASSERT_EQ(0, finish_ctx3.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, compare_and_write_compare_failed) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);
+  MockApi mock_api;
+  MockSSDWriteLog rwl(
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+      mock_image_writeback, mock_api);
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  MockContextSSD finish_ctx1;
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  Extents image_extents{{0, 4096}};
+  bufferlist bl1;
+  bl1.append(std::string(4096, '1'));
+  bufferlist bl1_copy = bl1;
+  int fadvise_flags = 0;
+  rwl.write(std::move(image_extents), std::move(bl1), fadvise_flags, &finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+
+  MockContextSSD finish_ctx_cw;
+  bufferlist bl2;
+  bl2.append(std::string(4096, '2'));
+  bufferlist com_bl = bl2;
+  uint64_t mismatch_offset = -1;
+  expect_context_complete(finish_ctx_cw, -EILSEQ);
+  rwl.compare_and_write({{0, 4096}}, std::move(com_bl), std::move(bl2),
+                            &mismatch_offset, fadvise_flags, &finish_ctx_cw);
+  ASSERT_EQ(-EILSEQ, finish_ctx_cw.wait());
+  ASSERT_EQ(0, mismatch_offset);
+
+  MockContextSSD finish_ctx_read;
+  bufferlist read_bl;
+  expect_context_complete(finish_ctx_read, 0);
+  rwl.read({{0, 4096}}, &read_bl, fadvise_flags, &finish_ctx_read);
+  ASSERT_EQ(0, finish_ctx_read.wait());
+  ASSERT_EQ(4096, read_bl.length());
+  ASSERT_TRUE(bl1_copy.contents_equal(read_bl));
+
+  MockContextSSD finish_ctx3;
+  expect_context_complete(finish_ctx3, 0);
+  rwl.shut_down(&finish_ctx3);
+  ASSERT_EQ(0, finish_ctx3.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, writesame) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);
+  MockApi mock_api;
+  MockSSDWriteLog rwl(
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+      mock_image_writeback, mock_api);
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  MockContextSSD finish_ctx1;
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  bufferlist bl, test_bl;
+  bl.append(std::string(512, '1'));
+  test_bl.append(std::string(4096, '1'));
+  int fadvise_flags = 0;
+  rwl.writesame(0, 4096, std::move(bl), fadvise_flags, &finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+
+  MockContextSSD finish_ctx_read;
+  bufferlist read_bl;
+  expect_context_complete(finish_ctx_read, 0);
+  rwl.read({{0, 4096}}, &read_bl, fadvise_flags, &finish_ctx_read);
+  ASSERT_EQ(0, finish_ctx_read.wait());
+  ASSERT_EQ(4096, read_bl.length());
+  ASSERT_TRUE(test_bl.contents_equal(read_bl));
+
+  MockContextSSD finish_ctx3;
+  expect_context_complete(finish_ctx3, 0);
+  rwl.shut_down(&finish_ctx3);
+
+  ASSERT_EQ(0, finish_ctx3.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, discard) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);
+  MockApi mock_api;
+  MockSSDWriteLog rwl(
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+      mock_image_writeback, mock_api);
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  MockContextSSD finish_ctx1;
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  Extents image_extents{{0, 4096}};
+  bufferlist bl;
+  bl.append(std::string(4096, '1'));
+  bufferlist bl_copy = bl;
+  int fadvise_flags = 0;
+  rwl.write(std::move(image_extents), std::move(bl), fadvise_flags, &finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+
+  MockContextSSD finish_ctx_discard;
+  expect_context_complete(finish_ctx_discard, 0);
+  rwl.discard(0, 4096, 1, &finish_ctx_discard);
+  ASSERT_EQ(0, finish_ctx_discard.wait());
+
+  MockContextSSD finish_ctx_read;
+  bufferlist read_bl;
+  expect_context_complete(finish_ctx_read, 0);
+  rwl.read({{0, 4096}}, &read_bl, fadvise_flags, &finish_ctx_read);
+  ASSERT_EQ(0, finish_ctx_read.wait());
+  ASSERT_EQ(4096, read_bl.length());
+  ASSERT_TRUE(read_bl.is_zero());
+
+  MockContextSSD finish_ctx3;
+  expect_context_complete(finish_ctx3, 0);
+  rwl.shut_down(&finish_ctx3);
+
+  ASSERT_EQ(0, finish_ctx3.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, invalidate) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);
+  MockApi mock_api;
+  MockSSDWriteLog rwl(
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+      mock_image_writeback, mock_api);
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  MockContextSSD finish_ctx1;
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  Extents image_extents{{0, 4096}};
+  bufferlist bl;
+  bl.append(std::string(4096, '1'));
+  bufferlist bl_copy = bl;
+  int fadvise_flags = 0;
+  rwl.write(std::move(image_extents), std::move(bl), fadvise_flags, &finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+
+  MockContextSSD finish_ctx_invalidate;
+  expect_context_complete(finish_ctx_invalidate, 0);
+  rwl.invalidate(&finish_ctx_invalidate);
+  ASSERT_EQ(0, finish_ctx_invalidate.wait());
+
+  MockContextSSD finish_ctx3;
+  expect_context_complete(finish_ctx3, 0);
+  rwl.shut_down(&finish_ctx3);
+
+  ASSERT_EQ(0, finish_ctx3.wait());
+}
+
 } // namespace pwl
 } // namespace cache
 } // namespace librbd