librbd/cache: Implement aio_write operation

author Mahati Chamarthy <mahati.chamarthy@intel.com>

Fri, 6 Nov 2020 12:09:55 +0000 (17:39 +0530)

committer Mahati Chamarthy <mahati.chamarthy@intel.com>

Wed, 6 Jan 2021 13:36:32 +0000 (19:06 +0530)
author Mahati Chamarthy <mahati.chamarthy@intel.com>
Fri, 6 Nov 2020 12:09:55 +0000 (17:39 +0530)
committer Mahati Chamarthy <mahati.chamarthy@intel.com>
Wed, 6 Jan 2021 13:36:32 +0000 (19:06 +0530)
diff --git a/src/common/options.cc b/src/common/options.cc

index 57901687e77f52cd3e2cf66a57fb7a2789cdc794..8926a37e3688ea9fd37dffb67931208c853c721e 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -7730,8 +7730,9 @@ static std::vector<Option> get_rbd_options() {
      .set_min(0)
      .set_description("maximum io delay (in milliseconds) for simple io scheduler (if set to 0 dalay is calculated based on latency stats)"),
  
-    Option("rbd_rwl_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
-    .set_default(false)
+    Option("rbd_persistent_cache_mode", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("disabled")
+    .set_enum_allowed({"disabled", "rwl", "ssd"})
      .set_description("enable persistent write back cache for this volume"),
  
      Option("rbd_rwl_log_periodic_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
diff --git a/src/librbd/CMakeLists.txt b/src/librbd/CMakeLists.txt

index 20bfd5427967b18d9876345b495ed3532ee0e159..44663f4af67e8e2b104de971de31dcae84bca1e1 100644 (file)
--- a/src/librbd/CMakeLists.txt
+++ b/src/librbd/CMakeLists.txt
@@ -16,7 +16,7 @@ endif()
  add_library(rbd_types STATIC
    ${librbd_types_srcs})
  
-if (WITH_RBD_RWL AND WITH_RBD_SSD_CACHE)
+if (WITH_RBD_RWL)
    target_link_libraries(rbd_types
      PRIVATE pmem::pmemobj)
  endif()
@@ -275,12 +275,17 @@ if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE)
      if(WITH_RBD_SSD_CACHE)
        set(rbd_plugin_pwl_srcs
          ${rbd_plugin_pwl_srcs}
-        cache/pwl/SSDWriteLog.cc)
+        cache/pwl/ssd/LogEntry.cc
+        cache/pwl/ssd/Request.cc
+        cache/pwl/ssd/WriteLog.cc)
      endif()
      if(WITH_RBD_RWL)
        set(rbd_plugin_pwl_srcs
          ${rbd_plugin_pwl_srcs}
-        cache/pwl/ReplicatedWriteLog.cc)
+        cache/pwl/rwl/WriteLog.cc
+        cache/pwl/rwl/LogEntry.cc
+        cache/pwl/rwl/LogOperation.cc
+        cache/pwl/rwl/Request.cc)
      endif()
  
    add_library(librbd_plugin_pwl_cache SHARED
diff --git a/src/librbd/cache/Types.h b/src/librbd/cache/Types.h

index 682d30c91edc451cdd1f42e0fd6824c304082c64..b791d590873732dc6d85858318a8896315d9ffab 100644 (file)
--- a/src/librbd/cache/Types.h
+++ b/src/librbd/cache/Types.h
@@ -15,6 +15,7 @@ namespace cache {
  enum ImageCacheType {
    IMAGE_CACHE_TYPE_RWL = 1,
    IMAGE_CACHE_TYPE_SSD,
+  IMAGE_CACHE_TYPE_UNKNOWN
  };
  
  typedef std::list<Context *> Contexts;
diff --git a/src/librbd/cache/Utils.h b/src/librbd/cache/Utils.h

index e338899c09e8d696dc57544ac18e679023cd24c6..cd2eb7c3b003fa4591d13b2ccd405e73dc8ee6da 100644 (file)
--- a/src/librbd/cache/Utils.h
+++ b/src/librbd/cache/Utils.h
@@ -5,6 +5,7 @@
  #define CEPH_LIBRBD_CACHE_UTILS_H
  
  #include "acconfig.h"
+#include <string>
  
  class Context;
  
@@ -17,8 +18,9 @@ namespace util {
  
  template <typename T>
  bool is_pwl_enabled(T& image_ctx) {
-#if defined(WITH_RBD_RWL)
-  return image_ctx.config.template get_val<bool>("rbd_rwl_enabled");
+#if defined(WITH_RBD_RWL) || defined(WITH_RBD_SSD_CACHE)
+  auto value = image_ctx.config.template get_val<std::string>("rbd_persistent_cache_mode");
+  return value == "disabled" ? false : true;
  #else
    return false;
  #endif // WITH_RBD_RWL
diff --git a/src/librbd/cache/pwl/AbstractWriteLog.cc b/src/librbd/cache/pwl/AbstractWriteLog.cc

index d5bac1b509399498dfedd1d5c7586f2bdfee1e8b..36552cd2cdacd9c362473479e9759ab9db7f1e20 100644 (file)
--- a/src/librbd/cache/pwl/AbstractWriteLog.cc
+++ b/src/librbd/cache/pwl/AbstractWriteLog.cc
@@ -25,8 +25,8 @@
  #undef dout_subsys
  #define dout_subsys ceph_subsys_rbd_pwl
  #undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::pwl::AbstractWriteLog: " << this << " " \
-                           <<  __func__ << ": "
+#define dout_prefix *_dout << "librbd::cache::pwl::AbstractWriteLog: " << this \
+                           << " " <<  __func__ << ": "
  
  namespace librbd {
  namespace cache {
@@ -38,16 +38,19 @@ typedef AbstractWriteLog<ImageCtx>::Extent Extent;
  typedef AbstractWriteLog<ImageCtx>::Extents Extents;
  
  template <typename I>
-AbstractWriteLog<I>::AbstractWriteLog(I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
-    cache::ImageWritebackInterface& image_writeback,
+AbstractWriteLog<I>::AbstractWriteLog(
+    I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
+    Builder<This> *builder, cache::ImageWritebackInterface& image_writeback,
      plugin::Api<I>& plugin_api)
-  : m_write_log_guard(image_ctx.cct),
+  : m_builder(builder),
+    m_write_log_guard(image_ctx.cct),
      m_deferred_dispatch_lock(ceph::make_mutex(pwl::unique_lock_name(
        "librbd::cache::pwl::AbstractWriteLog::m_deferred_dispatch_lock", this))),
      m_blockguard_lock(ceph::make_mutex(pwl::unique_lock_name(
        "librbd::cache::pwl::AbstractWriteLog::m_blockguard_lock", this))),
      m_thread_pool(
-        image_ctx.cct, "librbd::cache::pwl::AbstractWriteLog::thread_pool", "tp_pwl", 4, ""),
+        image_ctx.cct, "librbd::cache::pwl::AbstractWriteLog::thread_pool",
+        "tp_pwl", 4, ""),
      m_cache_state(cache_state),
      m_image_ctx(image_ctx),
      m_log_pool_config_size(DEFAULT_POOL_SIZE),
@@ -92,7 +95,8 @@ AbstractWriteLog<I>::~AbstractWriteLog() {
  
  template <typename I>
  void AbstractWriteLog<I>::perf_start(std::string name) {
-  PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_pwl_first, l_librbd_pwl_last);
+  PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_pwl_first,
+                          l_librbd_pwl_last);
  
    // Latency axis configuration for op histograms, values are in nanoseconds
    PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
@@ -336,50 +340,50 @@ void AbstractWriteLog<I>::arm_periodic_stats() {
  
  template <typename I>
  void AbstractWriteLog<I>::update_entries(std::shared_ptr<GenericLogEntry> log_entry,
-    WriteLogPmemEntry *pmem_entry, std::map<uint64_t, bool> &missing_sync_points,
+    WriteLogCacheEntry *cache_entry, std::map<uint64_t, bool> &missing_sync_points,
      std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> &sync_point_entries,
      int entry_index) {
-    bool writer = pmem_entry->is_writer();
-    if (pmem_entry->is_sync_point()) {
+    bool writer = cache_entry->is_writer();
+    if (cache_entry->is_sync_point()) {
        ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
-                                 << " is a sync point. pmem_entry=[" << *pmem_entry << "]" << dendl;
-      auto sync_point_entry = std::make_shared<SyncPointLogEntry>(pmem_entry->sync_gen_number);
+                                 << " is a sync point. cache_entry=[" << *cache_entry << "]" << dendl;
+      auto sync_point_entry = std::make_shared<SyncPointLogEntry>(cache_entry->sync_gen_number);
        log_entry = sync_point_entry;
-      sync_point_entries[pmem_entry->sync_gen_number] = sync_point_entry;
-      missing_sync_points.erase(pmem_entry->sync_gen_number);
-      m_current_sync_gen = pmem_entry->sync_gen_number;
-    } else if (pmem_entry->is_write()) {
+      sync_point_entries[cache_entry->sync_gen_number] = sync_point_entry;
+      missing_sync_points.erase(cache_entry->sync_gen_number);
+      m_current_sync_gen = cache_entry->sync_gen_number;
+    } else if (cache_entry->is_write()) {
        ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
-                                 << " is a write. pmem_entry=[" << *pmem_entry << "]" << dendl;
+                                 << " is a write. cache_entry=[" << *cache_entry << "]" << dendl;
        auto write_entry =
-        std::make_shared<WriteLogEntry>(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes);
-      write_data_to_buffer(write_entry, pmem_entry);
+        m_builder->create_write_log_entry(nullptr, cache_entry->image_offset_bytes, cache_entry->write_bytes);
+      write_data_to_buffer(write_entry, cache_entry);
        log_entry = write_entry;
-    } else if (pmem_entry->is_writesame()) {
+    } else if (cache_entry->is_writesame()) {
        ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
-                                 << " is a write same. pmem_entry=[" << *pmem_entry << "]" << dendl;
+                                 << " is a write same. cache_entry=[" << *cache_entry << "]" << dendl;
        auto ws_entry =
-        std::make_shared<WriteSameLogEntry>(nullptr, pmem_entry->image_offset_bytes,
-                                            pmem_entry->write_bytes, pmem_entry->ws_datalen);
-      write_data_to_buffer(ws_entry, pmem_entry);
+        m_builder->create_writesame_log_entry(nullptr, cache_entry->image_offset_bytes,
+                                              cache_entry->write_bytes, cache_entry->ws_datalen);
+      write_data_to_buffer(ws_entry, cache_entry);
        log_entry = ws_entry;
-    } else if (pmem_entry->is_discard()) {
+    } else if (cache_entry->is_discard()) {
        ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
-                                 << " is a discard. pmem_entry=[" << *pmem_entry << "]" << dendl;
+                                 << " is a discard. cache_entry=[" << *cache_entry << "]" << dendl;
        auto discard_entry =
-        std::make_shared<DiscardLogEntry>(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes,
+        std::make_shared<DiscardLogEntry>(nullptr, cache_entry->image_offset_bytes, cache_entry->write_bytes,
                                            m_discard_granularity_bytes);
        log_entry = discard_entry;
      } else {
        lderr(m_image_ctx.cct) << "Unexpected entry type in entry " << entry_index
-                             << ", pmem_entry=[" << *pmem_entry << "]" << dendl;
+                             << ", cache_entry=[" << *cache_entry << "]" << dendl;
      }
  
      if (writer) {
        ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
-                                 << " writes. pmem_entry=[" << *pmem_entry << "]" << dendl;
-      if (!sync_point_entries[pmem_entry->sync_gen_number]) {
-        missing_sync_points[pmem_entry->sync_gen_number] = true;
+                                 << " writes. cache_entry=[" << *cache_entry << "]" << dendl;
+      if (!sync_point_entries[cache_entry->sync_gen_number]) {
+        missing_sync_points[cache_entry->sync_gen_number] = true;
        }
      }
  }
@@ -387,7 +391,7 @@ void AbstractWriteLog<I>::update_entries(std::shared_ptr<GenericLogEntry> log_en
  template <typename I>
  void AbstractWriteLog<I>::update_sync_points(std::map<uint64_t, bool> &missing_sync_points,
      std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> &sync_point_entries,
-    DeferredContexts &later) {
+    DeferredContexts &later, uint32_t alloc_size ) {
    /* Create missing sync points. These must not be appended until the
     * entry reload is complete and the write map is up to
     * date. Currently this is handled by the deferred contexts object
@@ -440,7 +444,7 @@ void AbstractWriteLog<I>::update_sync_points(std::map<uint64_t, bool> &missing_s
            }
            if (log_entry->write_bytes() == log_entry->bytes_dirty()) {
              /* This entry is a basic write */
-            uint64_t bytes_allocated = MIN_WRITE_ALLOC_SIZE;
+            uint64_t bytes_allocated = alloc_size;
              if (gen_write_entry->ram_entry.write_bytes > bytes_allocated) {
                bytes_allocated = gen_write_entry->ram_entry.write_bytes;
              }
@@ -738,14 +742,14 @@ void AbstractWriteLog<I>::read(Extents&& image_extents,
          uint64_t map_entry_buffer_offset = entry_image_extent.first - map_entry.log_entry->ram_entry.image_offset_bytes;
          /* Offset into the log entry buffer of this read hit */
          uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset;
-        /* Create buffer object referring to pmem pool for this read hit */
+        /* Create buffer object referring to cache pool for this read hit */
          auto write_entry = map_entry.log_entry;
  
          /* Make a bl for this hit extent. This will add references to the write_entry->pmem_bp */
          buffer::list hit_bl;
  
          buffer::list entry_bl_copy;
-        write_entry->copy_pmem_bl(&entry_bl_copy);
+        write_entry->copy_cache_bl(&entry_bl_copy);
          entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl);
  
          ceph_assert(hit_bl.length() == entry_hit_length);
@@ -798,9 +802,9 @@ void AbstractWriteLog<I>::write(Extents &&image_extents,
  
    ceph_assert(m_initialized);
  
-  auto *write_req =
-    new C_WriteRequestT(*this, now, std::move(image_extents), std::move(bl), fadvise_flags,
-                        m_lock, m_perfcounter, on_finish);
+  C_WriteRequestT *write_req =
+    m_builder->create_write_request(*this, now, std::move(image_extents), std::move(bl),
+                                    fadvise_flags, m_lock, m_perfcounter, on_finish);
    m_perfcounter->inc(l_librbd_pwl_wr_bytes, write_req->image_extents_summary.total_bytes);
  
    /* The lambda below will be called when the block guard for all
@@ -934,9 +938,9 @@ void AbstractWriteLog<I>::writesame(uint64_t offset, uint64_t length,
     * as long as the length of the bl here, which is the pattern that's repeated
     * in the image for the entire length of this WS. Read hits and flushing of
     * write sames are different than normal writes. */
-  auto *ws_req =
-    new C_WriteSameRequestT(*this, now, std::move(ws_extents), std::move(bl),
-                            fadvise_flags, m_lock, m_perfcounter, on_finish);
+  C_WriteSameRequestT *ws_req =
+    m_builder->create_writesame_request(*this, now, std::move(ws_extents), std::move(bl),
+                                        fadvise_flags, m_lock, m_perfcounter, on_finish);
    m_perfcounter->inc(l_librbd_pwl_ws_bytes, ws_req->image_extents_summary.total_bytes);
  
    /* The lambda below will be called when the block guard for all
@@ -966,9 +970,10 @@ void AbstractWriteLog<I>::compare_and_write(Extents &&image_extents,
    /* A compare and write request is also a write request. We only allocate
     * resources and dispatch this write request if the compare phase
     * succeeds. */
-  auto *cw_req =
-    new C_CompAndWriteRequestT(*this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl),
-                               mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish);
+  C_WriteRequestT *cw_req =
+    m_builder->create_comp_and_write_request(
+        *this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl),
+        mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish);
    m_perfcounter->inc(l_librbd_pwl_cmp_bytes, cw_req->image_extents_summary.total_bytes);
  
    /* The lambda below will be called when the block guard for all
@@ -1198,19 +1203,6 @@ void AbstractWriteLog<I>::append_scheduled(GenericLogOperations &ops, bool &ops_
    }
  }
  
-template <typename I>
-void AbstractWriteLog<I>::enlist_op_appender()
-{
-  m_async_append_ops++;
-  m_async_op_tracker.start_op();
-  Context *append_ctx = new LambdaContext([this](int r) {
-      append_scheduled_ops();
-      m_async_append_ops--;
-      m_async_op_tracker.finish_op();
-    });
-  m_work_queue.queue(append_ctx);
-}
-
  template <typename I>
  void AbstractWriteLog<I>::schedule_append(GenericLogOperationsVector &ops)
  {
@@ -1245,6 +1237,9 @@ void AbstractWriteLog<I>::complete_op_log_entries(GenericLogOperations &&ops,
        op->mark_log_entry_completed();
        dirty_entries.push_back(log_entry);
      }
+    if (log_entry->is_write_entry()) {
+      release_ram(log_entry);
+    }
      if (op->reserved_allocated()) {
        published_reserves++;
      }
@@ -1447,7 +1442,7 @@ bool AbstractWriteLog<I>::check_allocation(C_BlockIORequestT *req,
    }
  
    if (alloc_succeeds) {
-    reserve_pmem(req, alloc_succeeds, no_space);
+    reserve_cache(req, alloc_succeeds, no_space);
    }
  
    if (alloc_succeeds) {
@@ -1999,7 +1994,7 @@ void AbstractWriteLog<I>::internal_flush(bool invalidate, Context *on_finish) {
  template <typename I>
  void AbstractWriteLog<I>::add_into_log_map(GenericWriteLogEntries &log_entries,
                                             C_BlockIORequestT *req) {
-  copy_pmem(req);
+  req->copy_cache();
    m_blocks_to_log_entries.add_log_entries(log_entries);
  }
  
diff --git a/src/librbd/cache/pwl/AbstractWriteLog.h b/src/librbd/cache/pwl/AbstractWriteLog.h

index c96eb32686986d4c573db35e4b5583f944588d1d..dc0058bf97fcff23fb8581ebb44a85f79ecf4916 100644 (file)
--- a/src/librbd/cache/pwl/AbstractWriteLog.h
+++ b/src/librbd/cache/pwl/AbstractWriteLog.h
@@ -14,6 +14,7 @@
  #include "librbd/cache/pwl/LogOperation.h"
  #include "librbd/cache/pwl/Request.h"
  #include "librbd/cache/pwl/LogMap.h"
+#include "librbd/cache/pwl/Builder.h"
  #include <functional>
  #include <list>
  
@@ -27,14 +28,13 @@ struct ImageCtx;
  namespace plugin { template <typename> struct Api; }
  
  namespace cache {
-
  namespace pwl {
  
  class GenericLogEntry;
  class GenericWriteLogEntry;
  class SyncPointLogEntry;
  class WriteLogEntry;
-struct WriteLogPmemEntry;
+struct WriteLogCacheEntry;
  
  typedef std::list<std::shared_ptr<WriteLogEntry>> WriteLogEntries;
  typedef std::list<std::shared_ptr<GenericLogEntry>> GenericLogEntries;
@@ -49,7 +49,11 @@ typedef LogMap<GenericWriteLogEntry> WriteLogMap;
  typedef librbd::BlockGuard<GuardedRequest> WriteLogGuard;
  
  class DeferredContexts;
-template <typename> class ImageCacheState;
+template <typename>
+class ImageCacheState;
+
+template<typename T>
+class Builder;
  
  template <typename T>
  struct C_BlockIORequest;
@@ -64,9 +68,13 @@ template <typename ImageCtxT>
  class AbstractWriteLog {
  public:
    typedef io::Extent Extent;
-  typedef io::Extents Extents; 
+  typedef io::Extents Extents;
+  using This = AbstractWriteLog<ImageCtxT>;
+  Builder<This> *m_builder;
  
-  AbstractWriteLog(ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+  AbstractWriteLog(ImageCtxT &image_ctx,
+                   librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+                   Builder<This> *builder,
                     cache::ImageWritebackInterface& image_writeback,
                    plugin::Api<ImageCtxT>& plugin_api);
    virtual ~AbstractWriteLog();
@@ -103,13 +111,11 @@ public:
    void invalidate(Context *on_finish);
    void flush(Context *on_finish);
  
-  using This = AbstractWriteLog<ImageCtxT>;
    using C_WriteRequestT = pwl::C_WriteRequest<This>;
    using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
    using C_FlushRequestT = pwl::C_FlushRequest<This>;
    using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
    using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
-  using C_CompAndWriteRequestT = pwl::C_CompAndWriteRequest<This>;
  
    CephContext * get_context();
    void release_guarded_request(BlockGuardCell *cell);
@@ -119,7 +125,8 @@ public:
        pwl::GenericLogOperationsVector &ops, bool do_early_flush) = 0;
    void schedule_append(pwl::GenericLogOperationsVector &ops);
    void schedule_append(pwl::GenericLogOperationSharedPtr op);
-  void flush_new_sync_point(C_FlushRequestT *flush_req, pwl::DeferredContexts &later);
+  void flush_new_sync_point(C_FlushRequestT *flush_req,
+                            pwl::DeferredContexts &later);
  
    std::shared_ptr<pwl::SyncPoint> get_current_sync_point() {
      return m_current_sync_point;
@@ -145,6 +152,10 @@ public:
    }
    void add_into_log_map(pwl::GenericWriteLogEntries &log_entries,
                          C_BlockIORequestT *req);
+  virtual void complete_user_request(Context *&user_req, int r) = 0;
+  virtual void copy_bl_to_buffer(
+      WriteRequestResources *resources,
+      std::unique_ptr<WriteLogOperationSet> &op_set) {}
  
  private:
   typedef std::list<pwl::C_WriteRequest<This> *> C_WriteRequests;
@@ -166,7 +177,6 @@ private:
    bool m_persist_on_write_until_flush = true;
  
   /* Debug counters for the places m_async_op_tracker is used */
-  std::atomic<int> m_async_append_ops = {0};
    std::atomic<int> m_async_complete_ops = {0};
    std::atomic<int> m_async_null_flush_finish = {0};
    std::atomic<int> m_async_process_work = {0};
@@ -186,7 +196,7 @@ private:
    Contexts m_flush_complete_contexts;
  
    std::shared_ptr<pwl::SyncPoint> m_current_sync_point = nullptr;
-  bool m_persist_on_flush = false; /* If false, persist each write before completion */
+  bool m_persist_on_flush = false; //If false, persist each write before completion
  
    int m_flush_ops_in_flight = 0;
    int m_flush_bytes_in_flight = 0;
@@ -208,7 +218,8 @@ private:
    uint32_t m_discard_granularity_bytes;
  
    BlockGuardCell* detain_guarded_request_helper(pwl::GuardedRequest &req);
-  BlockGuardCell* detain_guarded_request_barrier_helper(pwl::GuardedRequest &req);
+  BlockGuardCell* detain_guarded_request_barrier_helper(
+      pwl::GuardedRequest &req);
    void detain_guarded_request(C_BlockIORequestT *request,
                                pwl::GuardedRequestFunctionContext *guarded_ctx,
                                bool is_barrier);
@@ -224,16 +235,21 @@ private:
  
    void flush_dirty_entries(Context *on_finish);
    bool can_flush_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry);
-  bool handle_flushed_sync_point(std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
-  void sync_point_writer_flushed(std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
+  bool handle_flushed_sync_point(
+      std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
+  void sync_point_writer_flushed(
+      std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
  
    void init_flush_new_sync_point(pwl::DeferredContexts &later);
    void new_sync_point(pwl::DeferredContexts &later);
-  pwl::C_FlushRequest<AbstractWriteLog<ImageCtxT>>* make_flush_req(Context *on_finish);
-  void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, pwl::DeferredContexts &later);
+  pwl::C_FlushRequest<AbstractWriteLog<ImageCtxT>>* make_flush_req(
+      Context *on_finish);
+  void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req,
+                                      pwl::DeferredContexts &later);
  
    void alloc_and_dispatch_io_req(C_BlockIORequestT *write_req);
-  void schedule_complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r);
+  void schedule_complete_op_log_entries(pwl::GenericLogOperations &&ops,
+                                        const int r);
    void internal_flush(bool invalidate, Context *on_finish);
  
  protected:
@@ -274,6 +290,7 @@ protected:
    AsyncOpTracker m_async_op_tracker;
    /* Debug counters for the places m_async_op_tracker is used */
    std::atomic<int> m_async_flush_ops = {0};
+  std::atomic<int> m_async_append_ops = {0};
  
    /* Acquire locks in order declared here */
  
@@ -314,45 +331,58 @@ protected:
  
    void update_entries(
        std::shared_ptr<pwl::GenericLogEntry> log_entry,
-      pwl::WriteLogPmemEntry *pmem_entry, std::map<uint64_t, bool> &missing_sync_points,
-      std::map<uint64_t, std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
+      pwl::WriteLogCacheEntry *cache_entry,
+      std::map<uint64_t, bool> &missing_sync_points,
+      std::map<uint64_t,
+      std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
        int entry_index);
    void update_sync_points(
        std::map<uint64_t, bool> &missing_sync_points,
-      std::map<uint64_t, std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
-      pwl::DeferredContexts &later);
+      std::map<uint64_t,
+      std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
+      pwl::DeferredContexts &later, uint32_t alloc_size);
    Context *construct_flush_entry(
        const std::shared_ptr<pwl::GenericLogEntry> log_entry, bool invalidating);
    void process_writeback_dirty_entries();
    bool can_retire_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry);
  
    void dispatch_deferred_writes(void);
-  void enlist_op_appender();
    void complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r);
  
    bool check_allocation(
        C_BlockIORequestT *req,
-      uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+      uint64_t &bytes_cached, uint64_t &bytes_dirtied,
+      uint64_t &bytes_allocated,
        uint64_t &num_lanes, uint64_t &num_log_entries,
        uint64_t &num_unpublished_reserves, uint64_t bytes_allocated_cap);
    void append_scheduled(
-      pwl::GenericLogOperations &ops, bool &ops_remain, bool &appending, bool isRWL=false);
-  
+      pwl::GenericLogOperations &ops, bool &ops_remain, bool &appending,
+      bool isRWL=false);
+
    virtual void process_work() = 0;
    virtual void append_scheduled_ops(void) = 0;
    virtual void schedule_append_ops(pwl::GenericLogOperations &ops) = 0;
    virtual void remove_pool_file() = 0;
-  virtual void initialize_pool(Context *on_finish, pwl::DeferredContexts &later) = 0;
+  virtual void initialize_pool(Context *on_finish,
+                               pwl::DeferredContexts &later) = 0;
    virtual void write_data_to_buffer(
-      std::shared_ptr<pwl::WriteLogEntry> ws_entry, pwl::WriteLogPmemEntry *pmem_entry) {}
+      std::shared_ptr<pwl::WriteLogEntry> ws_entry,
+      pwl::WriteLogCacheEntry *cache_entry) {}
+  virtual void release_ram(
+      const std::shared_ptr<pwl::GenericLogEntry> log_entry) {}
    virtual void alloc_op_log_entries(pwl::GenericLogOperations &ops) {}
-  virtual bool retire_entries(const unsigned long int frees_per_tx) {return false;}
-  virtual void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops) {}
-  virtual void copy_pmem(C_BlockIORequestT *req) {}
+  virtual bool retire_entries(const unsigned long int frees_per_tx) {
+    return false;
+  }
+  virtual void schedule_flush_and_append(
+      pwl::GenericLogOperationsVector &ops) {}
    virtual void persist_last_flushed_sync_gen() {}
-  virtual void reserve_pmem(C_BlockIORequestT *req, bool &alloc_succeeds, bool &no_space) {}
+  virtual void reserve_cache(C_BlockIORequestT *req, bool &alloc_succeeds,
+                             bool &no_space) {}
    virtual Context *construct_flush_entry_ctx(
-      const std::shared_ptr<pwl::GenericLogEntry> log_entry) {return nullptr;}
+      const std::shared_ptr<pwl::GenericLogEntry> log_entry) {
+    return nullptr;
+  }
  };
  
  } // namespace pwl
diff --git a/src/librbd/cache/pwl/Builder.h b/src/librbd/cache/pwl/Builder.h

new file mode 100644 (file)

index 0000000..6cea7ac
--- /dev/null
+++ b/src/librbd/cache/pwl/Builder.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_BUILDER_H
+#define CEPH_LIBRBD_CACHE_PWL_BUILDER_H
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+template <typename T>
+class Builder {
+public:
+  virtual ~Builder() {}
+  virtual std::shared_ptr<WriteLogEntry> create_write_log_entry(
+      uint64_t image_offset_bytes, uint64_t write_bytes) = 0;
+  virtual std::shared_ptr<WriteLogEntry> create_write_log_entry(
+      std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+      uint64_t image_offset_bytes, uint64_t write_bytes) = 0;
+  virtual std::shared_ptr<WriteLogEntry> create_writesame_log_entry(
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length) = 0;
+  virtual std::shared_ptr<WriteLogEntry> create_writesame_log_entry(
+      std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length) = 0;
+  virtual C_WriteRequest<T> *create_write_request(
+      T &pwl, utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req) = 0;
+  virtual C_WriteSameRequest<T> *create_writesame_request(
+      T &pwl, utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req) = 0;
+  virtual C_WriteRequest<T> *create_comp_and_write_request(
+      T &pwl, utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+      const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req) = 0;
+  virtual std::shared_ptr<WriteLogOperation> create_write_log_operation(
+      WriteLogOperationSet &set, uint64_t image_offset_bytes,
+      uint64_t write_bytes, CephContext *cct,
+      std::shared_ptr<WriteLogEntry> write_log_entry) = 0;
+  virtual std::shared_ptr<WriteLogOperation> create_write_log_operation(
+      WriteLogOperationSet &set, uint64_t image_offset_bytes,
+      uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+      std::shared_ptr<WriteLogEntry> writesame_log_entry) = 0;
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_BUILDER_H
diff --git a/src/librbd/cache/pwl/ImageCacheState.cc b/src/librbd/cache/pwl/ImageCacheState.cc

index 09ebd15b841870ced142bd8a4739450edc161956..4320024b316daac8fc35b34602be579aa5f2ca40 100644 (file)
--- a/src/librbd/cache/pwl/ImageCacheState.cc
+++ b/src/librbd/cache/pwl/ImageCacheState.cc
@@ -41,6 +41,7 @@ ImageCacheState<I>::ImageCacheState(I *image_ctx, plugin::Api<I>& plugin_api) :
  
    ConfigProxy &config = image_ctx->config;
    log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
+  cache_type = config.get_val<std::string>("rbd_persistent_cache_mode");
  }
  
  template <typename I>
@@ -93,7 +94,7 @@ void ImageCacheState<I>::dump(ceph::Formatter *f) const {
    ::encode_json("present", present, f);
    ::encode_json("empty", empty, f);
    ::encode_json("clean", clean, f);
-  ::encode_json("cache_type", (int)get_image_cache_type(), f);
+  ::encode_json("cache_type", cache_type, f);
    ::encode_json("pwl_host", host, f);
    ::encode_json("pwl_path", path, f);
    ::encode_json("pwl_size", size, f);
@@ -143,6 +144,7 @@ ImageCacheState<I>* ImageCacheState<I>::create_image_cache_state(
      int cache_type = (int)f["cache_type"];
  
      switch (cache_type) {
+      case IMAGE_CACHE_TYPE_SSD:
        case IMAGE_CACHE_TYPE_RWL:
          if (!cache_exists) {
            cache_state = new ImageCacheState<I>(image_ctx, plugin_api);
diff --git a/src/librbd/cache/pwl/ImageCacheState.h b/src/librbd/cache/pwl/ImageCacheState.h

index 1da4306464f757a903ee54bd1cc44f1938af01f3..7ea1412e26ff88be254077f77b2a37ea7a6f29ae 100644 (file)
--- a/src/librbd/cache/pwl/ImageCacheState.h
+++ b/src/librbd/cache/pwl/ImageCacheState.h
@@ -2,7 +2,7 @@
  // vim: ts=8 sw=2 smarttab
  
  #ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
-#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H 
+#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
  
  #include "librbd/ImageCtx.h"
  #include "librbd/cache/Types.h"
@@ -31,6 +31,7 @@ public:
    bool clean = true;
    std::string host;
    std::string path;
+  std::string cache_type;
    uint64_t size = 0;
    bool log_periodic_stats;
  
@@ -42,7 +43,12 @@ public:
    ~ImageCacheState() {}
  
    ImageCacheType get_image_cache_type() const {
-    return IMAGE_CACHE_TYPE_RWL;
+    if (cache_type == "rwl") {
+      return IMAGE_CACHE_TYPE_RWL;
+    } else if (cache_type == "ssd") {
+      return IMAGE_CACHE_TYPE_SSD;
+    }
+    return IMAGE_CACHE_TYPE_UNKNOWN;
    }
  
  
diff --git a/src/librbd/cache/pwl/InitRequest.cc b/src/librbd/cache/pwl/InitRequest.cc

index 4b0962a81585e5e141fe14e9c1fb0a5a6f15012e..ea00d4fbc7664410a19b0f276c90a24d9030cd52 100644 (file)
--- a/src/librbd/cache/pwl/InitRequest.cc
+++ b/src/librbd/cache/pwl/InitRequest.cc
@@ -12,11 +12,11 @@
  #include "librbd/cache/WriteLogImageDispatch.h"
  #include "librbd/cache/ImageWriteback.h"
  #ifdef WITH_RBD_RWL
-#include "librbd/cache/pwl/ReplicatedWriteLog.h"
+#include "librbd/cache/pwl/rwl/WriteLog.h"
  #endif
  
  #ifdef WITH_RBD_SSD_CACHE
-#include "librbd/cache/pwl/SSDWriteLog.h"
+#include "librbd/cache/pwl/ssd/WriteLog.h"
  #endif
  
  #include "librbd/cache/Utils.h"
@@ -90,19 +90,19 @@ void InitRequest<I>::get_image_cache_state() {
      #ifdef WITH_RBD_RWL
      case cache::IMAGE_CACHE_TYPE_RWL:
        m_image_cache =
-        new librbd::cache::pwl::ReplicatedWriteLog<I>(m_image_ctx,
-                                                      cache_state,
-                                                      m_image_writeback,
-                                                      m_plugin_api);
+        new librbd::cache::pwl::rwl::WriteLog<I>(m_image_ctx,
+                                                 cache_state,
+                                                 m_image_writeback,
+                                                 m_plugin_api);
        break;
      #endif
      #ifdef WITH_RBD_SSD_CACHE
      case cache::IMAGE_CACHE_TYPE_SSD:
        m_image_cache =
-        new librbd::cache::pwl::SSDWriteLog<I>(m_image_ctx,
-                                               cache_state,
-                                               m_image_writeback,
-                                               m_plugin_api);
+        new librbd::cache::pwl::ssd::WriteLog<I>(m_image_ctx,
+                                                 cache_state,
+                                                 m_image_writeback,
+                                                 m_plugin_api);
        break;
      #endif
      default:
@@ -122,8 +122,8 @@ void InitRequest<I>::init_image_cache() {
    ldout(cct, 10) << dendl;
  
    using klass = InitRequest<I>;
-  Context *ctx = create_context_callback<klass, &klass::handle_init_image_cache>(
-    this);
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_init_image_cache>(this);
    m_image_cache->init(ctx);
  }
  
@@ -199,8 +199,8 @@ void InitRequest<I>::shutdown_image_cache() {
    ldout(cct, 10) << dendl;
  
    using klass = InitRequest<I>;
-  Context *ctx = create_context_callback<klass, &klass::handle_shutdown_image_cache>(
-    this);
+  Context *ctx = create_context_callback<
+    klass, &klass::handle_shutdown_image_cache>(this);
    m_image_cache->shut_down(ctx);
  }
  
diff --git a/src/librbd/cache/pwl/LogEntry.cc b/src/librbd/cache/pwl/LogEntry.cc

index 06f7931ea8c7015a3a11a14564f32c69dafa7594..98224241b557e384e48da8e63ef4c507fec8786c 100644 (file)
--- a/src/librbd/cache/pwl/LogEntry.cc
+++ b/src/librbd/cache/pwl/LogEntry.cc
@@ -11,14 +11,12 @@
                             <<  __func__ << ": "
  
  namespace librbd {
-
  namespace cache {
-
  namespace pwl {
  
  std::ostream& GenericLogEntry::format(std::ostream &os) const {
    os << "ram_entry=[" << ram_entry << "], "
-     << "pmem_entry=" << (void*)pmem_entry << ", "
+     << "cache_entry=" << (void*)cache_entry << ", "
       << "log_entry_index=" << log_entry_index << ", "
       << "completed=" << completed;
    return os;
@@ -73,16 +71,9 @@ std::ostream &operator<<(std::ostream &os,
    return entry.format(os);
  }
  
-#ifdef WITH_RBD_RWL
-void WriteLogEntry::init_pmem_buffer(std::vector<WriteBufferAllocation>::iterator allocation) {
-  ram_entry.write_data = allocation->buffer_oid;
-  ceph_assert(!TOID_IS_NULL(ram_entry.write_data));
-  pmem_buffer = D_RW(ram_entry.write_data);
-}
-#endif
-
  void WriteLogEntry::init(bool has_data,
-                         uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush) {
+                         uint64_t current_sync_gen,
+                         uint64_t last_op_sequence_num, bool persist_on_flush) {
    ram_entry.has_data = 1;
    ram_entry.sync_gen_number = current_sync_gen;
    if (persist_on_flush) {
@@ -97,68 +88,21 @@ void WriteLogEntry::init(bool has_data,
    ram_entry.discard = 0;
  }
  
-void WriteLogEntry::init_pmem_bp() {
-  ceph_assert(!pmem_bp.have_raw());
-  pmem_bp = buffer::ptr(buffer::create_static(this->write_bytes(), (char*)pmem_buffer));
-}
-
-void WriteLogEntry::init_pmem_bl() {
-  pmem_bl.clear();
-  init_pmem_bp();
-  ceph_assert(pmem_bp.have_raw());
-  int before_bl = pmem_bp.raw_nref();
-  this->init_bl(pmem_bp, pmem_bl);
-  int after_bl = pmem_bp.raw_nref();
-  bl_refs = after_bl - before_bl;
-}
-
  unsigned int WriteLogEntry::reader_count() const {
-  if (pmem_bp.have_raw()) {
-    return (pmem_bp.raw_nref() - bl_refs - 1);
+  if (cache_bp.have_raw()) {
+    return (cache_bp.raw_nref() - bl_refs - 1);
    } else {
      return 0;
    }
  }
  
-/* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */
-buffer::list& WriteLogEntry::get_pmem_bl() {
-  if (0 == bl_refs) {
-    std::lock_guard locker(m_entry_bl_lock);
-    if (0 == bl_refs) {
-      init_pmem_bl();
-    }
-    ceph_assert(0 != bl_refs);
-  }
-  return pmem_bl;
-}
-
-/* Constructs a new bl containing copies of pmem_bp */
-void WriteLogEntry::copy_pmem_bl(bufferlist *out_bl) {
-  this->get_pmem_bl();
-  /* pmem_bp is now initialized */
-  buffer::ptr cloned_bp(pmem_bp.clone());
-  out_bl->clear();
-  this->init_bl(cloned_bp, *out_bl);
-}
-
-void WriteLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
-                              Context *ctx) {
-  /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the bl even after flush()). */
-  bufferlist entry_bl;
-  buffer::list entry_bl_copy;
-  copy_pmem_bl(&entry_bl_copy);
-  entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
-  image_writeback.aio_write({{ram_entry.image_offset_bytes, ram_entry.write_bytes}},
-                            std::move(entry_bl), 0, ctx);
-}
-
  std::ostream& WriteLogEntry::format(std::ostream &os) const {
    os << "(Write) ";
    GenericWriteLogEntry::format(os);
    os << ", "
-     << "pmem_buffer=" << (void*)pmem_buffer << ", ";
-  os << "pmem_bp=" << pmem_bp << ", ";
-  os << "pmem_bl=" << pmem_bl << ", ";
+     << "cache_buffer=" << (void*)cache_buffer << ", ";
+  os << "cache_bp=" << cache_bp << ", ";
+  os << "cache_bl=" << cache_bl << ", ";
    os << "bl_refs=" << bl_refs;
    return os;
  }
@@ -168,13 +112,15 @@ std::ostream &operator<<(std::ostream &os,
    return entry.format(os);
  }
  
-void DiscardLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
-                                Context *ctx) {
-  image_writeback.aio_discard(ram_entry.image_offset_bytes, ram_entry.write_bytes,
+void DiscardLogEntry::writeback(
+    librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) {
+  image_writeback.aio_discard(ram_entry.image_offset_bytes,
+                              ram_entry.write_bytes,
                                m_discard_granularity_bytes, ctx);
  }
  
-void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num) {
+void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush,
+                           uint64_t last_op_sequence_num) {
    ram_entry.sync_gen_number = current_sync_gen;
    if (persist_on_flush) {
      /* Persist on flush. Sequence #0 is never used. */
@@ -197,37 +143,6 @@ std::ostream &operator<<(std::ostream &os,
    return entry.format(os);
  }
  
-void WriteSameLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) {
-  for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) {
-    bl.append(bp);
-  }
-  int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen;
-  if (trailing_partial) {
-    bl.append(bp, 0, trailing_partial);
-  }
-}
-
-void WriteSameLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
-                                  Context *ctx) {
-  bufferlist entry_bl;
-  buffer::list entry_bl_copy;
-  copy_pmem_bl(&entry_bl_copy);
-  entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
-  image_writeback.aio_writesame(ram_entry.image_offset_bytes, ram_entry.write_bytes,
-                                std::move(entry_bl), 0, ctx);
-}
-
-std::ostream &WriteSameLogEntry::format(std::ostream &os) const {
-  os << "(WriteSame) ";
-  WriteLogEntry::format(os);
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
-                         const WriteSameLogEntry &entry) {
-  return entry.format(os);
-}
-
  } // namespace pwl
  } // namespace cache
  } // namespace librbd
diff --git a/src/librbd/cache/pwl/LogEntry.h b/src/librbd/cache/pwl/LogEntry.h

index 6f477fe83bdd084e077759f1c818b0eff11501e5..7c216ad9e97b54583ad11a5b676a80a937f72e42 100644 (file)
--- a/src/librbd/cache/pwl/LogEntry.h
+++ b/src/librbd/cache/pwl/LogEntry.h
@@ -1,8 +1,8 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab
  
-#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
-#define CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+#ifndef CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H
  
  #include "common/ceph_mutex.h"
  #include "librbd/Utils.h"
@@ -23,11 +23,11 @@ typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
  
  class GenericLogEntry {
  public:
-  WriteLogPmemEntry ram_entry;
-  WriteLogPmemEntry *pmem_entry = nullptr;
+  WriteLogCacheEntry ram_entry;
+  WriteLogCacheEntry *cache_entry = nullptr;
    uint32_t log_entry_index = 0;
    bool completed = false;
-  GenericLogEntry(const uint64_t image_offset_bytes = 0, const uint64_t write_bytes = 0)
+  GenericLogEntry(uint64_t image_offset_bytes = 0, uint64_t write_bytes = 0)
      : ram_entry(image_offset_bytes, write_bytes) {
    };
    virtual ~GenericLogEntry() { };
@@ -55,6 +55,20 @@ public:
                           Context *ctx) {
      ceph_assert(false);
    };
+  virtual void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback,
+                 Context *ctx, ceph::bufferlist &&bl) {
+    ceph_assert(false);
+  }
+  virtual bool is_write_entry() const {
+    return false;
+  }
+  virtual bool is_sync_point() const {
+    return false;
+  }
+  virtual unsigned int get_aligned_data_size() const {
+    return 0;
+  }
+  virtual void remove_cache_bl() {}
    virtual std::ostream& format(std::ostream &os) const;
    friend std::ostream &operator<<(std::ostream &os,
                                    const GenericLogEntry &entry);
@@ -73,7 +87,7 @@ public:
    /* All writing entries using all prior sync gen numbers have been flushed */
    std::atomic<bool> prior_sync_point_flushed = {true};
    std::shared_ptr<SyncPointLogEntry> next_sync_point_entry = nullptr;
-  SyncPointLogEntry(const uint64_t sync_gen_number) {
+  SyncPointLogEntry(uint64_t sync_gen_number) {
      ram_entry.sync_gen_number = sync_gen_number;
      ram_entry.sync_point = 1;
    };
@@ -83,6 +97,9 @@ public:
    bool can_retire() const override {
      return this->completed;
    }
+  bool is_sync_point() const override {
+    return true;
+  }
    std::ostream& format(std::ostream &os) const;
    friend std::ostream &operator<<(std::ostream &os,
                                    const SyncPointLogEntry &entry);
@@ -93,9 +110,9 @@ public:
    uint32_t referring_map_entries = 0;
    std::shared_ptr<SyncPointLogEntry> sync_point_entry;
    GenericWriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
-                       const uint64_t image_offset_bytes, const uint64_t write_bytes)
+                       uint64_t image_offset_bytes, uint64_t write_bytes)
      : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(sync_point_entry) { }
-  GenericWriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+  GenericWriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
      : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(nullptr) { }
    ~GenericWriteLogEntry() override {};
    GenericWriteLogEntry(const GenericWriteLogEntry&) = delete;
@@ -120,7 +137,7 @@ public:
    std::shared_ptr<SyncPointLogEntry> get_sync_point_entry() override {
      return sync_point_entry;
    }
-  virtual void copy_pmem_bl(bufferlist *out_bl) = 0;
+  virtual void copy_cache_bl(bufferlist *out_bl) = 0;
    void set_flushed(bool flushed) override {
      m_flushed = flushed;
    }
@@ -137,53 +154,75 @@ private:
  
  class WriteLogEntry : public GenericWriteLogEntry {
  protected:
-  buffer::ptr pmem_bp;
-  buffer::list pmem_bl;
-  std::atomic<int> bl_refs = {0}; /* The refs held on pmem_bp by pmem_bl */
-  /* Used in WriteLogEntry::get_pmem_bl() to syncronize between threads making entries readable */
+  bool is_writesame = false;
+  buffer::ptr cache_bp;
+  buffer::list cache_bl;
+  std::atomic<int> bl_refs = {0}; /* The refs held on cache_bp by cache_bl */
+  /* Used in WriteLogEntry::get_cache_bl() to syncronize between threads making entries readable */
    mutable ceph::mutex m_entry_bl_lock;
  
-  void init_pmem_bp();
-
-  /* Write same will override */
-  virtual void init_bl(buffer::ptr &bp, buffer::list &bl) {
-    bl.append(bp);
-  }
-
-  void init_pmem_bl();
+  virtual void init_cache_bp() {}
  
+  virtual void init_bl(buffer::ptr &bp, buffer::list &bl) {}
  public:
-  uint8_t *pmem_buffer = nullptr;
+  uint8_t *cache_buffer = nullptr;
    WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
-                const uint64_t image_offset_bytes, const uint64_t write_bytes)
+                uint64_t image_offset_bytes, uint64_t write_bytes)
      : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
        m_entry_bl_lock(ceph::make_mutex(pwl::unique_lock_name(
          "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this)))
    { }
-  WriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+  WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
      : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes),
        m_entry_bl_lock(ceph::make_mutex(pwl::unique_lock_name(
          "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this)))
    { }
-  ~WriteLogEntry() override {};
+  WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+                    uint64_t image_offset_bytes, uint64_t write_bytes,
+                    uint32_t data_length)
+    : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {
+    ram_entry.writesame = 1;
+    ram_entry.ws_datalen = data_length;
+    is_writesame = true;
+  };
+  WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes,
+                    uint32_t data_length)
+    : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
+    ram_entry.writesame = 1;
+    ram_entry.ws_datalen = data_length;
+    is_writesame = true;
+  };
+ ~WriteLogEntry() override {};
    WriteLogEntry(const WriteLogEntry&) = delete;
    WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+  unsigned int write_bytes() const override {
+    // The valid bytes in this ops data buffer.
+    if(is_writesame) {
+      return ram_entry.ws_datalen;
+    }
+    return ram_entry.write_bytes;
+  };
+  unsigned int bytes_dirty() const override {
+    // The bytes in the image this op makes dirty.
+    return ram_entry.write_bytes;
+  };
    void init(bool has_data,
              uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush);
-  #ifdef WITH_RBD_RWL
-  void init_pmem_buffer(std::vector<WriteBufferAllocation>::iterator allocation);
-  #endif
+  virtual void init_cache_buffer(std::vector<WriteBufferAllocation>::iterator allocation) {}
+  virtual void init_cache_bl(bufferlist &src_bl, uint64_t off, uint64_t len) {}
+  /* Returns a ref to a bl containing bufferptrs to the entry cache buffer */
+  virtual buffer::list &get_cache_bl() = 0;
+
    BlockExtent block_extent();
    unsigned int reader_count() const;
-  /* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */
-  buffer::list &get_pmem_bl();
-  /* Constructs a new bl containing copies of pmem_bp */
-  void copy_pmem_bl(bufferlist *out_bl) override;
-  void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
-                 Context *ctx) override;
+  /* Constructs a new bl containing copies of cache_bp */
+  void copy_cache_bl(bufferlist *out_bl) override {};
    bool can_retire() const override {
      return (this->completed && this->get_flushed() && (0 == reader_count()));
    }
+  bool is_write_entry() const override {
+    return true;
+  }
    std::ostream &format(std::ostream &os) const;
    friend std::ostream &operator<<(std::ostream &os,
                                    const WriteLogEntry &entry);
@@ -192,13 +231,13 @@ public:
  class DiscardLogEntry : public GenericWriteLogEntry {
  public:
    DiscardLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
-                  const uint64_t image_offset_bytes, const uint64_t write_bytes,
+                  uint64_t image_offset_bytes, uint64_t write_bytes,
                    uint32_t discard_granularity_bytes)
      : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
        m_discard_granularity_bytes(discard_granularity_bytes) {
      ram_entry.discard = 1;
    };
-  DiscardLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+  DiscardLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
      : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
      ram_entry.discard = 1;
    };
@@ -215,7 +254,7 @@ public:
    bool can_retire() const override {
      return this->completed;
    }
-  void copy_pmem_bl(bufferlist *out_bl) override {
+  void copy_cache_bl(bufferlist *out_bl) override {
      ceph_assert(false);
    }
    void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
@@ -228,43 +267,8 @@ private:
    uint32_t m_discard_granularity_bytes;
  };
  
-class WriteSameLogEntry : public WriteLogEntry {
-protected:
-  void init_bl(buffer::ptr &bp, buffer::list &bl) override;
-
-public:
-  WriteSameLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
-                    const uint64_t image_offset_bytes, const uint64_t write_bytes,
-                    const uint32_t data_length)
-    : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {
-    ram_entry.writesame = 1;
-    ram_entry.ws_datalen = data_length;
-  };
-  WriteSameLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes,
-                    const uint32_t data_length)
-    : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
-    ram_entry.writesame = 1;
-    ram_entry.ws_datalen = data_length;
-  };
-  WriteSameLogEntry(const WriteSameLogEntry&) = delete;
-  WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
-  unsigned int write_bytes() const override {
-    /* The valid bytes in this ops data buffer. */
-    return ram_entry.ws_datalen;
-  };
-  unsigned int bytes_dirty() const override {
-    /* The bytes in the image this op makes dirty. */
-    return ram_entry.write_bytes;
-  };
-  void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
-                 Context *ctx) override;
-  std::ostream &format(std::ostream &os) const;
-  friend std::ostream &operator<<(std::ostream &os,
-                                  const WriteSameLogEntry &entry);
-};
-
  } // namespace pwl
  } // namespace cache
  } // namespace librbd
  
-#endif // CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+#endif // CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H
diff --git a/src/librbd/cache/pwl/LogOperation.cc b/src/librbd/cache/pwl/LogOperation.cc

index aca964031e1c84955242a5abdc917860f1baeb31..0bb4f092f7ee61df2958a3d4bbd2a3e13e9b4620 100644 (file)
--- a/src/librbd/cache/pwl/LogOperation.cc
+++ b/src/librbd/cache/pwl/LogOperation.cc
@@ -7,16 +7,15 @@
  
  #define dout_subsys ceph_subsys_rbd_pwl
  #undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::pwl::LogOperation: " << this << " " \
-                           <<  __func__ << ": "
+#define dout_prefix *_dout << "librbd::cache::pwl::LogOperation: " << this \
+                           << " " <<  __func__ << ": "
  
  namespace librbd {
-
  namespace cache {
-
  namespace pwl {
  
-GenericLogOperation::GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter)
+GenericLogOperation::GenericLogOperation(utime_t dispatch_time,
+                                         PerfCounters *perfcounter)
    : m_perfcounter(perfcounter), dispatch_time(dispatch_time) {
  }
  
@@ -36,10 +35,11 @@ std::ostream &operator<<(std::ostream &os,
  
  SyncPointLogOperation::SyncPointLogOperation(ceph::mutex &lock,
                                               std::shared_ptr<SyncPoint> sync_point,
-                                             const utime_t dispatch_time,
+                                             utime_t dispatch_time,
                                               PerfCounters *perfcounter,
                                               CephContext *cct)
-  : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock), sync_point(sync_point) {
+  : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock),
+    sync_point(sync_point) {
  }
  
  SyncPointLogOperation::~SyncPointLogOperation() { }
@@ -110,7 +110,7 @@ void SyncPointLogOperation::complete(int result) {
  }
  
  GenericWriteLogOperation::GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
-                                                   const utime_t dispatch_time,
+                                                   utime_t dispatch_time,
                                                     PerfCounters *perfcounter,
                                                     CephContext *cct)
    : GenericLogOperation(dispatch_time, perfcounter),
@@ -158,35 +158,54 @@ void GenericWriteLogOperation::complete(int result) {
      on_write_persist = nullptr;
    }
    if (on_persist) {
-    ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist << dendl;
+    ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist
+                     << dendl;
      on_persist->complete(result);
    }
  }
  
-WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set,
-                                     uint64_t image_offset_bytes, uint64_t write_bytes,
-                                     CephContext *cct)
-  : GenericWriteLogOperation(set.sync_point, set.dispatch_time, set.perfcounter, cct),
-    log_entry(std::make_shared<WriteLogEntry>(set.sync_point->log_entry, image_offset_bytes, write_bytes)) {
+WriteLogOperation::WriteLogOperation(
+    WriteLogOperationSet &set, uint64_t image_offset_bytes,
+    uint64_t write_bytes, CephContext *cct,
+    std::shared_ptr<WriteLogEntry> write_log_entry)
+  : GenericWriteLogOperation(set.sync_point, set.dispatch_time,
+                             set.perfcounter, cct),
+    log_entry(write_log_entry) {
    on_write_append = set.extent_ops_appending->new_sub();
    on_write_persist = set.extent_ops_persist->new_sub();
    log_entry->sync_point_entry->writes++;
    log_entry->sync_point_entry->bytes += write_bytes;
  }
  
+WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set,
+                                     uint64_t image_offset_bytes,
+                                     uint64_t write_bytes,
+                                     uint32_t data_len,
+                                     CephContext *cct,
+                                     std::shared_ptr<WriteLogEntry> writesame_log_entry)
+  : WriteLogOperation(set, image_offset_bytes, write_bytes, cct,
+                      writesame_log_entry) {
+  is_writesame = true;
+}
+
  WriteLogOperation::~WriteLogOperation() { }
  
-void WriteLogOperation::init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation, uint64_t current_sync_gen,
-                             uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset,
+void WriteLogOperation::init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation,
+                             uint64_t current_sync_gen,
+                             uint64_t last_op_sequence_num,
+                             bufferlist &write_req_bl, uint64_t buffer_offset,
                               bool persist_on_flush) {
-  log_entry->init(has_data, current_sync_gen, last_op_sequence_num, persist_on_flush);
+  log_entry->init(has_data, current_sync_gen, last_op_sequence_num,
+                  persist_on_flush);
    buffer_alloc = &(*allocation);
-  bl.substr_of(write_req_bl, buffer_offset,
-               log_entry->write_bytes());
+  bl.substr_of(write_req_bl, buffer_offset, log_entry->write_bytes());
+  log_entry->init_cache_bl(write_req_bl, buffer_offset,
+                           log_entry->write_bytes());
  }
  
  std::ostream &WriteLogOperation::format(std::ostream &os) const {
-  os << "(Write) ";
+  string op_name = is_writesame ? "(Write Same) " : "(Write) ";
+  os << op_name;
    GenericWriteLogOperation::format(os);
    os << ", ";
    if (log_entry) {
@@ -215,22 +234,6 @@ void WriteLogOperation::complete(int result) {
    m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_app_t, log_append_time - buf_persist_time);
  }
  
-#ifdef WITH_RBD_RWL
-void WriteLogOperation::copy_bl_to_pmem_buffer(std::vector<WriteBufferAllocation>::iterator allocation) {
-  /* operation is a shared_ptr, so write_op is only good as long as operation is in scope */
-  bufferlist::iterator i(&bl);
-  m_perfcounter->inc(l_librbd_pwl_log_op_bytes, log_entry->write_bytes());
-  ldout(m_cct, 20) << bl << dendl;
-  log_entry->init_pmem_buffer(allocation);
-  i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->pmem_buffer);
-}
-
-void WriteLogOperation::flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {
-  buf_persist_time = ceph_clock_now();
-  pmemobj_flush(log_pool, log_entry->pmem_buffer, log_entry->write_bytes());
-}
-#endif
-
  WriteLogOperationSet::WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr<SyncPoint> sync_point,
                                             bool persist_on_flush, CephContext *cct, Context *on_finish)
    : m_cct(cct), m_on_finish(on_finish),
@@ -270,10 +273,10 @@ std::ostream &operator<<(std::ostream &os,
  }
  
  DiscardLogOperation::DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
-                                         const uint64_t image_offset_bytes,
-                                         const uint64_t write_bytes,
+                                         uint64_t image_offset_bytes,
+                                         uint64_t write_bytes,
                                           uint32_t discard_granularity_bytes,
-                                         const utime_t dispatch_time,
+                                         utime_t dispatch_time,
                                           PerfCounters *perfcounter,
                                           CephContext *cct)
    : GenericWriteLogOperation(sync_point, dispatch_time, perfcounter, cct),
@@ -312,30 +315,6 @@ std::ostream &operator<<(std::ostream &os,
    return op.format(os);
  }
  
-WriteSameLogOperation::WriteSameLogOperation(WriteLogOperationSet &set,
-                                             uint64_t image_offset_bytes,
-                                             uint64_t write_bytes,
-                                             uint32_t data_len,
-                                             CephContext *cct)
-  : WriteLogOperation(set, image_offset_bytes, write_bytes, cct) {
-  log_entry =
-    std::make_shared<WriteSameLogEntry>(set.sync_point->log_entry, image_offset_bytes, write_bytes, data_len);
-  ldout(m_cct, 20) << __func__ << " " << this << dendl;
-}
-
-WriteSameLogOperation::~WriteSameLogOperation() { }
-
-std::ostream &WriteSameLogOperation::format(std::ostream &os) const {
-  os << "(Write Same) ";
-  WriteLogOperation::format(os);
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
-                         const WriteSameLogOperation &op) {
-  return op.format(os);
-}
-
  } // namespace pwl
  } // namespace cache
  } // namespace librbd
diff --git a/src/librbd/cache/pwl/LogOperation.h b/src/librbd/cache/pwl/LogOperation.h

index d3aa37e87c0f008fd4053a9b087b2633724c0c65..856c5fd2d9fec3cc5df8a1804330f3b469773111 100644 (file)
--- a/src/librbd/cache/pwl/LogOperation.h
+++ b/src/librbd/cache/pwl/LogOperation.h
@@ -11,6 +11,7 @@
  namespace librbd {
  namespace cache {
  namespace pwl {
+
  struct WriteBufferAllocation;
  
  class WriteLogOperationSet;
@@ -23,6 +24,9 @@ class SyncPointLogOperation;
  
  class GenericLogOperation;
  
+template <typename T>
+class AbstractWriteLog;
+
  using GenericLogOperationSharedPtr = std::shared_ptr<GenericLogOperation>;
  
  using GenericLogOperationsVector = std::vector<GenericLogOperationSharedPtr>;
@@ -36,7 +40,7 @@ public:
    utime_t buf_persist_comp_time; // When buffer persist completes
    utime_t log_append_time;       // When log append begins
    utime_t log_append_comp_time;  // When log append completes
-  GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter);
+  GenericLogOperation(utime_t dispatch_time, PerfCounters *perfcounter);
    virtual ~GenericLogOperation() { };
    GenericLogOperation(const GenericLogOperation&) = delete;
    GenericLogOperation &operator=(const GenericLogOperation&) = delete;
@@ -53,11 +57,8 @@ public:
    virtual bool is_writing_op() const {
      return false;
    }
-  #ifdef WITH_RBD_RWL
-  virtual void copy_bl_to_pmem_buffer(
+  virtual void copy_bl_to_cache_buffer(
        std::vector<WriteBufferAllocation>::iterator allocation) {};
-  virtual void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {};
-  #endif
  };
  
  class SyncPointLogOperation : public GenericLogOperation {
@@ -71,7 +72,7 @@ public:
    std::shared_ptr<SyncPoint> sync_point;
    SyncPointLogOperation(ceph::mutex &lock,
                          std::shared_ptr<SyncPoint> sync_point,
-                        const utime_t dispatch_time,
+                        utime_t dispatch_time,
                          PerfCounters *perfcounter,
                          CephContext *cct);
    ~SyncPointLogOperation() override;
@@ -99,7 +100,7 @@ public:
    Context *on_write_persist = nullptr; /* Completion for things waiting on this
                                          * write to persist */
    GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
-                           const utime_t dispatch_time,
+                           utime_t dispatch_time,
                             PerfCounters *perfcounter,
                             CephContext *cct);
    ~GenericWriteLogOperation() override;
@@ -129,14 +130,24 @@ public:
    using GenericWriteLogOperation::on_write_persist;
    std::shared_ptr<WriteLogEntry> log_entry;
    bufferlist bl;
+  bool is_writesame = false;
    WriteBufferAllocation *buffer_alloc = nullptr;
-  WriteLogOperation(WriteLogOperationSet &set, const uint64_t image_offset_bytes,
-                    const uint64_t write_bytes, CephContext *cct);
-  ~WriteLogOperation() override;
+  WriteLogOperation(WriteLogOperationSet &set,
+                    uint64_t image_offset_bytes,
+                    uint64_t write_bytes, CephContext *cct,
+                    std::shared_ptr<WriteLogEntry> write_log_entry);
+  WriteLogOperation(WriteLogOperationSet &set,
+                    uint64_t image_offset_bytes,
+                    uint64_t write_bytes, uint32_t data_len,
+                    CephContext *cct,
+                    std::shared_ptr<WriteLogEntry> writesame_log_entry);
+ ~WriteLogOperation() override;
    WriteLogOperation(const WriteLogOperation&) = delete;
    WriteLogOperation &operator=(const WriteLogOperation&) = delete;
-  void init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation, uint64_t current_sync_gen,
-            uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset,
+  void init(bool has_data,
+            std::vector<WriteBufferAllocation>::iterator allocation,
+            uint64_t current_sync_gen, uint64_t last_op_sequence_num,
+            bufferlist &write_req_bl, uint64_t buffer_offset,
              bool persist_on_flush);
    std::ostream &format(std::ostream &os) const;
    friend std::ostream &operator<<(std::ostream &os,
@@ -146,11 +157,6 @@ public:
    }
  
    void complete(int r) override;
-  #ifdef WITH_RBD_RWL
-  void copy_bl_to_pmem_buffer(
-      std::vector<WriteBufferAllocation>::iterator allocation) override;
-  void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) override;
-  #endif
  };
  
  
@@ -169,8 +175,10 @@ public:
    utime_t dispatch_time; /* When set created */
    PerfCounters *perfcounter = nullptr;
    std::shared_ptr<SyncPoint> sync_point;
-  WriteLogOperationSet(const utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr<SyncPoint> sync_point,
-                       const bool persist_on_flush, CephContext *cct, Context *on_finish);
+  WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter,
+                       std::shared_ptr<SyncPoint> sync_point,
+                       const bool persist_on_flush, CephContext *cct,
+                       Context *on_finish);
    ~WriteLogOperationSet();
    WriteLogOperationSet(const WriteLogOperationSet&) = delete;
    WriteLogOperationSet &operator=(const WriteLogOperationSet&) = delete;
@@ -186,10 +194,10 @@ public:
    using GenericWriteLogOperation::on_write_persist;
    std::shared_ptr<DiscardLogEntry> log_entry;
    DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
-                      const uint64_t image_offset_bytes,
-                      const uint64_t write_bytes,
+                      uint64_t image_offset_bytes,
+                      uint64_t write_bytes,
                        uint32_t discard_granularity_bytes,
-                      const utime_t dispatch_time,
+                      utime_t dispatch_time,
                        PerfCounters *perfcounter,
                        CephContext *cct);
    ~DiscardLogOperation() override;
@@ -208,28 +216,6 @@ public:
                                    const DiscardLogOperation &op);
  };
  
-class WriteSameLogOperation : public WriteLogOperation {
-public:
-  using GenericWriteLogOperation::m_lock;
-  using GenericWriteLogOperation::sync_point;
-  using GenericWriteLogOperation::on_write_append;
-  using GenericWriteLogOperation::on_write_persist;
-  using WriteLogOperation::log_entry;
-  using WriteLogOperation::bl;
-  using WriteLogOperation::buffer_alloc;
-  WriteSameLogOperation(WriteLogOperationSet &set,
-                        const uint64_t image_offset_bytes,
-                        const uint64_t write_bytes,
-                        const uint32_t data_len,
-                        CephContext *cct);
-  ~WriteSameLogOperation();
-  WriteSameLogOperation(const WriteSameLogOperation&) = delete;
-  WriteSameLogOperation &operator=(const WriteSameLogOperation&) = delete;
-  std::ostream &format(std::ostream &os) const;
-  friend std::ostream &operator<<(std::ostream &os,
-                                  const WriteSameLogOperation &op);
-};
-
  } // namespace pwl
  } // namespace cache
  } // namespace librbd
diff --git a/src/librbd/cache/pwl/ReplicatedWriteLog.cc b/src/librbd/cache/pwl/ReplicatedWriteLog.cc

deleted file mode 100644 (file)

index 200746f..0000000
--- a/src/librbd/cache/pwl/ReplicatedWriteLog.cc
+++ /dev/null
@@ -1,898 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "ReplicatedWriteLog.h"
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "include/ceph_assert.h"
-#include "common/deleter.h"
-#include "common/dout.h"
-#include "common/environment.h"
-#include "common/errno.h"
-#include "common/WorkQueue.h"
-#include "common/Timer.h"
-#include "common/perf_counters.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/asio/ContextWQ.h"
-#include "librbd/cache/pwl/ImageCacheState.h"
-#include "librbd/cache/pwl/LogEntry.h"
-#include "librbd/plugin/Api.h"
-#include <map>
-#include <vector>
-
-#undef dout_subsys
-#define dout_subsys ceph_subsys_rbd_pwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::pwl::ReplicatedWriteLog: " << this << " " \
-                             <<  __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace pwl {
-
-using namespace librbd::cache::pwl;
-
-const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION;
-
-template <typename I>
-ReplicatedWriteLog<I>::ReplicatedWriteLog(
-    I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
-    ImageWritebackInterface& image_writeback,
-    plugin::Api<I>& plugin_api)
-: AbstractWriteLog<I>(image_ctx, cache_state, image_writeback, plugin_api),
-  m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl))
-{ 
-}
-
-template <typename I>
-ReplicatedWriteLog<I>::~ReplicatedWriteLog() {
-  m_log_pool = nullptr;
-}
-
-/*
- * Allocate the (already reserved) write log entries for a set of operations.
- *
- * Locking:
- * Acquires lock
- */
-template <typename I>
-void ReplicatedWriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops)
-{
-  TOID(struct WriteLogPoolRoot) pool_root;
-  pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-  struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
-  
-  ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
-  
-  /* Allocate the (already reserved) log entries */
-  std::lock_guard locker(m_lock);
-  
-  for (auto &operation : ops) {
-    uint32_t entry_index = this->m_first_free_entry;
-    this->m_first_free_entry = (this->m_first_free_entry + 1) % this->m_total_log_entries;
-    auto &log_entry = operation->get_log_entry();
-    log_entry->log_entry_index = entry_index;
-    log_entry->ram_entry.entry_index = entry_index;
-    log_entry->pmem_entry = &pmem_log_entries[entry_index];
-    log_entry->ram_entry.entry_valid = 1;
-    m_log_entries.push_back(log_entry);
-    ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
-  } 
-}
-
-/*
- * Write and persist the (already allocated) write log entries and
- * data buffer allocations for a set of ops. The data buffer for each
- * of these must already have been persisted to its reserved area.
- */
-template <typename I>
-int ReplicatedWriteLog<I>::append_op_log_entries(GenericLogOperations &ops)
-{
-  CephContext *cct = m_image_ctx.cct;
-  GenericLogOperationsVector entries_to_flush;
-  TOID(struct WriteLogPoolRoot) pool_root;
-  pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-  int ret = 0;
-
-  ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
-
-  if (ops.empty()) {
-    return 0;
-  }
-  entries_to_flush.reserve(OPS_APPENDED_TOGETHER);
-
-  /* Write log entries to ring and persist */
-  utime_t now = ceph_clock_now();
-  for (auto &operation : ops) {
-    if (!entries_to_flush.empty()) {
-      /* Flush these and reset the list if the current entry wraps to the
-       * tail of the ring */
-      if (entries_to_flush.back()->get_log_entry()->log_entry_index >
-          operation->get_log_entry()->log_entry_index) {
-        ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at "
-                                   << "operation=[" << *operation << "]" << dendl;
-        flush_op_log_entries(entries_to_flush);
-        entries_to_flush.clear();
-        now = ceph_clock_now();
-      } 
-    } 
-    ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index="
-                               << operation->get_log_entry()->log_entry_index << " "
-                               << "from " << &operation->get_log_entry()->ram_entry << " "
-                               << "to " << operation->get_log_entry()->pmem_entry << " "
-                               << "operation=[" << *operation << "]" << dendl;
-    ldout(m_image_ctx.cct, 05) << "APPENDING: index="
-                               << operation->get_log_entry()->log_entry_index << " "
-                               << "operation=[" << *operation << "]" << dendl;
-    operation->log_append_time = now;
-    *operation->get_log_entry()->pmem_entry = operation->get_log_entry()->ram_entry;
-    ldout(m_image_ctx.cct, 20) << "APPENDING: index="
-                               << operation->get_log_entry()->log_entry_index << " "
-                               << "pmem_entry=[" << *operation->get_log_entry()->pmem_entry
-                               << "]" << dendl;
-    entries_to_flush.push_back(operation);
-  } 
-  flush_op_log_entries(entries_to_flush);
-
-  /* Drain once for all */
-  pmemobj_drain(m_log_pool);
-
-  /*
-   * Atomically advance the log head pointer and publish the
-   * allocations for all the data buffers they refer to.
-   */
-  utime_t tx_start = ceph_clock_now();
-  TX_BEGIN(m_log_pool) {
-    D_RW(pool_root)->first_free_entry = this->m_first_free_entry;
-    for (auto &operation : ops) {
-      if (operation->reserved_allocated()) {
-        auto write_op = (std::shared_ptr<WriteLogOperation>&) operation;
-        pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1);
-      } else {
-        ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
-      } 
-    } 
-  } TX_ONCOMMIT {
-  } TX_ONABORT {
-    lderr(cct) << "failed to commit " << ops.size()
-               << " log entries (" << this->m_log_pool_name << ")" << dendl;
-    ceph_assert(false);
-    ret = -EIO;
-  } TX_FINALLY {
-  } TX_END;
-
-  utime_t tx_end = ceph_clock_now();
-  m_perfcounter->tinc(l_librbd_pwl_append_tx_t, tx_end - tx_start);
-  m_perfcounter->hinc(
-    l_librbd_pwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size());
-  for (auto &operation : ops) {
-    operation->log_append_comp_time = tx_end;
-  } 
-
-  return ret;
-}
-
-/*
- * Flush the persistent write log entries set of ops. The entries must
- * be contiguous in persistent memory.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::flush_op_log_entries(GenericLogOperationsVector &ops)
-{
-  if (ops.empty()) {
-    return;
-  } 
-
-  if (ops.size() > 1) {
-    ceph_assert(ops.front()->get_log_entry()->pmem_entry < ops.back()->get_log_entry()->pmem_entry);
-  } 
-
-  ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() << " "
-                             << "start address="
-                             << ops.front()->get_log_entry()->pmem_entry << " "
-                             << "bytes="
-                             << ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry))
-                             << dendl;
-  pmemobj_flush(m_log_pool,  
-                ops.front()->get_log_entry()->pmem_entry,
-                ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry)));
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::remove_pool_file() {
-  if (m_log_pool) {
-    ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl;
-    pmemobj_close(m_log_pool);
-  } 
-  if (m_cache_state->clean) {
-      ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << this->m_log_pool_name << dendl;
-      if (remove(this->m_log_pool_name.c_str()) != 0) {
-        lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << this->m_log_pool_name << "\": "
-          << pmemobj_errormsg() << dendl;
-      } else {
-        m_cache_state->clean = true;
-        m_cache_state->empty = true;
-        m_cache_state->present = false;
-      } 
-  } else {
-    ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << this->m_log_pool_name << dendl;
-  } 
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::initialize_pool(Context *on_finish, pwl::DeferredContexts &later) {
-  CephContext *cct = m_image_ctx.cct;
-  TOID(struct WriteLogPoolRoot) pool_root;
-  ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
-  if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
-    if ((m_log_pool =
-         pmemobj_create(this->m_log_pool_name.c_str(),
-                        this->m_pwl_pool_layout_name,
-                        this->m_log_pool_config_size,
-                        (S_IWUSR | S_IRUSR))) == NULL) {
-      lderr(cct) << "failed to create pool (" << this->m_log_pool_name << ")"
-                 << pmemobj_errormsg() << dendl;
-      m_cache_state->present = false;
-      m_cache_state->clean = true;
-      m_cache_state->empty = true;
-      /* TODO: filter/replace errnos that are meaningless to the caller */
-      on_finish->complete(-errno);
-      return;
-    } 
-    m_cache_state->present = true;
-    m_cache_state->clean = true;
-    m_cache_state->empty = true;
-    pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-
-    /* new pool, calculate and store metadata */
-    size_t effective_pool_size = (size_t)(this->m_log_pool_config_size * USABLE_SIZE);
-    size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogPmemEntry);
-    uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size);
-    if (num_small_writes > MAX_LOG_ENTRIES) {
-      num_small_writes = MAX_LOG_ENTRIES;
-    }
-    if (num_small_writes <= 2) {
-      lderr(cct) << "num_small_writes needs to > 2" << dendl;
-      on_finish->complete(-EINVAL);
-      return;
-    } 
-    this->m_log_pool_actual_size = this->m_log_pool_config_size;
-    this->m_bytes_allocated_cap = effective_pool_size;
-    /* Log ring empty */
-    m_first_free_entry = 0;
-    m_first_valid_entry = 0;
-    TX_BEGIN(m_log_pool) {
-      TX_ADD(pool_root);
-      D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION;
-      D_RW(pool_root)->log_entries =
-        TX_ZALLOC(struct WriteLogPmemEntry,
-                  sizeof(struct WriteLogPmemEntry) * num_small_writes);
-      D_RW(pool_root)->pool_size = this->m_log_pool_actual_size;
-      D_RW(pool_root)->flushed_sync_gen = this->m_flushed_sync_gen;
-      D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE;
-      D_RW(pool_root)->num_log_entries = num_small_writes;
-      D_RW(pool_root)->first_free_entry = m_first_free_entry;
-      D_RW(pool_root)->first_valid_entry = m_first_valid_entry;
-    } TX_ONCOMMIT {
-      this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
-      this->m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free
-    } TX_ONABORT {
-      this->m_total_log_entries = 0;
-      this->m_free_log_entries = 0;
-      lderr(cct) << "failed to initialize pool (" << this->m_log_pool_name << ")" << dendl;
-      on_finish->complete(-pmemobj_tx_errno());
-      return;
-    } TX_FINALLY {
-    } TX_END;
-  } else {
-    m_cache_state->present = true;
-    /* Open existing pool */
-    if ((m_log_pool =
-         pmemobj_open(this->m_log_pool_name.c_str(),
-                      this->m_pwl_pool_layout_name)) == NULL) {
-      lderr(cct) << "failed to open pool (" << this->m_log_pool_name << "): "
-                 << pmemobj_errormsg() << dendl;
-      on_finish->complete(-errno);
-      return;
-    }
-    pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-    if (D_RO(pool_root)->header.layout_version != RWL_POOL_VERSION) {
-      // TODO: will handle upgrading version in the future
-      lderr(cct) << "Pool layout version is " << D_RO(pool_root)->header.layout_version
-                 << " expected " << RWL_POOL_VERSION << dendl;
-      on_finish->complete(-EINVAL);
-      return;
-    }
-    if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) {
-      lderr(cct) << "Pool block size is " << D_RO(pool_root)->block_size
-                 << " expected " << MIN_WRITE_ALLOC_SIZE << dendl;
-      on_finish->complete(-EINVAL);
-      return;
-    }
-    this->m_log_pool_actual_size = D_RO(pool_root)->pool_size;
-    this->m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen;
-    this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
-    m_first_free_entry = D_RO(pool_root)->first_free_entry;
-    m_first_valid_entry = D_RO(pool_root)->first_valid_entry;
-    if (m_first_free_entry < m_first_valid_entry) {
-      /* Valid entries wrap around the end of the ring, so first_free is lower
-       * than first_valid.  If first_valid was == first_free+1, the entry at
-       * first_free would be empty. The last entry is never used, so in
-       * that case there would be zero free log entries. */
-     this->m_free_log_entries = this->m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1;
-    } else {
-      /* first_valid is <= first_free. If they are == we have zero valid log
-       * entries, and n-1 free log entries */
-      this->m_free_log_entries = this->m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1;
-    } 
-    size_t effective_pool_size = (size_t)(this->m_log_pool_config_size * USABLE_SIZE);
-    this->m_bytes_allocated_cap = effective_pool_size;
-    load_existing_entries(later);
-    m_cache_state->clean = this->m_dirty_log_entries.empty();
-    m_cache_state->empty = m_log_entries.empty();
-  }
-}
-
-/*
- * Loads the log entries from an existing log.
- *
- * Creates the in-memory structures to represent the state of the
- * re-opened log.
- *
- * Finds the last appended sync point, and any sync points referred to
- * in log entries, but missing from the log. These missing sync points
- * are created and scheduled for append. Some rudimentary consistency
- * checking is done.
- *    
- * Rebuilds the m_blocks_to_log_entries map, to make log entries
- * readable.
- *  
- * Places all writes on the dirty entries list, which causes them all
- * to be flushed.
- *  
- */
-
-template <typename I>
-void ReplicatedWriteLog<I>::load_existing_entries(DeferredContexts &later) {
-  TOID(struct WriteLogPoolRoot) pool_root;
-  pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-  struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
-  uint64_t entry_index = m_first_valid_entry;
-  /* The map below allows us to find sync point log entries by sync
-   * gen number, which is necessary so write entries can be linked to
-   * their sync points. */
-  std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
-  /* The map below tracks sync points referred to in writes but not
-   * appearing in the sync_point_entries map.  We'll use this to
-   * determine which sync points are missing and need to be
-   * created. */
-  std::map<uint64_t, bool> missing_sync_points;
-
-  /*
-   * Read the existing log entries. Construct an in-memory log entry
-   * object of the appropriate type for each. Add these to the global
-   * log entries list.
-   *
-   * Write entries will not link to their sync points yet. We'll do
-   * that in the next pass. Here we'll accumulate a map of sync point
-   * gen numbers that are referred to in writes but do not appearing in
-   * the log.
-   */
-  while (entry_index != m_first_free_entry) {
-    WriteLogPmemEntry *pmem_entry = &pmem_log_entries[entry_index];
-    std::shared_ptr<GenericLogEntry> log_entry = nullptr;
-    ceph_assert(pmem_entry->entry_index == entry_index);
-
-    this->update_entries(log_entry, pmem_entry, missing_sync_points,
-        sync_point_entries, entry_index);
-
-    log_entry->ram_entry = *pmem_entry;
-    log_entry->pmem_entry = pmem_entry;
-    log_entry->log_entry_index = entry_index;
-    log_entry->completed = true;
-
-    m_log_entries.push_back(log_entry);
-    
-    entry_index = (entry_index + 1) % this->m_total_log_entries;
-  }
-
-  this->update_sync_points(missing_sync_points, sync_point_entries, later);
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::write_data_to_buffer(std::shared_ptr<WriteLogEntry> ws_entry,
-    WriteLogPmemEntry *pmem_entry) {
-  ws_entry->pmem_buffer = D_RW(pmem_entry->write_data);
-} 
-
-/**
- * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries
- * that are eligible to be retired. Returns true if anything was
- * retired.
- */
-template <typename I>
-bool ReplicatedWriteLog<I>::retire_entries(const unsigned long int frees_per_tx) {
-  CephContext *cct = m_image_ctx.cct;
-  GenericLogEntriesVector retiring_entries;
-  uint32_t initial_first_valid_entry;
-  uint32_t first_valid_entry;
-
-  std::lock_guard retire_locker(this->m_log_retire_lock);
-  ldout(cct, 20) << "Look for entries to retire" << dendl;
-  {
-    /* Entry readers can't be added while we hold m_entry_reader_lock */
-    RWLock::WLocker entry_reader_locker(this->m_entry_reader_lock);
-    std::lock_guard locker(m_lock);
-    initial_first_valid_entry = this->m_first_valid_entry;
-    first_valid_entry = this->m_first_valid_entry;
-    auto entry = m_log_entries.front();
-    while (!m_log_entries.empty() &&
-           retiring_entries.size() < frees_per_tx &&
-           this->can_retire_entry(entry)) {
-      if (entry->log_entry_index != first_valid_entry) {
-        lderr(cct) << "Retiring entry index (" << entry->log_entry_index
-                   << ") and first valid log entry index (" << first_valid_entry
-                   << ") must be ==." << dendl;
-      }
-      ceph_assert(entry->log_entry_index == first_valid_entry);
-      first_valid_entry = (first_valid_entry + 1) % this->m_total_log_entries;
-      m_log_entries.pop_front();
-      retiring_entries.push_back(entry);
-      /* Remove entry from map so there will be no more readers */
-      if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) {
-        auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(entry);
-        if (gen_write_entry) {
-          this->m_blocks_to_log_entries.remove_log_entry(gen_write_entry);
-        }
-      }
-      entry = m_log_entries.front();
-    }
-  }
-
-  if (retiring_entries.size()) {
-    ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl;
-    TOID(struct WriteLogPoolRoot) pool_root;
-    pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-
-    utime_t tx_start;
-    utime_t tx_end;
-    /* Advance first valid entry and release buffers */
-    {
-      uint64_t flushed_sync_gen;
-      std::lock_guard append_locker(this->m_log_append_lock);
-      {
-        std::lock_guard locker(m_lock);
-        flushed_sync_gen = this->m_flushed_sync_gen;
-      }
-
-      tx_start = ceph_clock_now();
-      TX_BEGIN(m_log_pool) {
-        if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
-          ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from "
-                                     << D_RO(pool_root)->flushed_sync_gen << " to "
-                                     << flushed_sync_gen << dendl;
-          D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
-        } 
-        D_RW(pool_root)->first_valid_entry = first_valid_entry;
-        for (auto &entry: retiring_entries) {
-          if (entry->write_bytes()) {
-            ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo
-                           << "." << entry->ram_entry.write_data.oid.off << dendl;
-            TX_FREE(entry->ram_entry.write_data);
-          } else {
-            ldout(cct, 20) << "Retiring non-write: " << *entry << dendl;
-          }
-        }
-      } TX_ONCOMMIT {
-      } TX_ONABORT {
-        lderr(cct) << "failed to commit free of" << retiring_entries.size()
-                   << " log entries (" << this->m_log_pool_name << ")" << dendl;
-        ceph_assert(false);
-      } TX_FINALLY {
-      } TX_END;
-      tx_end = ceph_clock_now();
-    }
-    m_perfcounter->tinc(l_librbd_pwl_retire_tx_t, tx_end - tx_start);
-    m_perfcounter->hinc(l_librbd_pwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(),
-        retiring_entries.size());
-
-    /* Update runtime copy of first_valid, and free entries counts */
-    {
-      std::lock_guard locker(m_lock);
-
-      ceph_assert(this->m_first_valid_entry == initial_first_valid_entry);
-      this->m_first_valid_entry = first_valid_entry;
-      this->m_free_log_entries += retiring_entries.size();
-      for (auto &entry: retiring_entries) {
-        if (entry->write_bytes()) {
-          ceph_assert(this->m_bytes_cached >= entry->write_bytes());
-          this->m_bytes_cached -= entry->write_bytes();
-          uint64_t entry_allocation_size = entry->write_bytes();
-          if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) {
-            entry_allocation_size = MIN_WRITE_ALLOC_SIZE;
-          }
-          ceph_assert(this->m_bytes_allocated >= entry_allocation_size);
-          this->m_bytes_allocated -= entry_allocation_size;
-        }
-      } 
-      this->m_alloc_failed_since_retire = false;
-      this->wake_up();
-    }
-  } else {
-    ldout(cct, 20) << "Nothing to retire" << dendl;
-    return false;
-  }
-  return true;
-}
-
-template <typename I>
-Context* ReplicatedWriteLog<I>::construct_flush_entry_ctx(
-    std::shared_ptr<GenericLogEntry> log_entry) {
-  bool invalidating = this->m_invalidating; // snapshot so we behave consistently
-  Context *ctx = this->construct_flush_entry(log_entry, invalidating);
-  
-  if (invalidating) {
-    return ctx;
-  }
-  return new LambdaContext(
-    [this, log_entry, ctx](int r) {
-      m_image_ctx.op_work_queue->queue(new LambdaContext(
-        [this, log_entry, ctx](int r) {
-          ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
-                                     << " " << *log_entry << dendl;
-          log_entry->writeback(this->m_image_writeback, ctx);
-        }), 0);
-    }); 
-}
-
-const unsigned long int ops_flushed_together = 4;
-/*
- * Performs the pmem buffer flush on all scheduled ops, then schedules
- * the log event append operation for all of them.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::flush_then_append_scheduled_ops(void)
-{
-  GenericLogOperations ops;
-  bool ops_remain = false;
-  ldout(m_image_ctx.cct, 20) << dendl;
-  do {
-    {
-      ops.clear();
-      std::lock_guard locker(m_lock);
-      if (m_ops_to_flush.size()) {
-        auto last_in_batch = m_ops_to_flush.begin();
-        unsigned int ops_to_flush = m_ops_to_flush.size();
-        if (ops_to_flush > ops_flushed_together) {
-          ops_to_flush = ops_flushed_together;
-        }
-        ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl;
-        std::advance(last_in_batch, ops_to_flush);
-        ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch);
-        ops_remain = !m_ops_to_flush.empty();
-        ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", "
-                                   << m_ops_to_flush.size() << " remain" << dendl;
-      } else {
-        ops_remain = false;
-      }
-    } 
-    if (ops_remain) {
-      enlist_op_flusher();
-    }
-
-    /* Ops subsequently scheduled for flush may finish before these,
-     * which is fine. We're unconcerned with completion order until we
-     * get to the log message append step. */
-    if (ops.size()) {
-      flush_pmem_buffer(ops);
-      schedule_append_ops(ops);
-    } 
-  } while (ops_remain);
-  append_scheduled_ops();
-}
-
-/*
- * Performs the log event append operation for all of the scheduled
- * events.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::append_scheduled_ops(void) {
-  GenericLogOperations ops;
-  int append_result = 0;
-  bool ops_remain = false;
-  bool appending = false; /* true if we set m_appending */
-  ldout(m_image_ctx.cct, 20) << dendl;
-  do {
-    ops.clear();
-    this->append_scheduled(ops, ops_remain, appending, true);
-    
-    if (ops.size()) {
-      std::lock_guard locker(this->m_log_append_lock);
-      alloc_op_log_entries(ops);
-      append_result = append_op_log_entries(ops);
-    } 
-    
-    int num_ops = ops.size();
-    if (num_ops) {
-      /* New entries may be flushable. Completion will wake up flusher. */
-      this->complete_op_log_entries(std::move(ops), append_result);
-    } 
-  } while (ops_remain);
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::enlist_op_flusher()
-{
-  this->m_async_flush_ops++;
-  this->m_async_op_tracker.start_op();
-  Context *flush_ctx = new LambdaContext([this](int r) {
-      flush_then_append_scheduled_ops();
-      this->m_async_flush_ops--;
-      this->m_async_op_tracker.finish_op();
-    });
-  this->m_work_queue.queue(flush_ctx);
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::setup_schedule_append(
-    pwl::GenericLogOperationsVector &ops, bool do_early_flush) {
-  if (do_early_flush) {                           
-    /* This caller is waiting for persist, so we'll use their thread to
-     * expedite it */
-    flush_pmem_buffer(ops);
-    this->schedule_append(ops);
-  } else {
-    /* This is probably not still the caller's thread, so do the payload
-     * flushing/replicating later. */
-    schedule_flush_and_append(ops);
-  } 
-}
-
-/*
- * Takes custody of ops. They'll all get their log entries appended,
- * and have their on_write_persist contexts completed once they and
- * all prior log entries are persisted everywhere.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::schedule_append_ops(GenericLogOperations &ops)
-{
-  bool need_finisher;
-  GenericLogOperationsVector appending;
-  
-  std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
-  {
-    std::lock_guard locker(m_lock);
-  
-    need_finisher = this->m_ops_to_append.empty() && !this->m_appending;
-    this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops);
-  } 
-  
-  if (need_finisher) {
-    this->enlist_op_appender();
-  }
-  
-  for (auto &op : appending) {
-    op->appending();
-  }
-}
-
-/*
- * Takes custody of ops. They'll all get their pmem blocks flushed,
- * then get their log entries appended.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::schedule_flush_and_append(GenericLogOperationsVector &ops)
-{
-  GenericLogOperations to_flush(ops.begin(), ops.end());
-  bool need_finisher;
-  ldout(m_image_ctx.cct, 20) << dendl;
-  {
-    std::lock_guard locker(m_lock);
-    
-    need_finisher = m_ops_to_flush.empty();
-    m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush);
-  } 
-  
-  if (need_finisher) {
-    enlist_op_flusher();
-  } 
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::process_work() {
-  CephContext *cct = m_image_ctx.cct;
-  int max_iterations = 4;
-  bool wake_up_requested = false;
-  uint64_t aggressive_high_water_bytes = this->m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER;
-  uint64_t high_water_bytes = this->m_bytes_allocated_cap * RETIRE_HIGH_WATER;
-  uint64_t low_water_bytes = this->m_bytes_allocated_cap * RETIRE_LOW_WATER;
-  uint64_t aggressive_high_water_entries = this->m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER;
-  uint64_t high_water_entries = this->m_total_log_entries * RETIRE_HIGH_WATER;
-  uint64_t low_water_entries = this->m_total_log_entries * RETIRE_LOW_WATER;
-
-  ldout(cct, 20) << dendl;
-
-  do {
-    {
-      std::lock_guard locker(m_lock);
-      this->m_wake_up_requested = false;
-    }
-    if (this->m_alloc_failed_since_retire || this->m_invalidating ||
-        this->m_bytes_allocated > high_water_bytes ||
-        (m_log_entries.size() > high_water_entries)) {
-      int retired = 0;
-      utime_t started = ceph_clock_now();
-      ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire
-                                 << ", allocated > high_water="
-                                 << (this->m_bytes_allocated > high_water_bytes)
-                                 << ", allocated_entries > high_water="
-                                 << (m_log_entries.size() > high_water_entries)
-                                 << dendl;
-      while (this->m_alloc_failed_since_retire || this->m_invalidating ||
-            (this->m_bytes_allocated > high_water_bytes) ||
-            (m_log_entries.size() > high_water_entries) ||
-            (((this->m_bytes_allocated > low_water_bytes) ||
-              (m_log_entries.size() > low_water_entries)) &&
-            (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) {
-        if (!retire_entries((this->m_shutting_down || this->m_invalidating ||
-           (this->m_bytes_allocated > aggressive_high_water_bytes) ||
-           (m_log_entries.size() > aggressive_high_water_entries))
-            ? MAX_ALLOC_PER_TRANSACTION
-            : MAX_FREE_PER_TRANSACTION)) {
-          break;
-        }
-        retired++;
-        this->dispatch_deferred_writes();
-        this->process_writeback_dirty_entries();
-      }
-      ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl;
-    }
-    this->dispatch_deferred_writes();
-    this->process_writeback_dirty_entries();
-
-    {
-      std::lock_guard locker(m_lock);
-      wake_up_requested = this->m_wake_up_requested;
-    }
-  } while (wake_up_requested && --max_iterations > 0);
-
-  {
-    std::lock_guard locker(m_lock);
-    this->m_wake_up_scheduled = false;
-    /* Reschedule if it's still requested */
-    if (this->m_wake_up_requested) {
-      this->wake_up();
-    } 
-  }
-}
-
-/*
- * Flush the pmem regions for the data blocks of a set of operations
- *
- * V is expected to be GenericLogOperations<I>, or GenericLogOperationsVector<I>
- */
-template <typename I>
-template <typename V>
-void ReplicatedWriteLog<I>::flush_pmem_buffer(V& ops)
-{
-  for (auto &operation : ops) {
-    operation->flush_pmem_buf_to_cache(m_log_pool);
-  } 
-  
-  /* Drain once for all */
-  pmemobj_drain(m_log_pool);
-  
-  utime_t now = ceph_clock_now();
-  for (auto &operation : ops) {
-    if (operation->reserved_allocated()) {
-      operation->buf_persist_comp_time = now;
-    } else {
-      ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
-    } 
-  } 
-}
-
-/**
- * Update/persist the last flushed sync point in the log
- */
-template <typename I>
-void ReplicatedWriteLog<I>::persist_last_flushed_sync_gen()
-{
-  TOID(struct WriteLogPoolRoot) pool_root;
-  pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-  uint64_t flushed_sync_gen;
-
-  std::lock_guard append_locker(this->m_log_append_lock);
-  {
-    std::lock_guard locker(m_lock);
-    flushed_sync_gen = this->m_flushed_sync_gen;
-  }
-  
-  if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
-    ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from "
-                               << D_RO(pool_root)->flushed_sync_gen << " to "
-                               << flushed_sync_gen << dendl;
-    TX_BEGIN(m_log_pool) {     
-      D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
-    } TX_ONCOMMIT {
-    } TX_ONABORT {
-      lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl;
-      ceph_assert(false);
-    } TX_FINALLY {
-    } TX_END;
-  }
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::reserve_pmem(C_BlockIORequestT *req,
-                                         bool &alloc_succeeds, bool &no_space) {
-  std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
-  for (auto &buffer : buffers) {
-    utime_t before_reserve = ceph_clock_now();
-    buffer.buffer_oid = pmemobj_reserve(m_log_pool,
-                                        &buffer.buffer_alloc_action,
-                                        buffer.allocation_size,
-                                        0 /* Object type */);
-    buffer.allocation_lat = ceph_clock_now() - before_reserve;
-    if (TOID_IS_NULL(buffer.buffer_oid)) {
-      if (!req->has_io_waited_for_buffers()) {
-        req->set_io_waited_for_entries(true);
-      } 
-      ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: "
-                                << pmemobj_errormsg() << ". "
-                                << *req << dendl;
-      alloc_succeeds = false;   
-      no_space = true; /* Entries need to be retired */
-      break;
-    } else {
-      buffer.allocated = true;
-    } 
-    ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo
-                               << "." << buffer.buffer_oid.oid.off
-                               << ", size=" << buffer.allocation_size << dendl;
-  }                            
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::copy_pmem(C_BlockIORequestT *req) {
-  req->copy_pmem();
-}
-
-template <typename I>
-bool ReplicatedWriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
-  bool alloc_succeeds = true;
-  uint64_t bytes_allocated = 0;
-  uint64_t bytes_cached = 0;
-  uint64_t bytes_dirtied = 0;
-  uint64_t num_lanes = 0;
-  uint64_t num_unpublished_reserves = 0;
-  uint64_t num_log_entries = 0;
-  
-  ldout(m_image_ctx.cct, 20) << dendl;
-  // Setup buffer, and get all the number of required resources
-  req->setup_buffer_resources(bytes_cached, bytes_dirtied, bytes_allocated,
-                              num_lanes, num_log_entries, num_unpublished_reserves);
-                              
-  alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied, bytes_allocated,
-                              num_lanes, num_log_entries, num_unpublished_reserves,
-                              this->m_bytes_allocated_cap);
-                              
-  std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
-  if (!alloc_succeeds) {
-    /* On alloc failure, free any buffers we did allocate */
-    for (auto &buffer : buffers) {
-      if (buffer.allocated) {
-        pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1);
-      }
-    } 
-  } 
-  
-  req->set_allocated(alloc_succeeds);
-  return alloc_succeeds;
-}
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::pwl::ReplicatedWriteLog<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/ReplicatedWriteLog.h b/src/librbd/cache/pwl/ReplicatedWriteLog.h

deleted file mode 100644 (file)

index bf4b0be..0000000
--- a/src/librbd/cache/pwl/ReplicatedWriteLog.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
-#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
-
-#include <libpmemobj.h>
-#include "common/RWLock.h"
-#include "common/WorkQueue.h"
-#include "common/AsyncOpTracker.h"
-#include "librbd/cache/ImageWriteback.h"
-#include "librbd/Utils.h"
-#include "librbd/BlockGuard.h"
-#include "librbd/cache/Types.h"
-#include "librbd/cache/pwl/LogOperation.h"
-#include "librbd/cache/pwl/Request.h"
-#include "librbd/cache/pwl/LogMap.h"
-#include "AbstractWriteLog.h"
-#include <functional>
-#include <list>
-
-class Context;
-class SafeTimer;
-
-namespace librbd {
-
-struct ImageCtx;
-
-namespace cache {
-
-namespace pwl {
-
-template <typename ImageCtxT>
-class ReplicatedWriteLog : public AbstractWriteLog<ImageCtxT> {
-public:
-  ReplicatedWriteLog(
-      ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
-      ImageWritebackInterface& image_writeback,
-      plugin::Api<ImageCtxT>& plugin_api);
-  ~ReplicatedWriteLog();
-  ReplicatedWriteLog(const ReplicatedWriteLog&) = delete;
-  ReplicatedWriteLog &operator=(const ReplicatedWriteLog&) = delete;
-
-private:
-  using This = AbstractWriteLog<ImageCtxT>;
-  using C_WriteRequestT = pwl::C_WriteRequest<This>;
-  using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
-  using C_FlushRequestT = pwl::C_FlushRequest<This>;
-  using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
-  using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
-  using C_CompAndWriteRequestT = pwl::C_CompAndWriteRequest<This>;
-
-  PMEMobjpool *m_log_pool = nullptr;
-  const char* m_pwl_pool_layout_name;
-
-  void remove_pool_file();
-  void load_existing_entries(pwl::DeferredContexts &later);
-  void alloc_op_log_entries(pwl::GenericLogOperations &ops);
-  int append_op_log_entries(pwl::GenericLogOperations &ops);
-  void flush_then_append_scheduled_ops(void);
-  void enlist_op_flusher();
-  void flush_op_log_entries(pwl::GenericLogOperationsVector &ops);
-  template <typename V>
-  void flush_pmem_buffer(V& ops);
-
-protected:
-  using AbstractWriteLog<ImageCtxT>::m_lock;
-  using AbstractWriteLog<ImageCtxT>::m_log_entries;
-  using AbstractWriteLog<ImageCtxT>::m_image_ctx;
-  using AbstractWriteLog<ImageCtxT>::m_perfcounter;
-  using AbstractWriteLog<ImageCtxT>::m_ops_to_flush;
-  using AbstractWriteLog<ImageCtxT>::m_cache_state;
-  using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
-  using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
-
-  void process_work() override;
-  void copy_pmem(C_BlockIORequestT *req) override;
-  void schedule_append_ops(pwl::GenericLogOperations &ops) override;
-  void append_scheduled_ops(void) override;
-  void reserve_pmem(C_BlockIORequestT *req, bool &alloc_succeeds, bool &no_space) override;
-  bool retire_entries(const unsigned long int frees_per_tx) override;
-  void persist_last_flushed_sync_gen() override;
-  bool alloc_resources(C_BlockIORequestT *req) override;
-  void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops) override;
-  void setup_schedule_append(
-      pwl::GenericLogOperationsVector &ops, bool do_early_flush) override;
-  Context *construct_flush_entry_ctx(
-        const std::shared_ptr<pwl::GenericLogEntry> log_entry) override;
-  void initialize_pool(Context *on_finish, pwl::DeferredContexts &later) override;
-  void write_data_to_buffer(
-      std::shared_ptr<pwl::WriteLogEntry> ws_entry,
-      pwl::WriteLogPmemEntry *pmem_entry) override;
-};
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-extern template class librbd::cache::pwl::ReplicatedWriteLog<librbd::ImageCtx>;
-
-#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
diff --git a/src/librbd/cache/pwl/Request.cc b/src/librbd/cache/pwl/Request.cc

index 85ba30cab66e5b5172142751941619e407185186..28e15bb11fd4f6daab995b9b239f85583265072f 100644 (file)
--- a/src/librbd/cache/pwl/Request.cc
+++ b/src/librbd/cache/pwl/Request.cc
@@ -78,9 +78,7 @@ void C_BlockIORequest<T>::complete_user_request(int r) {
    if (m_user_req_completed.compare_exchange_strong(initial, true)) {
      ldout(pwl.get_context(), 15) << this << " completing user req" << dendl;
      m_user_req_completed_time = ceph_clock_now();
-    user_req->complete(r);
-    // Set user_req as null as it is deleted
-    user_req = nullptr;
+    pwl.complete_user_request(user_req, r);
    } else {
      ldout(pwl.get_context(), 20) << this << " user req already completed" << dendl;
    }
@@ -118,6 +116,18 @@ C_WriteRequest<T>::C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&i
    ldout(pwl.get_context(), 99) << this << dendl;
  }
  
+template <typename T>
+C_WriteRequest<T>::C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+                                  bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+                                  int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
+                                  Context *user_req)
+  : C_BlockIORequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req),
+  mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)),
+  m_perfcounter(perfcounter), m_lock(lock) {
+  is_comp_and_write = true;
+  ldout(pwl.get_context(), 20) << dendl;
+}
+
  template <typename T>
  C_WriteRequest<T>::~C_WriteRequest() {
    ldout(pwl.get_context(), 99) << this << dendl;
@@ -150,6 +160,10 @@ void C_WriteRequest<T>::finish_req(int r) {
  
    /* Completed to caller by here (in finish(), which calls this) */
    utime_t now = ceph_clock_now();
+  if(is_comp_and_write && !compare_succeeded) {
+    update_req_stats(now);
+    return;
+  }
    pwl.release_write_lanes(this);
    ceph_assert(m_resources.allocated);
    m_resources.allocated = false;
@@ -158,39 +172,11 @@ void C_WriteRequest<T>::finish_req(int r) {
  }
  
  template <typename T>
-void C_WriteRequest<T>::setup_buffer_resources(
-    uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
-    uint64_t &number_lanes, uint64_t &number_log_entries,
-    uint64_t &number_unpublished_reserves) {
-
-  ceph_assert(!m_resources.allocated);
-
-  auto image_extents_size = this->image_extents.size();
-  m_resources.buffers.reserve(image_extents_size);
-
-  bytes_cached = 0;
-  bytes_allocated = 0;
-  number_lanes = image_extents_size;
-  number_log_entries = image_extents_size;
-  number_unpublished_reserves = image_extents_size;
-
-  for (auto &extent : this->image_extents) {
-    m_resources.buffers.emplace_back();
-    struct WriteBufferAllocation &buffer = m_resources.buffers.back();
-    buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
-    buffer.allocated = false;
-    bytes_cached += extent.second;
-    if (extent.second > buffer.allocation_size) {
-      buffer.allocation_size = extent.second;
-    }
-    bytes_allocated += buffer.allocation_size;
-  }
-  bytes_dirtied = bytes_cached;
-}
-
-template <typename T>
-std::shared_ptr<WriteLogOperation> C_WriteRequest<T>::create_operation(uint64_t offset, uint64_t len) {
-  return std::make_shared<WriteLogOperation>(*op_set, offset, len, pwl.get_context());
+std::shared_ptr<WriteLogOperation> C_WriteRequest<T>::create_operation(
+    uint64_t offset, uint64_t len) {
+  return pwl.m_builder->create_write_log_operation(
+      *op_set, offset, len, pwl.get_context(),
+      pwl.m_builder->create_write_log_entry(op_set->sync_point->log_entry, offset, len));
  }
  
  template <typename T>
@@ -254,16 +240,10 @@ void C_WriteRequest<T>::setup_log_operations(DeferredContexts &on_exit) {
    pwl.add_into_log_map(log_entries, this);
  }
  
-#ifdef WITH_RBD_RWL
  template <typename T>
-void C_WriteRequest<T>::copy_pmem() {
-  auto allocation = m_resources.buffers.begin();
-  for (auto &operation : op_set->operations) {
-    operation->copy_bl_to_pmem_buffer(allocation);
-    allocation++;
-  }
+void C_WriteRequest<T>::copy_cache() {
+  pwl.copy_bl_to_buffer(&m_resources, op_set);
  }
-#endif
  
  template <typename T>
  bool C_WriteRequest<T>::append_write_request(std::shared_ptr<SyncPoint> sync_point) {
@@ -395,10 +375,10 @@ void C_FlushRequest<T>::dispatch() {
  
  template <typename T>
  void C_FlushRequest<T>::setup_buffer_resources(
-    uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
-    uint64_t &number_lanes, uint64_t &number_log_entries,
-    uint64_t &number_unpublished_reserves) {
-  number_log_entries = 1;
+    uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+    uint64_t *number_lanes, uint64_t *number_log_entries,
+    uint64_t *number_unpublished_reserves) {
+  *number_log_entries = 1;
  }
  
  template <typename T>
@@ -479,15 +459,15 @@ void C_DiscardRequest<T>::dispatch() {
  
  template <typename T>
  void C_DiscardRequest<T>::setup_buffer_resources(
-    uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
-    uint64_t &number_lanes, uint64_t &number_log_entries,
-    uint64_t &number_unpublished_reserves) {
-  number_log_entries = 1;
+    uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+    uint64_t *number_lanes, uint64_t *number_log_entries,
+    uint64_t *number_unpublished_reserves) {
+  *number_log_entries = 1;
    /* No bytes are allocated for a discard, but we count the discarded bytes
     * as dirty.  This means it's possible to have more bytes dirty than
     * there are bytes cached or allocated. */
    for (auto &extent : this->image_extents) {
-    bytes_dirtied = extent.second;
+    *bytes_dirtied = extent.second;
      break;
    }
  }
@@ -514,10 +494,12 @@ std::ostream &operator<<(std::ostream &os,
  }
  
  template <typename T>
-C_WriteSameRequest<T>::C_WriteSameRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
-                                          bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
-                                          PerfCounters *perfcounter, Context *user_req)
-  : C_WriteRequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req) {
+C_WriteSameRequest<T>::C_WriteSameRequest(
+    T &pwl, const utime_t arrived, io::Extents &&image_extents,
+    bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+    PerfCounters *perfcounter, Context *user_req)
+  : C_WriteRequest<T>(pwl, arrived, std::move(image_extents), std::move(bl),
+      fadvise_flags, lock, perfcounter, user_req) {
    ldout(pwl.get_context(), 20) << this << dendl;
  }
  
@@ -536,32 +518,15 @@ void C_WriteSameRequest<T>::update_req_stats(utime_t &now) {
    this->m_perfcounter->tinc(l_librbd_pwl_ws_latency, comp_latency);
  }
  
-/* Write sames will allocate one buffer, the size of the repeating pattern */
  template <typename T>
-void C_WriteSameRequest<T>::setup_buffer_resources(
-    uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
-    uint64_t &number_lanes, uint64_t &number_log_entries,
-    uint64_t &number_unpublished_reserves) {
-  ldout(pwl.get_context(), 20) << this << dendl;
+std::shared_ptr<WriteLogOperation> C_WriteSameRequest<T>::create_operation(
+    uint64_t offset, uint64_t len) {
    ceph_assert(this->image_extents.size() == 1);
-  bytes_dirtied += this->image_extents[0].second;
-  auto pattern_length = this->bl.length();
-  this->m_resources.buffers.emplace_back();
-  struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
-  buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
-  buffer.allocated = false;
-  bytes_cached += pattern_length;
-  if (pattern_length > buffer.allocation_size) {
-    buffer.allocation_size = pattern_length;
-  }
-  bytes_allocated += buffer.allocation_size;
-}
-
-template <typename T>
-std::shared_ptr<WriteLogOperation> C_WriteSameRequest<T>::create_operation(uint64_t offset, uint64_t len) {
-  ceph_assert(this->image_extents.size() == 1);
-  return std::make_shared<WriteSameLogOperation>(*this->op_set.get(), offset, len,
-                                                 this->bl.length(), pwl.get_context());
+  WriteLogOperationSet &set = *this->op_set.get();
+  return pwl.m_builder->create_write_log_operation(
+      *this->op_set.get(), offset, len, this->bl.length(), pwl.get_context(),
+      pwl.m_builder->create_writesame_log_entry(set.sync_point->log_entry, offset,
+                                                len, this->bl.length()));
  }
  
  template <typename T>
@@ -572,51 +537,17 @@ std::ostream &operator<<(std::ostream &os,
  }
  
  template <typename T>
-C_CompAndWriteRequest<T>::C_CompAndWriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
-                                                bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
-                                                int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
-                                                Context *user_req)
-  : C_WriteRequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req),
-  mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)) {
-  ldout(pwl.get_context(), 20) << dendl;
-}
-
-template <typename T>
-C_CompAndWriteRequest<T>::~C_CompAndWriteRequest() {
-  ldout(pwl.get_context(), 20) << dendl;
-}
-
-template <typename T>
-void C_CompAndWriteRequest<T>::finish_req(int r) {
-  if (compare_succeeded) {
-    C_WriteRequest<T>::finish_req(r);
-  } else {
-    utime_t now = ceph_clock_now();
-    update_req_stats(now);
-  }
-}
-
-template <typename T>
-void C_CompAndWriteRequest<T>::update_req_stats(utime_t &now) {
+void C_WriteRequest<T>::update_req_stats(utime_t &now) {
    /* Compare-and-write stats. Compare-and-write excluded from most write
     * stats because the read phase will make them look like slow writes in
     * those histograms. */
-  if (!compare_succeeded) {
-    this->m_perfcounter->inc(l_librbd_pwl_cmp_fails, 1);
+  if(is_comp_and_write) {
+    if (!compare_succeeded) {
+      this->m_perfcounter->inc(l_librbd_pwl_cmp_fails, 1);
+    }
+    utime_t comp_latency = now - this->m_arrived_time;
+    this->m_perfcounter->tinc(l_librbd_pwl_cmp_latency, comp_latency);
    }
-  utime_t comp_latency = now - this->m_arrived_time;
-  this->m_perfcounter->tinc(l_librbd_pwl_cmp_latency, comp_latency);
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os,
-                         const C_CompAndWriteRequest<T> &req) {
-  os << (C_WriteRequest<T>&)req
-     << "cmp_bl=" << req.cmp_bl << ", "
-     << "read_bl=" << req.read_bl << ", "
-     << "compare_succeeded=" << req.compare_succeeded << ", "
-     << "mismatch_offset=" << req.mismatch_offset;
-  return os;
  }
  
  } // namespace pwl
@@ -628,4 +559,3 @@ template class librbd::cache::pwl::C_WriteRequest<librbd::cache::pwl::AbstractWr
  template class librbd::cache::pwl::C_FlushRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
  template class librbd::cache::pwl::C_DiscardRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
  template class librbd::cache::pwl::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
-template class librbd::cache::pwl::C_CompAndWriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
diff --git a/src/librbd/cache/pwl/Request.h b/src/librbd/cache/pwl/Request.h

index fc7aecb24c18e8957b38723c7eeee4021148eb96..8d5a0b4742a722d55ac56d8dbaaa592c8dc49290 100644 (file)
--- a/src/librbd/cache/pwl/Request.h
+++ b/src/librbd/cache/pwl/Request.h
@@ -1,8 +1,8 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab
  
-#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H 
-#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H 
+#ifndef CEPH_LIBRBD_CACHE_PWL_REQUEST_H
+#define CEPH_LIBRBD_CACHE_PWL_REQUEST_H
  
  #include "include/Context.h"
  #include "librbd/cache/pwl/Types.h"
@@ -66,7 +66,7 @@ public:
  
    virtual void dispatch()  = 0;
  
-  virtual void copy_pmem() {};
+  virtual void copy_cache() {};
  
    virtual const char *get_name() const {
      return "C_BlockIORequest";
@@ -99,9 +99,9 @@ public:
    }
  
    virtual void setup_buffer_resources(
-      uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
-      uint64_t &number_lanes, uint64_t &number_log_entries,
-      uint64_t &number_unpublished_reserves) {};
+      uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+      uint64_t *number_lanes, uint64_t *number_log_entries,
+      uint64_t *number_unpublished_reserves) {};
  
  protected:
    utime_t m_arrived_time;
@@ -131,12 +131,22 @@ template <typename T>
  class C_WriteRequest : public C_BlockIORequest<T> {
  public:
    using C_BlockIORequest<T>::pwl;
+  bool compare_succeeded = false;
+  uint64_t *mismatch_offset;
+  bufferlist cmp_bl;
+  bufferlist read_bl;
+  bool is_comp_and_write = false;
    unique_ptr<WriteLogOperationSet> op_set = nullptr;
  
    C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
                   bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
                   PerfCounters *perfcounter, Context *user_req);
  
+  C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+                 bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+                 int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
+                 Context *user_req);
+
    ~C_WriteRequest() override;
  
    void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx);
@@ -145,20 +155,18 @@ public:
    void finish_req(int r) override;
  
    /* Compare and write will override this */
-  virtual void update_req_stats(utime_t &now) {
-    // TODO: Add in later PRs
-  }
+  virtual void update_req_stats(utime_t &now);
+
    bool alloc_resources() override;
  
    void deferred_handler() override { }
  
    void dispatch() override;
  
-  #ifdef WITH_RBD_RWL
-  void copy_pmem() override;
-  #endif
+  void copy_cache() override;
  
-  virtual std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset, uint64_t len);
+  virtual std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset,
+                                                              uint64_t len);
  
    virtual void setup_log_operations(DeferredContexts &on_exit);
  
@@ -173,11 +181,6 @@ public:
  protected:
    using C_BlockIORequest<T>::m_resources;
    PerfCounters *m_perfcounter = nullptr;
-  /* Plain writes will allocate one buffer per request extent */
-  void setup_buffer_resources(
-      uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
-      uint64_t &number_lanes, uint64_t &number_log_entries,
-      uint64_t &number_unpublished_reserves) override;
  
  private:
    bool m_do_early_flush = false;
@@ -220,9 +223,10 @@ public:
    }
  
    void setup_buffer_resources(
-      uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
-      uint64_t &number_lanes, uint64_t &number_log_entries,
-      uint64_t &number_unpublished_reserves) override;
+      uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+      uint64_t *bytes_allocated, uint64_t *number_lanes,
+      uint64_t *number_log_entries,
+      uint64_t *number_unpublished_reserves) override;
  private:
    std::shared_ptr<SyncPointLogOperation> op;
    ceph::mutex &m_lock;
@@ -270,9 +274,9 @@ public:
      return "C_DiscardRequest";
    }
    void setup_buffer_resources(
-      uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
-      uint64_t &number_lanes, uint64_t &number_log_entries,
-      uint64_t &number_unpublished_reserves) override;
+      uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+      uint64_t *number_lanes, uint64_t *number_log_entries,
+      uint64_t *number_unpublished_reserves) override;
  private:
    uint32_t m_discard_granularity_bytes;
    ceph::mutex &m_lock;
@@ -300,11 +304,6 @@ public:
  
    void update_req_stats(utime_t &now) override;
  
-  void setup_buffer_resources(
-      uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
-      uint64_t &number_lanes, uint64_t &number_log_entries,
-      uint64_t &number_unpublished_reserves) override;
-
    std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset, uint64_t len) override;
  
    const char *get_name() const override {
@@ -316,44 +315,6 @@ public:
                                    const C_WriteSameRequest<U> &req);
  };
  
-/**
- * This is the custodian of the BlockGuard cell for this compare and write. The
- * block guard is acquired before the read begins to guarantee atomicity of this
- * operation.  If this results in a write, the block guard will be released
- * when the write completes to all replicas.
- */
-template <typename T>
-class C_CompAndWriteRequest : public C_WriteRequest<T> {
-public:
-  using C_BlockIORequest<T>::pwl;
-  bool compare_succeeded = false;
-  uint64_t *mismatch_offset;
-  bufferlist cmp_bl;
-  bufferlist read_bl;
-  C_CompAndWriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
-                        bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
-                        int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
-                        Context *user_req);
-  ~C_CompAndWriteRequest();
-
-  void finish_req(int r) override;
-
-  void update_req_stats(utime_t &now) override;
-
-  /*
-   * Compare and write doesn't implement alloc_resources(), deferred_handler(),
-   * or dispatch(). We use the implementation in C_WriteRequest(), and only if the
-   * compare phase succeeds and a write is actually performed.
-   */
-
-  const char *get_name() const override {
-    return "C_CompAndWriteRequest";
-  }
-  template <typename U>
-  friend std::ostream &operator<<(std::ostream &os,
-                                  const C_CompAndWriteRequest<U> &req);
-};
-
  struct BlockGuardReqState {
    bool barrier = false; /* This is a barrier request */
    bool current_barrier = false; /* This is the currently active barrier */
@@ -410,4 +371,4 @@ public:
  } // namespace cache
  } // namespace librbd
  
-#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+#endif // CEPH_LIBRBD_CACHE_PWL_REQUEST_H
diff --git a/src/librbd/cache/pwl/SSDTypes.h b/src/librbd/cache/pwl/SSDTypes.h

deleted file mode 100644 (file)

index 7e6f2df..0000000
--- a/src/librbd/cache/pwl/SSDTypes.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-      
-#ifndef CEPH_LIBRBD_CACHE_SSD_TYPES_H
-#define CEPH_LIBRBD_CACHE_SSD_TYPES_H
-  
-#include "acconfig.h"
-    
-#include "librbd/io/Types.h"
-#include "Types.h" //generic type = to be renamed
-
-namespace librbd {
-namespace cache {
-namespace pwl {
-
-struct SuperBlock{
-  WriteLogPoolRoot root;
-
-  DENC(SuperBlock, v, p) {
-    DENC_START(1, 1, p);
-    denc(v.root, p);
-    DENC_FINISH(p);
-  }
-
-  void dump(Formatter *f) const {
-    f->dump_object("super", root);
-  }
-
-  static void generate_test_instances(list<SuperBlock*>& ls) {
-    ls.push_back(new SuperBlock);
-    ls.push_back(new SuperBlock);
-    ls.back()->root.first_valid_entry = 2;
-  }
-};
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-WRITE_CLASS_DENC(librbd::cache::pwl::SuperBlock)
-
-#endif // CEPH_LIBRBD_CACHE_SSD_TYPES_H
diff --git a/src/librbd/cache/pwl/SSDWriteLog.cc b/src/librbd/cache/pwl/SSDWriteLog.cc

deleted file mode 100644 (file)

index 34c1a53..0000000
--- a/src/librbd/cache/pwl/SSDWriteLog.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "SSDWriteLog.h"
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "include/ceph_assert.h"
-#include "common/deleter.h"
-#include "common/dout.h"
-#include "common/environment.h"
-#include "common/errno.h"
-#include "common/WorkQueue.h"
-#include "common/Timer.h"
-#include "common/perf_counters.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/asio/ContextWQ.h"
-#include "librbd/cache/pwl/ImageCacheState.h"
-#include "librbd/cache/pwl/LogEntry.h"
-#include <map>
-#include <vector>
-
-#undef dout_subsys
-#define dout_subsys ceph_subsys_rbd_pwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::pwl::SSDWriteLog: " << this << " " \
-                           <<  __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace pwl {
-
-using namespace librbd::cache::pwl;
-
-// SSD: this number can be updated later
-const unsigned long int ops_appended_together = MAX_WRITES_PER_SYNC_POINT;
-
-template <typename I>
-SSDWriteLog<I>::SSDWriteLog(
-    I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
-    cache::ImageWritebackInterface& image_writeback,
-    plugin::Api<I>& plugin_api)
-  : AbstractWriteLog<I>(image_ctx, cache_state, image_writeback, plugin_api)
-{
-}
-
-template <typename I>
-void SSDWriteLog<I>::initialize_pool(Context *on_finish, pwl::DeferredContexts &later) {
-  CephContext *cct = m_image_ctx.cct;
-  ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
-  if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
-    int fd = ::open(this->m_log_pool_name.c_str(), O_RDWR|O_CREAT, 0644);
-    bool succeed = true;
-    if (fd >= 0) {
-      if (truncate(this->m_log_pool_name.c_str(), this->m_log_pool_config_size) != 0) {
-        succeed = false;
-      }
-      ::close(fd);
-    } else {
-      succeed = false;
-    }
-    if (!succeed) {
-      m_cache_state->present = false;
-      m_cache_state->clean = true;
-      m_cache_state->empty = true;
-      /* TODO: filter/replace errnos that are meaningless to the caller */
-      on_finish->complete(-errno);
-      return;
-    }
-
-    bdev = BlockDevice::create(cct, this->m_log_pool_name, aio_cache_cb,
-                               nullptr, nullptr, nullptr);
-    int r = bdev->open(this->m_log_pool_name);
-    if (r < 0) {
-      delete bdev;
-      on_finish->complete(-1);
-      return;
-    }
-    m_cache_state->present = true;
-    m_cache_state->clean = true;
-    m_cache_state->empty = true;
-    /* new pool, calculate and store metadata */
-    size_t small_write_size = MIN_WRITE_ALLOC_SIZE + sizeof(struct WriteLogPmemEntry);
-
-    uint64_t num_small_writes = (uint64_t)(this->m_log_pool_config_size / small_write_size);
-    if (num_small_writes > MAX_LOG_ENTRIES) {
-      num_small_writes = MAX_LOG_ENTRIES;
-    }
-    assert(num_small_writes > 2);
-    m_log_pool_ring_buffer_size = this->m_log_pool_config_size - DATA_RING_BUFFER_OFFSET;
-    /* Log ring empty */
-    m_first_free_entry = DATA_RING_BUFFER_OFFSET;
-    m_first_valid_entry = DATA_RING_BUFFER_OFFSET;
-
-    pool_size = this->m_log_pool_config_size;
-    auto new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
-    new_root->pool_size = this->m_log_pool_config_size;
-    new_root->flushed_sync_gen = this->m_flushed_sync_gen;
-    new_root->block_size = MIN_WRITE_ALLOC_SIZE;
-    new_root->first_free_entry = m_first_free_entry;
-    new_root->first_valid_entry = m_first_valid_entry;
-    new_root->num_log_entries = num_small_writes;
-    pool_root = *new_root;
-
-    r = update_pool_root_sync(new_root);
-    if (r != 0) {
-      this->m_total_log_entries = 0;
-      this->m_free_log_entries = 0;
-      lderr(m_image_ctx.cct) << "failed to initialize pool ("
-                             << this->m_log_pool_name << ")" << dendl;
-      on_finish->complete(r);
-    }
-    this->m_total_log_entries = new_root->num_log_entries;
-    this->m_free_log_entries = new_root->num_log_entries - 1;
-   } else {
-     m_cache_state->present = true;
-     bdev = BlockDevice::create(
-         cct, this->m_log_pool_name, aio_cache_cb,
-         static_cast<void*>(this), nullptr, static_cast<void*>(this));
-     int r = bdev->open(this->m_log_pool_name);
-     if (r < 0) {
-       delete bdev;
-       on_finish->complete(r);
-       return;
-     }
-     //load_existing_entries(later); #TODO: Implement and uncomment in later PR
-     if (m_first_free_entry < m_first_valid_entry) {
-      /* Valid entries wrap around the end of the ring, so first_free is lower
-       * than first_valid.  If first_valid was == first_free+1, the entry at
-       * first_free would be empty. The last entry is never used, so in
-       * that case there would be zero free log entries. */
-       this->m_free_log_entries = this->m_total_log_entries -
-         (m_first_valid_entry - m_first_free_entry) - 1;
-     } else {
-      /* first_valid is <= first_free. If they are == we have zero valid log
-       * entries, and n-1 free log entries */
-       this->m_free_log_entries = this->m_total_log_entries -
-         (m_first_free_entry - m_first_valid_entry) - 1;
-     }
-     m_cache_state->clean = this->m_dirty_log_entries.empty();
-     m_cache_state->empty = m_log_entries.empty();
-  }
-}
-
-template <typename I>
-int SSDWriteLog<I>::update_pool_root_sync(
-    std::shared_ptr<WriteLogPoolRoot> root) {
-  bufferlist bl;
-  SuperBlock superblock;
-  superblock.root = *root;
-  encode(superblock, bl);
-  bl.append_zero(MIN_WRITE_ALLOC_SIZE - bl.length());
-  ceph_assert(bl.length() % MIN_WRITE_ALLOC_SIZE == 0);
-  return bdev->write(0, bl, false);
-}
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::pwl::SSDWriteLog<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/SSDWriteLog.h b/src/librbd/cache/pwl/SSDWriteLog.h

deleted file mode 100644 (file)

index ff9330e..0000000
--- a/src/librbd/cache/pwl/SSDWriteLog.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
-#define CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
-
-#include "AbstractWriteLog.h"
-#include "blk/BlockDevice.h"
-#include "common/AsyncOpTracker.h"
-#include "common/Checksummer.h"
-#include "common/environment.h"
-#include "common/RWLock.h"
-#include "common/WorkQueue.h"
-#include "librbd/BlockGuard.h"
-#include "librbd/Utils.h"
-#include "librbd/cache/ImageWriteback.h"
-#include "librbd/cache/Types.h"
-#include "librbd/cache/pwl/LogMap.h"
-#include "librbd/cache/pwl/LogOperation.h"
-#include "librbd/cache/pwl/Request.h"
-#include "librbd/cache/pwl/SSDTypes.h"
-#include <functional>
-#include <list>
-
-namespace librbd {
-
-struct ImageCtx;
-
-namespace cache {
-
-namespace pwl {
-
-template <typename ImageCtxT>
-class SSDWriteLog : public AbstractWriteLog<ImageCtxT> {
-public:
-  SSDWriteLog(ImageCtxT &image_ctx,
-              librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
-              cache::ImageWritebackInterface& image_writeback,
-              plugin::Api<ImageCtxT>& plugin_api);
-  ~SSDWriteLog() {}
-  SSDWriteLog(const SSDWriteLog&) = delete;
-  SSDWriteLog &operator=(const SSDWriteLog&) = delete;
-
-  using This = AbstractWriteLog<ImageCtxT>;
-  using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
-
-  //TODO: Implement below functions in later PR
-  bool alloc_resources(C_BlockIORequestT *req) override { return false; }
-  void setup_schedule_append(
-      pwl::GenericLogOperationsVector &ops, bool do_early_flush) override {}
-
-protected:
-  using AbstractWriteLog<ImageCtxT>::m_lock;
-  using AbstractWriteLog<ImageCtxT>::m_log_entries;
-  using AbstractWriteLog<ImageCtxT>::m_image_ctx;
-  using AbstractWriteLog<ImageCtxT>::m_cache_state;
-  using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
-  using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
-
-  void initialize_pool(Context *on_finish, pwl::DeferredContexts &later) override;
-  //TODO: Implement below functions in later PR
-  void process_work() override {}
-  void append_scheduled_ops(void) override {}
-  void schedule_append_ops(pwl::GenericLogOperations &ops) override {}
-  void remove_pool_file() override {}
-
-private:
-  uint64_t m_log_pool_ring_buffer_size; /* Size of ring buffer */
-
-  //classes and functions to faciliate block device operations
-  class AioTransContext {
-  public:
-    Context *on_finish;
-    ::IOContext ioc;
-    explicit AioTransContext(CephContext* cct, Context *cb)
-      :on_finish(cb), ioc(cct, this) {
-    }
-    ~AioTransContext(){}
-
-    void aio_finish() {
-      on_finish->complete(ioc.get_return_value());
-      delete this;
-    }
-  }; //class AioTransContext
-
-  BlockDevice *bdev = nullptr;
-  uint64_t pool_size;
-  pwl::WriteLogPoolRoot pool_root;
-
-  int update_pool_root_sync(std::shared_ptr<pwl::WriteLogPoolRoot> root);
-
-  static void aio_cache_cb(void *priv, void *priv2) {
-    AioTransContext *c = static_cast<AioTransContext*>(priv2);
-    c->aio_finish();
-  }
-};//class SSDWriteLog
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-extern template class librbd::cache::pwl::SSDWriteLog<librbd::ImageCtx>;
-
-#endif // CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
diff --git a/src/librbd/cache/pwl/ShutdownRequest.h b/src/librbd/cache/pwl/ShutdownRequest.h

index dd2385b7ec889e233a99971bc4396e18d3dcabec..dafac9e9cc32a4b579afd17052dfa35b73fb526b 100644 (file)
--- a/src/librbd/cache/pwl/ShutdownRequest.h
+++ b/src/librbd/cache/pwl/ShutdownRequest.h
@@ -13,7 +13,6 @@ class ImageCtx;
  namespace plugin { template <typename> struct Api; }
  
  namespace cache {
-
  namespace pwl {
  
  template<typename>
diff --git a/src/librbd/cache/pwl/Types.cc b/src/librbd/cache/pwl/Types.cc

index 9962d35df29610da15f243a78bf1d6c904e9eb10..827125c31f8ff005fa494bf86ca15e0dac70a64d 100644 (file)
--- a/src/librbd/cache/pwl/Types.cc
+++ b/src/librbd/cache/pwl/Types.cc
@@ -35,26 +35,26 @@ void DeferredContexts::add(Context* ctx) {
   * convert between image and block extents here using a "block size"
   * of 1.
   */
-BlockExtent convert_to_block_extent(const uint64_t offset_bytes, const uint64_t length_bytes)
+BlockExtent convert_to_block_extent(uint64_t offset_bytes, uint64_t length_bytes)
  {
    return BlockExtent(offset_bytes,
                       offset_bytes + length_bytes);
  }
  
-BlockExtent WriteLogPmemEntry::block_extent() {
+BlockExtent WriteLogCacheEntry::block_extent() {
    return convert_to_block_extent(image_offset_bytes, write_bytes);
  }
  
-uint64_t WriteLogPmemEntry::get_offset_bytes() {
+uint64_t WriteLogCacheEntry::get_offset_bytes() {
    return image_offset_bytes;
  }
  
-uint64_t WriteLogPmemEntry::get_write_bytes() {
+uint64_t WriteLogCacheEntry::get_write_bytes() {
    return write_bytes;
  }
  
  #ifdef WITH_RBD_SSD_CACHE
-void WriteLogPmemEntry::dump(Formatter *f) const {
+void WriteLogCacheEntry::dump(Formatter *f) const {
    f->dump_unsigned("sync_gen_number", sync_gen_number);
    f->dump_unsigned("write_sequence_number", write_sequence_number);
    f->dump_unsigned("image_offset_bytes", image_offset_bytes);
@@ -70,9 +70,9 @@ void WriteLogPmemEntry::dump(Formatter *f) const {
    f->dump_unsigned("entry_index", entry_index);
  }
  
-void WriteLogPmemEntry::generate_test_instances(list<WriteLogPmemEntry*>& ls) {
-  ls.push_back(new WriteLogPmemEntry);
-  ls.push_back(new WriteLogPmemEntry);
+void WriteLogCacheEntry::generate_test_instances(list<WriteLogCacheEntry*>& ls) {
+  ls.push_back(new WriteLogCacheEntry);
+  ls.push_back(new WriteLogCacheEntry);
    ls.back()->sync_gen_number = 1;
    ls.back()->write_sequence_number = 1;
    ls.back()->image_offset_bytes = 1;
@@ -113,7 +113,7 @@ void WriteLogPoolRoot::generate_test_instances(list<WriteLogPoolRoot*>& ls) {
  #endif
  
  std::ostream& operator<<(std::ostream& os,
-                         const WriteLogPmemEntry &entry) {
+                         const WriteLogCacheEntry &entry) {
    os << "entry_valid=" << (bool)entry.entry_valid << ", "
       << "sync_point=" << (bool)entry.sync_point << ", "
       << "sequenced=" << (bool)entry.sequenced << ", "
diff --git a/src/librbd/cache/pwl/Types.h b/src/librbd/cache/pwl/Types.h

index 4bb810b38f3cebbf9948c297daa0f7555059527d..d8bdcfa7ddbd34f5933ed7abc639be2d9a4c4e5e 100644 (file)
--- a/src/librbd/cache/pwl/Types.h
+++ b/src/librbd/cache/pwl/Types.h
@@ -203,11 +203,11 @@ public:
  POBJ_LAYOUT_BEGIN(rbd_pwl);
  POBJ_LAYOUT_ROOT(rbd_pwl, struct WriteLogPoolRoot);
  POBJ_LAYOUT_TOID(rbd_pwl, uint8_t);
-POBJ_LAYOUT_TOID(rbd_pwl, struct WriteLogPmemEntry);
+POBJ_LAYOUT_TOID(rbd_pwl, struct WriteLogCacheEntry);
  POBJ_LAYOUT_END(rbd_pwl);
  #endif
  
-struct WriteLogPmemEntry {
+struct WriteLogCacheEntry {
    uint64_t sync_gen_number = 0;
    uint64_t write_sequence_number = 0;
    uint64_t image_offset_bytes;
@@ -216,7 +216,7 @@ struct WriteLogPmemEntry {
    TOID(uint8_t) write_data;
    #endif
    #ifdef WITH_RBD_SSD_CACHE
-  uint64_t write_data_pos; /* SSD data offset */
+  uint64_t write_data_pos = 0; /* SSD data offset */
    #endif
    union {
      uint8_t flags;
@@ -233,7 +233,7 @@ struct WriteLogPmemEntry {
    uint32_t ws_datalen = 0;  /* Length of data buffer (writesame only) */
    uint32_t entry_index = 0; /* For debug consistency check. Can be removed if
                               * we need the space */
-  WriteLogPmemEntry(const uint64_t image_offset_bytes=0, const uint64_t write_bytes=0)
+  WriteLogCacheEntry(uint64_t image_offset_bytes=0, uint64_t write_bytes=0)
      : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes),
        entry_valid(0), sync_point(0), sequenced(0), has_data(0), discard(0), writesame(0) {
    }
@@ -258,9 +258,9 @@ struct WriteLogPmemEntry {
      return is_write() || is_discard() || is_writesame();
    }
    friend std::ostream& operator<<(std::ostream& os,
-                                  const WriteLogPmemEntry &entry);
+                                  const WriteLogCacheEntry &entry);
    #ifdef WITH_RBD_SSD_CACHE
-  DENC(WriteLogPmemEntry, v, p) {
+  DENC(WriteLogCacheEntry, v, p) {
      DENC_START(1, 1, p);
      denc(v.sync_gen_number, p);
      denc(v.write_sequence_number, p);
@@ -274,7 +274,7 @@ struct WriteLogPmemEntry {
    }
    #endif
    void dump(ceph::Formatter *f) const;
-  static void generate_test_instances(list<WriteLogPmemEntry*>& ls);
+  static void generate_test_instances(list<WriteLogCacheEntry*>& ls);
  };
  
  struct WriteLogPoolRoot {
@@ -285,7 +285,7 @@ struct WriteLogPoolRoot {
      };
      uint64_t _u64;
    } header;
-  TOID(struct WriteLogPmemEntry) log_entries;   /* contiguous array of log entries */
+  TOID(struct WriteLogCacheEntry) log_entries;   /* contiguous array of log entries */
    #endif
    #ifdef WITH_RBD_SSD_CACHE
    uint64_t layout_version = 0;
@@ -377,7 +377,7 @@ std::string unique_lock_name(const std::string &name, void *address);
  } // namespace librbd
  
  #ifdef WITH_RBD_SSD_CACHE
-WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogPmemEntry)
+WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogCacheEntry)
  WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogPoolRoot)
  #endif
  
diff --git a/src/librbd/cache/pwl/rwl/Builder.h b/src/librbd/cache/pwl/rwl/Builder.h

new file mode 100644 (file)

index 0000000..9665a83
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/Builder.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H
+
+#include <iostream>
+#include "LogEntry.h"
+#include "Request.h"
+#include "LogOperation.h"
+
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/pwl/Builder.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename T>
+class Builder : public pwl::Builder<T> {
+public:
+  std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+      uint64_t image_offset_bytes, uint64_t write_bytes) override {
+    return std::make_shared<WriteLogEntry>(image_offset_bytes, write_bytes);
+  }
+  std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+      std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+      uint64_t image_offset_bytes, uint64_t write_bytes) override {
+    return std::make_shared<WriteLogEntry>(
+        sync_point_entry, image_offset_bytes, write_bytes);
+  }
+  std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length) override {
+    return std::make_shared<WriteSameLogEntry>(
+        image_offset_bytes, write_bytes, data_length);
+  }
+  std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+      std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length) override {
+    return std::make_shared<WriteSameLogEntry>(
+        sync_point_entry, image_offset_bytes, write_bytes, data_length);
+  }
+  pwl::C_WriteRequest<T> *create_write_request(
+      T &pwl, utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req) override {
+    return new C_WriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(bl),
+        fadvise_flags, lock, perfcounter, user_req);
+  }
+  pwl::C_WriteSameRequest<T> *create_writesame_request(
+      T &pwl, utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req) override {
+    return new C_WriteSameRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(bl),
+        fadvise_flags, lock, perfcounter, user_req);
+  }
+  pwl::C_WriteRequest<T> *create_comp_and_write_request(
+      T &pwl, utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+      const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req) override {
+    return new rwl::C_CompAndWriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+        std::move(bl), mismatch_offset, fadvise_flags,
+        lock, perfcounter, user_req);
+  }
+  std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+      WriteLogOperationSet &set, uint64_t image_offset_bytes,
+      uint64_t write_bytes, CephContext *cct,
+      std::shared_ptr<pwl::WriteLogEntry> write_log_entry) {
+    return std::make_shared<WriteLogOperation>(
+        set, image_offset_bytes, write_bytes, cct, write_log_entry);
+  }
+  std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+      WriteLogOperationSet &set, uint64_t image_offset_bytes,
+      uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+      std::shared_ptr<pwl::WriteLogEntry> writesame_log_entry) {
+    return std::make_shared<WriteLogOperation>(
+        set, image_offset_bytes, write_bytes, data_len, cct,
+        writesame_log_entry);
+  }
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H
diff --git a/src/librbd/cache/pwl/rwl/LogEntry.cc b/src/librbd/cache/pwl/rwl/LogEntry.cc

new file mode 100644 (file)

index 0000000..7325bef
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/LogEntry.cc
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/ImageWriteback.h"
+#include "LogEntry.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::WriteLogEntry: " \
+                           << this << " " <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+void WriteLogEntry::writeback(
+    librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) {
+  /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the
+   * bl even after flush()). */
+  bufferlist entry_bl;
+  buffer::list entry_bl_copy;
+  copy_cache_bl(&entry_bl_copy);
+  entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
+  image_writeback.aio_write({{ram_entry.image_offset_bytes,
+                              ram_entry.write_bytes}},
+                            std::move(entry_bl), 0, ctx);
+}
+
+void WriteLogEntry::init_cache_bp() {
+  ceph_assert(!this->cache_bp.have_raw());
+  cache_bp = buffer::ptr(buffer::create_static(this->write_bytes(),
+                                               (char*)this->cache_buffer));
+}
+
+void WriteLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) {
+  if(!is_writesame) {
+    bl.append(bp);
+    return;
+  }
+  for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) {
+    bl.append(bp);
+  }
+  int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen;
+  if (trailing_partial) {
+    bl.append(bp, 0, trailing_partial);
+  }
+}
+
+void WriteLogEntry::init_cache_buffer(
+    std::vector<WriteBufferAllocation>::iterator allocation) {
+  this->ram_entry.write_data = allocation->buffer_oid;
+  ceph_assert(!TOID_IS_NULL(this->ram_entry.write_data));
+  cache_buffer = D_RW(this->ram_entry.write_data);
+}
+
+buffer::list& WriteLogEntry::get_cache_bl() {
+  if (0 == bl_refs) {
+    std::lock_guard locker(m_entry_bl_lock);
+    if (0 == bl_refs) {
+      //init pmem bufferlist
+      cache_bl.clear();
+      init_cache_bp();
+      ceph_assert(cache_bp.have_raw());
+      int before_bl = cache_bp.raw_nref();
+      this->init_bl(cache_bp, cache_bl);
+      int after_bl = cache_bp.raw_nref();
+      bl_refs = after_bl - before_bl;
+    }
+    ceph_assert(0 != bl_refs);
+  }
+  return cache_bl;
+}
+
+void WriteLogEntry::copy_cache_bl(bufferlist *out_bl) {
+  this->get_cache_bl();
+  // cache_bp is now initialized
+  buffer::ptr cloned_bp(cache_bp.clone());
+  out_bl->clear();
+  this->init_bl(cloned_bp, *out_bl);
+}
+
+void WriteSameLogEntry::writeback(
+    librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) {
+  bufferlist entry_bl;
+  buffer::list entry_bl_copy;
+  copy_cache_bl(&entry_bl_copy);
+  entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
+  image_writeback.aio_writesame(ram_entry.image_offset_bytes,
+                                ram_entry.write_bytes,
+                                std::move(entry_bl), 0, ctx);
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/rwl/LogEntry.h b/src/librbd/cache/pwl/rwl/LogEntry.h

new file mode 100644 (file)

index 0000000..0eacb5a
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/LogEntry.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H
+
+#include "librbd/cache/pwl/LogEntry.h"
+
+namespace librbd {
+namespace cache {
+class ImageWritebackInterface;
+namespace pwl {
+namespace rwl {
+
+class WriteLogEntry : public pwl::WriteLogEntry {
+public:
+  WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+                uint64_t image_offset_bytes, uint64_t write_bytes)
+    : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {}
+  WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
+    : pwl::WriteLogEntry(image_offset_bytes, write_bytes) {}
+  WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+                uint64_t image_offset_bytes, uint64_t write_bytes,
+                uint32_t data_length)
+    : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes,
+                         data_length) {}
+  WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes,
+                uint32_t data_length)
+    : pwl::WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+ ~WriteLogEntry() {}
+  WriteLogEntry(const WriteLogEntry&) = delete;
+  WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+
+  void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+                 Context *ctx) override;
+  void init_cache_bp() override;
+  void init_bl(buffer::ptr &bp, buffer::list &bl) override;
+  void init_cache_buffer(
+      std::vector<WriteBufferAllocation>::iterator allocation) override;
+  buffer::list &get_cache_bl() override;
+  void copy_cache_bl(bufferlist *out_bl) override;
+};
+
+class WriteSameLogEntry : public WriteLogEntry {
+public:
+  WriteSameLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+                    uint64_t image_offset_bytes, uint64_t write_bytes,
+                    uint32_t data_length)
+    : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes,
+                    data_length) {}
+  WriteSameLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes,
+                   uint32_t data_length)
+    : WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+  ~WriteSameLogEntry() {}
+  WriteSameLogEntry(const WriteSameLogEntry&) = delete;
+  WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
+
+  void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+                 Context *ctx) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H
diff --git a/src/librbd/cache/pwl/rwl/LogOperation.cc b/src/librbd/cache/pwl/rwl/LogOperation.cc

new file mode 100644 (file)

index 0000000..fdd0cf6
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/LogOperation.cc
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LogOperation.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::LogOperation: " \
+                           << this << " " <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+void WriteLogOperation::copy_bl_to_cache_buffer(
+    std::vector<WriteBufferAllocation>::iterator allocation) {
+  /* operation is a shared_ptr, so write_op is only good as long as operation is
+   * in scope */
+  bufferlist::iterator i(&bl);
+  m_perfcounter->inc(l_librbd_pwl_log_op_bytes, log_entry->write_bytes());
+  ldout(m_cct, 20) << bl << dendl;
+  log_entry->init_cache_buffer(allocation);
+  i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->cache_buffer);
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/rwl/LogOperation.h b/src/librbd/cache/pwl/rwl/LogOperation.h

new file mode 100644 (file)

index 0000000..8134c79
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/LogOperation.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H
+
+#include "librbd/cache/pwl/LogOperation.h"
+
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+class WriteLogOperation : public pwl::WriteLogOperation {
+public:
+  WriteLogOperation(
+      WriteLogOperationSet &set, uint64_t image_offset_bytes,
+      uint64_t write_bytes, CephContext *cct,
+      std::shared_ptr<pwl::WriteLogEntry> write_log_entry)
+    : pwl::WriteLogOperation(set, image_offset_bytes, write_bytes, cct,
+                             write_log_entry) {}
+
+  WriteLogOperation(
+      WriteLogOperationSet &set, uint64_t image_offset_bytes,
+      uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+      std::shared_ptr<pwl::WriteLogEntry> writesame_log_entry)
+    : pwl::WriteLogOperation(set, image_offset_bytes, write_bytes, cct,
+                             writesame_log_entry) {}
+
+  void copy_bl_to_cache_buffer(
+          std::vector<WriteBufferAllocation>::iterator allocation) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H
diff --git a/src/librbd/cache/pwl/rwl/Request.cc b/src/librbd/cache/pwl/rwl/Request.cc

new file mode 100644 (file)

index 0000000..2fe1318
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/Request.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Request.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::Request: " << this \
+                           << " " <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename T>
+void C_WriteRequest<T>::setup_buffer_resources(
+    uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+    uint64_t *number_lanes, uint64_t *number_log_entries,
+    uint64_t *number_unpublished_reserves) {
+
+  ceph_assert(!this->m_resources.allocated);
+
+  auto image_extents_size = this->image_extents.size();
+  this->m_resources.buffers.reserve(image_extents_size);
+
+  *bytes_cached = 0;
+  *bytes_allocated = 0;
+  *number_lanes = image_extents_size;
+  *number_log_entries = image_extents_size;
+  *number_unpublished_reserves = image_extents_size;
+
+  for (auto &extent : this->image_extents) {
+    this->m_resources.buffers.emplace_back();
+    struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
+    buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
+    buffer.allocated = false;
+    *bytes_cached += extent.second;
+    if (extent.second > buffer.allocation_size) {
+      buffer.allocation_size = extent.second;
+    }
+    *bytes_allocated += buffer.allocation_size;
+  }
+  *bytes_dirtied = *bytes_cached;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+                         const C_CompAndWriteRequest<T> &req) {
+  os << (C_WriteRequest<T>&)req
+     << "cmp_bl=" << req.cmp_bl << ", "
+     << "read_bl=" << req.read_bl << ", "
+     << "compare_succeeded=" << req.compare_succeeded << ", "
+     << "mismatch_offset=" << req.mismatch_offset;
+  return os;
+}
+
+template <typename T>
+void C_WriteSameRequest<T>::setup_buffer_resources(
+    uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+    uint64_t *number_lanes, uint64_t *number_log_entries,
+    uint64_t *number_unpublished_reserves) {
+  ceph_assert(this->image_extents.size() == 1);
+  *bytes_dirtied += this->image_extents[0].second;
+  auto pattern_length = this->bl.length();
+  this->m_resources.buffers.emplace_back();
+  struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
+  buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
+  buffer.allocated = false;
+  *bytes_cached += pattern_length;
+  if (pattern_length > buffer.allocation_size) {
+    buffer.allocation_size = pattern_length;
+  }
+  *bytes_allocated += buffer.allocation_size;
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::rwl::C_WriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::rwl::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::rwl::C_CompAndWriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
diff --git a/src/librbd/cache/pwl/rwl/Request.h b/src/librbd/cache/pwl/rwl/Request.h

new file mode 100644 (file)

index 0000000..0a5c610
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/Request.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+
+#include "librbd/cache/pwl/Request.h"
+
+namespace librbd {
+class BlockGuardCell;
+
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename T>
+class C_WriteRequest : public pwl::C_WriteRequest<T> {
+public:
+  C_WriteRequest(
+      T &pwl, const utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+      const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req)
+    : pwl::C_WriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+        std::move(bl), mismatch_offset, fadvise_flags,
+        lock, perfcounter, user_req) {}
+
+  C_WriteRequest(
+      T &pwl, const utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req)
+    : pwl::C_WriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(bl),
+        fadvise_flags, lock, perfcounter, user_req) {}
+protected:
+  //Plain writes will allocate one buffer per request extent
+  void setup_buffer_resources(
+      uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+      uint64_t *bytes_allocated, uint64_t *number_lanes,
+      uint64_t *number_log_entries,
+      uint64_t *number_unpublished_reserves) override;
+};
+
+template <typename T>
+class C_CompAndWriteRequest : public C_WriteRequest<T> {
+public:
+  C_CompAndWriteRequest(
+      T &pwl, const utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+      const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req)
+    : C_WriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+        std::move(bl), mismatch_offset, fadvise_flags,
+        lock, perfcounter, user_req) {}
+
+  const char *get_name() const override {
+    return "C_CompAndWriteRequest";
+  }
+  template <typename U>
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const C_CompAndWriteRequest<U> &req);
+};
+
+template <typename T>
+class C_WriteSameRequest : public pwl::C_WriteSameRequest<T> {
+public:
+  C_WriteSameRequest(
+      T &pwl, const utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req)
+    : pwl::C_WriteSameRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags,
+        lock, perfcounter, user_req) {}
+
+  void setup_buffer_resources(
+      uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+      uint64_t *bytes_allocated, uint64_t *number_lanes,
+      uint64_t *number_log_entries,
+      uint64_t *number_unpublished_reserves) override;
+
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H
diff --git a/src/librbd/cache/pwl/rwl/WriteLog.cc b/src/librbd/cache/pwl/rwl/WriteLog.cc

new file mode 100644 (file)

index 0000000..f059628
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/WriteLog.cc
@@ -0,0 +1,932 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "WriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/plugin/Api.h"
+#include <map>
+#include <vector>
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::WriteLog: " << this \
+                           << " " <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+using namespace librbd::cache::pwl;
+namespace rwl {
+
+const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION;
+
+template <typename I>
+Builder<AbstractWriteLog<I>>* WriteLog<I>::create_builder() {
+  m_builderobj = new Builder<This>();
+  return m_builderobj;
+}
+
+template <typename I>
+WriteLog<I>::WriteLog(
+    I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
+    ImageWritebackInterface& image_writeback,
+    plugin::Api<I>& plugin_api)
+: AbstractWriteLog<I>(image_ctx, cache_state, create_builder(), image_writeback,
+                      plugin_api),
+  m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl))
+{
+}
+
+template <typename I>
+WriteLog<I>::~WriteLog() {
+  m_log_pool = nullptr;
+  delete m_builderobj;
+}
+
+/*
+ * Allocate the (already reserved) write log entries for a set of operations.
+ *
+ * Locking:
+ * Acquires lock
+ */
+template <typename I>
+void WriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops)
+{
+  TOID(struct WriteLogPoolRoot) pool_root;
+  pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+  struct WriteLogCacheEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
+
+  ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
+
+  /* Allocate the (already reserved) log entries */
+  std::lock_guard locker(m_lock);
+
+  for (auto &operation : ops) {
+    uint32_t entry_index = this->m_first_free_entry;
+    this->m_first_free_entry = (this->m_first_free_entry + 1) % this->m_total_log_entries;
+    auto &log_entry = operation->get_log_entry();
+    log_entry->log_entry_index = entry_index;
+    log_entry->ram_entry.entry_index = entry_index;
+    log_entry->cache_entry = &pmem_log_entries[entry_index];
+    log_entry->ram_entry.entry_valid = 1;
+    m_log_entries.push_back(log_entry);
+    ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
+  }
+}
+
+/*
+ * Write and persist the (already allocated) write log entries and
+ * data buffer allocations for a set of ops. The data buffer for each
+ * of these must already have been persisted to its reserved area.
+ */
+template <typename I>
+int WriteLog<I>::append_op_log_entries(GenericLogOperations &ops)
+{
+  CephContext *cct = m_image_ctx.cct;
+  GenericLogOperationsVector entries_to_flush;
+  TOID(struct WriteLogPoolRoot) pool_root;
+  pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+  int ret = 0;
+
+  ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
+
+  if (ops.empty()) {
+    return 0;
+  }
+  entries_to_flush.reserve(OPS_APPENDED_TOGETHER);
+
+  /* Write log entries to ring and persist */
+  utime_t now = ceph_clock_now();
+  for (auto &operation : ops) {
+    if (!entries_to_flush.empty()) {
+      /* Flush these and reset the list if the current entry wraps to the
+       * tail of the ring */
+      if (entries_to_flush.back()->get_log_entry()->log_entry_index >
+          operation->get_log_entry()->log_entry_index) {
+        ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at "
+                                   << "operation=[" << *operation << "]" << dendl;
+        flush_op_log_entries(entries_to_flush);
+        entries_to_flush.clear();
+        now = ceph_clock_now();
+      }
+    }
+    ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index="
+                               << operation->get_log_entry()->log_entry_index << " "
+                               << "from " << &operation->get_log_entry()->ram_entry << " "
+                               << "to " << operation->get_log_entry()->cache_entry << " "
+                               << "operation=[" << *operation << "]" << dendl;
+    ldout(m_image_ctx.cct, 05) << "APPENDING: index="
+                               << operation->get_log_entry()->log_entry_index << " "
+                               << "operation=[" << *operation << "]" << dendl;
+    operation->log_append_time = now;
+    *operation->get_log_entry()->cache_entry = operation->get_log_entry()->ram_entry;
+    ldout(m_image_ctx.cct, 20) << "APPENDING: index="
+                               << operation->get_log_entry()->log_entry_index << " "
+                               << "pmem_entry=[" << *operation->get_log_entry()->cache_entry
+                               << "]" << dendl;
+    entries_to_flush.push_back(operation);
+  }
+  flush_op_log_entries(entries_to_flush);
+
+  /* Drain once for all */
+  pmemobj_drain(m_log_pool);
+
+  /*
+   * Atomically advance the log head pointer and publish the
+   * allocations for all the data buffers they refer to.
+   */
+  utime_t tx_start = ceph_clock_now();
+  TX_BEGIN(m_log_pool) {
+    D_RW(pool_root)->first_free_entry = this->m_first_free_entry;
+    for (auto &operation : ops) {
+      if (operation->reserved_allocated()) {
+        auto write_op = (std::shared_ptr<WriteLogOperation>&) operation;
+        pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1);
+      } else {
+        ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
+      }
+    }
+  } TX_ONCOMMIT {
+  } TX_ONABORT {
+    lderr(cct) << "failed to commit " << ops.size()
+               << " log entries (" << this->m_log_pool_name << ")" << dendl;
+    ceph_assert(false);
+    ret = -EIO;
+  } TX_FINALLY {
+  } TX_END;
+
+  utime_t tx_end = ceph_clock_now();
+  m_perfcounter->tinc(l_librbd_pwl_append_tx_t, tx_end - tx_start);
+  m_perfcounter->hinc(
+    l_librbd_pwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size());
+  for (auto &operation : ops) {
+    operation->log_append_comp_time = tx_end;
+  }
+
+  return ret;
+}
+
+/*
+ * Flush the persistent write log entries set of ops. The entries must
+ * be contiguous in persistent memory.
+ */
+template <typename I>
+void WriteLog<I>::flush_op_log_entries(GenericLogOperationsVector &ops)
+{
+  if (ops.empty()) {
+    return;
+  }
+
+  if (ops.size() > 1) {
+    ceph_assert(ops.front()->get_log_entry()->cache_entry < ops.back()->get_log_entry()->cache_entry);
+  }
+
+  ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() << " "
+                             << "start address="
+                             << ops.front()->get_log_entry()->cache_entry << " "
+                             << "bytes="
+                             << ops.size() * sizeof(*(ops.front()->get_log_entry()->cache_entry))
+                             << dendl;
+  pmemobj_flush(m_log_pool,
+                ops.front()->get_log_entry()->cache_entry,
+                ops.size() * sizeof(*(ops.front()->get_log_entry()->cache_entry)));
+}
+
+template <typename I>
+void WriteLog<I>::remove_pool_file() {
+  if (m_log_pool) {
+    ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl;
+    pmemobj_close(m_log_pool);
+  }
+  if (m_cache_state->clean) {
+      ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << this->m_log_pool_name << dendl;
+      if (remove(this->m_log_pool_name.c_str()) != 0) {
+        lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << this->m_log_pool_name << "\": "
+          << pmemobj_errormsg() << dendl;
+      } else {
+        m_cache_state->clean = true;
+        m_cache_state->empty = true;
+        m_cache_state->present = false;
+      }
+  } else {
+    ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << this->m_log_pool_name << dendl;
+  }
+}
+
+template <typename I>
+void WriteLog<I>::initialize_pool(Context *on_finish, pwl::DeferredContexts &later) {
+  CephContext *cct = m_image_ctx.cct;
+  TOID(struct WriteLogPoolRoot) pool_root;
+  ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+  if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
+    if ((m_log_pool =
+         pmemobj_create(this->m_log_pool_name.c_str(),
+                        this->m_pwl_pool_layout_name,
+                        this->m_log_pool_config_size,
+                        (S_IWUSR | S_IRUSR))) == NULL) {
+      lderr(cct) << "failed to create pool (" << this->m_log_pool_name << ")"
+                 << pmemobj_errormsg() << dendl;
+      m_cache_state->present = false;
+      m_cache_state->clean = true;
+      m_cache_state->empty = true;
+      /* TODO: filter/replace errnos that are meaningless to the caller */
+      on_finish->complete(-errno);
+      return;
+    }
+    m_cache_state->present = true;
+    m_cache_state->clean = true;
+    m_cache_state->empty = true;
+    pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+    /* new pool, calculate and store metadata */
+    size_t effective_pool_size = (size_t)(this->m_log_pool_config_size * USABLE_SIZE);
+    size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogCacheEntry);
+    uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size);
+    if (num_small_writes > MAX_LOG_ENTRIES) {
+      num_small_writes = MAX_LOG_ENTRIES;
+    }
+    if (num_small_writes <= 2) {
+      lderr(cct) << "num_small_writes needs to > 2" << dendl;
+      on_finish->complete(-EINVAL);
+      return;
+    }
+    this->m_log_pool_actual_size = this->m_log_pool_config_size;
+    this->m_bytes_allocated_cap = effective_pool_size;
+    /* Log ring empty */
+    m_first_free_entry = 0;
+    m_first_valid_entry = 0;
+    TX_BEGIN(m_log_pool) {
+      TX_ADD(pool_root);
+      D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION;
+      D_RW(pool_root)->log_entries =
+        TX_ZALLOC(struct WriteLogCacheEntry,
+                  sizeof(struct WriteLogCacheEntry) * num_small_writes);
+      D_RW(pool_root)->pool_size = this->m_log_pool_actual_size;
+      D_RW(pool_root)->flushed_sync_gen = this->m_flushed_sync_gen;
+      D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE;
+      D_RW(pool_root)->num_log_entries = num_small_writes;
+      D_RW(pool_root)->first_free_entry = m_first_free_entry;
+      D_RW(pool_root)->first_valid_entry = m_first_valid_entry;
+    } TX_ONCOMMIT {
+      this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
+      this->m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free
+    } TX_ONABORT {
+      this->m_total_log_entries = 0;
+      this->m_free_log_entries = 0;
+      lderr(cct) << "failed to initialize pool (" << this->m_log_pool_name << ")" << dendl;
+      on_finish->complete(-pmemobj_tx_errno());
+      return;
+    } TX_FINALLY {
+    } TX_END;
+  } else {
+    m_cache_state->present = true;
+    /* Open existing pool */
+    if ((m_log_pool =
+         pmemobj_open(this->m_log_pool_name.c_str(),
+                      this->m_pwl_pool_layout_name)) == NULL) {
+      lderr(cct) << "failed to open pool (" << this->m_log_pool_name << "): "
+                 << pmemobj_errormsg() << dendl;
+      on_finish->complete(-errno);
+      return;
+    }
+    pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+    if (D_RO(pool_root)->header.layout_version != RWL_POOL_VERSION) {
+      // TODO: will handle upgrading version in the future
+      lderr(cct) << "Pool layout version is "
+                 << D_RO(pool_root)->header.layout_version
+                 << " expected " << RWL_POOL_VERSION << dendl;
+      on_finish->complete(-EINVAL);
+      return;
+    }
+    if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) {
+      lderr(cct) << "Pool block size is " << D_RO(pool_root)->block_size
+                 << " expected " << MIN_WRITE_ALLOC_SIZE << dendl;
+      on_finish->complete(-EINVAL);
+      return;
+    }
+    this->m_log_pool_actual_size = D_RO(pool_root)->pool_size;
+    this->m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen;
+    this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
+    m_first_free_entry = D_RO(pool_root)->first_free_entry;
+    m_first_valid_entry = D_RO(pool_root)->first_valid_entry;
+    if (m_first_free_entry < m_first_valid_entry) {
+      /* Valid entries wrap around the end of the ring, so first_free is lower
+       * than first_valid.  If first_valid was == first_free+1, the entry at
+       * first_free would be empty. The last entry is never used, so in
+       * that case there would be zero free log entries. */
+     this->m_free_log_entries = this->m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1;
+    } else {
+      /* first_valid is <= first_free. If they are == we have zero valid log
+       * entries, and n-1 free log entries */
+      this->m_free_log_entries = this->m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1;
+    }
+    size_t effective_pool_size = (size_t)(this->m_log_pool_config_size * USABLE_SIZE);
+    this->m_bytes_allocated_cap = effective_pool_size;
+    load_existing_entries(later);
+    m_cache_state->clean = this->m_dirty_log_entries.empty();
+    m_cache_state->empty = m_log_entries.empty();
+  }
+}
+
+/*
+ * Loads the log entries from an existing log.
+ *
+ * Creates the in-memory structures to represent the state of the
+ * re-opened log.
+ *
+ * Finds the last appended sync point, and any sync points referred to
+ * in log entries, but missing from the log. These missing sync points
+ * are created and scheduled for append. Some rudimentary consistency
+ * checking is done.
+ *
+ * Rebuilds the m_blocks_to_log_entries map, to make log entries
+ * readable.
+ *
+ * Places all writes on the dirty entries list, which causes them all
+ * to be flushed.
+ *
+ */
+
+template <typename I>
+void WriteLog<I>::load_existing_entries(DeferredContexts &later) {
+  TOID(struct WriteLogPoolRoot) pool_root;
+  pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+  struct WriteLogCacheEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
+  uint64_t entry_index = m_first_valid_entry;
+  /* The map below allows us to find sync point log entries by sync
+   * gen number, which is necessary so write entries can be linked to
+   * their sync points. */
+  std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
+  /* The map below tracks sync points referred to in writes but not
+   * appearing in the sync_point_entries map.  We'll use this to
+   * determine which sync points are missing and need to be
+   * created. */
+  std::map<uint64_t, bool> missing_sync_points;
+
+  /*
+   * Read the existing log entries. Construct an in-memory log entry
+   * object of the appropriate type for each. Add these to the global
+   * log entries list.
+   *
+   * Write entries will not link to their sync points yet. We'll do
+   * that in the next pass. Here we'll accumulate a map of sync point
+   * gen numbers that are referred to in writes but do not appearing in
+   * the log.
+   */
+  while (entry_index != m_first_free_entry) {
+    WriteLogCacheEntry *pmem_entry = &pmem_log_entries[entry_index];
+    std::shared_ptr<GenericLogEntry> log_entry = nullptr;
+    ceph_assert(pmem_entry->entry_index == entry_index);
+
+    this->update_entries(log_entry, pmem_entry, missing_sync_points,
+        sync_point_entries, entry_index);
+
+    log_entry->ram_entry = *pmem_entry;
+    log_entry->cache_entry = pmem_entry;
+    log_entry->log_entry_index = entry_index;
+    log_entry->completed = true;
+
+    m_log_entries.push_back(log_entry);
+
+    entry_index = (entry_index + 1) % this->m_total_log_entries;
+  }
+
+  this->update_sync_points(missing_sync_points, sync_point_entries, later, MIN_WRITE_ALLOC_SIZE);
+}
+
+template <typename I>
+void WriteLog<I>::write_data_to_buffer(
+    std::shared_ptr<pwl::WriteLogEntry> ws_entry,
+    WriteLogCacheEntry *pmem_entry) {
+  ws_entry->cache_buffer = D_RW(pmem_entry->write_data);
+}
+
+/**
+ * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries
+ * that are eligible to be retired. Returns true if anything was
+ * retired.
+ */
+template <typename I>
+bool WriteLog<I>::retire_entries(const unsigned long int frees_per_tx) {
+  CephContext *cct = m_image_ctx.cct;
+  GenericLogEntriesVector retiring_entries;
+  uint32_t initial_first_valid_entry;
+  uint32_t first_valid_entry;
+
+  std::lock_guard retire_locker(this->m_log_retire_lock);
+  ldout(cct, 20) << "Look for entries to retire" << dendl;
+  {
+    /* Entry readers can't be added while we hold m_entry_reader_lock */
+    RWLock::WLocker entry_reader_locker(this->m_entry_reader_lock);
+    std::lock_guard locker(m_lock);
+    initial_first_valid_entry = this->m_first_valid_entry;
+    first_valid_entry = this->m_first_valid_entry;
+    auto entry = m_log_entries.front();
+    while (!m_log_entries.empty() &&
+           retiring_entries.size() < frees_per_tx &&
+           this->can_retire_entry(entry)) {
+      if (entry->log_entry_index != first_valid_entry) {
+        lderr(cct) << "Retiring entry index (" << entry->log_entry_index
+                   << ") and first valid log entry index (" << first_valid_entry
+                   << ") must be ==." << dendl;
+      }
+      ceph_assert(entry->log_entry_index == first_valid_entry);
+      first_valid_entry = (first_valid_entry + 1) % this->m_total_log_entries;
+      m_log_entries.pop_front();
+      retiring_entries.push_back(entry);
+      /* Remove entry from map so there will be no more readers */
+      if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) {
+        auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(entry);
+        if (gen_write_entry) {
+          this->m_blocks_to_log_entries.remove_log_entry(gen_write_entry);
+        }
+      }
+      entry = m_log_entries.front();
+    }
+  }
+
+  if (retiring_entries.size()) {
+    ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl;
+    TOID(struct WriteLogPoolRoot) pool_root;
+    pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+    utime_t tx_start;
+    utime_t tx_end;
+    /* Advance first valid entry and release buffers */
+    {
+      uint64_t flushed_sync_gen;
+      std::lock_guard append_locker(this->m_log_append_lock);
+      {
+        std::lock_guard locker(m_lock);
+        flushed_sync_gen = this->m_flushed_sync_gen;
+      }
+
+      tx_start = ceph_clock_now();
+      TX_BEGIN(m_log_pool) {
+        if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
+          ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from "
+                                     << D_RO(pool_root)->flushed_sync_gen << " to "
+                                     << flushed_sync_gen << dendl;
+          D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
+        }
+        D_RW(pool_root)->first_valid_entry = first_valid_entry;
+        for (auto &entry: retiring_entries) {
+          if (entry->write_bytes()) {
+            ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo
+                           << "." << entry->ram_entry.write_data.oid.off << dendl;
+            TX_FREE(entry->ram_entry.write_data);
+          } else {
+            ldout(cct, 20) << "Retiring non-write: " << *entry << dendl;
+          }
+        }
+      } TX_ONCOMMIT {
+      } TX_ONABORT {
+        lderr(cct) << "failed to commit free of" << retiring_entries.size()
+                   << " log entries (" << this->m_log_pool_name << ")" << dendl;
+        ceph_assert(false);
+      } TX_FINALLY {
+      } TX_END;
+      tx_end = ceph_clock_now();
+    }
+    m_perfcounter->tinc(l_librbd_pwl_retire_tx_t, tx_end - tx_start);
+    m_perfcounter->hinc(l_librbd_pwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(),
+        retiring_entries.size());
+
+    /* Update runtime copy of first_valid, and free entries counts */
+    {
+      std::lock_guard locker(m_lock);
+
+      ceph_assert(this->m_first_valid_entry == initial_first_valid_entry);
+      this->m_first_valid_entry = first_valid_entry;
+      this->m_free_log_entries += retiring_entries.size();
+      for (auto &entry: retiring_entries) {
+        if (entry->write_bytes()) {
+          ceph_assert(this->m_bytes_cached >= entry->write_bytes());
+          this->m_bytes_cached -= entry->write_bytes();
+          uint64_t entry_allocation_size = entry->write_bytes();
+          if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) {
+            entry_allocation_size = MIN_WRITE_ALLOC_SIZE;
+          }
+          ceph_assert(this->m_bytes_allocated >= entry_allocation_size);
+          this->m_bytes_allocated -= entry_allocation_size;
+        }
+      }
+      this->m_alloc_failed_since_retire = false;
+      this->wake_up();
+    }
+  } else {
+    ldout(cct, 20) << "Nothing to retire" << dendl;
+    return false;
+  }
+  return true;
+}
+
+template <typename I>
+Context* WriteLog<I>::construct_flush_entry_ctx(
+    std::shared_ptr<GenericLogEntry> log_entry) {
+  bool invalidating = this->m_invalidating; // snapshot so we behave consistently
+  Context *ctx = this->construct_flush_entry(log_entry, invalidating);
+
+  if (invalidating) {
+    return ctx;
+  }
+  return new LambdaContext(
+    [this, log_entry, ctx](int r) {
+      m_image_ctx.op_work_queue->queue(new LambdaContext(
+        [this, log_entry, ctx](int r) {
+          ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+                                     << " " << *log_entry << dendl;
+          log_entry->writeback(this->m_image_writeback, ctx);
+        }), 0);
+    });
+}
+
+const unsigned long int ops_flushed_together = 4;
+/*
+ * Performs the pmem buffer flush on all scheduled ops, then schedules
+ * the log event append operation for all of them.
+ */
+template <typename I>
+void WriteLog<I>::flush_then_append_scheduled_ops(void)
+{
+  GenericLogOperations ops;
+  bool ops_remain = false;
+  ldout(m_image_ctx.cct, 20) << dendl;
+  do {
+    {
+      ops.clear();
+      std::lock_guard locker(m_lock);
+      if (m_ops_to_flush.size()) {
+        auto last_in_batch = m_ops_to_flush.begin();
+        unsigned int ops_to_flush = m_ops_to_flush.size();
+        if (ops_to_flush > ops_flushed_together) {
+          ops_to_flush = ops_flushed_together;
+        }
+        ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl;
+        std::advance(last_in_batch, ops_to_flush);
+        ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch);
+        ops_remain = !m_ops_to_flush.empty();
+        ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", "
+                                   << m_ops_to_flush.size() << " remain" << dendl;
+      } else {
+        ops_remain = false;
+      }
+    }
+    if (ops_remain) {
+      enlist_op_flusher();
+    }
+
+    /* Ops subsequently scheduled for flush may finish before these,
+     * which is fine. We're unconcerned with completion order until we
+     * get to the log message append step. */
+    if (ops.size()) {
+      flush_pmem_buffer(ops);
+      schedule_append_ops(ops);
+    }
+  } while (ops_remain);
+  append_scheduled_ops();
+}
+
+/*
+ * Performs the log event append operation for all of the scheduled
+ * events.
+ */
+template <typename I>
+void WriteLog<I>::append_scheduled_ops(void) {
+  GenericLogOperations ops;
+  int append_result = 0;
+  bool ops_remain = false;
+  bool appending = false; /* true if we set m_appending */
+  ldout(m_image_ctx.cct, 20) << dendl;
+  do {
+    ops.clear();
+    this->append_scheduled(ops, ops_remain, appending, true);
+
+    if (ops.size()) {
+      std::lock_guard locker(this->m_log_append_lock);
+      alloc_op_log_entries(ops);
+      append_result = append_op_log_entries(ops);
+    }
+
+    int num_ops = ops.size();
+    if (num_ops) {
+      /* New entries may be flushable. Completion will wake up flusher. */
+      this->complete_op_log_entries(std::move(ops), append_result);
+    }
+  } while (ops_remain);
+}
+
+template <typename I>
+void WriteLog<I>::enlist_op_flusher()
+{
+  this->m_async_flush_ops++;
+  this->m_async_op_tracker.start_op();
+  Context *flush_ctx = new LambdaContext([this](int r) {
+      flush_then_append_scheduled_ops();
+      this->m_async_flush_ops--;
+      this->m_async_op_tracker.finish_op();
+    });
+  this->m_work_queue.queue(flush_ctx);
+}
+
+template <typename I>
+void WriteLog<I>::setup_schedule_append(
+    pwl::GenericLogOperationsVector &ops, bool do_early_flush) {
+  if (do_early_flush) {
+    /* This caller is waiting for persist, so we'll use their thread to
+     * expedite it */
+    flush_pmem_buffer(ops);
+    this->schedule_append(ops);
+  } else {
+    /* This is probably not still the caller's thread, so do the payload
+     * flushing/replicating later. */
+    schedule_flush_and_append(ops);
+  }
+}
+
+/*
+ * Takes custody of ops. They'll all get their log entries appended,
+ * and have their on_write_persist contexts completed once they and
+ * all prior log entries are persisted everywhere.
+ */
+template <typename I>
+void WriteLog<I>::schedule_append_ops(GenericLogOperations &ops)
+{
+  bool need_finisher;
+  GenericLogOperationsVector appending;
+
+  std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
+  {
+    std::lock_guard locker(m_lock);
+
+    need_finisher = this->m_ops_to_append.empty() && !this->m_appending;
+    this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops);
+  }
+
+  if (need_finisher) {
+    //enlist op appender
+    this->m_async_append_ops++;
+    this->m_async_op_tracker.start_op();
+    Context *append_ctx = new LambdaContext([this](int r) {
+        append_scheduled_ops();
+        this->m_async_append_ops--;
+        this->m_async_op_tracker.finish_op();
+        });
+    this->m_work_queue.queue(append_ctx);
+  }
+
+  for (auto &op : appending) {
+    op->appending();
+  }
+}
+
+/*
+ * Takes custody of ops. They'll all get their pmem blocks flushed,
+ * then get their log entries appended.
+ */
+template <typename I>
+void WriteLog<I>::schedule_flush_and_append(GenericLogOperationsVector &ops)
+{
+  GenericLogOperations to_flush(ops.begin(), ops.end());
+  bool need_finisher;
+  ldout(m_image_ctx.cct, 20) << dendl;
+  {
+    std::lock_guard locker(m_lock);
+
+    need_finisher = m_ops_to_flush.empty();
+    m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush);
+  }
+
+  if (need_finisher) {
+    enlist_op_flusher();
+  }
+}
+
+template <typename I>
+void WriteLog<I>::process_work() {
+  CephContext *cct = m_image_ctx.cct;
+  int max_iterations = 4;
+  bool wake_up_requested = false;
+  uint64_t aggressive_high_water_bytes = this->m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER;
+  uint64_t high_water_bytes = this->m_bytes_allocated_cap * RETIRE_HIGH_WATER;
+  uint64_t low_water_bytes = this->m_bytes_allocated_cap * RETIRE_LOW_WATER;
+  uint64_t aggressive_high_water_entries = this->m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER;
+  uint64_t high_water_entries = this->m_total_log_entries * RETIRE_HIGH_WATER;
+  uint64_t low_water_entries = this->m_total_log_entries * RETIRE_LOW_WATER;
+
+  ldout(cct, 20) << dendl;
+
+  do {
+    {
+      std::lock_guard locker(m_lock);
+      this->m_wake_up_requested = false;
+    }
+    if (this->m_alloc_failed_since_retire || this->m_invalidating ||
+        this->m_bytes_allocated > high_water_bytes ||
+        (m_log_entries.size() > high_water_entries)) {
+      int retired = 0;
+      utime_t started = ceph_clock_now();
+      ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire
+                                 << ", allocated > high_water="
+                                 << (this->m_bytes_allocated > high_water_bytes)
+                                 << ", allocated_entries > high_water="
+                                 << (m_log_entries.size() > high_water_entries)
+                                 << dendl;
+      while (this->m_alloc_failed_since_retire || this->m_invalidating ||
+            (this->m_bytes_allocated > high_water_bytes) ||
+            (m_log_entries.size() > high_water_entries) ||
+            (((this->m_bytes_allocated > low_water_bytes) ||
+              (m_log_entries.size() > low_water_entries)) &&
+            (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) {
+        if (!retire_entries((this->m_shutting_down || this->m_invalidating ||
+           (this->m_bytes_allocated > aggressive_high_water_bytes) ||
+           (m_log_entries.size() > aggressive_high_water_entries))
+            ? MAX_ALLOC_PER_TRANSACTION
+            : MAX_FREE_PER_TRANSACTION)) {
+          break;
+        }
+        retired++;
+        this->dispatch_deferred_writes();
+        this->process_writeback_dirty_entries();
+      }
+      ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl;
+    }
+    this->dispatch_deferred_writes();
+    this->process_writeback_dirty_entries();
+
+    {
+      std::lock_guard locker(m_lock);
+      wake_up_requested = this->m_wake_up_requested;
+    }
+  } while (wake_up_requested && --max_iterations > 0);
+
+  {
+    std::lock_guard locker(m_lock);
+    this->m_wake_up_scheduled = false;
+    /* Reschedule if it's still requested */
+    if (this->m_wake_up_requested) {
+      this->wake_up();
+    }
+  }
+}
+
+/*
+ * Flush the pmem regions for the data blocks of a set of operations
+ *
+ * V is expected to be GenericLogOperations<I>, or GenericLogOperationsVector<I>
+ */
+template <typename I>
+template <typename V>
+void WriteLog<I>::flush_pmem_buffer(V& ops)
+{
+  for (auto &operation : ops) {
+    if(operation->is_writing_op()) {
+      auto log_entry = static_pointer_cast<WriteLogEntry>(operation->get_log_entry());
+      pmemobj_flush(m_log_pool, log_entry->cache_buffer, log_entry->write_bytes());
+    }
+  }
+
+  /* Drain once for all */
+  pmemobj_drain(m_log_pool);
+
+  utime_t now = ceph_clock_now();
+  for (auto &operation : ops) {
+    if (operation->reserved_allocated()) {
+      operation->buf_persist_comp_time = now;
+    } else {
+      ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
+    }
+  }
+}
+
+/**
+ * Update/persist the last flushed sync point in the log
+ */
+template <typename I>
+void WriteLog<I>::persist_last_flushed_sync_gen()
+{
+  TOID(struct WriteLogPoolRoot) pool_root;
+  pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+  uint64_t flushed_sync_gen;
+
+  std::lock_guard append_locker(this->m_log_append_lock);
+  {
+    std::lock_guard locker(m_lock);
+    flushed_sync_gen = this->m_flushed_sync_gen;
+  }
+
+  if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
+    ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from "
+                               << D_RO(pool_root)->flushed_sync_gen << " to "
+                               << flushed_sync_gen << dendl;
+    TX_BEGIN(m_log_pool) {
+      D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
+    } TX_ONCOMMIT {
+    } TX_ONABORT {
+      lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl;
+      ceph_assert(false);
+    } TX_FINALLY {
+    } TX_END;
+  }
+}
+
+template <typename I>
+void WriteLog<I>::reserve_cache(C_BlockIORequestT *req,
+                                         bool &alloc_succeeds, bool &no_space) {
+  std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
+  for (auto &buffer : buffers) {
+    utime_t before_reserve = ceph_clock_now();
+    buffer.buffer_oid = pmemobj_reserve(m_log_pool,
+                                        &buffer.buffer_alloc_action,
+                                        buffer.allocation_size,
+                                        0 /* Object type */);
+    buffer.allocation_lat = ceph_clock_now() - before_reserve;
+    if (TOID_IS_NULL(buffer.buffer_oid)) {
+      if (!req->has_io_waited_for_buffers()) {
+        req->set_io_waited_for_entries(true);
+      }
+      ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: "
+                                << pmemobj_errormsg() << ". "
+                                << *req << dendl;
+      alloc_succeeds = false;
+      no_space = true; /* Entries need to be retired */
+      break;
+    } else {
+      buffer.allocated = true;
+    }
+    ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo
+                               << "." << buffer.buffer_oid.oid.off
+                               << ", size=" << buffer.allocation_size << dendl;
+  }
+}
+
+template<typename I>
+void WriteLog<I>::copy_bl_to_buffer(
+    WriteRequestResources *resources, std::unique_ptr<WriteLogOperationSet> &op_set) {
+  auto allocation = resources->buffers.begin();
+  for (auto &operation : op_set->operations) {
+    operation->copy_bl_to_cache_buffer(allocation);
+    allocation++;
+  }
+}
+
+template <typename I>
+bool WriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
+  bool alloc_succeeds = true;
+  uint64_t bytes_allocated = 0;
+  uint64_t bytes_cached = 0;
+  uint64_t bytes_dirtied = 0;
+  uint64_t num_lanes = 0;
+  uint64_t num_unpublished_reserves = 0;
+  uint64_t num_log_entries = 0;
+
+  ldout(m_image_ctx.cct, 20) << dendl;
+  // Setup buffer, and get all the number of required resources
+  req->setup_buffer_resources(&bytes_cached, &bytes_dirtied, &bytes_allocated,
+                              &num_lanes, &num_log_entries, &num_unpublished_reserves);
+
+  alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied, bytes_allocated,
+                              num_lanes, num_log_entries, num_unpublished_reserves,
+                              this->m_bytes_allocated_cap);
+
+  std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
+  if (!alloc_succeeds) {
+    /* On alloc failure, free any buffers we did allocate */
+    for (auto &buffer : buffers) {
+      if (buffer.allocated) {
+        pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1);
+      }
+    }
+  }
+
+  req->set_allocated(alloc_succeeds);
+  return alloc_succeeds;
+}
+
+template <typename I>
+void WriteLog<I>::complete_user_request(Context *&user_req, int r) {
+  user_req->complete(r);
+  // Set user_req as null as it is deleted
+  user_req = nullptr;
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::rwl::WriteLog<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/rwl/WriteLog.h b/src/librbd/cache/pwl/rwl/WriteLog.h

new file mode 100644 (file)

index 0000000..4d65a1d
--- /dev/null
+++ b/src/librbd/cache/pwl/rwl/WriteLog.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+
+#include <functional>
+#include <libpmemobj.h>
+#include <list>
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/Utils.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/Types.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/cache/pwl/LogMap.h"
+#include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/Request.h"
+#include "librbd/cache/pwl/rwl/Builder.h"
+
+class Context;
+class SafeTimer;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename ImageCtxT>
+class WriteLog : public AbstractWriteLog<ImageCtxT> {
+public:
+  WriteLog(
+      ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+      ImageWritebackInterface& image_writeback,
+      plugin::Api<ImageCtxT>& plugin_api);
+  ~WriteLog();
+  WriteLog(const WriteLog&) = delete;
+  WriteLog &operator=(const WriteLog&) = delete;
+
+  using This = AbstractWriteLog<ImageCtxT>;
+  using C_WriteRequestT = pwl::C_WriteRequest<This>;
+  using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
+
+  void copy_bl_to_buffer(
+      WriteRequestResources *resources, std::unique_ptr<WriteLogOperationSet> &op_set) override;
+  void complete_user_request(Context *&user_req, int r) override;
+private:
+  using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
+  using C_FlushRequestT = pwl::C_FlushRequest<This>;
+  using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
+
+  PMEMobjpool *m_log_pool = nullptr;
+  Builder<This> *m_builderobj;
+  const char* m_pwl_pool_layout_name;
+
+  Builder<This>* create_builder();
+  void remove_pool_file();
+  void load_existing_entries(pwl::DeferredContexts &later);
+  void alloc_op_log_entries(pwl::GenericLogOperations &ops);
+  int append_op_log_entries(pwl::GenericLogOperations &ops);
+  void flush_then_append_scheduled_ops(void);
+  void enlist_op_flusher();
+  void flush_op_log_entries(pwl::GenericLogOperationsVector &ops);
+  template <typename V>
+  void flush_pmem_buffer(V& ops);
+
+protected:
+  using AbstractWriteLog<ImageCtxT>::m_lock;
+  using AbstractWriteLog<ImageCtxT>::m_log_entries;
+  using AbstractWriteLog<ImageCtxT>::m_image_ctx;
+  using AbstractWriteLog<ImageCtxT>::m_perfcounter;
+  using AbstractWriteLog<ImageCtxT>::m_ops_to_flush;
+  using AbstractWriteLog<ImageCtxT>::m_cache_state;
+  using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
+  using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
+
+  void process_work() override;
+  void schedule_append_ops(pwl::GenericLogOperations &ops) override;
+  void append_scheduled_ops(void) override;
+  void reserve_cache(C_BlockIORequestT *req, bool &alloc_succeeds, bool &no_space) override;
+  bool retire_entries(const unsigned long int frees_per_tx) override;
+  void persist_last_flushed_sync_gen() override;
+  bool alloc_resources(C_BlockIORequestT *req) override;
+  void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops) override;
+  void setup_schedule_append(
+      pwl::GenericLogOperationsVector &ops, bool do_early_flush) override;
+  Context *construct_flush_entry_ctx(
+        const std::shared_ptr<pwl::GenericLogEntry> log_entry) override;
+  void initialize_pool(Context *on_finish, pwl::DeferredContexts &later) override;
+  void write_data_to_buffer(
+      std::shared_ptr<pwl::WriteLogEntry> ws_entry,
+      pwl::WriteLogCacheEntry *pmem_entry) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::rwl::WriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
diff --git a/src/librbd/cache/pwl/ssd/Builder.h b/src/librbd/cache/pwl/ssd/Builder.h

new file mode 100644 (file)

index 0000000..f79d685
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/Builder.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H
+#define CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H
+
+#include <iostream>
+#include "LogEntry.h"
+#include "Request.h"
+
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/pwl/Builder.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+template <typename T>
+class Builder : public pwl::Builder<T> {
+public:
+  std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+      uint64_t image_offset_bytes, uint64_t write_bytes) override {
+    return std::make_shared<WriteLogEntry>(image_offset_bytes, write_bytes);
+  }
+  std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+      std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+      uint64_t image_offset_bytes, uint64_t write_bytes) override {
+    return std::make_shared<WriteLogEntry>(
+        sync_point_entry, image_offset_bytes, write_bytes);
+  }
+  std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length) override {
+    return std::make_shared<WriteSameLogEntry>(
+        image_offset_bytes, write_bytes, data_length);
+  }
+  std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+      std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length) override {
+    return std::make_shared<WriteSameLogEntry>(
+        sync_point_entry, image_offset_bytes, write_bytes, data_length);
+  }
+  pwl::C_WriteRequest<T> *create_write_request(
+      T &pwl, utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req) override {
+    return new C_WriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(bl),
+        fadvise_flags, lock, perfcounter, user_req);
+  }
+  pwl::C_WriteSameRequest<T> *create_writesame_request(
+      T &pwl, utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req) override {
+    return new C_WriteSameRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(bl),
+        fadvise_flags, lock, perfcounter, user_req);
+  }
+  pwl::C_WriteRequest<T> *create_comp_and_write_request(
+      T &pwl, utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+      const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req) override {
+    return new C_CompAndWriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+        std::move(bl), mismatch_offset, fadvise_flags,
+        lock, perfcounter, user_req);
+  }
+  std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+      WriteLogOperationSet &set, uint64_t image_offset_bytes,
+      uint64_t write_bytes, CephContext *cct,
+      std::shared_ptr<pwl::WriteLogEntry> write_log_entry) {
+    return std::make_shared<WriteLogOperation>(
+        set, image_offset_bytes, write_bytes, cct, write_log_entry);
+  }
+  std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+      WriteLogOperationSet &set, uint64_t image_offset_bytes,
+      uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+      std::shared_ptr<pwl::WriteLogEntry> writesame_log_entry) {
+    return std::make_shared<WriteLogOperation>(
+        set, image_offset_bytes, write_bytes, data_len, cct,
+        writesame_log_entry);
+  }
+};
+
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H
diff --git a/src/librbd/cache/pwl/ssd/LogEntry.cc b/src/librbd/cache/pwl/ssd/LogEntry.cc

new file mode 100644 (file)

index 0000000..02bc2be
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/LogEntry.cc
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/pwl/ssd/LogEntry.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::WriteLogEntry: " \
+                           << this << " " <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+void WriteLogEntry::init_cache_bl(
+    bufferlist &src_bl, uint64_t off, uint64_t len) {
+  cache_bl.clear();
+  cache_bl.substr_of(src_bl, off, len);
+}
+
+buffer::list& WriteLogEntry::get_cache_bl() {
+  std::lock_guard locker(m_entry_bl_lock);
+  return cache_bl;
+}
+
+void WriteLogEntry::remove_cache_bl() {
+    std::lock_guard locker(m_entry_bl_lock);
+    cache_bl.clear();
+}
+
+unsigned int WriteLogEntry::get_aligned_data_size() const {
+  if (cache_bl.length()) {
+    return round_up_to(cache_bl.length(), MIN_WRITE_ALLOC_SSD_SIZE);
+  }
+  return round_up_to(write_bytes(), MIN_WRITE_ALLOC_SSD_SIZE);
+}
+
+void WriteLogEntry::writeback_bl(
+    librbd::cache::ImageWritebackInterface &image_writeback,
+    Context *ctx, ceph::bufferlist&& bl) {
+    image_writeback.aio_write({{ram_entry.image_offset_bytes,
+                                ram_entry.write_bytes}},
+                               std::move(bl), 0, ctx);
+}
+
+void WriteSameLogEntry::writeback_bl(
+    librbd::cache::ImageWritebackInterface &image_writeback,
+    Context *ctx, ceph::bufferlist &&bl) {
+    image_writeback.aio_writesame(ram_entry.image_offset_bytes,
+                                  ram_entry.write_bytes,
+                                  std::move(bl), 0, ctx);
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/pwl/ssd/LogEntry.h b/src/librbd/cache/pwl/ssd/LogEntry.h

new file mode 100644 (file)

index 0000000..6663984
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/LogEntry.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// // vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H
+
+#include "librbd/cache/pwl/LogEntry.h"
+
+namespace librbd {
+namespace cache {
+class ImageWritebackInterface;
+namespace pwl {
+namespace ssd {
+
+class WriteLogEntry : public pwl::WriteLogEntry {
+public:
+  WriteLogEntry(
+      std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+      uint64_t image_offset_bytes, uint64_t write_bytes)
+    : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {}
+  WriteLogEntry(
+      uint64_t image_offset_bytes, uint64_t write_bytes)
+    : pwl::WriteLogEntry(image_offset_bytes, write_bytes) {}
+  WriteLogEntry(
+      std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length)
+    : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes,
+                         write_bytes, data_length) {}
+  WriteLogEntry(
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length)
+    : pwl::WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+  ~WriteLogEntry() {}
+  WriteLogEntry(const WriteLogEntry&) = delete;
+  WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+  void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback,
+                 Context *ctx, ceph::bufferlist &&bl) override;
+  void init_cache_bl(bufferlist &src_bl, uint64_t off, uint64_t len) override;
+  buffer::list &get_cache_bl() override;
+  void remove_cache_bl() override;
+  unsigned int get_aligned_data_size() const override;
+};
+
+class WriteSameLogEntry : public WriteLogEntry {
+public:
+  WriteSameLogEntry(
+      std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length)
+    : WriteLogEntry(sync_point_entry, image_offset_bytes,
+                        write_bytes, data_length) {}
+  WriteSameLogEntry(
+      uint64_t image_offset_bytes, uint64_t write_bytes,
+      uint32_t data_length)
+    : WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+  ~WriteSameLogEntry() {}
+  WriteSameLogEntry(const WriteSameLogEntry&) = delete;
+  WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
+  void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback,
+                 Context *ctx, ceph::bufferlist &&bl) override;
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H
diff --git a/src/librbd/cache/pwl/ssd/Request.cc b/src/librbd/cache/pwl/ssd/Request.cc

new file mode 100644 (file)

index 0000000..69951e7
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/Request.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Request.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::Request: " << this << " " \
+                           <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+template <typename T>
+void C_WriteRequest<T>::setup_buffer_resources(
+    uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+    uint64_t *number_lanes, uint64_t *number_log_entries,
+    uint64_t *number_unpublished_reserves) {
+
+  auto image_extents_size = this->image_extents.size();
+  *bytes_cached = 0;
+  *bytes_allocated = 0;
+  *number_lanes = image_extents_size;
+  *number_log_entries = image_extents_size;
+
+  for (auto &extent : this->image_extents) {
+    *bytes_cached += extent.second;
+    *bytes_allocated += round_up_to(extent.second, MIN_WRITE_ALLOC_SSD_SIZE);
+  }
+  *bytes_dirtied = *bytes_cached;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+                         const C_CompAndWriteRequest<T> &req) {
+  os << (C_WriteRequest<T>&)req
+     << "cmp_bl=" << req.cmp_bl << ", "
+     << "read_bl=" << req.read_bl << ", "
+     << "compare_succeeded=" << req.compare_succeeded << ", "
+     << "mismatch_offset=" << req.mismatch_offset;
+  return os;
+}
+
+template <typename T>
+void C_WriteSameRequest<T>::setup_buffer_resources(
+    uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+    uint64_t *number_lanes, uint64_t *number_log_entries,
+    uint64_t *number_unpublished_reserves) {
+  ceph_assert(this->image_extents.size() == 1);
+  *bytes_dirtied = this->image_extents[0].second;
+  *bytes_cached = this->bl.length();
+  *bytes_allocated = round_up_to(*bytes_cached, MIN_WRITE_ALLOC_SSD_SIZE);
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ssd::C_WriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::ssd::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::ssd::C_CompAndWriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
diff --git a/src/librbd/cache/pwl/ssd/Request.h b/src/librbd/cache/pwl/ssd/Request.h

new file mode 100644 (file)

index 0000000..9bb3e85
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/Request.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_SSD_REQUEST_H
+#define CEPH_LIBRBD_CACHE_SSD_REQUEST_H
+
+#include "librbd/cache/pwl/Request.h"
+
+namespace librbd {
+class BlockGuardCell;
+
+namespace cache {
+namespace pwl {
+
+template<typename T>
+class AbstractWriteLog;
+
+namespace ssd {
+
+template <typename T>
+class C_WriteRequest : public pwl::C_WriteRequest<T> {
+public:
+  C_WriteRequest(
+      T &pwl, const utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+      const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req)
+    : pwl::C_WriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+        std::move(bl), mismatch_offset, fadvise_flags,
+        lock, perfcounter, user_req) {}
+
+  C_WriteRequest(
+      T &pwl, const utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req)
+    : pwl::C_WriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(bl),
+        fadvise_flags, lock, perfcounter, user_req) {}
+protected:
+  void setup_buffer_resources(
+      uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+      uint64_t *bytes_allocated, uint64_t *number_lanes,
+      uint64_t *number_log_entries,
+      uint64_t *number_unpublished_reserves) override;
+};
+
+template <typename T>
+class C_CompAndWriteRequest : public C_WriteRequest<T> {
+public:
+  C_CompAndWriteRequest(
+      T &pwl, const utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+      const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req)
+    : C_WriteRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+        std::move(bl), mismatch_offset,fadvise_flags,
+        lock, perfcounter, user_req) {}
+
+  const char *get_name() const override {
+    return "C_CompAndWriteRequest";
+  }
+  template <typename U>
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const C_CompAndWriteRequest<U> &req);
+};
+
+template <typename T>
+class C_WriteSameRequest : public pwl::C_WriteSameRequest<T> {
+public:
+  C_WriteSameRequest(
+      T &pwl, const utime_t arrived, io::Extents &&image_extents,
+      bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+      PerfCounters *perfcounter, Context *user_req)
+    : pwl::C_WriteSameRequest<T>(
+        pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags,
+        lock, perfcounter, user_req) {}
+
+  void setup_buffer_resources(
+      uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+      uint64_t *bytes_allocated, uint64_t *number_lanes,
+      uint64_t *number_log_entries,
+      uint64_t *number_unpublished_reserves) override;
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_SSD_REQUEST_H
diff --git a/src/librbd/cache/pwl/ssd/Types.h b/src/librbd/cache/pwl/ssd/Types.h

new file mode 100644 (file)

index 0000000..e34b751
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/Types.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+      
+#ifndef CEPH_LIBRBD_CACHE_SSD_TYPES_H
+#define CEPH_LIBRBD_CACHE_SSD_TYPES_H
+  
+#include "acconfig.h"
+    
+#include "librbd/io/Types.h"
+#include "librbd/cache/pwl/Types.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+struct SuperBlock{
+  WriteLogPoolRoot root;
+
+  DENC(SuperBlock, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.root, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_object("super", root);
+  }
+
+  static void generate_test_instances(list<SuperBlock*>& ls) {
+    ls.push_back(new SuperBlock);
+    ls.push_back(new SuperBlock);
+    ls.back()->root.first_valid_entry = 2;
+  }
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+WRITE_CLASS_DENC(librbd::cache::pwl::ssd::SuperBlock)
+
+#endif // CEPH_LIBRBD_CACHE_SSD_TYPES_H
diff --git a/src/librbd/cache/pwl/ssd/WriteLog.cc b/src/librbd/cache/pwl/ssd/WriteLog.cc

new file mode 100644 (file)

index 0000000..6efd85b
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/WriteLog.cc
@@ -0,0 +1,812 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "WriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include <map>
+#include <vector>
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::WriteLog: " \
+                           << this << " " <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+using namespace librbd::cache::pwl;
+
+// SSD: this number can be updated later
+const unsigned long int ops_appended_together = MAX_WRITES_PER_SYNC_POINT;
+
+template <typename I>
+Builder<AbstractWriteLog<I>>* WriteLog<I>::create_builder() {
+  m_builderobj = new Builder<This>();
+  return m_builderobj;
+}
+
+template <typename I>
+WriteLog<I>::WriteLog(
+    I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
+    cache::ImageWritebackInterface& image_writeback,
+    plugin::Api<I>& plugin_api)
+  : AbstractWriteLog<I>(image_ctx, cache_state, create_builder(),
+                        image_writeback, plugin_api)
+{
+}
+
+template <typename I>
+WriteLog<I>::~WriteLog() {
+  delete m_builderobj;
+}
+
+template <typename I>
+void WriteLog<I>::initialize_pool(Context *on_finish,
+                                  pwl::DeferredContexts &later) {
+  CephContext *cct = m_image_ctx.cct;
+  ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+  if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
+    int fd = ::open(this->m_log_pool_name.c_str(), O_RDWR|O_CREAT, 0644);
+    bool succeed = true;
+    if (fd >= 0) {
+      if (truncate(this->m_log_pool_name.c_str(),
+                   this->m_log_pool_config_size) != 0) {
+        succeed = false;
+      }
+      ::close(fd);
+    } else {
+      succeed = false;
+    }
+    if (!succeed) {
+      m_cache_state->present = false;
+      m_cache_state->clean = true;
+      m_cache_state->empty = true;
+      /* TODO: filter/replace errnos that are meaningless to the caller */
+      on_finish->complete(-errno);
+      return;
+    }
+
+    bdev = BlockDevice::create(cct, this->m_log_pool_name, aio_cache_cb,
+                               nullptr, nullptr, nullptr);
+    int r = bdev->open(this->m_log_pool_name);
+    if (r < 0) {
+      delete bdev;
+      on_finish->complete(-1);
+      return;
+    }
+    m_cache_state->present = true;
+    m_cache_state->clean = true;
+    m_cache_state->empty = true;
+    /* new pool, calculate and store metadata */
+    size_t small_write_size = MIN_WRITE_ALLOC_SSD_SIZE + sizeof(struct WriteLogCacheEntry);
+
+    uint64_t num_small_writes = (uint64_t)(this->m_log_pool_config_size / small_write_size);
+    if (num_small_writes > MAX_LOG_ENTRIES) {
+      num_small_writes = MAX_LOG_ENTRIES;
+    }
+    assert(num_small_writes > 2);
+    m_log_pool_ring_buffer_size = this->m_log_pool_config_size - DATA_RING_BUFFER_OFFSET;
+    /* Log ring empty */
+    m_first_free_entry = DATA_RING_BUFFER_OFFSET;
+    m_first_valid_entry = DATA_RING_BUFFER_OFFSET;
+
+    pool_size = this->m_log_pool_config_size;
+    auto new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
+    new_root->pool_size = this->m_log_pool_config_size;
+    new_root->flushed_sync_gen = this->m_flushed_sync_gen;
+    new_root->block_size = MIN_WRITE_ALLOC_SSD_SIZE;
+    new_root->first_free_entry = m_first_free_entry;
+    new_root->first_valid_entry = m_first_valid_entry;
+    new_root->num_log_entries = num_small_writes;
+    pool_root = *new_root;
+
+    r = update_pool_root_sync(new_root);
+    if (r != 0) {
+      this->m_total_log_entries = 0;
+      this->m_free_log_entries = 0;
+      lderr(m_image_ctx.cct) << "failed to initialize pool ("
+                             << this->m_log_pool_name << ")" << dendl;
+      on_finish->complete(r);
+    }
+    this->m_total_log_entries = new_root->num_log_entries;
+    this->m_free_log_entries = new_root->num_log_entries - 1;
+   } else {
+     m_cache_state->present = true;
+     bdev = BlockDevice::create(
+         cct, this->m_log_pool_name, aio_cache_cb,
+         static_cast<void*>(this), nullptr, static_cast<void*>(this));
+     int r = bdev->open(this->m_log_pool_name);
+     if (r < 0) {
+       delete bdev;
+       on_finish->complete(r);
+       return;
+     }
+     load_existing_entries(later);
+     if (m_first_free_entry < m_first_valid_entry) {
+      /* Valid entries wrap around the end of the ring, so first_free is lower
+       * than first_valid.  If first_valid was == first_free+1, the entry at
+       * first_free would be empty. The last entry is never used, so in
+       * that case there would be zero free log entries. */
+       this->m_free_log_entries = this->m_total_log_entries -
+         (m_first_valid_entry - m_first_free_entry) - 1;
+     } else {
+      /* first_valid is <= first_free. If they are == we have zero valid log
+       * entries, and n-1 free log entries */
+       this->m_free_log_entries = this->m_total_log_entries -
+         (m_first_free_entry - m_first_valid_entry) - 1;
+     }
+     m_cache_state->clean = this->m_dirty_log_entries.empty();
+     m_cache_state->empty = m_log_entries.empty();
+  }
+}
+
+template <typename I>
+void WriteLog<I>::remove_pool_file() {
+  ceph_assert(bdev);
+  bdev->close();
+  delete bdev;
+  bdev = nullptr;
+  ldout(m_image_ctx.cct, 5) << "block device is closed" << dendl;
+
+  if (m_cache_state->clean) {
+    ldout(m_image_ctx.cct, 5) << "Removing empty pool file: "
+                              << this->m_log_pool_name << dendl;
+    if (remove(this->m_log_pool_name.c_str()) != 0) {
+      lderr(m_image_ctx.cct) << "failed to remove empty pool \""
+                             << this->m_log_pool_name << "\": " << dendl;
+    } else {
+      m_cache_state->clean = true;
+      m_cache_state->empty = true;
+      m_cache_state->present = false;
+    }
+  } else {
+    ldout(m_image_ctx.cct, 5) << "Not removing pool file: "
+                              << this->m_log_pool_name << dendl;
+  }
+}
+
+template <typename I>
+void WriteLog<I>::load_existing_entries(pwl::DeferredContexts &later) {
+  bufferlist bl;
+  CephContext *cct = m_image_ctx.cct;
+  ::IOContext ioctx(cct, nullptr);
+  bdev->read(0, MIN_WRITE_ALLOC_SSD_SIZE, &bl, &ioctx, false);
+  SuperBlock superblock;
+
+  auto p = bl.cbegin();
+  decode(superblock, p);
+  ldout(cct,5) << "Decoded superblock" << dendl;
+
+  WriteLogPoolRoot current_pool_root = superblock.root;
+  uint64_t next_log_pos = pool_root.first_valid_entry;
+  uint64_t first_free_entry =  pool_root.first_free_entry;
+  uint64_t curr_log_pos;
+
+  pool_root = current_pool_root;
+  m_first_free_entry = first_free_entry;
+  m_first_valid_entry = next_log_pos;
+  this->m_total_log_entries = current_pool_root.num_log_entries;
+  this->m_flushed_sync_gen = current_pool_root.flushed_sync_gen;
+  this->m_log_pool_actual_size = current_pool_root.pool_size;
+
+  std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
+
+  std::map<uint64_t, bool> missing_sync_points;
+
+  // Iterate through the log_entries and append all the write_bytes
+  // of each entry to fetch the pos of next 4k of log_entries. Iterate
+  // through the log entries and append them to the in-memory vector
+  while (next_log_pos != first_free_entry) {
+    // read the entries from SSD cache and decode
+    bufferlist bl_entries;
+    ::IOContext ioctx_entry(cct, nullptr);
+    bdev->read(next_log_pos, MIN_WRITE_ALLOC_SSD_SIZE, &bl_entries,
+               &ioctx_entry, false);
+    std::vector<WriteLogCacheEntry> ssd_log_entries;
+    auto pl = bl_entries.cbegin();
+    decode(ssd_log_entries, pl);
+    ldout(cct, 5) << "decoded ssd log entries" << dendl;
+    curr_log_pos = next_log_pos;
+    std::shared_ptr<GenericLogEntry> log_entry = nullptr;
+
+    for (auto it = ssd_log_entries.begin(); it != ssd_log_entries.end(); ++it) {
+      this->update_entries(log_entry, &*it, missing_sync_points,
+                           sync_point_entries, curr_log_pos);
+      log_entry->ram_entry = *it;
+      log_entry->log_entry_index = curr_log_pos;
+      log_entry->completed = true;
+      m_log_entries.push_back(log_entry);
+      next_log_pos += round_up_to(it->write_bytes, MIN_WRITE_ALLOC_SSD_SIZE);
+    }
+    // along with the write_bytes, add control block size too
+    next_log_pos += MIN_WRITE_ALLOC_SSD_SIZE;
+    if (next_log_pos >= this->m_log_pool_actual_size) {
+      next_log_pos = next_log_pos % this->m_log_pool_actual_size + DATA_RING_BUFFER_OFFSET;
+    }
+ }
+  this->update_sync_points(missing_sync_points, sync_point_entries, later,
+                           MIN_WRITE_ALLOC_SSD_SIZE);
+}
+
+template <typename I>
+bool WriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
+  bool alloc_succeeds = true;
+  uint64_t bytes_allocated = 0;
+  uint64_t bytes_cached = 0;
+  uint64_t bytes_dirtied = 0;
+  uint64_t num_lanes = 0;
+  uint64_t num_unpublished_reserves = 0;
+  uint64_t num_log_entries = 0;
+
+  // Setup buffer, and get all the number of required resources
+  req->setup_buffer_resources(&bytes_cached, &bytes_dirtied, &bytes_allocated,
+                              &num_lanes, &num_log_entries,
+                              &num_unpublished_reserves);
+
+  bytes_allocated += num_log_entries * MIN_WRITE_ALLOC_SSD_SIZE;
+
+  alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied,
+                                          bytes_allocated, num_lanes,
+                                          num_log_entries,
+                                          num_unpublished_reserves,
+                                          m_log_pool_ring_buffer_size);
+  req->set_allocated(alloc_succeeds);
+  return alloc_succeeds;
+}
+
+template <typename I>
+bool WriteLog<I>::has_sync_point_logs(GenericLogOperations &ops) {
+  for (auto &op : ops) {
+    if (op->get_log_entry()->is_sync_point()) {
+      return true;
+      break;
+    }
+  }
+  return false;
+}
+
+template<typename I>
+void WriteLog<I>::enlist_op_appender() {
+  this->m_async_append_ops++;
+  this->m_async_op_tracker.start_op();
+  Context *append_ctx = new LambdaContext([this](int r) {
+      append_scheduled_ops();
+      });
+  this->m_work_queue.queue(append_ctx);
+}
+/*
+ * Takes custody of ops. They'll all get their log entries appended,
+ * and have their on_write_persist contexts completed once they and
+ * all prior log entries are persisted everywhere.
+ */
+template<typename I>
+void WriteLog<I>::schedule_append_ops(GenericLogOperations &ops) {
+  bool need_finisher = false;
+  GenericLogOperationsVector appending;
+
+  std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
+  {
+    std::lock_guard locker(m_lock);
+
+    bool persist_on_flush = this->get_persist_on_flush();
+    need_finisher = !this->m_appending &&
+       ((this->m_ops_to_append.size() >= CONTROL_BLOCK_MAX_LOG_ENTRIES) ||
+        !persist_on_flush);
+
+    // Only flush logs into SSD when there is internal/external flush request
+    if (!need_finisher) {
+      need_finisher = has_sync_point_logs(ops);
+    }
+    this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops);
+  }
+
+  if (need_finisher) {
+    this->enlist_op_appender();
+  }
+
+  for (auto &op : appending) {
+    op->appending();
+  }
+}
+
+template <typename I>
+void WriteLog<I>::setup_schedule_append(pwl::GenericLogOperationsVector &ops,
+                                        bool do_early_flush) {
+  this->schedule_append(ops);
+}
+
+template <typename I>
+void WriteLog<I>::append_scheduled_ops(void) {
+  GenericLogOperations ops;
+  ldout(m_image_ctx.cct, 20) << dendl;
+
+  bool ops_remain = false; //no-op variable for SSD
+  bool appending = false; //no-op variable for SSD
+  this->append_scheduled(ops, ops_remain, appending);
+
+  if (ops.size()) {
+    alloc_op_log_entries(ops);
+    append_op_log_entries(ops);
+  } else {
+    this->m_async_append_ops--;
+    this->m_async_op_tracker.finish_op();
+  }
+}
+
+/*
+ * Write and persist the (already allocated) write log entries and
+ * data buffer allocations for a set of ops. The data buffer for each
+ * of these must already have been persisted to its reserved area.
+ */
+template <typename I>
+void WriteLog<I>::append_op_log_entries(GenericLogOperations &ops) {
+  ceph_assert(!ops.empty());
+  ldout(m_image_ctx.cct, 20) << dendl;
+  Context *ctx = new LambdaContext([this, ops](int r) {
+    assert(r == 0);
+    ldout(m_image_ctx.cct, 20) << "Finished root update " << dendl;
+    this->m_async_update_superblock--;
+    this->m_async_op_tracker.finish_op();
+
+    auto captured_ops = std::move(ops);
+    this->complete_op_log_entries(std::move(captured_ops), r);
+
+    bool need_finisher = false;
+    {
+      std::lock_guard locker1(m_lock);
+      bool persist_on_flush = this->get_persist_on_flush();
+      need_finisher = ((this->m_ops_to_append.size() >= CONTROL_BLOCK_MAX_LOG_ENTRIES) ||
+                       !persist_on_flush);
+
+      if (!need_finisher) {
+        need_finisher = has_sync_point_logs(this->m_ops_to_append);
+      }
+    }
+
+    if (need_finisher) {
+      this->enlist_op_appender();
+    }
+  });
+  uint64_t *new_first_free_entry = new(uint64_t);
+  Context *append_ctx = new LambdaContext(
+      [this, new_first_free_entry, ops, ctx](int r) {
+      std::shared_ptr<WriteLogPoolRoot> new_root;
+      {
+        ldout(m_image_ctx.cct, 20) << "Finished appending at "
+                                   << *new_first_free_entry << dendl;
+        utime_t now = ceph_clock_now();
+        for (auto &operation : ops) {
+          operation->log_append_comp_time = now;
+        }
+        this->m_async_append_ops--;
+        this->m_async_op_tracker.finish_op();
+
+        std::lock_guard locker(this->m_log_append_lock);
+        std::lock_guard locker1(m_lock);
+        assert(this->m_appending);
+        this->m_appending = false;
+        new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
+        pool_root.first_free_entry = *new_first_free_entry;
+        new_root->first_free_entry = *new_first_free_entry;
+        delete new_first_free_entry;
+        schedule_update_root(new_root, ctx);
+      }
+  });
+  // Append logs and update first_free_update
+  uint64_t bytes_allocated_updated;
+  append_ops(ops, append_ctx, new_first_free_entry, bytes_allocated_updated);
+
+  {
+    std::lock_guard locker1(m_lock);
+    m_first_free_entry = *new_first_free_entry;
+    m_bytes_allocated -= bytes_allocated_updated;
+  }
+
+  if (ops.size()) {
+    this->dispatch_deferred_writes();
+  }
+}
+
+template <typename I>
+void WriteLog<I>::release_ram(std::shared_ptr<GenericLogEntry> log_entry) {
+  log_entry->remove_cache_bl();
+}
+
+template <typename I>
+void WriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops) {
+  std::lock_guard locker(m_lock);
+
+  for (auto &operation : ops) {
+    auto &log_entry = operation->get_log_entry();
+    log_entry->ram_entry.entry_valid = 1;
+    m_log_entries.push_back(log_entry);
+    ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
+  }
+}
+
+template <typename I>
+Context* WriteLog<I>::construct_flush_entry_ctx(
+    std::shared_ptr<GenericLogEntry> log_entry) {
+  // snapshot so we behave consistently
+  bool invalidating = this->m_invalidating;
+
+  Context *ctx = this->construct_flush_entry(log_entry, invalidating);
+
+  if (invalidating) {
+    return ctx;
+  }
+  if(log_entry->is_write_entry()) {
+      bufferlist *read_bl_ptr = new bufferlist;
+      ctx = new LambdaContext(
+          [this, log_entry, read_bl_ptr, ctx](int r) {
+            bufferlist captured_entry_bl;
+            captured_entry_bl.claim_append(*read_bl_ptr);
+            free(read_bl_ptr);
+            m_image_ctx.op_work_queue->queue(new LambdaContext(
+              [this, log_entry, entry_bl=move(captured_entry_bl), ctx](int r) {
+               auto captured_entry_bl = std::move(entry_bl);
+               ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+                                          << " " << *log_entry << dendl;
+               log_entry->writeback_bl(this->m_image_writeback, ctx,
+                                       std::move(captured_entry_bl));
+              }), 0);
+      });
+      ctx = new LambdaContext(
+        [this, log_entry, read_bl_ptr, ctx](int r) {
+          aio_read_data_block(&log_entry->ram_entry, read_bl_ptr, ctx);
+      });
+    return ctx;
+  } else {
+    return new LambdaContext(
+      [this, log_entry, ctx](int r) {
+        m_image_ctx.op_work_queue->queue(new LambdaContext(
+          [this, log_entry, ctx](int r) {
+            ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+                                       << " " << *log_entry << dendl;
+            log_entry->writeback(this->m_image_writeback, ctx);
+          }), 0);
+      });
+  }
+}
+
+template <typename I>
+void WriteLog<I>::process_work() {
+  CephContext *cct = m_image_ctx.cct;
+  int max_iterations = 4;
+  bool wake_up_requested = false;
+  uint64_t high_water_bytes = m_log_pool_ring_buffer_size * RETIRE_HIGH_WATER;
+  uint64_t high_water_entries = this->m_total_log_entries * RETIRE_HIGH_WATER;
+
+  ldout(cct, 20) << dendl;
+
+  do {
+    {
+      std::lock_guard locker(m_lock);
+      this->m_wake_up_requested = false;
+    }
+    if (this->m_alloc_failed_since_retire || (this->m_shutting_down) ||
+        this->m_invalidating || m_bytes_allocated > high_water_bytes ||
+        (m_log_entries.size() > high_water_entries)) {
+      ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire
+                                 << ", allocated > high_water="
+                                 << (m_bytes_allocated > high_water_bytes)
+                                 << ", allocated_entries > high_water="
+                                 << (m_log_entries.size() > high_water_entries)
+                                 << dendl;
+      //TODO: Implement and uncomment this in next PR
+      /*retire_entries((this->m_shutting_down || this->m_invalidating ||
+                    (m_bytes_allocated > aggressive_high_water_bytes) ||
+                    (m_log_entries.size() > aggressive_high_water_entries))
+                    ? MAX_ALLOC_PER_TRANSACTION : MAX_FREE_PER_TRANSACTION);*/
+    }
+    this->dispatch_deferred_writes();
+    this->process_writeback_dirty_entries();
+    {
+      std::lock_guard locker(m_lock);
+      wake_up_requested = this->m_wake_up_requested;
+    }
+  } while (wake_up_requested && --max_iterations > 0);
+
+  {
+    std::lock_guard locker(m_lock);
+    this->m_wake_up_scheduled = false;
+    // Reschedule if it's still requested
+    if (this->m_wake_up_requested) {
+      this->wake_up();
+    }
+  }
+}
+
+template <typename I>
+void WriteLog<I>::append_ops(GenericLogOperations &ops, Context *ctx,
+                             uint64_t* new_first_free_entry,
+                             uint64_t &bytes_allocated) {
+  GenericLogEntriesVector log_entries;
+  CephContext *cct = m_image_ctx.cct;
+  uint64_t span_payload_len = 0;
+  bytes_allocated = 0;
+  ldout(cct, 20) << "Appending " << ops.size() << " log entries." << dendl;
+
+  AioTransContext* aio = new AioTransContext(cct, ctx);
+
+  utime_t now = ceph_clock_now();
+  for (auto &operation : ops) {
+    operation->log_append_time = now;
+    auto log_entry = operation->get_log_entry();
+
+    if (log_entries.size() == CONTROL_BLOCK_MAX_LOG_ENTRIES ||
+        span_payload_len >= SPAN_MAX_DATA_LEN) {
+      if (log_entries.size() > 1) {
+        bytes_allocated += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE;
+      }
+      write_log_entries(log_entries, aio);
+      log_entries.clear();
+      span_payload_len = 0;
+    }
+    log_entries.push_back(log_entry);
+    span_payload_len += log_entry->write_bytes();
+  }
+  if (!span_payload_len || !log_entries.empty()) {
+    if (log_entries.size() > 1) {
+      bytes_allocated += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE;
+    }
+    write_log_entries(log_entries, aio);
+  }
+  bdev->aio_submit(&aio->ioc);
+  *new_first_free_entry = pool_root.first_free_entry;
+}
+
+template <typename I>
+void WriteLog<I>::write_log_entries(GenericLogEntriesVector log_entries,
+                                    AioTransContext *aio) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(m_image_ctx.cct, 20) << dendl;
+  bufferlist data_bl;
+  // The first block is for log entries
+  uint64_t data_pos = pool_root.first_free_entry + MIN_WRITE_ALLOC_SSD_SIZE;
+  ldout(m_image_ctx.cct, 20) << "data_pos: " << data_pos << dendl;
+  if (data_pos == pool_root.pool_size ) {
+    data_pos = data_pos % pool_root.pool_size + DATA_RING_BUFFER_OFFSET;
+  }
+
+  std::vector<WriteLogCacheEntry> persist_log_entries;
+  for (auto &log_entry : log_entries) {
+    log_entry->log_entry_index = pool_root.first_free_entry;
+    // Append data buffer for write operations
+    persist_log_entries.push_back(log_entry->ram_entry);
+    if (log_entry->is_write_entry()) {
+      auto write_entry = static_pointer_cast<WriteLogEntry>(log_entry);
+      auto cache_bl = write_entry->get_cache_bl();
+      auto align_size = write_entry->get_aligned_data_size();
+      data_bl.append(cache_bl);
+      data_bl.append_zero(align_size - cache_bl.length());
+
+      write_entry->ram_entry.write_data_pos = data_pos;
+      data_pos += align_size;
+      if (data_pos >= pool_root.pool_size) {
+        data_pos = data_pos % pool_root.pool_size + DATA_RING_BUFFER_OFFSET;
+      }
+    }
+  }
+
+  //aio write
+  bufferlist bl;
+  encode(persist_log_entries, bl);
+  ceph_assert(bl.length() <= MIN_WRITE_ALLOC_SSD_SIZE);
+  bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length());
+  bl.append(data_bl);
+  ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+  if (pool_root.first_free_entry + bl.length() > pool_root.pool_size) {
+    //exceeds border, need to split
+    uint64_t size = bl.length();
+    auto end = pool_root.pool_size - pool_root.first_free_entry;
+    bufferlist bl1;
+    bl.splice(0, end, &bl1);
+    ceph_assert(bl.length() == (size - bl1.length()));
+    ldout(cct, 20) << "The write on " << pool_root.first_free_entry
+                   << " with length " << size << " is split into two: "
+                   << "pos=" << pool_root.first_free_entry << ", "
+                   << "length=" << bl1.length() << "; "
+                   << "pos=" << DATA_RING_BUFFER_OFFSET << ", "
+                   << "length=" << bl.length() << dendl;
+
+    bdev->aio_write(pool_root.first_free_entry, bl1, &aio->ioc, false,
+                    WRITE_LIFE_NOT_SET);
+    bdev->aio_write(DATA_RING_BUFFER_OFFSET, bl, &aio->ioc, false,
+                    WRITE_LIFE_NOT_SET);
+  } else {
+    ldout(cct, 20) << "first_free_entry: " << pool_root.first_free_entry
+                   << " bl length: " << bl.length() << dendl;
+    bdev->aio_write(pool_root.first_free_entry, bl, &aio->ioc, false,
+                    WRITE_LIFE_NOT_SET);
+    ldout(cct, 20) << "finished aio_write log entries" << dendl;
+  }
+  // New first free entry
+  pool_root.first_free_entry = data_pos;
+}
+
+template <typename I>
+void WriteLog<I>::schedule_update_root(
+    std::shared_ptr<WriteLogPoolRoot> root, Context *ctx) {
+  bool need_finisher;
+  {
+    ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+    need_finisher = m_poolroot_to_update.empty() && !m_updating_pool_root;
+    std::shared_ptr<WriteLogPoolRootUpdate> entry =
+      std::make_shared<WriteLogPoolRootUpdate>(root, ctx);
+    this->m_async_update_superblock++;
+    this->m_async_op_tracker.start_op();
+    m_poolroot_to_update.emplace_back(entry);
+  }
+  if (need_finisher) {
+    enlist_op_update_root();
+  }
+}
+
+template <typename I>
+void WriteLog<I>::enlist_op_update_root() {
+  Context *append_ctx = new LambdaContext([this](int r) {
+    update_root_scheduled_ops();
+  });
+  this->m_work_queue.queue(append_ctx);
+}
+
+template <typename I>
+void WriteLog<I>::update_root_scheduled_ops() {
+  ldout(m_image_ctx.cct, 20) << dendl;
+
+  std::shared_ptr<WriteLogPoolRoot> root;
+  WriteLogPoolRootUpdateList root_updates;
+  Context *ctx = nullptr;
+  {
+    std::lock_guard locker(m_lock);
+    if (m_updating_pool_root) {
+      /* Another thread is appending */
+      ldout(m_image_ctx.cct, 15) << "Another thread is updating pool root"
+                                 << dendl;
+      return;
+    }
+    if (m_poolroot_to_update.size()) {
+      m_updating_pool_root = true;
+      root_updates.swap(m_poolroot_to_update);
+    }
+  }
+  ceph_assert(!root_updates.empty());
+  ldout(m_image_ctx.cct, 15) << "Update root number: " << root_updates.size()
+                             << dendl;
+  // We just update the last one, and call all the completions.
+  auto entry = root_updates.back();
+  root = entry->root;
+
+  ctx = new LambdaContext([this, updates = std::move(root_updates)](int r) {
+    ldout(m_image_ctx.cct, 15) << "Start to callback." << dendl;
+    for (auto it = updates.begin(); it != updates.end(); it++) {
+      Context *it_ctx = (*it)->ctx;
+      it_ctx->complete(r);
+    }
+  });
+  Context *append_ctx = new LambdaContext([this, ctx](int r) {
+    ldout(m_image_ctx.cct, 15) << "Finish the update of pool root." << dendl;
+    bool need_finisher = false;;
+    assert(r == 0);
+    {
+      std::lock_guard locker(m_lock);
+      m_updating_pool_root = false;
+      need_finisher = !m_poolroot_to_update.empty();
+    }
+    if (need_finisher) {
+      enlist_op_update_root();
+    }
+    ctx->complete(r);
+  });
+  AioTransContext* aio = new AioTransContext(m_image_ctx.cct, append_ctx);
+  update_pool_root(root, aio);
+}
+
+template <typename I>
+void WriteLog<I>::update_pool_root(std::shared_ptr<WriteLogPoolRoot> root,
+                                   AioTransContext *aio) {
+  bufferlist bl;
+  SuperBlock superblock;
+  superblock.root = *root;
+  encode(superblock, bl);
+  bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length());
+  ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+  bdev->aio_write(0, bl, &aio->ioc, false, WRITE_LIFE_NOT_SET);
+  bdev->aio_submit(&aio->ioc);
+}
+
+template <typename I>
+int WriteLog<I>::update_pool_root_sync(
+    std::shared_ptr<WriteLogPoolRoot> root) {
+  bufferlist bl;
+  SuperBlock superblock;
+  superblock.root = *root;
+  encode(superblock, bl);
+  bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length());
+  ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+  return bdev->write(0, bl, false);
+}
+
+template <typename I>
+void WriteLog<I>::pre_io_check(WriteLogCacheEntry *log_entry,
+                               uint64_t &length) {
+  assert(log_entry->is_write() || log_entry->is_writesame());
+  ceph_assert(log_entry->write_data_pos <= pool_size);
+
+  length = log_entry->is_write() ? log_entry->write_bytes :
+                                   log_entry->ws_datalen;
+  length = round_up_to(length, MIN_WRITE_ALLOC_SSD_SIZE);
+  ceph_assert(length != 0 && log_entry->write_data_pos + length <= pool_size);
+}
+
+template <typename I>
+void WriteLog<I>::aio_read_data_block(
+  WriteLogCacheEntry *log_entry, bufferlist *bl, Context *ctx) {
+  std::vector<WriteLogCacheEntry*> log_entries {log_entry};
+  std::vector<bufferlist *> bls {bl};
+  aio_read_data_block(log_entries, bls, ctx);
+}
+
+template <typename I>
+void WriteLog<I>::aio_read_data_block(
+    std::vector<WriteLogCacheEntry*> &log_entries,
+    std::vector<bufferlist *> &bls, Context *ctx) {
+  ceph_assert(log_entries.size() == bls.size());
+
+  //get the valid part
+  Context *read_ctx = new LambdaContext(
+    [this, log_entries, bls, ctx](int r) {
+      for (unsigned int i = 0; i < log_entries.size(); i++) {
+        bufferlist valid_data_bl;
+        auto length = log_entries[i]->is_write() ? log_entries[i]->write_bytes :
+                                                   log_entries[i]->ws_datalen;
+        valid_data_bl.substr_of(*bls[i], 0, length);
+        bls[i]->clear();
+        bls[i]->append(valid_data_bl);
+      }
+     ctx->complete(r);
+    });
+
+  CephContext *cct = m_image_ctx.cct;
+  AioTransContext *aio = new AioTransContext(cct, read_ctx);
+  for (unsigned int i = 0; i < log_entries.size(); i++) {
+    auto log_entry = log_entries[i];
+
+    uint64_t length;
+    pre_io_check(log_entry, length);
+    ldout(cct, 20) << "Read at " << log_entry->write_data_pos
+                   << ", length " << length << dendl;
+
+    bdev->aio_read(log_entry->write_data_pos, length, bls[i], &aio->ioc);
+  }
+  bdev->aio_submit(&aio->ioc);
+}
+
+template <typename I>
+void WriteLog<I>::complete_user_request(Context *&user_req, int r) {
+  m_image_ctx.op_work_queue->queue(user_req, r);
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ssd::WriteLog<librbd::ImageCtx>;
diff --git a/src/librbd/cache/pwl/ssd/WriteLog.h b/src/librbd/cache/pwl/ssd/WriteLog.h

new file mode 100644 (file)

index 0000000..3bc72bb
--- /dev/null
+++ b/src/librbd/cache/pwl/ssd/WriteLog.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
+
+#include "blk/BlockDevice.h"
+#include "common/AsyncOpTracker.h"
+#include "common/Checksummer.h"
+#include "common/environment.h"
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/Types.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/cache/pwl/LogMap.h"
+#include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/Request.h"
+#include "librbd/cache/pwl/ssd/Builder.h"
+#include "librbd/cache/pwl/ssd/Types.h"
+#include <functional>
+#include <list>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+template <typename ImageCtxT>
+class WriteLog : public AbstractWriteLog<ImageCtxT> {
+public:
+  WriteLog(ImageCtxT &image_ctx,
+           librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+           cache::ImageWritebackInterface& image_writeback,
+           plugin::Api<ImageCtxT>& plugin_api);
+  ~WriteLog();
+  WriteLog(const WriteLog&) = delete;
+  WriteLog &operator=(const WriteLog&) = delete;
+
+  using This = AbstractWriteLog<ImageCtxT>;
+  using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
+  using C_WriteRequestT = pwl::C_WriteRequest<This>;
+  using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
+
+  bool alloc_resources(C_BlockIORequestT *req) override;
+  void setup_schedule_append(
+      pwl::GenericLogOperationsVector &ops, bool do_early_flush) override;
+  void complete_user_request(Context *&user_req, int r) override;
+
+protected:
+  using AbstractWriteLog<ImageCtxT>::m_lock;
+  using AbstractWriteLog<ImageCtxT>::m_log_entries;
+  using AbstractWriteLog<ImageCtxT>::m_image_ctx;
+  using AbstractWriteLog<ImageCtxT>::m_cache_state;
+  using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
+  using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
+  using AbstractWriteLog<ImageCtxT>::m_bytes_allocated;
+
+  void initialize_pool(Context *on_finish,
+                       pwl::DeferredContexts &later) override;
+  void process_work() override;
+  void append_scheduled_ops(void) override;
+  void schedule_append_ops(pwl::GenericLogOperations &ops) override;
+  void remove_pool_file() override;
+  void release_ram(std::shared_ptr<GenericLogEntry> log_entry) override;
+
+private:
+ class AioTransContext {
+   public:
+     Context *on_finish;
+     ::IOContext ioc;
+     explicit AioTransContext(CephContext* cct, Context *cb)
+       : on_finish(cb), ioc(cct, this) {}
+
+     ~AioTransContext(){}
+
+     void aio_finish() {
+       on_finish->complete(ioc.get_return_value());
+       delete this;
+     }
+ }; //class AioTransContext
+
+ struct WriteLogPoolRootUpdate {
+    std::shared_ptr<pwl::WriteLogPoolRoot> root;
+    Context *ctx;
+    WriteLogPoolRootUpdate(std::shared_ptr<pwl::WriteLogPoolRoot> r,
+                           Context* c)
+      : root(r), ctx(c) {}
+  };
+
+  using WriteLogPoolRootUpdateList = std::list<std::shared_ptr<WriteLogPoolRootUpdate>>;
+  WriteLogPoolRootUpdateList m_poolroot_to_update; /* pool root list to update to SSD */
+  bool m_updating_pool_root = false;
+
+  uint64_t m_log_pool_ring_buffer_size; /* Size of ring buffer */
+  std::atomic<int> m_async_update_superblock = {0};
+  BlockDevice *bdev = nullptr;
+  uint64_t pool_size;
+  pwl::WriteLogPoolRoot pool_root;
+  Builder<This> *m_builderobj;
+
+  Builder<This>* create_builder();
+  void load_existing_entries(pwl::DeferredContexts &later);
+  void enlist_op_appender();
+  bool has_sync_point_logs(GenericLogOperations &ops);
+  void append_op_log_entries(GenericLogOperations &ops);
+  void alloc_op_log_entries(GenericLogOperations &ops);
+  Context* construct_flush_entry_ctx(
+      std::shared_ptr<GenericLogEntry> log_entry);
+  void append_ops(GenericLogOperations &ops, Context *ctx,
+                  uint64_t* new_first_free_entry,
+                  uint64_t &bytes_allocated);
+  void write_log_entries(GenericLogEntriesVector log_entries,
+                         AioTransContext *aio);
+  void schedule_update_root(std::shared_ptr<WriteLogPoolRoot> root,
+                            Context *ctx);
+  void enlist_op_update_root();
+  void update_root_scheduled_ops();
+  int update_pool_root_sync(std::shared_ptr<pwl::WriteLogPoolRoot> root);
+  void update_pool_root(std::shared_ptr<WriteLogPoolRoot> root,
+                                          AioTransContext *aio);
+  void pre_io_check(WriteLogCacheEntry *log_entry, uint64_t &length);
+  void aio_read_data_block(WriteLogCacheEntry *log_entry, bufferlist *bl,
+                           Context *ctx);
+  void aio_read_data_block(std::vector<WriteLogCacheEntry*> &log_entries,
+                           std::vector<bufferlist *> &bls, Context *ctx);
+  static void aio_cache_cb(void *priv, void *priv2) {
+    AioTransContext *c = static_cast<AioTransContext*>(priv2);
+    c->aio_finish();
+  }
+};//class WriteLog
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::ssd::WriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
diff --git a/src/test/librbd/CMakeLists.txt b/src/test/librbd/CMakeLists.txt

index 790d74fc1295dacfbd547fc2036fc08d69f4790d..425ccc370351132b1f3186e01f03d734ddf3626b 100644 (file)
--- a/src/test/librbd/CMakeLists.txt
+++ b/src/test/librbd/CMakeLists.txt
@@ -128,12 +128,21 @@ set(unittest_librbd_srcs
    watcher/test_mock_RewatchRequest.cc
    )
  
-if(WITH_RBD_RWL)
+if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE)
     set(unittest_librbd_srcs
       ${unittest_librbd_srcs}
-     cache/pwl/test_mock_ReplicatedWriteLog.cc
       cache/pwl/test_WriteLogMap.cc)
-endif(WITH_RBD_RWL)
+   if(WITH_RBD_RWL)
+     set(unittest_librbd_srcs
+       ${unittest_librbd_srcs}
+       cache/pwl/test_mock_ReplicatedWriteLog.cc)
+   endif()
+   if(WITH_RBD_SSD_CACHE)
+     set(unittest_librbd_srcs
+       ${unittest_librbd_srcs}
+       cache/pwl/test_mock_SSDWriteLog.cc)
+   endif()
+endif()
  
  if(LINUX AND HAVE_LIBCRYPTSETUP)
    list(APPEND unittest_librbd_srcs
diff --git a/src/test/librbd/cache/pwl/test_mock_ReplicatedWriteLog.cc b/src/test/librbd/cache/pwl/test_mock_ReplicatedWriteLog.cc

index 5160b189d630cb5658c992057843684fea957ceb..77faee08f3b10747c091f8751244cb4306a176c8 100644 (file)
--- a/src/test/librbd/cache/pwl/test_mock_ReplicatedWriteLog.cc
+++ b/src/test/librbd/cache/pwl/test_mock_ReplicatedWriteLog.cc
@@ -36,13 +36,14 @@ inline ImageCtx *get_image_ctx(MockImageCtx *image_ctx) {
  } // namespace librbd
  
  #include "librbd/cache/pwl/AbstractWriteLog.cc"
-#include "librbd/cache/pwl/ReplicatedWriteLog.cc"
-template class librbd::cache::pwl::ReplicatedWriteLog<librbd::MockImageCtx>;
+#include "librbd/cache/pwl/rwl/WriteLog.cc"
+template class librbd::cache::pwl::rwl::WriteLog<librbd::MockImageCtx>;
  
  // template definitions
  #include "librbd/cache/ImageWriteback.cc"
  #include "librbd/cache/pwl/ImageCacheState.cc"
  #include "librbd/cache/pwl/Request.cc"
+#include "librbd/cache/pwl/rwl/Request.cc"
  #include "librbd/plugin/Api.cc"
  
  namespace librbd {
@@ -58,7 +59,7 @@ typedef io::Extent Extent;
  typedef io::Extents Extents;
  
  struct TestMockCacheReplicatedWriteLog : public TestMockFixture {
-  typedef librbd::cache::pwl::ReplicatedWriteLog<librbd::MockImageCtx> MockReplicatedWriteLog;
+  typedef librbd::cache::pwl::rwl::WriteLog<librbd::MockImageCtx> MockReplicatedWriteLog;
    typedef librbd::cache::pwl::ImageCacheState<librbd::MockImageCtx> MockImageCacheStateRWL;
    typedef librbd::cache::ImageWriteback<librbd::MockImageCtx> MockImageWriteback;
    typedef librbd::plugin::Api<librbd::MockImageCtx> MockApi;
diff --git a/src/test/librbd/cache/pwl/test_mock_SSDWriteLog.cc b/src/test/librbd/cache/pwl/test_mock_SSDWriteLog.cc

new file mode 100644 (file)

index 0000000..4eac15e
--- /dev/null
+++ b/src/test/librbd/cache/pwl/test_mock_SSDWriteLog.cc
@@ -0,0 +1,238 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "common/hostname.h"
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "include/rbd/librbd.hpp"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/Types.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/plugin/Api.h"
+
+namespace librbd {
+namespace {
+
+struct MockContextSSD : public C_SaferCond  {
+  MOCK_METHOD1(complete, void(int));
+  MOCK_METHOD1(finish, void(int));
+
+  void do_complete(int r) {
+    C_SaferCond::complete(r);
+  }
+};
+
+} // anonymous namespace
+
+namespace util {
+
+inline ImageCtx *get_image_ctx(MockImageCtx *image_ctx) {
+  return image_ctx->image_ctx;
+}
+
+} // namespace util
+} // namespace librbd
+
+#include "librbd/cache/pwl/AbstractWriteLog.cc"
+#include "librbd/cache/pwl/ssd/WriteLog.cc"
+template class librbd::cache::pwl::ssd::WriteLog<librbd::MockImageCtx>;
+
+// template definitions
+#include "librbd/cache/ImageWriteback.cc"
+#include "librbd/cache/pwl/ImageCacheState.cc"
+#include "librbd/cache/pwl/Request.cc"
+#include "librbd/plugin/Api.cc"
+#include "librbd/cache/pwl/ssd/Request.cc"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::InSequence;
+using ::testing::Invoke;
+
+typedef io::Extent Extent;
+typedef io::Extents Extents;
+
+struct TestMockCacheSSDWriteLog : public TestMockFixture {
+  typedef librbd::cache::pwl::ssd::WriteLog<librbd::MockImageCtx> MockSSDWriteLog;
+  typedef librbd::cache::pwl::ImageCacheState<librbd::MockImageCtx> MockImageCacheStateSSD;
+  typedef librbd::cache::ImageWriteback<librbd::MockImageCtx> MockImageWriteback;
+  typedef librbd::plugin::Api<librbd::MockImageCtx> MockApi;
+
+  MockImageCacheStateSSD *get_cache_state(
+      MockImageCtx& mock_image_ctx, MockApi& mock_api) {
+    MockImageCacheStateSSD *rwl_state = new MockImageCacheStateSSD(&mock_image_ctx, mock_api);
+    return rwl_state;
+  }
+
+  void validate_cache_state(librbd::ImageCtx *image_ctx,
+                            MockImageCacheStateSSD &state,
+                            bool present, bool empty, bool clean,
+                            string host, string path,
+                            uint64_t size) {
+    ConfigProxy &config = image_ctx->config;
+    ASSERT_EQ(present, state.present);
+    ASSERT_EQ(empty, state.empty);
+    ASSERT_EQ(clean, state.clean);
+   
+    ASSERT_EQ(host, state.host);
+    ASSERT_EQ(path, state.path);
+    ASSERT_EQ(size, state.size);
+    ASSERT_EQ(config.get_val<bool>("rbd_rwl_log_periodic_stats"),
+             state.log_periodic_stats);
+  }
+
+  void expect_op_work_queue(MockImageCtx& mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.op_work_queue, queue(_, _))
+      .WillRepeatedly(Invoke([](Context* ctx, int r) {
+                        ctx->complete(r);
+                      }));
+  }
+
+  void expect_context_complete(MockContextSSD& mock_context, int r) {
+    EXPECT_CALL(mock_context, complete(r))
+      .WillRepeatedly(Invoke([&mock_context](int r) {
+                        mock_context.do_complete(r);
+                      }));
+  }
+
+  void expect_metadata_set(MockImageCtx& mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.operations, execute_metadata_set(_, _, _))
+      .WillRepeatedly(Invoke([](std::string key, std::string val, Context* ctx) {
+                        ctx->complete(0);
+                      }));
+  }
+
+  void expect_metadata_remove(MockImageCtx& mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.operations, execute_metadata_remove(_, _))
+      .WillRepeatedly(Invoke([](std::string key, Context* ctx) {
+                        ctx->complete(0);
+                      }));
+  }
+};
+
+TEST_F(TestMockCacheSSDWriteLog, init_state_write) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockApi mock_api;
+  MockImageCacheStateSSD image_cache_state(&mock_image_ctx, mock_api);
+
+  validate_cache_state(ictx, image_cache_state, false, true, true, "", "", 0);
+  
+  image_cache_state.empty = false;
+  image_cache_state.clean = false;
+  MockContextSSD finish_ctx;
+  expect_metadata_set(mock_image_ctx);
+  expect_context_complete(finish_ctx, 0);
+  image_cache_state.write_image_cache_state(&finish_ctx);
+  ASSERT_EQ(0, finish_ctx.wait());
+}
+
+static void get_jf(const string& s, JSONFormattable *f)
+{
+  JSONParser p;
+  bool result = p.parse(s.c_str(), s.size());
+  if (!result) {
+    cout << "Failed to parse: '" << s << "'" << std::endl;
+  }
+  ASSERT_EQ(true, result);
+  try {
+    decode_json_obj(*f, &p);
+  } catch (JSONDecoder::err& e) {
+    ASSERT_TRUE(0 == "Failed to decode JSON object");
+  }
+}
+
+TEST_F(TestMockCacheSSDWriteLog, init_state_json_write) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  JSONFormattable f;
+  string strf = "{ \"present\": \"1\", \"empty\": \"0\", \"clean\": \"0\", \
+                   \"pwl_host\": \"testhost\", \
+                   \"pwl_path\": \"/tmp\", \
+                   \"pwl_size\": \"1024\" }";
+  get_jf(strf, &f);
+  MockApi mock_api;
+  MockImageCacheStateSSD image_cache_state(&mock_image_ctx, f, mock_api);
+
+  validate_cache_state(ictx, image_cache_state, true, false, false,
+                       "testhost", "/tmp", 1024);
+
+  MockContextSSD finish_ctx;
+  expect_metadata_remove(mock_image_ctx);
+  expect_context_complete(finish_ctx, 0);
+  image_cache_state.clear_image_cache_state(&finish_ctx);
+  ASSERT_EQ(0, finish_ctx.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, init_shutdown) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);
+  MockApi mock_api;
+  MockSSDWriteLog rwl(
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+      mock_image_writeback, mock_api);
+  MockContextSSD finish_ctx1;
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  rwl.shut_down(&finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, write) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageWriteback mock_image_writeback(mock_image_ctx);                               
+  MockApi mock_api;                                                                      
+  MockSSDWriteLog rwl(                                                                   
+      mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),                         
+      mock_image_writeback, mock_api);
+
+  MockContextSSD finish_ctx1;
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextSSD finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  Extents image_extents{{0, 4096}};
+  bufferlist bl;
+  bl.append(std::string(4096, '1'));
+  int fadvise_flags = 0;
+  rwl.write(std::move(image_extents), std::move(bl), fadvise_flags, &finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+
+  MockContextSSD finish_ctx3;
+  expect_context_complete(finish_ctx3, 0);
+  rwl.shut_down(&finish_ctx3);
+  ASSERT_EQ(0, finish_ctx3.wait());
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/test/librbd/test_fixture.cc b/src/test/librbd/test_fixture.cc

index eccf085ea27db3f9241744bb08a2035572693a14..ba55e0a20568d51c9dc38932482e9778dc167ca9 100644 (file)
--- a/src/test/librbd/test_fixture.cc
+++ b/src/test/librbd/test_fixture.cc
@@ -61,6 +61,8 @@ std::string TestFixture::get_temp_image_name() {
  void TestFixture::SetUp() {
    ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), m_ioctx));
    m_cct = reinterpret_cast<CephContext*>(m_ioctx.cct());
+  librados::Rados rados(m_ioctx);
+  rados.conf_set("rbd_rwl_path", ".");
  
    m_image_name = get_temp_image_name();
    m_image_size = 2 << 20;
@@ -73,7 +75,6 @@ void TestFixture::TearDown() {
         iter != m_ictxs.end(); ++iter) {
      (*iter)->state->close();
    }
-
    m_ioctx.close();
  }
  
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc

index 0330a0e2810baab8c2f06377090ca76f3df22c7f..e1f74bc0099366a3f0bf123cb6e9d00b4d6779ac 100644 (file)
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -4148,7 +4148,7 @@ TYPED_TEST(DiffIterateTest, DiffIterateDiscard)
  
  TYPED_TEST(DiffIterateTest, DiffIterateStress)
  {
-  REQUIRE(!is_rbd_rwl_enabled((CephContext *)this->_rados.cct()));
+  REQUIRE(!is_rbd_pwl_enabled((CephContext *)this->_rados.cct()));
    librados::IoCtx ioctx;
    ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx));
  
@@ -6876,7 +6876,7 @@ TEST_F(TestLibRBD, ExclusiveLock)
  TEST_F(TestLibRBD, BreakLock)
  {
    REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
-  REQUIRE(!is_rbd_rwl_enabled((CephContext *)_rados.cct()));
+  REQUIRE(!is_rbd_pwl_enabled((CephContext *)_rados.cct()));
  
    static char buf[10];
  
diff --git a/src/test/librbd/test_mock_ExclusiveLock.cc b/src/test/librbd/test_mock_ExclusiveLock.cc

index 8d1f35c3b6bf1538cd3d6f494a00711feeb3dae5..6feb54ec66146c1f5a910a36e3bc4b9fd19e73d5 100644 (file)
--- a/src/test/librbd/test_mock_ExclusiveLock.cc
+++ b/src/test/librbd/test_mock_ExclusiveLock.cc
@@ -265,7 +265,7 @@ public:
                                 bool init_shutdown) {
      if (mock_image_ctx.clone_copy_on_read ||
          (mock_image_ctx.features & RBD_FEATURE_JOURNALING) != 0 ||
-        is_rbd_rwl_enabled(mock_image_ctx.cct)) {
+        is_rbd_pwl_enabled(mock_image_ctx.cct)) {
        expect_set_require_lock(mock_image_dispatch, init_shutdown,
                                io::DIRECTION_BOTH);
      } else {
diff --git a/src/test/librbd/test_support.cc b/src/test/librbd/test_support.cc

index 25b34436bb18ece02bfd3ddd4960700e983d0d22..bc9d2543d35a9110a825a18043d82f2055684ac4 100644 (file)
--- a/src/test/librbd/test_support.cc
+++ b/src/test/librbd/test_support.cc
@@ -127,9 +127,10 @@ bool is_librados_test_stub(librados::Rados &rados) {
    return fsid == "00000000-1111-2222-3333-444444444444";
  }
  
-bool is_rbd_rwl_enabled(ceph::common::CephContext *cct) {
-#if defined(WITH_RBD_RWL)
-  return cct->_conf.get_val<bool>("rbd_rwl_enabled");
+bool is_rbd_pwl_enabled(ceph::common::CephContext *cct) {
+#if defined(WITH_RBD_RWL) || defined(WITH_RBD_SSD_CACHE)
+  auto value = cct->_conf.get_val<std::string>("rbd_persistent_cache_mode");
+  return value == "disabled" ? false : true;
  #else
    return false;
  #endif
diff --git a/src/test/librbd/test_support.h b/src/test/librbd/test_support.h

index a428d32b393bd44edf80850272a23f7026fdcec9..2d2de175ba79001324745fd0a095c17b5ca54954 100644 (file)
--- a/src/test/librbd/test_support.h
+++ b/src/test/librbd/test_support.h
@@ -26,7 +26,7 @@ int create_image_data_pool(librados::Rados &rados, std::string &data_pool, bool
  
  bool is_librados_test_stub(librados::Rados &rados);
  
-bool is_rbd_rwl_enabled(ceph::common::CephContext *ctx);
+bool is_rbd_pwl_enabled(ceph::common::CephContext *ctx);
  
  #define REQUIRE(x) {                     \
    if (!(x)) {                            \
diff --git a/src/tools/ceph-dencoder/rbd_types.h b/src/tools/ceph-dencoder/rbd_types.h

index 5c7c15fe98c7edc5f07dffcc936fb0ad57d81a13..6fb84dea66ba121008101582643907650557daf6 100644 (file)
--- a/src/tools/ceph-dencoder/rbd_types.h
+++ b/src/tools/ceph-dencoder/rbd_types.h
@@ -21,10 +21,10 @@ TYPE(rbd::mirror::image_map::PolicyData)
  
  #if defined(WITH_RBD) && defined(WITH_RBD_SSD_CACHE)
  #include "librbd/cache/pwl/Types.h"
-#include "librbd/cache/pwl/SSDTypes.h"
-TYPE(librbd::cache::pwl::WriteLogPmemEntry)
+#include "librbd/cache/pwl/ssd/Types.h"
+TYPE(librbd::cache::pwl::WriteLogCacheEntry)
  TYPE(librbd::cache::pwl::WriteLogPoolRoot)
-TYPE(librbd::cache::pwl::SuperBlock)
+TYPE(librbd::cache::pwl::ssd::SuperBlock)
  #endif
  
  #ifdef WITH_RBD
author	Mahati Chamarthy <mahati.chamarthy@intel.com>
	Fri, 6 Nov 2020 12:09:55 +0000 (17:39 +0530)
committer	Mahati Chamarthy <mahati.chamarthy@intel.com>
	Wed, 6 Jan 2021 13:36:32 +0000 (19:06 +0530)
src/common/options.cc		patch \| blob \| history
src/librbd/CMakeLists.txt		patch \| blob \| history
src/librbd/cache/Types.h		patch \| blob \| history
src/librbd/cache/Utils.h		patch \| blob \| history
src/librbd/cache/pwl/AbstractWriteLog.cc		patch \| blob \| history
src/librbd/cache/pwl/AbstractWriteLog.h		patch \| blob \| history
src/librbd/cache/pwl/Builder.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/ImageCacheState.cc		patch \| blob \| history
src/librbd/cache/pwl/ImageCacheState.h		patch \| blob \| history
src/librbd/cache/pwl/InitRequest.cc		patch \| blob \| history
src/librbd/cache/pwl/LogEntry.cc		patch \| blob \| history
src/librbd/cache/pwl/LogEntry.h		patch \| blob \| history
src/librbd/cache/pwl/LogOperation.cc		patch \| blob \| history
src/librbd/cache/pwl/LogOperation.h		patch \| blob \| history
src/librbd/cache/pwl/ReplicatedWriteLog.cc	[deleted file]	patch \| blob \| history
src/librbd/cache/pwl/ReplicatedWriteLog.h	[deleted file]	patch \| blob \| history
src/librbd/cache/pwl/Request.cc		patch \| blob \| history
src/librbd/cache/pwl/Request.h		patch \| blob \| history
src/librbd/cache/pwl/SSDTypes.h	[deleted file]	patch \| blob \| history
src/librbd/cache/pwl/SSDWriteLog.cc	[deleted file]	patch \| blob \| history
src/librbd/cache/pwl/SSDWriteLog.h	[deleted file]	patch \| blob \| history
src/librbd/cache/pwl/ShutdownRequest.h		patch \| blob \| history
src/librbd/cache/pwl/Types.cc		patch \| blob \| history
src/librbd/cache/pwl/Types.h		patch \| blob \| history
src/librbd/cache/pwl/rwl/Builder.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/rwl/LogEntry.cc	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/rwl/LogEntry.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/rwl/LogOperation.cc	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/rwl/LogOperation.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/rwl/Request.cc	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/rwl/Request.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/rwl/WriteLog.cc	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/rwl/WriteLog.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/ssd/Builder.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/ssd/LogEntry.cc	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/ssd/LogEntry.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/ssd/Request.cc	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/ssd/Request.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/ssd/Types.h	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/ssd/WriteLog.cc	[new file with mode: 0644]	patch \| blob
src/librbd/cache/pwl/ssd/WriteLog.h	[new file with mode: 0644]	patch \| blob
src/test/librbd/CMakeLists.txt		patch \| blob \| history
src/test/librbd/cache/pwl/test_mock_ReplicatedWriteLog.cc		patch \| blob \| history
src/test/librbd/cache/pwl/test_mock_SSDWriteLog.cc	[new file with mode: 0644]	patch \| blob
src/test/librbd/test_fixture.cc		patch \| blob \| history
src/test/librbd/test_librbd.cc		patch \| blob \| history
src/test/librbd/test_mock_ExclusiveLock.cc		patch \| blob \| history
src/test/librbd/test_support.cc		patch \| blob \| history
src/test/librbd/test_support.h		patch \| blob \| history
src/tools/ceph-dencoder/rbd_types.h		patch \| blob \| history