]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
librbd: add the framework for RWL writeback cache
authorlixiaoy1 <xiaoyan.li@intel.com>
Thu, 31 Oct 2019 13:40:36 +0000 (09:40 -0400)
committerlixiaoy1 <xiaoyan.li@intel.com>
Thu, 21 Nov 2019 09:02:50 +0000 (04:02 -0500)
Signed-off-by: Peterson, Scott <scott.d.peterson@intel.com>
Signed-off-by: Li, Xiaoyan <xiaoyan.li@intel.com>
Signed-off-by: Lu, Yuan <yuan.y.lu@intel.com>
Signed-off-by: Chamarthy, Mahati <mahati.chamarthy@intel.com>
15 files changed:
CMakeLists.txt
src/CMakeLists.txt
src/common/options.cc
src/include/config-h.in.cmake
src/librbd/CMakeLists.txt
src/librbd/cache/ReplicatedWriteLog.cc [new file with mode: 0644]
src/librbd/cache/ReplicatedWriteLog.h [new file with mode: 0644]
src/librbd/cache/rwl/ImageCacheState.cc [new file with mode: 0644]
src/librbd/cache/rwl/ImageCacheState.h [new file with mode: 0644]
src/librbd/cache/rwl/LogEntry.cc [new file with mode: 0644]
src/librbd/cache/rwl/LogEntry.h [new file with mode: 0644]
src/librbd/cache/rwl/Types.cc [new file with mode: 0644]
src/librbd/cache/rwl/Types.h [new file with mode: 0644]
src/test/librbd/CMakeLists.txt
src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc [new file with mode: 0644]

index 48ff49df83ab3a82fb33d83dc702dde7bf20fe9f..21c97e0182d87333f27c40c647488bf8e0a66a94 100644 (file)
@@ -189,8 +189,11 @@ include(CMakeDependentOption)
 CMAKE_DEPENDENT_OPTION(WITH_BLUESTORE_PMEM "Enable PMDK libraries" OFF
   "WITH_BLUESTORE" OFF)
 
+CMAKE_DEPENDENT_OPTION(WITH_RBD_RWL "Enable librbd persistent write back cache" OFF
+  "WITH_RBD" OFF)
+
 CMAKE_DEPENDENT_OPTION(WITH_SYSTEM_PMDK "Require and build with system PMDK" OFF
-  "WITH_BLUESTORE_PMEM" OFF)
+  "WITH_RBD_RWL OR WITH_BLUESTORE_PMEM" OFF)
 
 if(WITH_BLUESTORE_PMEM)
   set(HAVE_BLUESTORE_PMEM ON)
index e0fed2abe6dbd7a51e2b3b1288611789b010bcfb..c2ac16de2a972ec2300ecbf56c25de2568fc6c04 100644 (file)
@@ -413,9 +413,14 @@ if(WITH_DPDK)
   list(APPEND ceph_common_deps common_async_dpdk)
 endif()
 
-if(WITH_BLUESTORE_PMEM)
+if(WITH_BLUESTORE_PMEM OR WITH_RBD_RWL)
   if(WITH_SYSTEM_PMDK)
-    find_package(pmem REQUIRED)
+    if(WITH_BLUESTORE_PMEM)
+      find_package(pmem REQUIRED COMPONENTS pmem)
+    endif()
+    if(WITH_RBD_RWL)
+      find_package(pmem REQUIRED COMPONENTS pmemobj)
+    endif()
   else()
     include(Buildpmem)
     build_pmem()
index c5cad0169cfa71d2484c37d7890e27b3625212a6..beedd0181bfba20e6896b4c5c25de377413e6cba 100644 (file)
@@ -7297,6 +7297,23 @@ static std::vector<Option> get_rbd_options() {
     .set_default(0)
     .set_min(0)
     .set_description("maximum io delay (in milliseconds) for simple io scheduler (if set to 0 dalay is calculated based on latency stats)"),
+
+    Option("rbd_rwl_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("enable persistent write back cache for this volume"),
+
+    Option("rbd_rwl_log_periodic_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("emit periodic perf stats to debug log"),
+
+    Option("rbd_rwl_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1073741824)
+    .set_min(1073741824)
+    .set_description("size of the persistent write back cache for this volume"),
+
+    Option("rbd_rwl_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/tmp")
+    .set_description("location of the persistent write back cache in a DAX-enabled filesystem on persistent memory"),
   });
 }
 
index 4bd92e5db9281d37218ea4c50c35aeb2045c8a38..3a010ed2b6e691f44d12d0e9d8b08ec5aae45ad8 100644 (file)
 /* Define if unit tests are built. */
 #cmakedefine UNIT_TESTS_BUILT
 
+/* Define if RWL is enabled */
+#cmakedefine WITH_RBD_RWL
+
 #endif /* CONFIG_H */
index 6fa139f194c4a9e37d85f362383548b34d5def54..8b95fe1e30d19c04eb1a7985acf88cf8b380e049 100644 (file)
@@ -146,6 +146,14 @@ if(WITH_EVENTTRACE)
   list(APPEND librbd_internal_srcs ../common/EventTrace.cc)
 endif()
 
+if(WITH_RBD_RWL)
+  set(librbd_internal_srcs
+    ${librbd_internal_srcs}
+    cache/rwl/Types.cc
+    cache/rwl/LogEntry.cc
+    cache/rwl/ImageCacheState.cc
+    cache/ReplicatedWriteLog.cc)
+endif()
 
 add_library(rbd_api STATIC librbd.cc)
 add_library(rbd_internal STATIC
@@ -164,6 +172,12 @@ target_link_libraries(rbd_internal PRIVATE
   ceph_immutable_object_cache_lib
   osdc)
 
+if(WITH_RBD_RWL)
+  target_link_libraries(rbd_internal PRIVATE
+    pmem::pmemobj
+    pmem::pmem)
+endif()
+
 add_library(librbd ${CEPH_SHARED}
   librbd.cc)
 if(WITH_LTTNG)
diff --git a/src/librbd/cache/ReplicatedWriteLog.cc b/src/librbd/cache/ReplicatedWriteLog.cc
new file mode 100644 (file)
index 0000000..4fd3854
--- /dev/null
@@ -0,0 +1,521 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <libpmemobj.h>
+#include "ReplicatedWriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/rwl/ImageCacheState.h"
+#include "librbd/cache/rwl/LogEntry.h"
+#include "librbd/cache/rwl/Types.h"
+#include <map>
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::ReplicatedWriteLog: " << this << " " \
+                           <<  __func__ << ": "
+
+const uint32_t MIN_WRITE_ALLOC_SIZE = 512;
+const uint32_t LOG_STATS_INTERVAL_SECONDS = 5;
+
+/**** Write log entries ****/
+const uint64_t DEFAULT_POOL_SIZE = 1u<<30;
+const uint64_t MIN_POOL_SIZE = DEFAULT_POOL_SIZE;
+constexpr double USABLE_SIZE = (7.0 / 10);
+const uint64_t BLOCK_ALLOC_OVERHEAD_BYTES = 16;
+const uint8_t RWL_POOL_VERSION = 1;
+const uint64_t MAX_LOG_ENTRIES = (1024 * 1024);
+
+namespace librbd {
+namespace cache {
+
+using namespace librbd::cache::rwl;
+
+typedef ReplicatedWriteLog<ImageCtx>::Extents Extents;
+
+template <typename I>
+ReplicatedWriteLog<I>::ReplicatedWriteLog(I &image_ctx, librbd::cache::rwl::ImageCacheState<I>* cache_state)
+  : m_cache_state(cache_state),
+    m_rwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_rwl)),
+    m_image_ctx(image_ctx),
+    m_log_pool_config_size(DEFAULT_POOL_SIZE),
+    m_image_writeback(image_ctx),
+    m_lock("librbd::cache::ReplicatedWriteLog::m_lock",
+           true, true),
+    m_thread_pool(image_ctx.cct, "librbd::cache::ReplicatedWriteLog::thread_pool", "tp_rwl",
+                  4,
+                  ""),
+    m_work_queue("librbd::cache::ReplicatedWriteLog::work_queue",
+                 image_ctx.config.template get_val<uint64_t>("rbd_op_thread_timeout"),
+                 &m_thread_pool)
+{
+  CephContext *cct = m_image_ctx.cct;
+  ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock);
+}
+
+template <typename I>
+ReplicatedWriteLog<I>::~ReplicatedWriteLog() {
+  ldout(m_image_ctx.cct, 15) << "enter" << dendl;
+  {
+    std::lock_guard timer_locker(*m_timer_lock);
+    std::lock_guard locker(m_lock);
+    m_timer->cancel_event(m_timer_ctx);
+    m_thread_pool.stop();
+    m_log_pool = nullptr;
+    delete m_cache_state;
+    m_cache_state = nullptr;
+  }
+  ldout(m_image_ctx.cct, 15) << "exit" << dendl;
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::perf_start(std::string name) {
+  PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_rwl_first, l_librbd_rwl_last);
+
+  // Latency axis configuration for op histograms, values are in nanoseconds
+  PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
+    "Latency (nsec)",
+    PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
+    0,                               ///< Start at 0
+    5000,                            ///< Quantization unit is 5usec
+    16,                              ///< Ranges into the mS
+  };
+
+  // Op size axis configuration for op histogram y axis, values are in bytes
+  PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
+    "Request size (bytes)",
+    PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+    0,                               ///< Start at 0
+    512,                             ///< Quantization unit is 512 bytes
+    16,                              ///< Writes up to >32k
+  };
+
+  // Num items configuration for op histogram y axis, values are in items
+  PerfHistogramCommon::axis_config_d op_hist_y_axis_count_config{
+    "Number of items",
+    PerfHistogramCommon::SCALE_LINEAR, ///< Request size in linear scale
+    0,                                 ///< Start at 0
+    1,                                 ///< Quantization unit is 512 bytes
+    32,                                ///< Writes up to >32k
+  };
+
+  plb.add_u64_counter(l_librbd_rwl_rd_req, "rd", "Reads");
+  plb.add_u64_counter(l_librbd_rwl_rd_bytes, "rd_bytes", "Data size in reads");
+  plb.add_time_avg(l_librbd_rwl_rd_latency, "rd_latency", "Latency of reads");
+
+  plb.add_u64_counter(l_librbd_rwl_rd_hit_req, "hit_rd", "Reads completely hitting RWL");
+  plb.add_u64_counter(l_librbd_rwl_rd_hit_bytes, "rd_hit_bytes", "Bytes read from RWL");
+  plb.add_time_avg(l_librbd_rwl_rd_hit_latency, "hit_rd_latency", "Latency of read hits");
+
+  plb.add_u64_counter(l_librbd_rwl_rd_part_hit_req, "part_hit_rd", "reads partially hitting RWL");
+
+  plb.add_u64_counter(l_librbd_rwl_wr_req, "wr", "Writes");
+  plb.add_u64_counter(l_librbd_rwl_wr_req_def, "wr_def", "Writes deferred for resources");
+  plb.add_u64_counter(l_librbd_rwl_wr_req_def_lanes, "wr_def_lanes", "Writes deferred for lanes");
+  plb.add_u64_counter(l_librbd_rwl_wr_req_def_log, "wr_def_log", "Writes deferred for log entries");
+  plb.add_u64_counter(l_librbd_rwl_wr_req_def_buf, "wr_def_buf", "Writes deferred for buffers");
+  plb.add_u64_counter(l_librbd_rwl_wr_req_overlap, "wr_overlap", "Writes overlapping with prior in-progress writes");
+  plb.add_u64_counter(l_librbd_rwl_wr_req_queued, "wr_q_barrier", "Writes queued for prior barriers (aio_flush)");
+  plb.add_u64_counter(l_librbd_rwl_wr_bytes, "wr_bytes", "Data size in writes");
+
+  plb.add_u64_counter(l_librbd_rwl_log_ops, "log_ops", "Log appends");
+  plb.add_u64_avg(l_librbd_rwl_log_op_bytes, "log_op_bytes", "Average log append bytes");
+
+  plb.add_time_avg(
+      l_librbd_rwl_req_arr_to_all_t, "req_arr_to_all_t",
+      "Average arrival to allocation time (time deferred for overlap)");
+  plb.add_time_avg(
+      l_librbd_rwl_req_arr_to_dis_t, "req_arr_to_dis_t",
+      "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
+  plb.add_time_avg(
+      l_librbd_rwl_req_all_to_dis_t, "req_all_to_dis_t",
+      "Average allocation to dispatch time (time deferred for log resources)");
+  plb.add_time_avg(
+      l_librbd_rwl_wr_latency, "wr_latency",
+      "Latency of writes (persistent completion)");
+  plb.add_u64_counter_histogram(
+    l_librbd_rwl_wr_latency_hist, "wr_latency_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of write request latency (nanoseconds) vs. bytes written");
+  plb.add_time_avg(
+      l_librbd_rwl_wr_caller_latency, "caller_wr_latency",
+      "Latency of write completion to caller");
+  plb.add_time_avg(
+      l_librbd_rwl_nowait_req_arr_to_all_t, "req_arr_to_all_nw_t",
+      "Average arrival to allocation time (time deferred for overlap)");
+  plb.add_time_avg(
+      l_librbd_rwl_nowait_req_arr_to_dis_t, "req_arr_to_dis_nw_t",
+      "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
+  plb.add_time_avg(
+      l_librbd_rwl_nowait_req_all_to_dis_t, "req_all_to_dis_nw_t",
+      "Average allocation to dispatch time (time deferred for log resources)");
+  plb.add_time_avg(
+      l_librbd_rwl_nowait_wr_latency, "wr_latency_nw",
+      "Latency of writes (persistent completion) not deferred for free space");
+  plb.add_u64_counter_histogram(
+    l_librbd_rwl_nowait_wr_latency_hist, "wr_latency_nw_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of write request latency (nanoseconds) vs. bytes written for writes not deferred for free space");
+  plb.add_time_avg(
+      l_librbd_rwl_nowait_wr_caller_latency, "caller_wr_latency_nw",
+      "Latency of write completion to callerfor writes not deferred for free space");
+  plb.add_time_avg(l_librbd_rwl_log_op_alloc_t, "op_alloc_t", "Average buffer pmemobj_reserve() time");
+  plb.add_u64_counter_histogram(
+    l_librbd_rwl_log_op_alloc_t_hist, "op_alloc_t_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of buffer pmemobj_reserve() time (nanoseconds) vs. bytes written");
+  plb.add_time_avg(l_librbd_rwl_log_op_dis_to_buf_t, "op_dis_to_buf_t", "Average dispatch to buffer persist time");
+  plb.add_time_avg(l_librbd_rwl_log_op_dis_to_app_t, "op_dis_to_app_t", "Average dispatch to log append time");
+  plb.add_time_avg(l_librbd_rwl_log_op_dis_to_cmp_t, "op_dis_to_cmp_t", "Average dispatch to persist completion time");
+  plb.add_u64_counter_histogram(
+    l_librbd_rwl_log_op_dis_to_cmp_t_hist, "op_dis_to_cmp_t_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of op dispatch to persist complete time (nanoseconds) vs. bytes written");
+
+  plb.add_time_avg(
+      l_librbd_rwl_log_op_buf_to_app_t, "op_buf_to_app_t",
+      "Average buffer persist to log append time (write data persist/replicate + wait for append time)");
+  plb.add_time_avg(
+      l_librbd_rwl_log_op_buf_to_bufc_t, "op_buf_to_bufc_t",
+      "Average buffer persist time (write data persist/replicate time)");
+  plb.add_u64_counter_histogram(
+    l_librbd_rwl_log_op_buf_to_bufc_t_hist, "op_buf_to_bufc_t_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of write buffer persist time (nanoseconds) vs. bytes written");
+  plb.add_time_avg(
+      l_librbd_rwl_log_op_app_to_cmp_t, "op_app_to_cmp_t",
+      "Average log append to persist complete time (log entry append/replicate + wait for complete time)");
+  plb.add_time_avg(
+      l_librbd_rwl_log_op_app_to_appc_t, "op_app_to_appc_t",
+      "Average log append to persist complete time (log entry append/replicate time)");
+  plb.add_u64_counter_histogram(
+    l_librbd_rwl_log_op_app_to_appc_t_hist, "op_app_to_appc_t_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of log append persist time (nanoseconds) (vs. op bytes)");
+
+  plb.add_u64_counter(l_librbd_rwl_discard, "discard", "Discards");
+  plb.add_u64_counter(l_librbd_rwl_discard_bytes, "discard_bytes", "Bytes discarded");
+  plb.add_time_avg(l_librbd_rwl_discard_latency, "discard_lat", "Discard latency");
+
+  plb.add_u64_counter(l_librbd_rwl_aio_flush, "aio_flush", "AIO flush (flush to RWL)");
+  plb.add_u64_counter(l_librbd_rwl_aio_flush_def, "aio_flush_def", "AIO flushes deferred for resources");
+  plb.add_time_avg(l_librbd_rwl_aio_flush_latency, "aio_flush_lat", "AIO flush latency");
+
+  plb.add_u64_counter(l_librbd_rwl_ws,"ws", "Write Sames");
+  plb.add_u64_counter(l_librbd_rwl_ws_bytes, "ws_bytes", "Write Same bytes to image");
+  plb.add_time_avg(l_librbd_rwl_ws_latency, "ws_lat", "Write Same latency");
+
+  plb.add_u64_counter(l_librbd_rwl_cmp, "cmp", "Compare and Write requests");
+  plb.add_u64_counter(l_librbd_rwl_cmp_bytes, "cmp_bytes", "Compare and Write bytes compared/written");
+  plb.add_time_avg(l_librbd_rwl_cmp_latency, "cmp_lat", "Compare and Write latecy");
+  plb.add_u64_counter(l_librbd_rwl_cmp_fails, "cmp_fails", "Compare and Write compare fails");
+
+  plb.add_u64_counter(l_librbd_rwl_flush, "flush", "Flush (flush RWL)");
+  plb.add_u64_counter(l_librbd_rwl_invalidate_cache, "invalidate", "Invalidate RWL");
+  plb.add_u64_counter(l_librbd_rwl_invalidate_discard_cache, "discard", "Discard and invalidate RWL");
+
+  plb.add_time_avg(l_librbd_rwl_append_tx_t, "append_tx_lat", "Log append transaction latency");
+  plb.add_u64_counter_histogram(
+    l_librbd_rwl_append_tx_t_hist, "append_tx_lat_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_count_config,
+    "Histogram of log append transaction time (nanoseconds) vs. entries appended");
+  plb.add_time_avg(l_librbd_rwl_retire_tx_t, "retire_tx_lat", "Log retire transaction latency");
+  plb.add_u64_counter_histogram(
+    l_librbd_rwl_retire_tx_t_hist, "retire_tx_lat_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_count_config,
+    "Histogram of log retire transaction time (nanoseconds) vs. entries retired");
+
+  m_perfcounter = plb.create_perf_counters();
+  m_image_ctx.cct->get_perfcounters_collection()->add(m_perfcounter);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::perf_stop() {
+  ceph_assert(m_perfcounter);
+  m_image_ctx.cct->get_perfcounters_collection()->remove(m_perfcounter);
+  delete m_perfcounter;
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::log_perf() {
+  bufferlist bl;
+  Formatter *f = Formatter::create("json-pretty");
+  bl.append("Perf dump follows\n--- Begin perf dump ---\n");
+  bl.append("{\n");
+  stringstream ss;
+  utime_t now = ceph_clock_now();
+  ss << "\"test_time\": \"" << now << "\",";
+  ss << "\"image\": \"" << m_image_ctx.name << "\",";
+  bl.append(ss);
+  bl.append("\"stats\": ");
+  m_image_ctx.cct->get_perfcounters_collection()->dump_formatted(f, 0);
+  f->flush(bl);
+  bl.append(",\n\"histograms\": ");
+  m_image_ctx.cct->get_perfcounters_collection()->dump_formatted_histograms(f, 0);
+  f->flush(bl);
+  delete f;
+  bl.append("}\n--- End perf dump ---\n");
+  bl.append('\0');
+  ldout(m_image_ctx.cct, 1) << bl.c_str() << dendl;
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::periodic_stats() {
+  std::lock_guard locker(m_lock);
+  ldout(m_image_ctx.cct, 1) << "STATS: "
+                            << "m_free_log_entries=" << m_free_log_entries << ", "
+                            << "m_log_entries=" << m_log_entries.size() << ", "
+                            << "m_dirty_log_entries=" << m_dirty_log_entries.size() << ", "
+                            << "m_bytes_allocated=" << m_bytes_allocated << ", "
+                            << "m_bytes_cached=" << m_bytes_cached << ", "
+                            << "m_bytes_dirty=" << m_bytes_dirty << ", "
+                            << "bytes available=" << m_bytes_allocated_cap - m_bytes_allocated << ", "
+                            << "m_current_sync_gen=" << m_current_sync_gen << ", "
+                            << "m_flushed_sync_gen=" << m_flushed_sync_gen << ", "
+                            << dendl;
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::arm_periodic_stats() {
+  ceph_assert(m_timer_lock->is_locked());
+  if (m_periodic_stats_enabled) {
+    m_timer_ctx = new LambdaContext(
+      [this](int r) {
+        /* m_timer_lock is held */
+        periodic_stats();
+        arm_periodic_stats();
+      });
+    m_timer->add_event_after(LOG_STATS_INTERVAL_SECONDS, m_timer_ctx);
+  }
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::rwl_init(Context *on_finish, DeferredContexts &later) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << dendl;
+  TOID(struct WriteLogPoolRoot) pool_root;
+  ceph_assert(m_cache_state);
+  std::lock_guard locker(m_lock);
+  ceph_assert(!m_initialized);
+  ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl;
+  ldout(cct,5) << "rwl_size: " << m_cache_state->size << dendl;
+  std::string rwl_path = m_cache_state->path;
+  ldout(cct,5) << "rwl_path: " << rwl_path << dendl;
+
+  std::string pool_name = m_image_ctx.md_ctx.get_pool_name();
+  std::string log_pool_name = rwl_path + "/rbd-rwl." + pool_name + "." + m_image_ctx.id + ".pool";
+  std::string log_poolset_name = rwl_path + "/rbd-rwl." + pool_name + "." + m_image_ctx.id + ".poolset";
+  m_log_pool_config_size = max(m_cache_state->size, MIN_POOL_SIZE);
+
+  if (access(log_poolset_name.c_str(), F_OK) == 0) {
+    m_log_pool_name = log_poolset_name;
+    m_log_is_poolset = true;
+  } else {
+    m_log_pool_name = log_pool_name;
+    ldout(cct, 5) << "Poolset file " << log_poolset_name
+                  << " not present (or can't open). Using unreplicated pool" << dendl;
+  }
+
+  if ((!m_cache_state->present) &&
+      (access(m_log_pool_name.c_str(), F_OK) == 0)) {
+    ldout(cct, 5) << "There's an existing pool/poolset file " << m_log_pool_name
+                  << ", While there's no cache in the image metatata." << dendl;
+    if (remove(m_log_pool_name.c_str()) != 0) {
+      lderr(cct) << "Failed to remove the pool/poolset file " << m_log_pool_name
+                 << dendl;
+      on_finish->complete(-errno);
+      return;
+    } else {
+      ldout(cct, 5) << "Removed the existing pool/poolset file." << dendl;
+    }
+  }
+
+  if (access(m_log_pool_name.c_str(), F_OK) != 0) {
+    if ((m_log_pool =
+         pmemobj_create(m_log_pool_name.c_str(),
+                        m_rwl_pool_layout_name,
+                        m_log_pool_config_size,
+                        (S_IWUSR | S_IRUSR))) == NULL) {
+      lderr(cct) << "failed to create pool (" << m_log_pool_name << ")"
+                   << pmemobj_errormsg() << dendl;
+      m_cache_state->present = false;
+      m_cache_state->clean = true;
+      m_cache_state->empty = true;
+      /* TODO: filter/replace errnos that are meaningless to the caller */
+      on_finish->complete(-errno);
+      return;
+    }
+    m_cache_state->present = true;
+    m_cache_state->clean = true;
+    m_cache_state->empty = true;
+    pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+    /* new pool, calculate and store metadata */
+    size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE);
+    size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogPmemEntry);
+    uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size);
+    if (num_small_writes > MAX_LOG_ENTRIES) {
+      num_small_writes = MAX_LOG_ENTRIES;
+    }
+    if (num_small_writes <= 2) {
+      lderr(cct) << "num_small_writes needs to > 2" << dendl;
+      on_finish->complete(-EINVAL);
+      return;
+    }
+    m_log_pool_actual_size = m_log_pool_config_size;
+    m_bytes_allocated_cap = effective_pool_size;
+    /* Log ring empty */
+    m_first_free_entry = 0;
+    m_first_valid_entry = 0;
+    TX_BEGIN(m_log_pool) {
+      TX_ADD(pool_root);
+      D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION;
+      D_RW(pool_root)->log_entries =
+        TX_ZALLOC(struct WriteLogPmemEntry,
+                  sizeof(struct WriteLogPmemEntry) * num_small_writes);
+      D_RW(pool_root)->pool_size = m_log_pool_actual_size;
+      D_RW(pool_root)->flushed_sync_gen = m_flushed_sync_gen;
+      D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE;
+      D_RW(pool_root)->num_log_entries = num_small_writes;
+      D_RW(pool_root)->first_free_entry = m_first_free_entry;
+      D_RW(pool_root)->first_valid_entry = m_first_valid_entry;
+    } TX_ONCOMMIT {
+      m_total_log_entries = D_RO(pool_root)->num_log_entries;
+      m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free
+    } TX_ONABORT {
+      m_total_log_entries = 0;
+      m_free_log_entries = 0;
+      lderr(cct) << "failed to initialize pool (" << m_log_pool_name << ")" << dendl;
+      on_finish->complete(-pmemobj_tx_errno());
+      return;
+    } TX_FINALLY {
+    } TX_END;
+  } else {
+    // TODO: load existed cache. This will be covered in later PR.
+  }
+
+  ldout(cct,1) << "pool " << m_log_pool_name << " has " << m_total_log_entries
+               << " log entries, " << m_free_log_entries << " of which are free."
+               << " first_valid=" << m_first_valid_entry
+               << ", first_free=" << m_first_free_entry
+               << ", flushed_sync_gen=" << m_flushed_sync_gen
+               << ", current_sync_gen=" << m_current_sync_gen << dendl;
+  if (m_first_free_entry == m_first_valid_entry) {
+    ldout(cct,1) << "write log is empty" << dendl;
+    m_cache_state->empty = true;
+  }
+
+  // TODO: Will init sync point, this will be covered in later PR.
+  //  init_flush_new_sync_point(later);
+
+  m_initialized = true;
+  // Start the thread
+  m_thread_pool.start();
+
+  m_periodic_stats_enabled = m_cache_state->log_periodic_stats;
+  /* Do these after we drop m_lock */
+  later.add(new LambdaContext([this](int r) {
+        if (m_periodic_stats_enabled) {
+          /* Log stats for the first time */
+          periodic_stats();
+          /* Arm periodic stats logging for the first time */
+          std::lock_guard timer_locker(*m_timer_lock);
+          arm_periodic_stats();
+        }
+      }));
+  m_image_ctx.op_work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::update_image_cache_state(Context *on_finish) {
+  m_cache_state->write_image_cache_state(on_finish);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::init(Context *on_finish) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << dendl;
+  perf_start(m_image_ctx.id);
+
+  ceph_assert(!m_initialized);
+
+  Context *ctx = new LambdaContext(
+    [this, on_finish](int r) {
+      if (r >= 0) {
+        update_image_cache_state(on_finish);
+      } else {
+        on_finish->complete(r);
+      }
+    });
+
+  DeferredContexts later;
+  rwl_init(ctx, later);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::shut_down(Context *on_finish) {
+  // TODO: This is cover in later PR.
+  on_finish->complete(0);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_read(Extents&& image_extents,
+                                     ceph::bufferlist* bl,
+                                     int fadvise_flags, Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_write(Extents &&image_extents,
+                                      bufferlist&& bl,
+                                      int fadvise_flags,
+                                      Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_discard(uint64_t offset, uint64_t length,
+                                        uint32_t discard_granularity_bytes, Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_flush(Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_writesame(uint64_t offset, uint64_t length,
+                                          bufferlist&& bl, int fadvise_flags,
+                                          Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_compare_and_write(Extents &&image_extents,
+                                                  bufferlist&& cmp_bl,
+                                                  bufferlist&& bl,
+                                                  uint64_t *mismatch_offset,
+                                                  int fadvise_flags,
+                                                  Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::flush(Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::invalidate(Context *on_finish) {
+}
+
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::ReplicatedWriteLog<librbd::ImageCtx>;
+template class librbd::cache::ImageCache<librbd::ImageCtx>;
+
diff --git a/src/librbd/cache/ReplicatedWriteLog.h b/src/librbd/cache/ReplicatedWriteLog.h
new file mode 100644 (file)
index 0000000..248a5e2
--- /dev/null
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/cache/ImageCache.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/Utils.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/Types.h"
+#include <functional>
+#include <list>
+
+class Context;
+class SafeTimer;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+namespace rwl {
+
+class GenericLogEntry;
+typedef std::list<std::shared_ptr<GenericLogEntry>> GenericLogEntries;
+
+class DeferredContexts;
+template <typename> class ImageCacheState;
+} // namespace rwl
+
+
+template <typename ImageCtxT>
+class ReplicatedWriteLog : public ImageCache<ImageCtxT> {
+public:
+  using typename ImageCache<ImageCtxT>::Extents;
+
+  ReplicatedWriteLog(ImageCtxT &image_ctx, librbd::cache::rwl::ImageCacheState<ImageCtxT>* cache_state);
+  ~ReplicatedWriteLog();
+  ReplicatedWriteLog(const ReplicatedWriteLog&) = delete;
+  ReplicatedWriteLog &operator=(const ReplicatedWriteLog&) = delete;
+
+  /// client AIO methods
+  void aio_read(Extents&& image_extents, ceph::bufferlist *bl,
+                int fadvise_flags, Context *on_finish) override;
+  void aio_write(Extents&& image_extents, ceph::bufferlist&& bl,
+                 int fadvise_flags, Context *on_finish) override;
+  void aio_discard(uint64_t offset, uint64_t length,
+                   uint32_t discard_granularity_bytes,
+                   Context *on_finish) override;
+  void aio_flush(Context *on_finish) override;
+  void aio_writesame(uint64_t offset, uint64_t length,
+                     ceph::bufferlist&& bl,
+                     int fadvise_flags, Context *on_finish) override;
+  void aio_compare_and_write(Extents&& image_extents,
+                             ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl,
+                             uint64_t *mismatch_offset,int fadvise_flags,
+                             Context *on_finish) override;
+
+  /// internal state methods
+  void init(Context *on_finish) override;
+  void shut_down(Context *on_finish) override;
+  void invalidate(Context *on_finish);
+  void flush(Context *on_finish) override;
+
+private:
+  librbd::cache::rwl::ImageCacheState<ImageCtxT>* m_cache_state = nullptr;
+
+  std::atomic<bool> m_initialized = {false};
+  PMEMobjpool *m_log_pool = nullptr;
+  const char* m_rwl_pool_layout_name;
+
+  ImageCtxT &m_image_ctx;
+
+  std::string m_log_pool_name;
+  bool m_log_is_poolset = false;
+  uint64_t m_log_pool_config_size; /* Configured size of RWL */
+  uint64_t m_log_pool_actual_size = 0; /* Actual size of RWL pool */
+
+  uint32_t m_total_log_entries = 0;
+  uint32_t m_free_log_entries = 0;
+
+  std::atomic<uint64_t> m_bytes_allocated = {0}; /* Total bytes allocated in write buffers */
+  uint64_t m_bytes_cached = 0;    /* Total bytes used in write buffers */
+  uint64_t m_bytes_dirty = 0;     /* Total bytes yet to flush to RBD */
+  uint64_t m_bytes_allocated_cap = 0;
+
+  ImageWriteback<ImageCtxT> m_image_writeback;
+
+  /*
+   * When m_first_free_entry == m_first_valid_entry, the log is
+   * empty. There is always at least one free entry, which can't be
+   * used.
+   */
+  uint64_t m_first_free_entry = 0;  /* Entries from here to m_first_valid_entry-1 are free */
+  uint64_t m_first_valid_entry = 0; /* Entries from here to m_first_free_entry-1 are valid */
+
+  /* Starts at 0 for a new write log. Incremented on every flush. */
+  uint64_t m_current_sync_gen = 0;
+  /* Starts at 0 on each sync gen increase. Incremented before applied
+     to an operation */
+  uint64_t m_last_op_sequence_num = 0;
+  /* All writes bearing this and all prior sync gen numbers are flushed */
+  uint64_t m_flushed_sync_gen = 0;
+
+  /* Acquire locks in order declared here */
+
+  /* Used for most synchronization */
+  mutable ceph::mutex m_lock;
+
+  librbd::cache::Contexts m_flush_complete_contexts;
+
+  /* New entries are at the back. Oldest at the front */
+  rwl::GenericLogEntries m_log_entries;
+  rwl::GenericLogEntries m_dirty_log_entries;
+
+  PerfCounters *m_perfcounter = nullptr;
+
+  /* Initialized from config, then set false during shutdown */
+  std::atomic<bool> m_periodic_stats_enabled = {false};
+  SafeTimer *m_timer = nullptr; /* Used with m_timer_lock */
+  mutable ceph::mutex *m_timer_lock = nullptr; /* Used with and by m_timer */
+  Context *m_timer_ctx = nullptr;
+
+  ThreadPool m_thread_pool;
+  ContextWQ m_work_queue;
+
+  void perf_start(const std::string name);
+  void perf_stop();
+  void log_perf();
+  void periodic_stats();
+  void arm_periodic_stats();
+
+  void rwl_init(Context *on_finish, rwl::DeferredContexts &later);
+  void update_image_cache_state(Context *on_finish);
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::ReplicatedWriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
diff --git a/src/librbd/cache/rwl/ImageCacheState.cc b/src/librbd/cache/rwl/ImageCacheState.cc
new file mode 100644 (file)
index 0000000..cc5819a
--- /dev/null
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/rwl/ImageCacheState.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Operations.h"
+#include "common/environment.h"
+#include "common/hostname.h"
+#include "common/config_proxy.h"
+#include "common/ceph_json.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::rwl::ImageCacheState: " << this << " " \
+                           <<  __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace rwl {
+
+template <typename I>
+const std::string ImageCacheState<I>::image_cache_state = ".librbd/image_cache_state";
+
+template <typename I>
+ImageCacheState<I>::ImageCacheState(I *image_ctx) : m_image_ctx(image_ctx) {
+  ldout(image_ctx->cct, 20) << "Initialize RWL cache state with config data. " << dendl;
+
+  ConfigProxy &config = image_ctx->config;
+  host = ceph_get_short_hostname();
+  path = config.get_val<std::string>("rbd_rwl_path");
+  size = config.get_val<uint64_t>("rbd_rwl_size");
+  log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
+}
+
+template <typename I>
+ImageCacheState<I>::ImageCacheState(I *image_ctx, JSONFormattable &f) : m_image_ctx(image_ctx) {
+  ldout(image_ctx->cct, 20) << "Initialize RWL cache state with data from server side" << dendl;
+
+  present = (bool)f["present"];
+  empty = (bool)f["empty"];
+  clean = (bool)f["clean"];
+  host = (string)f["rwl_host"];
+  path = (string)f["rwl_path"];
+  uint64_t rwl_size;
+  std::istringstream iss(f["rwl_size"]);
+  iss >> rwl_size;
+  size = rwl_size;
+
+  // Others from config
+  ConfigProxy &config = image_ctx->config;
+  log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
+}
+
+template <typename I>
+void ImageCacheState<I>::write_image_cache_state(Context *on_finish) {
+  std::shared_lock owner_lock{m_image_ctx->owner_lock};
+  JSONFormattable f;
+  ::encode_json(image_cache_state.c_str(), *this, &f);
+  std::ostringstream oss;
+  f.flush(oss);
+  std::string image_state_json = oss.str();
+
+  ldout(m_image_ctx->cct, 20) << __func__ << " Store state: " << image_state_json << dendl;
+  m_image_ctx->operations->execute_metadata_set(image_cache_state.c_str(), image_state_json, on_finish);
+}
+
+template <typename I>
+void ImageCacheState<I>::clear_image_cache_state(Context *on_finish) {
+  std::shared_lock owner_lock{m_image_ctx->owner_lock};
+  ldout(m_image_ctx->cct, 20) << __func__ << " Remove state: " << dendl;
+  m_image_ctx->operations->execute_metadata_remove(image_cache_state.c_str(), on_finish);
+}
+
+template <typename I>
+void ImageCacheState<I>::dump(ceph::Formatter *f) const {
+  ::encode_json("present", present, f);
+  ::encode_json("empty", empty, f);
+  ::encode_json("clean", clean, f);
+  ::encode_json("cache_type", (int)get_image_cache_type(), f);
+  ::encode_json("rwl_host", host, f);
+  ::encode_json("rwl_path", path, f);
+  ::encode_json("rwl_size", size, f);
+}
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::rwl::ImageCacheState<librbd::ImageCtx>;
diff --git a/src/librbd/cache/rwl/ImageCacheState.h b/src/librbd/cache/rwl/ImageCacheState.h
new file mode 100644 (file)
index 0000000..0bab76c
--- /dev/null
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
+#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H 
+
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/Types.h"
+#include <string>
+
+class JSONFormattable;
+namespace ceph {
+  class Formatter;
+}
+
+namespace librbd {
+namespace cache {
+namespace rwl {
+
+template <typename ImageCtxT = ImageCtx>
+class ImageCacheState {
+private:
+  ImageCtxT* m_image_ctx;
+public:
+  bool present = true;
+  bool empty = true;
+  bool clean = true;
+  static const std::string image_cache_state;
+  std::string host;
+  std::string path;
+  uint64_t size;
+  bool log_periodic_stats;
+
+  ImageCacheState(ImageCtxT* image_ctx);
+
+  ImageCacheState(ImageCtxT* image_ctx, JSONFormattable& f);
+
+  ~ImageCacheState() {}
+
+  ImageCacheType get_image_cache_type() const {
+    return IMAGE_CACHE_TYPE_RWL;
+  }
+
+
+  void write_image_cache_state(Context *on_finish);
+
+  void clear_image_cache_state(Context *on_finish);
+
+  void dump(ceph::Formatter *f) const;
+};
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::rwl::ImageCacheState<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
diff --git a/src/librbd/cache/rwl/LogEntry.cc b/src/librbd/cache/rwl/LogEntry.cc
new file mode 100644 (file)
index 0000000..a582236
--- /dev/null
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "LogEntry.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::rwl::LogEntry: " << this << " " \
+                           <<  __func__ << ": "
+
+namespace librbd {
+
+namespace cache {
+
+namespace rwl {
+
+bool GenericLogEntry::is_sync_point() {
+  return ram_entry.is_sync_point();
+}
+
+bool GenericLogEntry::is_discard() {
+  return ram_entry.is_discard();
+}
+
+bool GenericLogEntry::is_writesame() {
+  return ram_entry.is_writesame();
+}
+
+bool GenericLogEntry::is_write() {
+  return ram_entry.is_write();
+}
+
+bool GenericLogEntry::is_writer() {
+  return ram_entry.is_writer();
+}
+
+std::ostream &format(std::ostream &os, const GenericLogEntry &entry) {
+  os << "ram_entry=[" << entry.ram_entry << "], "
+     << "pmem_entry=" << (void*)entry.pmem_entry << ", "
+     << "log_entry_index=" << entry.log_entry_index << ", "
+     << "completed=" << entry.completed;
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const GenericLogEntry &entry) {
+  return entry.format(os, entry);
+}
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/rwl/LogEntry.h b/src/librbd/cache/rwl/LogEntry.h
new file mode 100644 (file)
index 0000000..9cff360
--- /dev/null
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+
+#include "librbd/cache/rwl/Types.h"
+
+namespace librbd {
+namespace cache {
+namespace rwl {
+
+class GenericLogEntry {
+public:
+  WriteLogPmemEntry ram_entry;
+  WriteLogPmemEntry *pmem_entry = nullptr;
+  uint32_t log_entry_index = 0;
+  bool completed = false;
+  GenericLogEntry(const uint64_t image_offset_bytes = 0, const uint64_t write_bytes = 0)
+    : ram_entry(image_offset_bytes, write_bytes) {
+  };
+  virtual ~GenericLogEntry() { };
+  GenericLogEntry(const GenericLogEntry&) = delete;
+  GenericLogEntry &operator=(const GenericLogEntry&) = delete;
+  virtual unsigned int write_bytes() = 0;
+  bool is_sync_point();
+  bool is_discard();
+  bool is_writesame();
+  bool is_write();
+  bool is_writer();
+  virtual std::ostream &format(std::ostream &os, const GenericLogEntry &entry) const;
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const GenericLogEntry &entry);
+};
+
+} // namespace rwl 
+} // namespace cache 
+} // namespace librbd 
+
+#endif // CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
diff --git a/src/librbd/cache/rwl/Types.cc b/src/librbd/cache/rwl/Types.cc
new file mode 100644 (file)
index 0000000..3201b83
--- /dev/null
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "Types.h"
+#include "common/ceph_context.h"
+#include "include/Context.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::rwl::Types: " << this << " " \
+                           <<  __func__ << ": "
+
+namespace librbd {
+
+namespace cache {
+
+namespace rwl {
+
+DeferredContexts::~DeferredContexts() {
+  finish_contexts(nullptr, contexts, 0);
+}
+
+void DeferredContexts::add(Context* ctx) {
+    contexts.push_back(ctx);
+}
+
+bool WriteLogPmemEntry::is_sync_point() {
+  return sync_point;
+}
+
+bool WriteLogPmemEntry::is_discard() {
+  return discard;
+}
+
+bool WriteLogPmemEntry::is_writesame() {
+  return writesame;
+}
+
+bool WriteLogPmemEntry::is_write() {
+  /* Log entry is a basic write */
+  return !is_sync_point() && !is_discard() && !is_writesame();
+}
+
+bool WriteLogPmemEntry::is_writer() {
+  /* Log entry is any type that writes data */
+  return is_write() || is_discard() || is_writesame();
+}
+
+const uint64_t WriteLogPmemEntry::get_offset_bytes() {
+  return image_offset_bytes;
+}
+
+const uint64_t WriteLogPmemEntry::get_write_bytes() {
+  return write_bytes;
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const WriteLogPmemEntry &entry) {
+  os << "entry_valid=" << (bool)entry.entry_valid << ", "
+     << "sync_point=" << (bool)entry.sync_point << ", "
+     << "sequenced=" << (bool)entry.sequenced << ", "
+     << "has_data=" << (bool)entry.has_data << ", "
+     << "discard=" << (bool)entry.discard << ", "
+     << "writesame=" << (bool)entry.writesame << ", "
+     << "sync_gen_number=" << entry.sync_gen_number << ", "
+     << "write_sequence_number=" << entry.write_sequence_number << ", "
+     << "image_offset_bytes=" << entry.image_offset_bytes << ", "
+     << "write_bytes=" << entry.write_bytes << ", "
+     << "ws_datalen=" << entry.ws_datalen << ", "
+     << "entry_index=" << entry.entry_index;
+  return os;
+};
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
diff --git a/src/librbd/cache/rwl/Types.h b/src/librbd/cache/rwl/Types.h
new file mode 100644 (file)
index 0000000..2a21a81
--- /dev/null
@@ -0,0 +1,215 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_TYPES_H
+#define CEPH_LIBRBD_CACHE_RWL_TYPES_H
+
+#include <vector>
+#include <libpmemobj.h>
+
+class Context;
+
+enum {
+  l_librbd_rwl_first = 26500,
+
+  // All read requests
+  l_librbd_rwl_rd_req,           // read requests
+  l_librbd_rwl_rd_bytes,         // bytes read
+  l_librbd_rwl_rd_latency,       // average req completion latency
+
+  // Read requests completed from RWL (no misses)
+  l_librbd_rwl_rd_hit_req,       // read requests
+  l_librbd_rwl_rd_hit_bytes,     // bytes read
+  l_librbd_rwl_rd_hit_latency,   // average req completion latency
+
+  // Reed requests with hit and miss extents
+  l_librbd_rwl_rd_part_hit_req,  // read ops
+
+  // All write requests
+  l_librbd_rwl_wr_req,             // write requests
+  l_librbd_rwl_wr_req_def,         // write requests deferred for resources
+  l_librbd_rwl_wr_req_def_lanes,   // write requests deferred for lanes
+  l_librbd_rwl_wr_req_def_log,     // write requests deferred for log entries
+  l_librbd_rwl_wr_req_def_buf,     // write requests deferred for buffer space
+  l_librbd_rwl_wr_req_overlap,     // write requests detained for overlap
+  l_librbd_rwl_wr_req_queued,      // write requests queued for prior barrier
+  l_librbd_rwl_wr_bytes,           // bytes written
+
+  // Write log operations (1 .. n per request that appends to the log)
+  l_librbd_rwl_log_ops,            // log append ops
+  l_librbd_rwl_log_op_bytes,       // average bytes written per log op
+
+  /*
+
+   Req and op average latencies to the beginning of and over various phases:
+
+   +------------------------------+------+-------------------------------+
+   | Phase                        | Name | Description                   |
+   +------------------------------+------+-------------------------------+
+   | Arrive at RWL                | arr  |Arrives as a request           |
+   +------------------------------+------+-------------------------------+
+   | Allocate resources           | all  |time spent in block guard for  |
+   |                              |      |overlap sequencing occurs      |
+   |                              |      |before this point              |
+   +------------------------------+------+-------------------------------+
+   | Dispatch                     | dis  |Op lifetime begins here. time  |
+   |                              |      |spent in allocation waiting for|
+   |                              |      |resources occurs before this   |
+   |                              |      |point                          |
+   +------------------------------+------+-------------------------------+
+   | Payload buffer persist and   | buf  |time spent queued for          |
+   |replicate                     |      |replication occurs before here |
+   +------------------------------+------+-------------------------------+
+   | Payload buffer persist       | bufc |bufc - buf is just the persist |
+   |complete                      |      |time                           |
+   +------------------------------+------+-------------------------------+
+   | Log append                   | app  |time spent queued for append   |
+   |                              |      |occurs before here             |
+   +------------------------------+------+-------------------------------+
+   | Append complete              | appc |appc - app is just the time    |
+   |                              |      |spent in the append operation  |
+   +------------------------------+------+-------------------------------+
+   | Complete                     | cmp  |write persisted, replicated,   |
+   |                              |      |and globally visible           |
+   +------------------------------+------+-------------------------------+
+
+  */
+
+  /* Request times */
+  l_librbd_rwl_req_arr_to_all_t,   // arrival to allocation elapsed time - same as time deferred in block guard
+  l_librbd_rwl_req_arr_to_dis_t,   // arrival to dispatch elapsed time
+  l_librbd_rwl_req_all_to_dis_t,   // Time spent allocating or waiting to allocate resources
+  l_librbd_rwl_wr_latency,         // average req (persist) completion latency
+  l_librbd_rwl_wr_latency_hist,    // Histogram of write req (persist) completion latency vs. bytes written
+  l_librbd_rwl_wr_caller_latency,  // average req completion (to caller) latency
+
+  /* Request times for requests that never waited for space*/
+  l_librbd_rwl_nowait_req_arr_to_all_t,   // arrival to allocation elapsed time - same as time deferred in block guard
+  l_librbd_rwl_nowait_req_arr_to_dis_t,   // arrival to dispatch elapsed time
+  l_librbd_rwl_nowait_req_all_to_dis_t,   // Time spent allocating or waiting to allocate resources
+  l_librbd_rwl_nowait_wr_latency,         // average req (persist) completion latency
+  l_librbd_rwl_nowait_wr_latency_hist,    // Histogram of write req (persist) completion latency vs. bytes written
+  l_librbd_rwl_nowait_wr_caller_latency,  // average req completion (to caller) latency
+
+  /* Log operation times */
+  l_librbd_rwl_log_op_alloc_t,      // elapsed time of pmemobj_reserve()
+  l_librbd_rwl_log_op_alloc_t_hist, // Histogram of elapsed time of pmemobj_reserve()
+
+  l_librbd_rwl_log_op_dis_to_buf_t, // dispatch to buffer persist elapsed time
+  l_librbd_rwl_log_op_dis_to_app_t, // dispatch to log append elapsed time
+  l_librbd_rwl_log_op_dis_to_cmp_t, // dispatch to persist completion elapsed time
+  l_librbd_rwl_log_op_dis_to_cmp_t_hist, // Histogram of dispatch to persist completion elapsed time
+
+  l_librbd_rwl_log_op_buf_to_app_t, // data buf persist + append wait time
+  l_librbd_rwl_log_op_buf_to_bufc_t,// data buf persist / replicate elapsed time
+  l_librbd_rwl_log_op_buf_to_bufc_t_hist,// data buf persist time vs bytes histogram
+  l_librbd_rwl_log_op_app_to_cmp_t, // log entry append + completion wait time
+  l_librbd_rwl_log_op_app_to_appc_t, // log entry append / replicate elapsed time
+  l_librbd_rwl_log_op_app_to_appc_t_hist, // log entry append time (vs. op bytes) histogram
+
+  l_librbd_rwl_discard,
+  l_librbd_rwl_discard_bytes,
+  l_librbd_rwl_discard_latency,
+
+  l_librbd_rwl_aio_flush,
+  l_librbd_rwl_aio_flush_def,
+  l_librbd_rwl_aio_flush_latency,
+  l_librbd_rwl_ws,
+  l_librbd_rwl_ws_bytes, // Bytes modified by write same, probably much larger than WS payload bytes
+  l_librbd_rwl_ws_latency,
+
+  l_librbd_rwl_cmp,
+  l_librbd_rwl_cmp_bytes,
+  l_librbd_rwl_cmp_latency,
+  l_librbd_rwl_cmp_fails,
+
+  l_librbd_rwl_flush,
+  l_librbd_rwl_invalidate_cache,
+  l_librbd_rwl_invalidate_discard_cache,
+
+  l_librbd_rwl_append_tx_t,
+  l_librbd_rwl_retire_tx_t,
+  l_librbd_rwl_append_tx_t_hist,
+  l_librbd_rwl_retire_tx_t_hist,
+
+  l_librbd_rwl_last,
+};
+
+namespace librbd {
+namespace cache {
+namespace rwl {
+
+/* Defer a set of Contexts until destruct/exit. Used for deferring
+ * work on a given thread until a required lock is dropped. */
+class DeferredContexts {
+private:
+  std::vector<Context*> contexts;
+public:
+  ~DeferredContexts();
+  void add(Context* ctx);
+};
+
+/* Pmem structures */
+POBJ_LAYOUT_BEGIN(rbd_rwl);
+POBJ_LAYOUT_ROOT(rbd_rwl, struct WriteLogPoolRoot);
+POBJ_LAYOUT_TOID(rbd_rwl, uint8_t);
+POBJ_LAYOUT_TOID(rbd_rwl, struct WriteLogPmemEntry);
+POBJ_LAYOUT_END(rbd_rwl);
+
+struct WriteLogPmemEntry {
+  uint64_t sync_gen_number = 0;
+  uint64_t write_sequence_number = 0;
+  uint64_t image_offset_bytes;
+  uint64_t write_bytes;
+  TOID(uint8_t) write_data;
+  struct {
+    uint8_t entry_valid :1; /* if 0, this entry is free */
+    uint8_t sync_point :1;  /* No data. No write sequence number. Marks sync
+                               point for this sync gen number */
+    uint8_t sequenced :1;   /* write sequence number is valid */
+    uint8_t has_data :1;    /* write_data field is valid (else ignore) */
+    uint8_t discard :1;     /* has_data will be 0 if this is a discard */
+    uint8_t writesame :1;   /* ws_datalen indicates length of data at write_bytes */
+  };
+  uint32_t ws_datalen = 0;  /* Length of data buffer (writesame only) */
+  uint32_t entry_index = 0; /* For debug consistency check. Can be removed if
+                             * we need the space */
+  WriteLogPmemEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+    : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes),
+      entry_valid(0), sync_point(0), sequenced(0), has_data(0), discard(0), writesame(0) {
+  }
+  bool is_sync_point();
+  bool is_discard();
+  bool is_writesame();
+  bool is_write();
+  bool is_writer();
+  const uint64_t get_offset_bytes();
+  const uint64_t get_write_bytes();
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const WriteLogPmemEntry &entry);
+};
+
+static_assert(sizeof(WriteLogPmemEntry) == 64);
+
+struct WriteLogPoolRoot {
+  union {
+    struct {
+      uint8_t layout_version;    /* Version of this structure (RWL_POOL_VERSION) */
+    };
+    uint64_t _u64;
+  } header;
+  TOID(struct WriteLogPmemEntry) log_entries;   /* contiguous array of log entries */
+  uint64_t pool_size;
+  uint64_t flushed_sync_gen;     /* All writing entries with this or a lower
+                                  * sync gen number are flushed. */
+  uint32_t block_size;           /* block size */
+  uint32_t num_log_entries;
+  uint32_t first_free_entry;     /* Entry following the newest valid entry */
+  uint32_t first_valid_entry;    /* Index of the oldest valid entry in the log */
+};
+
+} // namespace rwl 
+} // namespace cache 
+} // namespace librbd 
+
+#endif // CEPH_LIBRBD_CACHE_RWL_TYPES_H
index b7eec1d7b3428d7e7b66319fb67cce36940d4b75..64e67f9485ff32bb1e298e384427f30ad0169d48 100644 (file)
@@ -109,6 +109,13 @@ set(unittest_librbd_srcs
   trash/test_mock_RemoveRequest.cc
   watcher/test_mock_RewatchRequest.cc
   )
+
+if(WITH_RBD_RWL)
+   set(unittest_librbd_srcs
+     ${unittest_librbd_srcs}
+     cache/test_mock_ReplicatedWriteLog.cc)
+endif(WITH_RBD_RWL)
+
 add_executable(unittest_librbd
   ${unittest_librbd_srcs}
   $<TARGET_OBJECTS:common_texttable_obj>)
diff --git a/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc b/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc
new file mode 100644 (file)
index 0000000..43d4a82
--- /dev/null
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "common/hostname.h"
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "include/rbd/librbd.hpp"
+#include "librbd/cache/rwl/ImageCacheState.h"
+#include "librbd/cache/rwl/Types.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/ReplicatedWriteLog.h"
+
+
+namespace librbd {
+namespace {
+
+struct MockContextRWL : public C_SaferCond  {
+  MOCK_METHOD1(complete, void(int));
+  MOCK_METHOD1(finish, void(int));
+
+  void do_complete(int r) {
+    C_SaferCond::complete(r);
+  }
+};
+
+} // anonymous namespace
+
+namespace util {
+
+inline ImageCtx *get_image_ctx(MockImageCtx *image_ctx) {
+  return image_ctx->image_ctx;
+}
+
+} // namespace util
+} // namespace librbd
+
+#include "librbd/cache/ReplicatedWriteLog.cc"
+
+// template definitions
+#include "librbd/cache/ImageWriteback.cc"
+#include "librbd/cache/rwl/ImageCacheState.cc"
+
+template class librbd::cache::ImageWriteback<librbd::MockImageCtx>;
+template class librbd::cache::rwl::ImageCacheState<librbd::MockImageCtx>;
+
+namespace librbd {
+namespace cache {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::InSequence;
+using ::testing::Invoke;
+
+struct TestMockCacheReplicatedWriteLog : public TestMockFixture {
+  typedef ReplicatedWriteLog<librbd::MockImageCtx> MockReplicatedWriteLog;
+  typedef librbd::cache::rwl::ImageCacheState<librbd::MockImageCtx> MockImageCacheStateRWL;
+
+  MockImageCacheStateRWL *get_cache_state(MockImageCtx& mock_image_ctx) {
+    MockImageCacheStateRWL *rwl_state = new MockImageCacheStateRWL(&mock_image_ctx);
+    return rwl_state;
+  }
+
+  void validate_cache_state(librbd::ImageCtx *image_ctx,
+                            MockImageCacheStateRWL &state,
+                            bool present, bool empty, bool clean,
+                           string host="", string path="",
+                           uint64_t size=0) {
+    ConfigProxy &config = image_ctx->config;
+    ASSERT_EQ(present, state.present);
+    ASSERT_EQ(empty, state.empty);
+    ASSERT_EQ(clean, state.clean);
+    
+    if (host.empty())
+      host = ceph_get_short_hostname();
+    if (path.empty())
+      path = config.get_val<string>("rbd_rwl_path");
+    if (!size)
+      size = config.get_val<uint64_t>("rbd_rwl_size");
+    
+    ASSERT_EQ(host, state.host);
+    ASSERT_EQ(path, state.path);
+    ASSERT_EQ(size, state.size);
+    ASSERT_EQ(config.get_val<bool>("rbd_rwl_log_periodic_stats"),
+             state.log_periodic_stats);
+  }
+
+  void expect_op_work_queue(MockImageCtx& mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.op_work_queue, queue(_, _))
+      .WillRepeatedly(Invoke([](Context* ctx, int r) {
+                        ctx->complete(r);
+                      }));
+  }
+
+  void expect_context_complete(MockContextRWL& mock_context, int r) {
+    EXPECT_CALL(mock_context, complete(r))
+      .WillRepeatedly(Invoke([&mock_context](int r) {
+                  mock_context.do_complete(r);
+                }));
+  }
+
+  void expect_metadata_set(MockImageCtx& mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.operations, execute_metadata_set(_, _, _))
+      .WillRepeatedly(Invoke([](std::string key, std::string val, Context* ctx) {
+                        ctx->complete(0);
+                      }));
+  }
+
+  void expect_metadata_remove(MockImageCtx& mock_image_ctx) {
+    EXPECT_CALL(*mock_image_ctx.operations, execute_metadata_remove(_, _))
+      .WillRepeatedly(Invoke([](std::string key, Context* ctx) {
+                        ctx->complete(0);
+                      }));
+  }
+};
+
+TEST_F(TestMockCacheReplicatedWriteLog, init_state_write) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockImageCacheStateRWL image_cache_state(&mock_image_ctx);
+
+  validate_cache_state(ictx, image_cache_state, true, true, true);
+  
+  image_cache_state.empty = false;
+  image_cache_state.clean = false;
+  MockContextRWL finish_ctx;
+  expect_metadata_set(mock_image_ctx);
+  expect_context_complete(finish_ctx, 0);
+  image_cache_state.write_image_cache_state(&finish_ctx);
+  ASSERT_EQ(0, finish_ctx.wait());
+}
+
+static void get_jf(const string& s, JSONFormattable *f)
+{
+  JSONParser p;
+  bool result = p.parse(s.c_str(), s.size());
+  if (!result) {
+    cout << "Failed to parse: '" << s << "'" << std::endl;
+  }
+  ASSERT_EQ(true, result);
+  try {
+    decode_json_obj(*f, &p);
+  } catch (JSONDecoder::err& e) {
+    ASSERT_TRUE(0 == "Failed to decode JSON object");
+  }
+}
+
+TEST_F(TestMockCacheReplicatedWriteLog, init_state_json_write) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+
+  JSONFormattable f;
+  string strf = "{ \"present\": \"1\", \"empty\": \"0\", \"clean\": \"0\", \
+                   \"rwl_host\": \"testhost\", \
+                   \"rwl_path\": \"/tmp\", \
+                   \"rwl_size\": \"1024\" }";
+  get_jf(strf, &f);
+  MockImageCacheStateRWL image_cache_state(&mock_image_ctx, f);
+
+  validate_cache_state(ictx, image_cache_state, true, false, false,
+                       "testhost", "/tmp", 1024);
+
+  MockContextRWL finish_ctx;
+  expect_metadata_remove(mock_image_ctx);
+  expect_context_complete(finish_ctx, 0);
+  image_cache_state.clear_image_cache_state(&finish_ctx);
+  ASSERT_EQ(0, finish_ctx.wait());
+}
+
+TEST_F(TestMockCacheReplicatedWriteLog, init_shutdown) {
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockImageCtx mock_image_ctx(*ictx);
+  MockReplicatedWriteLog rwl(mock_image_ctx, get_cache_state(mock_image_ctx));
+  MockContextRWL finish_ctx1;
+  expect_op_work_queue(mock_image_ctx);
+  expect_metadata_set(mock_image_ctx);
+
+  expect_context_complete(finish_ctx1, 0);
+  rwl.init(&finish_ctx1);
+  ASSERT_EQ(0, finish_ctx1.wait());
+
+  MockContextRWL finish_ctx2;
+  expect_context_complete(finish_ctx2, 0);
+  rwl.shut_down(&finish_ctx2);
+  ASSERT_EQ(0, finish_ctx2.wait());
+}
+
+} // namespace cache
+} // namespace librbd