CMAKE_DEPENDENT_OPTION(WITH_BLUESTORE_PMEM "Enable PMDK libraries" OFF
"WITH_BLUESTORE" OFF)
+CMAKE_DEPENDENT_OPTION(WITH_RBD_RWL "Enable librbd persistent write back cache" OFF
+ "WITH_RBD" OFF)
+
CMAKE_DEPENDENT_OPTION(WITH_SYSTEM_PMDK "Require and build with system PMDK" OFF
- "WITH_BLUESTORE_PMEM" OFF)
+ "WITH_RBD_RWL OR WITH_BLUESTORE_PMEM" OFF)
if(WITH_BLUESTORE_PMEM)
set(HAVE_BLUESTORE_PMEM ON)
list(APPEND ceph_common_deps common_async_dpdk)
endif()
-if(WITH_BLUESTORE_PMEM)
+if(WITH_BLUESTORE_PMEM OR WITH_RBD_RWL)
if(WITH_SYSTEM_PMDK)
- find_package(pmem REQUIRED)
+ if(WITH_BLUESTORE_PMEM)
+ find_package(pmem REQUIRED COMPONENTS pmem)
+ endif()
+ if(WITH_RBD_RWL)
+ find_package(pmem REQUIRED COMPONENTS pmemobj)
+ endif()
else()
include(Buildpmem)
build_pmem()
.set_default(0)
.set_min(0)
.set_description("maximum io delay (in milliseconds) for simple io scheduler (if set to 0 dalay is calculated based on latency stats)"),
+
+ Option("rbd_rwl_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description("enable persistent write back cache for this volume"),
+
+ Option("rbd_rwl_log_periodic_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description("emit periodic perf stats to debug log"),
+
+ Option("rbd_rwl_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1073741824)
+ .set_min(1073741824)
+ .set_description("size of the persistent write back cache for this volume"),
+
+ Option("rbd_rwl_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/tmp")
+ .set_description("location of the persistent write back cache in a DAX-enabled filesystem on persistent memory"),
});
}
/* Define if unit tests are built. */
#cmakedefine UNIT_TESTS_BUILT
+/* Define if RWL is enabled */
+#cmakedefine WITH_RBD_RWL
+
#endif /* CONFIG_H */
list(APPEND librbd_internal_srcs ../common/EventTrace.cc)
endif()
+if(WITH_RBD_RWL)
+ set(librbd_internal_srcs
+ ${librbd_internal_srcs}
+ cache/rwl/Types.cc
+ cache/rwl/LogEntry.cc
+ cache/rwl/ImageCacheState.cc
+ cache/ReplicatedWriteLog.cc)
+endif()
add_library(rbd_api STATIC librbd.cc)
add_library(rbd_internal STATIC
ceph_immutable_object_cache_lib
osdc)
+if(WITH_RBD_RWL)
+ target_link_libraries(rbd_internal PRIVATE
+ pmem::pmemobj
+ pmem::pmem)
+endif()
+
add_library(librbd ${CEPH_SHARED}
librbd.cc)
if(WITH_LTTNG)
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <libpmemobj.h>
+#include "ReplicatedWriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/rwl/ImageCacheState.h"
+#include "librbd/cache/rwl/LogEntry.h"
+#include "librbd/cache/rwl/Types.h"
+#include <map>
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::ReplicatedWriteLog: " << this << " " \
+ << __func__ << ": "
+
+const uint32_t MIN_WRITE_ALLOC_SIZE = 512;
+const uint32_t LOG_STATS_INTERVAL_SECONDS = 5;
+
+/**** Write log entries ****/
+const uint64_t DEFAULT_POOL_SIZE = 1u<<30;
+const uint64_t MIN_POOL_SIZE = DEFAULT_POOL_SIZE;
+constexpr double USABLE_SIZE = (7.0 / 10);
+const uint64_t BLOCK_ALLOC_OVERHEAD_BYTES = 16;
+const uint8_t RWL_POOL_VERSION = 1;
+const uint64_t MAX_LOG_ENTRIES = (1024 * 1024);
+
+namespace librbd {
+namespace cache {
+
+using namespace librbd::cache::rwl;
+
+typedef ReplicatedWriteLog<ImageCtx>::Extents Extents;
+
+template <typename I>
+ReplicatedWriteLog<I>::ReplicatedWriteLog(I &image_ctx, librbd::cache::rwl::ImageCacheState<I>* cache_state)
+ : m_cache_state(cache_state),
+ m_rwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_rwl)),
+ m_image_ctx(image_ctx),
+ m_log_pool_config_size(DEFAULT_POOL_SIZE),
+ m_image_writeback(image_ctx),
+ m_lock("librbd::cache::ReplicatedWriteLog::m_lock",
+ true, true),
+ m_thread_pool(image_ctx.cct, "librbd::cache::ReplicatedWriteLog::thread_pool", "tp_rwl",
+ 4,
+ ""),
+ m_work_queue("librbd::cache::ReplicatedWriteLog::work_queue",
+ image_ctx.config.template get_val<uint64_t>("rbd_op_thread_timeout"),
+ &m_thread_pool)
+{
+ CephContext *cct = m_image_ctx.cct;
+ ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock);
+}
+
+template <typename I>
+ReplicatedWriteLog<I>::~ReplicatedWriteLog() {
+ ldout(m_image_ctx.cct, 15) << "enter" << dendl;
+ {
+ std::lock_guard timer_locker(*m_timer_lock);
+ std::lock_guard locker(m_lock);
+ m_timer->cancel_event(m_timer_ctx);
+ m_thread_pool.stop();
+ m_log_pool = nullptr;
+ delete m_cache_state;
+ m_cache_state = nullptr;
+ }
+ ldout(m_image_ctx.cct, 15) << "exit" << dendl;
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::perf_start(std::string name) {
+ PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_rwl_first, l_librbd_rwl_last);
+
+ // Latency axis configuration for op histograms, values are in nanoseconds
+ PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
+ "Latency (nsec)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
+ 0, ///< Start at 0
+ 5000, ///< Quantization unit is 5usec
+ 16, ///< Ranges into the mS
+ };
+
+ // Op size axis configuration for op histogram y axis, values are in bytes
+ PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
+ "Request size (bytes)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+ 0, ///< Start at 0
+ 512, ///< Quantization unit is 512 bytes
+ 16, ///< Writes up to >32k
+ };
+
+ // Num items configuration for op histogram y axis, values are in items
+ PerfHistogramCommon::axis_config_d op_hist_y_axis_count_config{
+ "Number of items",
+ PerfHistogramCommon::SCALE_LINEAR, ///< Request size in linear scale
+ 0, ///< Start at 0
+ 1, ///< Quantization unit is 512 bytes
+ 32, ///< Writes up to >32k
+ };
+
+ plb.add_u64_counter(l_librbd_rwl_rd_req, "rd", "Reads");
+ plb.add_u64_counter(l_librbd_rwl_rd_bytes, "rd_bytes", "Data size in reads");
+ plb.add_time_avg(l_librbd_rwl_rd_latency, "rd_latency", "Latency of reads");
+
+ plb.add_u64_counter(l_librbd_rwl_rd_hit_req, "hit_rd", "Reads completely hitting RWL");
+ plb.add_u64_counter(l_librbd_rwl_rd_hit_bytes, "rd_hit_bytes", "Bytes read from RWL");
+ plb.add_time_avg(l_librbd_rwl_rd_hit_latency, "hit_rd_latency", "Latency of read hits");
+
+ plb.add_u64_counter(l_librbd_rwl_rd_part_hit_req, "part_hit_rd", "reads partially hitting RWL");
+
+ plb.add_u64_counter(l_librbd_rwl_wr_req, "wr", "Writes");
+ plb.add_u64_counter(l_librbd_rwl_wr_req_def, "wr_def", "Writes deferred for resources");
+ plb.add_u64_counter(l_librbd_rwl_wr_req_def_lanes, "wr_def_lanes", "Writes deferred for lanes");
+ plb.add_u64_counter(l_librbd_rwl_wr_req_def_log, "wr_def_log", "Writes deferred for log entries");
+ plb.add_u64_counter(l_librbd_rwl_wr_req_def_buf, "wr_def_buf", "Writes deferred for buffers");
+ plb.add_u64_counter(l_librbd_rwl_wr_req_overlap, "wr_overlap", "Writes overlapping with prior in-progress writes");
+ plb.add_u64_counter(l_librbd_rwl_wr_req_queued, "wr_q_barrier", "Writes queued for prior barriers (aio_flush)");
+ plb.add_u64_counter(l_librbd_rwl_wr_bytes, "wr_bytes", "Data size in writes");
+
+ plb.add_u64_counter(l_librbd_rwl_log_ops, "log_ops", "Log appends");
+ plb.add_u64_avg(l_librbd_rwl_log_op_bytes, "log_op_bytes", "Average log append bytes");
+
+ plb.add_time_avg(
+ l_librbd_rwl_req_arr_to_all_t, "req_arr_to_all_t",
+ "Average arrival to allocation time (time deferred for overlap)");
+ plb.add_time_avg(
+ l_librbd_rwl_req_arr_to_dis_t, "req_arr_to_dis_t",
+ "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
+ plb.add_time_avg(
+ l_librbd_rwl_req_all_to_dis_t, "req_all_to_dis_t",
+ "Average allocation to dispatch time (time deferred for log resources)");
+ plb.add_time_avg(
+ l_librbd_rwl_wr_latency, "wr_latency",
+ "Latency of writes (persistent completion)");
+ plb.add_u64_counter_histogram(
+ l_librbd_rwl_wr_latency_hist, "wr_latency_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of write request latency (nanoseconds) vs. bytes written");
+ plb.add_time_avg(
+ l_librbd_rwl_wr_caller_latency, "caller_wr_latency",
+ "Latency of write completion to caller");
+ plb.add_time_avg(
+ l_librbd_rwl_nowait_req_arr_to_all_t, "req_arr_to_all_nw_t",
+ "Average arrival to allocation time (time deferred for overlap)");
+ plb.add_time_avg(
+ l_librbd_rwl_nowait_req_arr_to_dis_t, "req_arr_to_dis_nw_t",
+ "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
+ plb.add_time_avg(
+ l_librbd_rwl_nowait_req_all_to_dis_t, "req_all_to_dis_nw_t",
+ "Average allocation to dispatch time (time deferred for log resources)");
+ plb.add_time_avg(
+ l_librbd_rwl_nowait_wr_latency, "wr_latency_nw",
+ "Latency of writes (persistent completion) not deferred for free space");
+ plb.add_u64_counter_histogram(
+ l_librbd_rwl_nowait_wr_latency_hist, "wr_latency_nw_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of write request latency (nanoseconds) vs. bytes written for writes not deferred for free space");
+ plb.add_time_avg(
+ l_librbd_rwl_nowait_wr_caller_latency, "caller_wr_latency_nw",
+ "Latency of write completion to callerfor writes not deferred for free space");
+ plb.add_time_avg(l_librbd_rwl_log_op_alloc_t, "op_alloc_t", "Average buffer pmemobj_reserve() time");
+ plb.add_u64_counter_histogram(
+ l_librbd_rwl_log_op_alloc_t_hist, "op_alloc_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of buffer pmemobj_reserve() time (nanoseconds) vs. bytes written");
+ plb.add_time_avg(l_librbd_rwl_log_op_dis_to_buf_t, "op_dis_to_buf_t", "Average dispatch to buffer persist time");
+ plb.add_time_avg(l_librbd_rwl_log_op_dis_to_app_t, "op_dis_to_app_t", "Average dispatch to log append time");
+ plb.add_time_avg(l_librbd_rwl_log_op_dis_to_cmp_t, "op_dis_to_cmp_t", "Average dispatch to persist completion time");
+ plb.add_u64_counter_histogram(
+ l_librbd_rwl_log_op_dis_to_cmp_t_hist, "op_dis_to_cmp_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of op dispatch to persist complete time (nanoseconds) vs. bytes written");
+
+ plb.add_time_avg(
+ l_librbd_rwl_log_op_buf_to_app_t, "op_buf_to_app_t",
+ "Average buffer persist to log append time (write data persist/replicate + wait for append time)");
+ plb.add_time_avg(
+ l_librbd_rwl_log_op_buf_to_bufc_t, "op_buf_to_bufc_t",
+ "Average buffer persist time (write data persist/replicate time)");
+ plb.add_u64_counter_histogram(
+ l_librbd_rwl_log_op_buf_to_bufc_t_hist, "op_buf_to_bufc_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of write buffer persist time (nanoseconds) vs. bytes written");
+ plb.add_time_avg(
+ l_librbd_rwl_log_op_app_to_cmp_t, "op_app_to_cmp_t",
+ "Average log append to persist complete time (log entry append/replicate + wait for complete time)");
+ plb.add_time_avg(
+ l_librbd_rwl_log_op_app_to_appc_t, "op_app_to_appc_t",
+ "Average log append to persist complete time (log entry append/replicate time)");
+ plb.add_u64_counter_histogram(
+ l_librbd_rwl_log_op_app_to_appc_t_hist, "op_app_to_appc_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of log append persist time (nanoseconds) (vs. op bytes)");
+
+ plb.add_u64_counter(l_librbd_rwl_discard, "discard", "Discards");
+ plb.add_u64_counter(l_librbd_rwl_discard_bytes, "discard_bytes", "Bytes discarded");
+ plb.add_time_avg(l_librbd_rwl_discard_latency, "discard_lat", "Discard latency");
+
+ plb.add_u64_counter(l_librbd_rwl_aio_flush, "aio_flush", "AIO flush (flush to RWL)");
+ plb.add_u64_counter(l_librbd_rwl_aio_flush_def, "aio_flush_def", "AIO flushes deferred for resources");
+ plb.add_time_avg(l_librbd_rwl_aio_flush_latency, "aio_flush_lat", "AIO flush latency");
+
+ plb.add_u64_counter(l_librbd_rwl_ws,"ws", "Write Sames");
+ plb.add_u64_counter(l_librbd_rwl_ws_bytes, "ws_bytes", "Write Same bytes to image");
+ plb.add_time_avg(l_librbd_rwl_ws_latency, "ws_lat", "Write Same latency");
+
+ plb.add_u64_counter(l_librbd_rwl_cmp, "cmp", "Compare and Write requests");
+ plb.add_u64_counter(l_librbd_rwl_cmp_bytes, "cmp_bytes", "Compare and Write bytes compared/written");
+ plb.add_time_avg(l_librbd_rwl_cmp_latency, "cmp_lat", "Compare and Write latecy");
+ plb.add_u64_counter(l_librbd_rwl_cmp_fails, "cmp_fails", "Compare and Write compare fails");
+
+ plb.add_u64_counter(l_librbd_rwl_flush, "flush", "Flush (flush RWL)");
+ plb.add_u64_counter(l_librbd_rwl_invalidate_cache, "invalidate", "Invalidate RWL");
+ plb.add_u64_counter(l_librbd_rwl_invalidate_discard_cache, "discard", "Discard and invalidate RWL");
+
+ plb.add_time_avg(l_librbd_rwl_append_tx_t, "append_tx_lat", "Log append transaction latency");
+ plb.add_u64_counter_histogram(
+ l_librbd_rwl_append_tx_t_hist, "append_tx_lat_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_count_config,
+ "Histogram of log append transaction time (nanoseconds) vs. entries appended");
+ plb.add_time_avg(l_librbd_rwl_retire_tx_t, "retire_tx_lat", "Log retire transaction latency");
+ plb.add_u64_counter_histogram(
+ l_librbd_rwl_retire_tx_t_hist, "retire_tx_lat_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_count_config,
+ "Histogram of log retire transaction time (nanoseconds) vs. entries retired");
+
+ m_perfcounter = plb.create_perf_counters();
+ m_image_ctx.cct->get_perfcounters_collection()->add(m_perfcounter);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::perf_stop() {
+ ceph_assert(m_perfcounter);
+ m_image_ctx.cct->get_perfcounters_collection()->remove(m_perfcounter);
+ delete m_perfcounter;
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::log_perf() {
+ bufferlist bl;
+ Formatter *f = Formatter::create("json-pretty");
+ bl.append("Perf dump follows\n--- Begin perf dump ---\n");
+ bl.append("{\n");
+ stringstream ss;
+ utime_t now = ceph_clock_now();
+ ss << "\"test_time\": \"" << now << "\",";
+ ss << "\"image\": \"" << m_image_ctx.name << "\",";
+ bl.append(ss);
+ bl.append("\"stats\": ");
+ m_image_ctx.cct->get_perfcounters_collection()->dump_formatted(f, 0);
+ f->flush(bl);
+ bl.append(",\n\"histograms\": ");
+ m_image_ctx.cct->get_perfcounters_collection()->dump_formatted_histograms(f, 0);
+ f->flush(bl);
+ delete f;
+ bl.append("}\n--- End perf dump ---\n");
+ bl.append('\0');
+ ldout(m_image_ctx.cct, 1) << bl.c_str() << dendl;
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::periodic_stats() {
+ std::lock_guard locker(m_lock);
+ ldout(m_image_ctx.cct, 1) << "STATS: "
+ << "m_free_log_entries=" << m_free_log_entries << ", "
+ << "m_log_entries=" << m_log_entries.size() << ", "
+ << "m_dirty_log_entries=" << m_dirty_log_entries.size() << ", "
+ << "m_bytes_allocated=" << m_bytes_allocated << ", "
+ << "m_bytes_cached=" << m_bytes_cached << ", "
+ << "m_bytes_dirty=" << m_bytes_dirty << ", "
+ << "bytes available=" << m_bytes_allocated_cap - m_bytes_allocated << ", "
+ << "m_current_sync_gen=" << m_current_sync_gen << ", "
+ << "m_flushed_sync_gen=" << m_flushed_sync_gen << ", "
+ << dendl;
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::arm_periodic_stats() {
+ ceph_assert(m_timer_lock->is_locked());
+ if (m_periodic_stats_enabled) {
+ m_timer_ctx = new LambdaContext(
+ [this](int r) {
+ /* m_timer_lock is held */
+ periodic_stats();
+ arm_periodic_stats();
+ });
+ m_timer->add_event_after(LOG_STATS_INTERVAL_SECONDS, m_timer_ctx);
+ }
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::rwl_init(Context *on_finish, DeferredContexts &later) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ ceph_assert(m_cache_state);
+ std::lock_guard locker(m_lock);
+ ceph_assert(!m_initialized);
+ ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl;
+ ldout(cct,5) << "rwl_size: " << m_cache_state->size << dendl;
+ std::string rwl_path = m_cache_state->path;
+ ldout(cct,5) << "rwl_path: " << rwl_path << dendl;
+
+ std::string pool_name = m_image_ctx.md_ctx.get_pool_name();
+ std::string log_pool_name = rwl_path + "/rbd-rwl." + pool_name + "." + m_image_ctx.id + ".pool";
+ std::string log_poolset_name = rwl_path + "/rbd-rwl." + pool_name + "." + m_image_ctx.id + ".poolset";
+ m_log_pool_config_size = max(m_cache_state->size, MIN_POOL_SIZE);
+
+ if (access(log_poolset_name.c_str(), F_OK) == 0) {
+ m_log_pool_name = log_poolset_name;
+ m_log_is_poolset = true;
+ } else {
+ m_log_pool_name = log_pool_name;
+ ldout(cct, 5) << "Poolset file " << log_poolset_name
+ << " not present (or can't open). Using unreplicated pool" << dendl;
+ }
+
+ if ((!m_cache_state->present) &&
+ (access(m_log_pool_name.c_str(), F_OK) == 0)) {
+ ldout(cct, 5) << "There's an existing pool/poolset file " << m_log_pool_name
+ << ", While there's no cache in the image metatata." << dendl;
+ if (remove(m_log_pool_name.c_str()) != 0) {
+ lderr(cct) << "Failed to remove the pool/poolset file " << m_log_pool_name
+ << dendl;
+ on_finish->complete(-errno);
+ return;
+ } else {
+ ldout(cct, 5) << "Removed the existing pool/poolset file." << dendl;
+ }
+ }
+
+ if (access(m_log_pool_name.c_str(), F_OK) != 0) {
+ if ((m_log_pool =
+ pmemobj_create(m_log_pool_name.c_str(),
+ m_rwl_pool_layout_name,
+ m_log_pool_config_size,
+ (S_IWUSR | S_IRUSR))) == NULL) {
+ lderr(cct) << "failed to create pool (" << m_log_pool_name << ")"
+ << pmemobj_errormsg() << dendl;
+ m_cache_state->present = false;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ /* TODO: filter/replace errnos that are meaningless to the caller */
+ on_finish->complete(-errno);
+ return;
+ }
+ m_cache_state->present = true;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+ /* new pool, calculate and store metadata */
+ size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE);
+ size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogPmemEntry);
+ uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size);
+ if (num_small_writes > MAX_LOG_ENTRIES) {
+ num_small_writes = MAX_LOG_ENTRIES;
+ }
+ if (num_small_writes <= 2) {
+ lderr(cct) << "num_small_writes needs to > 2" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ m_log_pool_actual_size = m_log_pool_config_size;
+ m_bytes_allocated_cap = effective_pool_size;
+ /* Log ring empty */
+ m_first_free_entry = 0;
+ m_first_valid_entry = 0;
+ TX_BEGIN(m_log_pool) {
+ TX_ADD(pool_root);
+ D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION;
+ D_RW(pool_root)->log_entries =
+ TX_ZALLOC(struct WriteLogPmemEntry,
+ sizeof(struct WriteLogPmemEntry) * num_small_writes);
+ D_RW(pool_root)->pool_size = m_log_pool_actual_size;
+ D_RW(pool_root)->flushed_sync_gen = m_flushed_sync_gen;
+ D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE;
+ D_RW(pool_root)->num_log_entries = num_small_writes;
+ D_RW(pool_root)->first_free_entry = m_first_free_entry;
+ D_RW(pool_root)->first_valid_entry = m_first_valid_entry;
+ } TX_ONCOMMIT {
+ m_total_log_entries = D_RO(pool_root)->num_log_entries;
+ m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free
+ } TX_ONABORT {
+ m_total_log_entries = 0;
+ m_free_log_entries = 0;
+ lderr(cct) << "failed to initialize pool (" << m_log_pool_name << ")" << dendl;
+ on_finish->complete(-pmemobj_tx_errno());
+ return;
+ } TX_FINALLY {
+ } TX_END;
+ } else {
+ // TODO: load existed cache. This will be covered in later PR.
+ }
+
+ ldout(cct,1) << "pool " << m_log_pool_name << " has " << m_total_log_entries
+ << " log entries, " << m_free_log_entries << " of which are free."
+ << " first_valid=" << m_first_valid_entry
+ << ", first_free=" << m_first_free_entry
+ << ", flushed_sync_gen=" << m_flushed_sync_gen
+ << ", current_sync_gen=" << m_current_sync_gen << dendl;
+ if (m_first_free_entry == m_first_valid_entry) {
+ ldout(cct,1) << "write log is empty" << dendl;
+ m_cache_state->empty = true;
+ }
+
+ // TODO: Will init sync point, this will be covered in later PR.
+ // init_flush_new_sync_point(later);
+
+ m_initialized = true;
+ // Start the thread
+ m_thread_pool.start();
+
+ m_periodic_stats_enabled = m_cache_state->log_periodic_stats;
+ /* Do these after we drop m_lock */
+ later.add(new LambdaContext([this](int r) {
+ if (m_periodic_stats_enabled) {
+ /* Log stats for the first time */
+ periodic_stats();
+ /* Arm periodic stats logging for the first time */
+ std::lock_guard timer_locker(*m_timer_lock);
+ arm_periodic_stats();
+ }
+ }));
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::update_image_cache_state(Context *on_finish) {
+ m_cache_state->write_image_cache_state(on_finish);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::init(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+ perf_start(m_image_ctx.id);
+
+ ceph_assert(!m_initialized);
+
+ Context *ctx = new LambdaContext(
+ [this, on_finish](int r) {
+ if (r >= 0) {
+ update_image_cache_state(on_finish);
+ } else {
+ on_finish->complete(r);
+ }
+ });
+
+ DeferredContexts later;
+ rwl_init(ctx, later);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::shut_down(Context *on_finish) {
+ // TODO: This is cover in later PR.
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_read(Extents&& image_extents,
+ ceph::bufferlist* bl,
+ int fadvise_flags, Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_write(Extents &&image_extents,
+ bufferlist&& bl,
+ int fadvise_flags,
+ Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes, Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_flush(Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_writesame(uint64_t offset, uint64_t length,
+ bufferlist&& bl, int fadvise_flags,
+ Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::aio_compare_and_write(Extents &&image_extents,
+ bufferlist&& cmp_bl,
+ bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::flush(Context *on_finish) {
+}
+
+template <typename I>
+void ReplicatedWriteLog<I>::invalidate(Context *on_finish) {
+}
+
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::ReplicatedWriteLog<librbd::ImageCtx>;
+template class librbd::cache::ImageCache<librbd::ImageCtx>;
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/cache/ImageCache.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/Utils.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/Types.h"
+#include <functional>
+#include <list>
+
+class Context;
+class SafeTimer;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+namespace rwl {
+
+class GenericLogEntry;
+typedef std::list<std::shared_ptr<GenericLogEntry>> GenericLogEntries;
+
+class DeferredContexts;
+template <typename> class ImageCacheState;
+} // namespace rwl
+
+
+template <typename ImageCtxT>
+class ReplicatedWriteLog : public ImageCache<ImageCtxT> {
+public:
+ using typename ImageCache<ImageCtxT>::Extents;
+
+ ReplicatedWriteLog(ImageCtxT &image_ctx, librbd::cache::rwl::ImageCacheState<ImageCtxT>* cache_state);
+ ~ReplicatedWriteLog();
+ ReplicatedWriteLog(const ReplicatedWriteLog&) = delete;
+ ReplicatedWriteLog &operator=(const ReplicatedWriteLog&) = delete;
+
+ /// client AIO methods
+ void aio_read(Extents&& image_extents, ceph::bufferlist *bl,
+ int fadvise_flags, Context *on_finish) override;
+ void aio_write(Extents&& image_extents, ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) override;
+ void aio_discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish) override;
+ void aio_flush(Context *on_finish) override;
+ void aio_writesame(uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish) override;
+ void aio_compare_and_write(Extents&& image_extents,
+ ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,int fadvise_flags,
+ Context *on_finish) override;
+
+ /// internal state methods
+ void init(Context *on_finish) override;
+ void shut_down(Context *on_finish) override;
+ void invalidate(Context *on_finish);
+ void flush(Context *on_finish) override;
+
+private:
+ librbd::cache::rwl::ImageCacheState<ImageCtxT>* m_cache_state = nullptr;
+
+ std::atomic<bool> m_initialized = {false};
+ PMEMobjpool *m_log_pool = nullptr;
+ const char* m_rwl_pool_layout_name;
+
+ ImageCtxT &m_image_ctx;
+
+ std::string m_log_pool_name;
+ bool m_log_is_poolset = false;
+ uint64_t m_log_pool_config_size; /* Configured size of RWL */
+ uint64_t m_log_pool_actual_size = 0; /* Actual size of RWL pool */
+
+ uint32_t m_total_log_entries = 0;
+ uint32_t m_free_log_entries = 0;
+
+ std::atomic<uint64_t> m_bytes_allocated = {0}; /* Total bytes allocated in write buffers */
+ uint64_t m_bytes_cached = 0; /* Total bytes used in write buffers */
+ uint64_t m_bytes_dirty = 0; /* Total bytes yet to flush to RBD */
+ uint64_t m_bytes_allocated_cap = 0;
+
+ ImageWriteback<ImageCtxT> m_image_writeback;
+
+ /*
+ * When m_first_free_entry == m_first_valid_entry, the log is
+ * empty. There is always at least one free entry, which can't be
+ * used.
+ */
+ uint64_t m_first_free_entry = 0; /* Entries from here to m_first_valid_entry-1 are free */
+ uint64_t m_first_valid_entry = 0; /* Entries from here to m_first_free_entry-1 are valid */
+
+ /* Starts at 0 for a new write log. Incremented on every flush. */
+ uint64_t m_current_sync_gen = 0;
+ /* Starts at 0 on each sync gen increase. Incremented before applied
+ to an operation */
+ uint64_t m_last_op_sequence_num = 0;
+ /* All writes bearing this and all prior sync gen numbers are flushed */
+ uint64_t m_flushed_sync_gen = 0;
+
+ /* Acquire locks in order declared here */
+
+ /* Used for most synchronization */
+ mutable ceph::mutex m_lock;
+
+ librbd::cache::Contexts m_flush_complete_contexts;
+
+ /* New entries are at the back. Oldest at the front */
+ rwl::GenericLogEntries m_log_entries;
+ rwl::GenericLogEntries m_dirty_log_entries;
+
+ PerfCounters *m_perfcounter = nullptr;
+
+ /* Initialized from config, then set false during shutdown */
+ std::atomic<bool> m_periodic_stats_enabled = {false};
+ SafeTimer *m_timer = nullptr; /* Used with m_timer_lock */
+ mutable ceph::mutex *m_timer_lock = nullptr; /* Used with and by m_timer */
+ Context *m_timer_ctx = nullptr;
+
+ ThreadPool m_thread_pool;
+ ContextWQ m_work_queue;
+
+ void perf_start(const std::string name);
+ void perf_stop();
+ void log_perf();
+ void periodic_stats();
+ void arm_periodic_stats();
+
+ void rwl_init(Context *on_finish, rwl::DeferredContexts &later);
+ void update_image_cache_state(Context *on_finish);
+};
+
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::ReplicatedWriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/rwl/ImageCacheState.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Operations.h"
+#include "common/environment.h"
+#include "common/hostname.h"
+#include "common/config_proxy.h"
+#include "common/ceph_json.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::rwl::ImageCacheState: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace rwl {
+
+template <typename I>
+const std::string ImageCacheState<I>::image_cache_state = ".librbd/image_cache_state";
+
+template <typename I>
+ImageCacheState<I>::ImageCacheState(I *image_ctx) : m_image_ctx(image_ctx) {
+ ldout(image_ctx->cct, 20) << "Initialize RWL cache state with config data. " << dendl;
+
+ ConfigProxy &config = image_ctx->config;
+ host = ceph_get_short_hostname();
+ path = config.get_val<std::string>("rbd_rwl_path");
+ size = config.get_val<uint64_t>("rbd_rwl_size");
+ log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
+}
+
+template <typename I>
+ImageCacheState<I>::ImageCacheState(I *image_ctx, JSONFormattable &f) : m_image_ctx(image_ctx) {
+ ldout(image_ctx->cct, 20) << "Initialize RWL cache state with data from server side" << dendl;
+
+ present = (bool)f["present"];
+ empty = (bool)f["empty"];
+ clean = (bool)f["clean"];
+ host = (string)f["rwl_host"];
+ path = (string)f["rwl_path"];
+ uint64_t rwl_size;
+ std::istringstream iss(f["rwl_size"]);
+ iss >> rwl_size;
+ size = rwl_size;
+
+ // Others from config
+ ConfigProxy &config = image_ctx->config;
+ log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
+}
+
+template <typename I>
+void ImageCacheState<I>::write_image_cache_state(Context *on_finish) {
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ JSONFormattable f;
+ ::encode_json(image_cache_state.c_str(), *this, &f);
+ std::ostringstream oss;
+ f.flush(oss);
+ std::string image_state_json = oss.str();
+
+ ldout(m_image_ctx->cct, 20) << __func__ << " Store state: " << image_state_json << dendl;
+ m_image_ctx->operations->execute_metadata_set(image_cache_state.c_str(), image_state_json, on_finish);
+}
+
+template <typename I>
+void ImageCacheState<I>::clear_image_cache_state(Context *on_finish) {
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ ldout(m_image_ctx->cct, 20) << __func__ << " Remove state: " << dendl;
+ m_image_ctx->operations->execute_metadata_remove(image_cache_state.c_str(), on_finish);
+}
+
+template <typename I>
+void ImageCacheState<I>::dump(ceph::Formatter *f) const {
+ ::encode_json("present", present, f);
+ ::encode_json("empty", empty, f);
+ ::encode_json("clean", clean, f);
+ ::encode_json("cache_type", (int)get_image_cache_type(), f);
+ ::encode_json("rwl_host", host, f);
+ ::encode_json("rwl_path", path, f);
+ ::encode_json("rwl_size", size, f);
+}
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::rwl::ImageCacheState<librbd::ImageCtx>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
+#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
+
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/Types.h"
+#include <string>
+
+class JSONFormattable;
+namespace ceph {
+ class Formatter;
+}
+
+namespace librbd {
+namespace cache {
+namespace rwl {
+
+template <typename ImageCtxT = ImageCtx>
+class ImageCacheState {
+private:
+ ImageCtxT* m_image_ctx;
+public:
+ bool present = true;
+ bool empty = true;
+ bool clean = true;
+ static const std::string image_cache_state;
+ std::string host;
+ std::string path;
+ uint64_t size;
+ bool log_periodic_stats;
+
+ ImageCacheState(ImageCtxT* image_ctx);
+
+ ImageCacheState(ImageCtxT* image_ctx, JSONFormattable& f);
+
+ ~ImageCacheState() {}
+
+ ImageCacheType get_image_cache_type() const {
+ return IMAGE_CACHE_TYPE_RWL;
+ }
+
+
+ void write_image_cache_state(Context *on_finish);
+
+ void clear_image_cache_state(Context *on_finish);
+
+ void dump(ceph::Formatter *f) const;
+};
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::rwl::ImageCacheState<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "LogEntry.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::rwl::LogEntry: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+
+namespace cache {
+
+namespace rwl {
+
+bool GenericLogEntry::is_sync_point() {
+ return ram_entry.is_sync_point();
+}
+
+bool GenericLogEntry::is_discard() {
+ return ram_entry.is_discard();
+}
+
+bool GenericLogEntry::is_writesame() {
+ return ram_entry.is_writesame();
+}
+
+bool GenericLogEntry::is_write() {
+ return ram_entry.is_write();
+}
+
+bool GenericLogEntry::is_writer() {
+ return ram_entry.is_writer();
+}
+
+std::ostream &format(std::ostream &os, const GenericLogEntry &entry) {
+ os << "ram_entry=[" << entry.ram_entry << "], "
+ << "pmem_entry=" << (void*)entry.pmem_entry << ", "
+ << "log_entry_index=" << entry.log_entry_index << ", "
+ << "completed=" << entry.completed;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const GenericLogEntry &entry) {
+ return entry.format(os, entry);
+}
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+
+#include "librbd/cache/rwl/Types.h"
+
+namespace librbd {
+namespace cache {
+namespace rwl {
+
+class GenericLogEntry {
+public:
+ WriteLogPmemEntry ram_entry;
+ WriteLogPmemEntry *pmem_entry = nullptr;
+ uint32_t log_entry_index = 0;
+ bool completed = false;
+ GenericLogEntry(const uint64_t image_offset_bytes = 0, const uint64_t write_bytes = 0)
+ : ram_entry(image_offset_bytes, write_bytes) {
+ };
+ virtual ~GenericLogEntry() { };
+ GenericLogEntry(const GenericLogEntry&) = delete;
+ GenericLogEntry &operator=(const GenericLogEntry&) = delete;
+ virtual unsigned int write_bytes() = 0;
+ bool is_sync_point();
+ bool is_discard();
+ bool is_writesame();
+ bool is_write();
+ bool is_writer();
+ virtual std::ostream &format(std::ostream &os, const GenericLogEntry &entry) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const GenericLogEntry &entry);
+};
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "Types.h"
+#include "common/ceph_context.h"
+#include "include/Context.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::rwl::Types: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+
+namespace cache {
+
+namespace rwl {
+
+DeferredContexts::~DeferredContexts() {
+ finish_contexts(nullptr, contexts, 0);
+}
+
+void DeferredContexts::add(Context* ctx) {
+ contexts.push_back(ctx);
+}
+
+bool WriteLogPmemEntry::is_sync_point() {
+ return sync_point;
+}
+
+bool WriteLogPmemEntry::is_discard() {
+ return discard;
+}
+
+bool WriteLogPmemEntry::is_writesame() {
+ return writesame;
+}
+
+bool WriteLogPmemEntry::is_write() {
+ /* Log entry is a basic write */
+ return !is_sync_point() && !is_discard() && !is_writesame();
+}
+
+bool WriteLogPmemEntry::is_writer() {
+ /* Log entry is any type that writes data */
+ return is_write() || is_discard() || is_writesame();
+}
+
+const uint64_t WriteLogPmemEntry::get_offset_bytes() {
+ return image_offset_bytes;
+}
+
+const uint64_t WriteLogPmemEntry::get_write_bytes() {
+ return write_bytes;
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const WriteLogPmemEntry &entry) {
+ os << "entry_valid=" << (bool)entry.entry_valid << ", "
+ << "sync_point=" << (bool)entry.sync_point << ", "
+ << "sequenced=" << (bool)entry.sequenced << ", "
+ << "has_data=" << (bool)entry.has_data << ", "
+ << "discard=" << (bool)entry.discard << ", "
+ << "writesame=" << (bool)entry.writesame << ", "
+ << "sync_gen_number=" << entry.sync_gen_number << ", "
+ << "write_sequence_number=" << entry.write_sequence_number << ", "
+ << "image_offset_bytes=" << entry.image_offset_bytes << ", "
+ << "write_bytes=" << entry.write_bytes << ", "
+ << "ws_datalen=" << entry.ws_datalen << ", "
+ << "entry_index=" << entry.entry_index;
+ return os;
+};
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_TYPES_H
+#define CEPH_LIBRBD_CACHE_RWL_TYPES_H
+
+#include <vector>
+#include <libpmemobj.h>
+
+class Context;
+
+enum {
+ l_librbd_rwl_first = 26500,
+
+ // All read requests
+ l_librbd_rwl_rd_req, // read requests
+ l_librbd_rwl_rd_bytes, // bytes read
+ l_librbd_rwl_rd_latency, // average req completion latency
+
+ // Read requests completed from RWL (no misses)
+ l_librbd_rwl_rd_hit_req, // read requests
+ l_librbd_rwl_rd_hit_bytes, // bytes read
+ l_librbd_rwl_rd_hit_latency, // average req completion latency
+
+ // Reed requests with hit and miss extents
+ l_librbd_rwl_rd_part_hit_req, // read ops
+
+ // All write requests
+ l_librbd_rwl_wr_req, // write requests
+ l_librbd_rwl_wr_req_def, // write requests deferred for resources
+ l_librbd_rwl_wr_req_def_lanes, // write requests deferred for lanes
+ l_librbd_rwl_wr_req_def_log, // write requests deferred for log entries
+ l_librbd_rwl_wr_req_def_buf, // write requests deferred for buffer space
+ l_librbd_rwl_wr_req_overlap, // write requests detained for overlap
+ l_librbd_rwl_wr_req_queued, // write requests queued for prior barrier
+ l_librbd_rwl_wr_bytes, // bytes written
+
+ // Write log operations (1 .. n per request that appends to the log)
+ l_librbd_rwl_log_ops, // log append ops
+ l_librbd_rwl_log_op_bytes, // average bytes written per log op
+
+ /*
+
+ Req and op average latencies to the beginning of and over various phases:
+
+ +------------------------------+------+-------------------------------+
+ | Phase | Name | Description |
+ +------------------------------+------+-------------------------------+
+ | Arrive at RWL | arr |Arrives as a request |
+ +------------------------------+------+-------------------------------+
+ | Allocate resources | all |time spent in block guard for |
+ | | |overlap sequencing occurs |
+ | | |before this point |
+ +------------------------------+------+-------------------------------+
+ | Dispatch | dis |Op lifetime begins here. time |
+ | | |spent in allocation waiting for|
+ | | |resources occurs before this |
+ | | |point |
+ +------------------------------+------+-------------------------------+
+ | Payload buffer persist and | buf |time spent queued for |
+ |replicate | |replication occurs before here |
+ +------------------------------+------+-------------------------------+
+ | Payload buffer persist | bufc |bufc - buf is just the persist |
+ |complete | |time |
+ +------------------------------+------+-------------------------------+
+ | Log append | app |time spent queued for append |
+ | | |occurs before here |
+ +------------------------------+------+-------------------------------+
+ | Append complete | appc |appc - app is just the time |
+ | | |spent in the append operation |
+ +------------------------------+------+-------------------------------+
+ | Complete | cmp |write persisted, replicated, |
+ | | |and globally visible |
+ +------------------------------+------+-------------------------------+
+
+ */
+
+ /* Request times */
+ l_librbd_rwl_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard
+ l_librbd_rwl_req_arr_to_dis_t, // arrival to dispatch elapsed time
+ l_librbd_rwl_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources
+ l_librbd_rwl_wr_latency, // average req (persist) completion latency
+ l_librbd_rwl_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written
+ l_librbd_rwl_wr_caller_latency, // average req completion (to caller) latency
+
+ /* Request times for requests that never waited for space*/
+ l_librbd_rwl_nowait_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard
+ l_librbd_rwl_nowait_req_arr_to_dis_t, // arrival to dispatch elapsed time
+ l_librbd_rwl_nowait_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources
+ l_librbd_rwl_nowait_wr_latency, // average req (persist) completion latency
+ l_librbd_rwl_nowait_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written
+ l_librbd_rwl_nowait_wr_caller_latency, // average req completion (to caller) latency
+
+ /* Log operation times */
+ l_librbd_rwl_log_op_alloc_t, // elapsed time of pmemobj_reserve()
+ l_librbd_rwl_log_op_alloc_t_hist, // Histogram of elapsed time of pmemobj_reserve()
+
+ l_librbd_rwl_log_op_dis_to_buf_t, // dispatch to buffer persist elapsed time
+ l_librbd_rwl_log_op_dis_to_app_t, // dispatch to log append elapsed time
+ l_librbd_rwl_log_op_dis_to_cmp_t, // dispatch to persist completion elapsed time
+ l_librbd_rwl_log_op_dis_to_cmp_t_hist, // Histogram of dispatch to persist completion elapsed time
+
+ l_librbd_rwl_log_op_buf_to_app_t, // data buf persist + append wait time
+ l_librbd_rwl_log_op_buf_to_bufc_t,// data buf persist / replicate elapsed time
+ l_librbd_rwl_log_op_buf_to_bufc_t_hist,// data buf persist time vs bytes histogram
+ l_librbd_rwl_log_op_app_to_cmp_t, // log entry append + completion wait time
+ l_librbd_rwl_log_op_app_to_appc_t, // log entry append / replicate elapsed time
+ l_librbd_rwl_log_op_app_to_appc_t_hist, // log entry append time (vs. op bytes) histogram
+
+ l_librbd_rwl_discard,
+ l_librbd_rwl_discard_bytes,
+ l_librbd_rwl_discard_latency,
+
+ l_librbd_rwl_aio_flush,
+ l_librbd_rwl_aio_flush_def,
+ l_librbd_rwl_aio_flush_latency,
+ l_librbd_rwl_ws,
+ l_librbd_rwl_ws_bytes, // Bytes modified by write same, probably much larger than WS payload bytes
+ l_librbd_rwl_ws_latency,
+
+ l_librbd_rwl_cmp,
+ l_librbd_rwl_cmp_bytes,
+ l_librbd_rwl_cmp_latency,
+ l_librbd_rwl_cmp_fails,
+
+ l_librbd_rwl_flush,
+ l_librbd_rwl_invalidate_cache,
+ l_librbd_rwl_invalidate_discard_cache,
+
+ l_librbd_rwl_append_tx_t,
+ l_librbd_rwl_retire_tx_t,
+ l_librbd_rwl_append_tx_t_hist,
+ l_librbd_rwl_retire_tx_t_hist,
+
+ l_librbd_rwl_last,
+};
+
+namespace librbd {
+namespace cache {
+namespace rwl {
+
+/* Defer a set of Contexts until destruct/exit. Used for deferring
+ * work on a given thread until a required lock is dropped. */
+class DeferredContexts {
+private:
+ std::vector<Context*> contexts;
+public:
+ ~DeferredContexts();
+ void add(Context* ctx);
+};
+
+/* Pmem structures */
+POBJ_LAYOUT_BEGIN(rbd_rwl);
+POBJ_LAYOUT_ROOT(rbd_rwl, struct WriteLogPoolRoot);
+POBJ_LAYOUT_TOID(rbd_rwl, uint8_t);
+POBJ_LAYOUT_TOID(rbd_rwl, struct WriteLogPmemEntry);
+POBJ_LAYOUT_END(rbd_rwl);
+
+struct WriteLogPmemEntry {
+ uint64_t sync_gen_number = 0;
+ uint64_t write_sequence_number = 0;
+ uint64_t image_offset_bytes;
+ uint64_t write_bytes;
+ TOID(uint8_t) write_data;
+ struct {
+ uint8_t entry_valid :1; /* if 0, this entry is free */
+ uint8_t sync_point :1; /* No data. No write sequence number. Marks sync
+ point for this sync gen number */
+ uint8_t sequenced :1; /* write sequence number is valid */
+ uint8_t has_data :1; /* write_data field is valid (else ignore) */
+ uint8_t discard :1; /* has_data will be 0 if this is a discard */
+ uint8_t writesame :1; /* ws_datalen indicates length of data at write_bytes */
+ };
+ uint32_t ws_datalen = 0; /* Length of data buffer (writesame only) */
+ uint32_t entry_index = 0; /* For debug consistency check. Can be removed if
+ * we need the space */
+ WriteLogPmemEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes),
+ entry_valid(0), sync_point(0), sequenced(0), has_data(0), discard(0), writesame(0) {
+ }
+ bool is_sync_point();
+ bool is_discard();
+ bool is_writesame();
+ bool is_write();
+ bool is_writer();
+ const uint64_t get_offset_bytes();
+ const uint64_t get_write_bytes();
+ friend std::ostream& operator<<(std::ostream& os,
+ const WriteLogPmemEntry &entry);
+};
+
+static_assert(sizeof(WriteLogPmemEntry) == 64);
+
+struct WriteLogPoolRoot {
+ union {
+ struct {
+ uint8_t layout_version; /* Version of this structure (RWL_POOL_VERSION) */
+ };
+ uint64_t _u64;
+ } header;
+ TOID(struct WriteLogPmemEntry) log_entries; /* contiguous array of log entries */
+ uint64_t pool_size;
+ uint64_t flushed_sync_gen; /* All writing entries with this or a lower
+ * sync gen number are flushed. */
+ uint32_t block_size; /* block size */
+ uint32_t num_log_entries;
+ uint32_t first_free_entry; /* Entry following the newest valid entry */
+ uint32_t first_valid_entry; /* Index of the oldest valid entry in the log */
+};
+
+} // namespace rwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_TYPES_H
trash/test_mock_RemoveRequest.cc
watcher/test_mock_RewatchRequest.cc
)
+
+if(WITH_RBD_RWL)
+ set(unittest_librbd_srcs
+ ${unittest_librbd_srcs}
+ cache/test_mock_ReplicatedWriteLog.cc)
+endif(WITH_RBD_RWL)
+
add_executable(unittest_librbd
${unittest_librbd_srcs}
$<TARGET_OBJECTS:common_texttable_obj>)
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "common/hostname.h"
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "include/rbd/librbd.hpp"
+#include "librbd/cache/rwl/ImageCacheState.h"
+#include "librbd/cache/rwl/Types.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/ReplicatedWriteLog.h"
+
+
+namespace librbd {
+namespace {
+
+struct MockContextRWL : public C_SaferCond {
+ MOCK_METHOD1(complete, void(int));
+ MOCK_METHOD1(finish, void(int));
+
+ void do_complete(int r) {
+ C_SaferCond::complete(r);
+ }
+};
+
+} // anonymous namespace
+
+namespace util {
+
+inline ImageCtx *get_image_ctx(MockImageCtx *image_ctx) {
+ return image_ctx->image_ctx;
+}
+
+} // namespace util
+} // namespace librbd
+
+#include "librbd/cache/ReplicatedWriteLog.cc"
+
+// template definitions
+#include "librbd/cache/ImageWriteback.cc"
+#include "librbd/cache/rwl/ImageCacheState.cc"
+
+template class librbd::cache::ImageWriteback<librbd::MockImageCtx>;
+template class librbd::cache::rwl::ImageCacheState<librbd::MockImageCtx>;
+
+namespace librbd {
+namespace cache {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::InSequence;
+using ::testing::Invoke;
+
+struct TestMockCacheReplicatedWriteLog : public TestMockFixture {
+ typedef ReplicatedWriteLog<librbd::MockImageCtx> MockReplicatedWriteLog;
+ typedef librbd::cache::rwl::ImageCacheState<librbd::MockImageCtx> MockImageCacheStateRWL;
+
+ MockImageCacheStateRWL *get_cache_state(MockImageCtx& mock_image_ctx) {
+ MockImageCacheStateRWL *rwl_state = new MockImageCacheStateRWL(&mock_image_ctx);
+ return rwl_state;
+ }
+
+ void validate_cache_state(librbd::ImageCtx *image_ctx,
+ MockImageCacheStateRWL &state,
+ bool present, bool empty, bool clean,
+ string host="", string path="",
+ uint64_t size=0) {
+ ConfigProxy &config = image_ctx->config;
+ ASSERT_EQ(present, state.present);
+ ASSERT_EQ(empty, state.empty);
+ ASSERT_EQ(clean, state.clean);
+
+ if (host.empty())
+ host = ceph_get_short_hostname();
+ if (path.empty())
+ path = config.get_val<string>("rbd_rwl_path");
+ if (!size)
+ size = config.get_val<uint64_t>("rbd_rwl_size");
+
+ ASSERT_EQ(host, state.host);
+ ASSERT_EQ(path, state.path);
+ ASSERT_EQ(size, state.size);
+ ASSERT_EQ(config.get_val<bool>("rbd_rwl_log_periodic_stats"),
+ state.log_periodic_stats);
+ }
+
+ void expect_op_work_queue(MockImageCtx& mock_image_ctx) {
+ EXPECT_CALL(*mock_image_ctx.op_work_queue, queue(_, _))
+ .WillRepeatedly(Invoke([](Context* ctx, int r) {
+ ctx->complete(r);
+ }));
+ }
+
+ void expect_context_complete(MockContextRWL& mock_context, int r) {
+ EXPECT_CALL(mock_context, complete(r))
+ .WillRepeatedly(Invoke([&mock_context](int r) {
+ mock_context.do_complete(r);
+ }));
+ }
+
+ void expect_metadata_set(MockImageCtx& mock_image_ctx) {
+ EXPECT_CALL(*mock_image_ctx.operations, execute_metadata_set(_, _, _))
+ .WillRepeatedly(Invoke([](std::string key, std::string val, Context* ctx) {
+ ctx->complete(0);
+ }));
+ }
+
+ void expect_metadata_remove(MockImageCtx& mock_image_ctx) {
+ EXPECT_CALL(*mock_image_ctx.operations, execute_metadata_remove(_, _))
+ .WillRepeatedly(Invoke([](std::string key, Context* ctx) {
+ ctx->complete(0);
+ }));
+ }
+};
+
+TEST_F(TestMockCacheReplicatedWriteLog, init_state_write) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockImageCtx mock_image_ctx(*ictx);
+ MockImageCacheStateRWL image_cache_state(&mock_image_ctx);
+
+ validate_cache_state(ictx, image_cache_state, true, true, true);
+
+ image_cache_state.empty = false;
+ image_cache_state.clean = false;
+ MockContextRWL finish_ctx;
+ expect_metadata_set(mock_image_ctx);
+ expect_context_complete(finish_ctx, 0);
+ image_cache_state.write_image_cache_state(&finish_ctx);
+ ASSERT_EQ(0, finish_ctx.wait());
+}
+
+static void get_jf(const string& s, JSONFormattable *f)
+{
+ JSONParser p;
+ bool result = p.parse(s.c_str(), s.size());
+ if (!result) {
+ cout << "Failed to parse: '" << s << "'" << std::endl;
+ }
+ ASSERT_EQ(true, result);
+ try {
+ decode_json_obj(*f, &p);
+ } catch (JSONDecoder::err& e) {
+ ASSERT_TRUE(0 == "Failed to decode JSON object");
+ }
+}
+
+TEST_F(TestMockCacheReplicatedWriteLog, init_state_json_write) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockImageCtx mock_image_ctx(*ictx);
+
+ JSONFormattable f;
+ string strf = "{ \"present\": \"1\", \"empty\": \"0\", \"clean\": \"0\", \
+ \"rwl_host\": \"testhost\", \
+ \"rwl_path\": \"/tmp\", \
+ \"rwl_size\": \"1024\" }";
+ get_jf(strf, &f);
+ MockImageCacheStateRWL image_cache_state(&mock_image_ctx, f);
+
+ validate_cache_state(ictx, image_cache_state, true, false, false,
+ "testhost", "/tmp", 1024);
+
+ MockContextRWL finish_ctx;
+ expect_metadata_remove(mock_image_ctx);
+ expect_context_complete(finish_ctx, 0);
+ image_cache_state.clear_image_cache_state(&finish_ctx);
+ ASSERT_EQ(0, finish_ctx.wait());
+}
+
+TEST_F(TestMockCacheReplicatedWriteLog, init_shutdown) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockImageCtx mock_image_ctx(*ictx);
+ MockReplicatedWriteLog rwl(mock_image_ctx, get_cache_state(mock_image_ctx));
+ MockContextRWL finish_ctx1;
+ expect_op_work_queue(mock_image_ctx);
+ expect_metadata_set(mock_image_ctx);
+
+ expect_context_complete(finish_ctx1, 0);
+ rwl.init(&finish_ctx1);
+ ASSERT_EQ(0, finish_ctx1.wait());
+
+ MockContextRWL finish_ctx2;
+ expect_context_complete(finish_ctx2, 0);
+ rwl.shut_down(&finish_ctx2);
+ ASSERT_EQ(0, finish_ctx2.wait());
+}
+
+} // namespace cache
+} // namespace librbd