SUBSYS(rbd, 0, 5)
SUBSYS(rbd_mirror, 0, 5)
SUBSYS(rbd_replay, 0, 5)
-SUBSYS(rbd_rwl, 0, 5)
+SUBSYS(rbd_pwl, 0, 5)
SUBSYS(journaler, 0, 5)
SUBSYS(objectcacher, 0, 5)
SUBSYS(immutable_obj_cache, 0, 5)
cache/ObjectCacherObjectDispatch.cc
cache/ObjectCacherWriteback.cc
cache/PassthroughImageCache.cc
- cache/rwl/InitRequest.cc
- cache/rwl/ShutdownRequest.cc
+ cache/pwl/InitRequest.cc
+ cache/pwl/ShutdownRequest.cc
cache/WriteAroundObjectDispatch.cc
crypto/CryptoObjectDispatch.cc
deep_copy/ImageCopyRequest.cc
if(WITH_RBD_RWL)
set(librbd_internal_srcs
${librbd_internal_srcs}
- cache/rwl/ImageCacheState.cc
- cache/rwl/LogEntry.cc
- cache/rwl/LogMap.cc
- cache/rwl/LogOperation.cc
- cache/rwl/ReadRequest.cc
- cache/rwl/Request.cc
- cache/rwl/SyncPoint.cc
- cache/rwl/Types.cc
- cache/ReplicatedWriteLog.cc
- cache/AbstractWriteLog.cc
+ cache/pwl/ImageCacheState.cc
+ cache/pwl/LogEntry.cc
+ cache/pwl/LogMap.cc
+ cache/pwl/LogOperation.cc
+ cache/pwl/ReadRequest.cc
+ cache/pwl/Request.cc
+ cache/pwl/SyncPoint.cc
+ cache/pwl/Types.cc
+ cache/pwl/ReplicatedWriteLog.cc
+ cache/pwl/AbstractWriteLog.cc
cache/WriteLogCache.cc)
endif()
on_finish->complete(r);
});
- bool rwl_enabled = cache::util::is_rwl_enabled(m_image_ctx);
+ bool rwl_enabled = cache::util::is_pwl_enabled(m_image_ctx);
if (m_image_ctx.clone_copy_on_read ||
(features & RBD_FEATURE_JOURNALING) != 0 ||
rwl_enabled) {
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include <libpmemobj.h>
-#include "AbstractWriteLog.h"
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "include/ceph_assert.h"
-#include "common/deleter.h"
-#include "common/dout.h"
-#include "common/environment.h"
-#include "common/errno.h"
-#include "common/WorkQueue.h"
-#include "common/Timer.h"
-#include "common/perf_counters.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/asio/ContextWQ.h"
-#include "librbd/cache/rwl/ImageCacheState.h"
-#include "librbd/cache/rwl/LogEntry.h"
-#include "librbd/cache/rwl/ReadRequest.h"
-#include "librbd/cache/rwl/Types.h"
-#include <map>
-#include <vector>
-
-#undef dout_subsys
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::AbstractWriteLog: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
-namespace cache {
-
-using namespace librbd::cache::rwl;
-
-typedef AbstractWriteLog<ImageCtx>::Extent Extent;
-typedef AbstractWriteLog<ImageCtx>::Extents Extents;
-
-const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION;
-
-template <typename I>
-AbstractWriteLog<I>::AbstractWriteLog(I &image_ctx, librbd::cache::rwl::ImageCacheState<I>* cache_state)
- : m_cache_state(cache_state),
- m_rwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_rwl)),
- m_image_ctx(image_ctx),
- m_log_pool_config_size(DEFAULT_POOL_SIZE),
- m_image_writeback(image_ctx), m_write_log_guard(image_ctx.cct),
- m_log_retire_lock(ceph::make_mutex(util::unique_lock_name(
- "librbd::cache::AbstractWriteLog::m_log_retire_lock", this))),
- m_entry_reader_lock("librbd::cache::AbstractWriteLog::m_entry_reader_lock"),
- m_deferred_dispatch_lock(ceph::make_mutex(util::unique_lock_name(
- "librbd::cache::AbstractWriteLog::m_deferred_dispatch_lock", this))),
- m_log_append_lock(ceph::make_mutex(util::unique_lock_name(
- "librbd::cache::AbstractWriteLog::m_log_append_lock", this))),
- m_lock(ceph::make_mutex(util::unique_lock_name(
- "librbd::cache::AbstractWriteLog::m_lock", this))),
- m_blockguard_lock(ceph::make_mutex(util::unique_lock_name(
- "librbd::cache::AbstractWriteLog::m_blockguard_lock", this))),
- m_blocks_to_log_entries(image_ctx.cct),
- m_thread_pool(image_ctx.cct, "librbd::cache::AbstractWriteLog::thread_pool", "tp_rwl",
- 4,
- ""),
- m_work_queue("librbd::cache::ReplicatedWriteLog::work_queue",
- ceph::make_timespan(
- image_ctx.config.template get_val<uint64_t>(
- "rbd_op_thread_timeout")),
- &m_thread_pool)
-{
- CephContext *cct = m_image_ctx.cct;
- ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock);
-}
-
-template <typename I>
-AbstractWriteLog<I>::~AbstractWriteLog() {
- ldout(m_image_ctx.cct, 15) << "enter" << dendl;
- {
- std::lock_guard timer_locker(*m_timer_lock);
- std::lock_guard locker(m_lock);
- m_timer->cancel_event(m_timer_ctx);
- m_thread_pool.stop();
- ceph_assert(m_deferred_ios.size() == 0);
- ceph_assert(m_ops_to_flush.size() == 0);
- ceph_assert(m_ops_to_append.size() == 0);
- ceph_assert(m_flush_ops_in_flight == 0);
-
- m_log_pool = nullptr;
- delete m_cache_state;
- m_cache_state = nullptr;
- }
- ldout(m_image_ctx.cct, 15) << "exit" << dendl;
-}
-
-template <typename I>
-void AbstractWriteLog<I>::perf_start(std::string name) {
- PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_rwl_first, l_librbd_rwl_last);
-
- // Latency axis configuration for op histograms, values are in nanoseconds
- PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
- "Latency (nsec)",
- PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
- 0, ///< Start at 0
- 5000, ///< Quantization unit is 5usec
- 16, ///< Ranges into the mS
- };
-
- // Syncpoint logentry number x-axis configuration for op histograms
- PerfHistogramCommon::axis_config_d sp_logentry_number_config{
- "logentry number",
- PerfHistogramCommon::SCALE_LINEAR, // log entry number in linear scale
- 0, // Start at 0
- 1, // Quantization unit is 1
- 260, // Up to 260 > (MAX_WRITES_PER_SYNC_POINT)
- };
-
- // Syncpoint bytes number y-axis configuration for op histogram
- PerfHistogramCommon::axis_config_d sp_bytes_number_config{
- "Number of SyncPoint",
- PerfHistogramCommon::SCALE_LOG2, // Request size in logarithmic scale
- 0, // Start at 0
- 512, // Quantization unit is 512
- 17, // Writes up to 8M >= MAX_BYTES_PER_SYNC_POINT
- };
-
- // Op size axis configuration for op histogram y axis, values are in bytes
- PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
- "Request size (bytes)",
- PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
- 0, ///< Start at 0
- 512, ///< Quantization unit is 512 bytes
- 16, ///< Writes up to >32k
- };
-
- // Num items configuration for op histogram y axis, values are in items
- PerfHistogramCommon::axis_config_d op_hist_y_axis_count_config{
- "Number of items",
- PerfHistogramCommon::SCALE_LINEAR, ///< Request size in linear scale
- 0, ///< Start at 0
- 1, ///< Quantization unit is 1
- 32, ///< Writes up to >32k
- };
-
- plb.add_u64_counter(l_librbd_rwl_rd_req, "rd", "Reads");
- plb.add_u64_counter(l_librbd_rwl_rd_bytes, "rd_bytes", "Data size in reads");
- plb.add_time_avg(l_librbd_rwl_rd_latency, "rd_latency", "Latency of reads");
-
- plb.add_u64_counter(l_librbd_rwl_rd_hit_req, "hit_rd", "Reads completely hitting RWL");
- plb.add_u64_counter(l_librbd_rwl_rd_hit_bytes, "rd_hit_bytes", "Bytes read from RWL");
- plb.add_time_avg(l_librbd_rwl_rd_hit_latency, "hit_rd_latency", "Latency of read hits");
-
- plb.add_u64_counter(l_librbd_rwl_rd_part_hit_req, "part_hit_rd", "reads partially hitting RWL");
-
- plb.add_u64_counter_histogram(
- l_librbd_rwl_syncpoint_hist, "syncpoint_logentry_bytes_histogram",
- sp_logentry_number_config, sp_bytes_number_config,
- "Histogram of syncpoint's logentry numbers vs bytes number");
-
- plb.add_u64_counter(l_librbd_rwl_wr_req, "wr", "Writes");
- plb.add_u64_counter(l_librbd_rwl_wr_req_def, "wr_def", "Writes deferred for resources");
- plb.add_u64_counter(l_librbd_rwl_wr_req_def_lanes, "wr_def_lanes", "Writes deferred for lanes");
- plb.add_u64_counter(l_librbd_rwl_wr_req_def_log, "wr_def_log", "Writes deferred for log entries");
- plb.add_u64_counter(l_librbd_rwl_wr_req_def_buf, "wr_def_buf", "Writes deferred for buffers");
- plb.add_u64_counter(l_librbd_rwl_wr_req_overlap, "wr_overlap", "Writes overlapping with prior in-progress writes");
- plb.add_u64_counter(l_librbd_rwl_wr_req_queued, "wr_q_barrier", "Writes queued for prior barriers (aio_flush)");
- plb.add_u64_counter(l_librbd_rwl_wr_bytes, "wr_bytes", "Data size in writes");
-
- plb.add_u64_counter(l_librbd_rwl_log_ops, "log_ops", "Log appends");
- plb.add_u64_avg(l_librbd_rwl_log_op_bytes, "log_op_bytes", "Average log append bytes");
-
- plb.add_time_avg(
- l_librbd_rwl_req_arr_to_all_t, "req_arr_to_all_t",
- "Average arrival to allocation time (time deferred for overlap)");
- plb.add_time_avg(
- l_librbd_rwl_req_arr_to_dis_t, "req_arr_to_dis_t",
- "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
- plb.add_time_avg(
- l_librbd_rwl_req_all_to_dis_t, "req_all_to_dis_t",
- "Average allocation to dispatch time (time deferred for log resources)");
- plb.add_time_avg(
- l_librbd_rwl_wr_latency, "wr_latency",
- "Latency of writes (persistent completion)");
- plb.add_u64_counter_histogram(
- l_librbd_rwl_wr_latency_hist, "wr_latency_bytes_histogram",
- op_hist_x_axis_config, op_hist_y_axis_config,
- "Histogram of write request latency (nanoseconds) vs. bytes written");
- plb.add_time_avg(
- l_librbd_rwl_wr_caller_latency, "caller_wr_latency",
- "Latency of write completion to caller");
- plb.add_time_avg(
- l_librbd_rwl_nowait_req_arr_to_all_t, "req_arr_to_all_nw_t",
- "Average arrival to allocation time (time deferred for overlap)");
- plb.add_time_avg(
- l_librbd_rwl_nowait_req_arr_to_dis_t, "req_arr_to_dis_nw_t",
- "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
- plb.add_time_avg(
- l_librbd_rwl_nowait_req_all_to_dis_t, "req_all_to_dis_nw_t",
- "Average allocation to dispatch time (time deferred for log resources)");
- plb.add_time_avg(
- l_librbd_rwl_nowait_wr_latency, "wr_latency_nw",
- "Latency of writes (persistent completion) not deferred for free space");
- plb.add_u64_counter_histogram(
- l_librbd_rwl_nowait_wr_latency_hist, "wr_latency_nw_bytes_histogram",
- op_hist_x_axis_config, op_hist_y_axis_config,
- "Histogram of write request latency (nanoseconds) vs. bytes written for writes not deferred for free space");
- plb.add_time_avg(
- l_librbd_rwl_nowait_wr_caller_latency, "caller_wr_latency_nw",
- "Latency of write completion to callerfor writes not deferred for free space");
- plb.add_time_avg(l_librbd_rwl_log_op_alloc_t, "op_alloc_t", "Average buffer pmemobj_reserve() time");
- plb.add_u64_counter_histogram(
- l_librbd_rwl_log_op_alloc_t_hist, "op_alloc_t_bytes_histogram",
- op_hist_x_axis_config, op_hist_y_axis_config,
- "Histogram of buffer pmemobj_reserve() time (nanoseconds) vs. bytes written");
- plb.add_time_avg(l_librbd_rwl_log_op_dis_to_buf_t, "op_dis_to_buf_t", "Average dispatch to buffer persist time");
- plb.add_time_avg(l_librbd_rwl_log_op_dis_to_app_t, "op_dis_to_app_t", "Average dispatch to log append time");
- plb.add_time_avg(l_librbd_rwl_log_op_dis_to_cmp_t, "op_dis_to_cmp_t", "Average dispatch to persist completion time");
- plb.add_u64_counter_histogram(
- l_librbd_rwl_log_op_dis_to_cmp_t_hist, "op_dis_to_cmp_t_bytes_histogram",
- op_hist_x_axis_config, op_hist_y_axis_config,
- "Histogram of op dispatch to persist complete time (nanoseconds) vs. bytes written");
-
- plb.add_time_avg(
- l_librbd_rwl_log_op_buf_to_app_t, "op_buf_to_app_t",
- "Average buffer persist to log append time (write data persist/replicate + wait for append time)");
- plb.add_time_avg(
- l_librbd_rwl_log_op_buf_to_bufc_t, "op_buf_to_bufc_t",
- "Average buffer persist time (write data persist/replicate time)");
- plb.add_u64_counter_histogram(
- l_librbd_rwl_log_op_buf_to_bufc_t_hist, "op_buf_to_bufc_t_bytes_histogram",
- op_hist_x_axis_config, op_hist_y_axis_config,
- "Histogram of write buffer persist time (nanoseconds) vs. bytes written");
- plb.add_time_avg(
- l_librbd_rwl_log_op_app_to_cmp_t, "op_app_to_cmp_t",
- "Average log append to persist complete time (log entry append/replicate + wait for complete time)");
- plb.add_time_avg(
- l_librbd_rwl_log_op_app_to_appc_t, "op_app_to_appc_t",
- "Average log append to persist complete time (log entry append/replicate time)");
- plb.add_u64_counter_histogram(
- l_librbd_rwl_log_op_app_to_appc_t_hist, "op_app_to_appc_t_bytes_histogram",
- op_hist_x_axis_config, op_hist_y_axis_config,
- "Histogram of log append persist time (nanoseconds) (vs. op bytes)");
-
- plb.add_u64_counter(l_librbd_rwl_discard, "discard", "Discards");
- plb.add_u64_counter(l_librbd_rwl_discard_bytes, "discard_bytes", "Bytes discarded");
- plb.add_time_avg(l_librbd_rwl_discard_latency, "discard_lat", "Discard latency");
-
- plb.add_u64_counter(l_librbd_rwl_aio_flush, "aio_flush", "AIO flush (flush to RWL)");
- plb.add_u64_counter(l_librbd_rwl_aio_flush_def, "aio_flush_def", "AIO flushes deferred for resources");
- plb.add_time_avg(l_librbd_rwl_aio_flush_latency, "aio_flush_lat", "AIO flush latency");
-
- plb.add_u64_counter(l_librbd_rwl_ws,"ws", "Write Sames");
- plb.add_u64_counter(l_librbd_rwl_ws_bytes, "ws_bytes", "Write Same bytes to image");
- plb.add_time_avg(l_librbd_rwl_ws_latency, "ws_lat", "Write Same latency");
-
- plb.add_u64_counter(l_librbd_rwl_cmp, "cmp", "Compare and Write requests");
- plb.add_u64_counter(l_librbd_rwl_cmp_bytes, "cmp_bytes", "Compare and Write bytes compared/written");
- plb.add_time_avg(l_librbd_rwl_cmp_latency, "cmp_lat", "Compare and Write latecy");
- plb.add_u64_counter(l_librbd_rwl_cmp_fails, "cmp_fails", "Compare and Write compare fails");
-
- plb.add_u64_counter(l_librbd_rwl_flush, "flush", "Flush (flush RWL)");
- plb.add_u64_counter(l_librbd_rwl_invalidate_cache, "invalidate", "Invalidate RWL");
- plb.add_u64_counter(l_librbd_rwl_invalidate_discard_cache, "discard", "Discard and invalidate RWL");
-
- plb.add_time_avg(l_librbd_rwl_append_tx_t, "append_tx_lat", "Log append transaction latency");
- plb.add_u64_counter_histogram(
- l_librbd_rwl_append_tx_t_hist, "append_tx_lat_histogram",
- op_hist_x_axis_config, op_hist_y_axis_count_config,
- "Histogram of log append transaction time (nanoseconds) vs. entries appended");
- plb.add_time_avg(l_librbd_rwl_retire_tx_t, "retire_tx_lat", "Log retire transaction latency");
- plb.add_u64_counter_histogram(
- l_librbd_rwl_retire_tx_t_hist, "retire_tx_lat_histogram",
- op_hist_x_axis_config, op_hist_y_axis_count_config,
- "Histogram of log retire transaction time (nanoseconds) vs. entries retired");
-
- m_perfcounter = plb.create_perf_counters();
- m_image_ctx.cct->get_perfcounters_collection()->add(m_perfcounter);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::perf_stop() {
- ceph_assert(m_perfcounter);
- m_image_ctx.cct->get_perfcounters_collection()->remove(m_perfcounter);
- delete m_perfcounter;
-}
-
-template <typename I>
-void AbstractWriteLog<I>::log_perf() {
- bufferlist bl;
- Formatter *f = Formatter::create("json-pretty");
- bl.append("Perf dump follows\n--- Begin perf dump ---\n");
- bl.append("{\n");
- stringstream ss;
- utime_t now = ceph_clock_now();
- ss << "\"test_time\": \"" << now << "\",";
- ss << "\"image\": \"" << m_image_ctx.name << "\",";
- bl.append(ss);
- bl.append("\"stats\": ");
- m_image_ctx.cct->get_perfcounters_collection()->dump_formatted(f, 0);
- f->flush(bl);
- bl.append(",\n\"histograms\": ");
- m_image_ctx.cct->get_perfcounters_collection()->dump_formatted_histograms(f, 0);
- f->flush(bl);
- delete f;
- bl.append("}\n--- End perf dump ---\n");
- bl.append('\0');
- ldout(m_image_ctx.cct, 1) << bl.c_str() << dendl;
-}
-
-template <typename I>
-void AbstractWriteLog<I>::periodic_stats() {
- std::lock_guard locker(m_lock);
- ldout(m_image_ctx.cct, 1) << "STATS: "
- << "m_free_log_entries=" << m_free_log_entries << ", "
- << "m_log_entries=" << m_log_entries.size() << ", "
- << "m_dirty_log_entries=" << m_dirty_log_entries.size() << ", "
- << "m_bytes_allocated=" << m_bytes_allocated << ", "
- << "m_bytes_cached=" << m_bytes_cached << ", "
- << "m_bytes_dirty=" << m_bytes_dirty << ", "
- << "bytes available=" << m_bytes_allocated_cap - m_bytes_allocated << ", "
- << "m_current_sync_gen=" << m_current_sync_gen << ", "
- << "m_flushed_sync_gen=" << m_flushed_sync_gen << ", "
- << dendl;
-}
-
-template <typename I>
-void AbstractWriteLog<I>::arm_periodic_stats() {
- ceph_assert(ceph_mutex_is_locked(*m_timer_lock));
- if (m_periodic_stats_enabled) {
- m_timer_ctx = new LambdaContext(
- [this](int r) {
- /* m_timer_lock is held */
- periodic_stats();
- arm_periodic_stats();
- });
- m_timer->add_event_after(LOG_STATS_INTERVAL_SECONDS, m_timer_ctx);
- }
-}
-
-/*
- * Loads the log entries from an existing log.
- *
- * Creates the in-memory structures to represent the state of the
- * re-opened log.
- *
- * Finds the last appended sync point, and any sync points referred to
- * in log entries, but missing from the log. These missing sync points
- * are created and scheduled for append. Some rudimentary consistency
- * checking is done.
- *
- * Rebuilds the m_blocks_to_log_entries map, to make log entries
- * readable.
- *
- * Places all writes on the dirty entries list, which causes them all
- * to be flushed.
- *
- */
-template <typename I>
-void AbstractWriteLog<I>::load_existing_entries(DeferredContexts &later) {
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
- uint64_t entry_index = m_first_valid_entry;
- /* The map below allows us to find sync point log entries by sync
- * gen number, which is necessary so write entries can be linked to
- * their sync points. */
- std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
- /* The map below tracks sync points referred to in writes but not
- * appearing in the sync_point_entries map. We'll use this to
- * determine which sync points are missing and need to be
- * created. */
- std::map<uint64_t, bool> missing_sync_points;
-
- /*
- * Read the existing log entries. Construct an in-memory log entry
- * object of the appropriate type for each. Add these to the global
- * log entries list.
- *
- * Write entries will not link to their sync points yet. We'll do
- * that in the next pass. Here we'll accumulate a map of sync point
- * gen numbers that are referred to in writes but do not appearing in
- * the log.
- */
- while (entry_index != m_first_free_entry) {
- WriteLogPmemEntry *pmem_entry = &pmem_log_entries[entry_index];
- std::shared_ptr<GenericLogEntry> log_entry = nullptr;
- bool writer = pmem_entry->is_writer();
-
- ceph_assert(pmem_entry->entry_index == entry_index);
- if (pmem_entry->is_sync_point()) {
- ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " is a sync point. pmem_entry=[" << *pmem_entry << "]" << dendl;
- auto sync_point_entry = std::make_shared<SyncPointLogEntry>(pmem_entry->sync_gen_number);
- log_entry = sync_point_entry;
- sync_point_entries[pmem_entry->sync_gen_number] = sync_point_entry;
- missing_sync_points.erase(pmem_entry->sync_gen_number);
- m_current_sync_gen = pmem_entry->sync_gen_number;
- } else if (pmem_entry->is_write()) {
- ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " is a write. pmem_entry=[" << *pmem_entry << "]" << dendl;
- auto write_entry =
- std::make_shared<WriteLogEntry>(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes);
- write_entry->pmem_buffer = D_RW(pmem_entry->write_data);
- log_entry = write_entry;
- } else if (pmem_entry->is_writesame()) {
- ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " is a write same. pmem_entry=[" << *pmem_entry << "]" << dendl;
- auto ws_entry =
- std::make_shared<WriteSameLogEntry>(nullptr, pmem_entry->image_offset_bytes,
- pmem_entry->write_bytes, pmem_entry->ws_datalen);
- ws_entry->pmem_buffer = D_RW(pmem_entry->write_data);
- log_entry = ws_entry;
- } else if (pmem_entry->is_discard()) {
- ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " is a discard. pmem_entry=[" << *pmem_entry << "]" << dendl;
- auto discard_entry =
- std::make_shared<DiscardLogEntry>(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes,
- m_discard_granularity_bytes);
- log_entry = discard_entry;
- } else {
- lderr(m_image_ctx.cct) << "Unexpected entry type in entry " << entry_index
- << ", pmem_entry=[" << *pmem_entry << "]" << dendl;
- }
-
- if (writer) {
- ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " writes. pmem_entry=[" << *pmem_entry << "]" << dendl;
- if (!sync_point_entries[pmem_entry->sync_gen_number]) {
- missing_sync_points[pmem_entry->sync_gen_number] = true;
- }
- }
-
- log_entry->ram_entry = *pmem_entry;
- log_entry->pmem_entry = pmem_entry;
- log_entry->log_entry_index = entry_index;
- log_entry->completed = true;
-
- m_log_entries.push_back(log_entry);
-
- entry_index = (entry_index + 1) % m_total_log_entries;
- }
-
- /* Create missing sync points. These must not be appended until the
- * entry reload is complete and the write map is up to
- * date. Currently this is handled by the deferred contexts object
- * passed to new_sync_point(). These contexts won't be completed
- * until this function returns. */
- for (auto &kv : missing_sync_points) {
- ldout(m_image_ctx.cct, 5) << "Adding sync point " << kv.first << dendl;
- if (0 == m_current_sync_gen) {
- /* The unlikely case where the log contains writing entries, but no sync
- * points (e.g. because they were all retired) */
- m_current_sync_gen = kv.first-1;
- }
- ceph_assert(kv.first == m_current_sync_gen+1);
- init_flush_new_sync_point(later);
- ceph_assert(kv.first == m_current_sync_gen);
- sync_point_entries[kv.first] = m_current_sync_point->log_entry;;
- }
-
- /*
- * Iterate over the log entries again (this time via the global
- * entries list), connecting write entries to their sync points and
- * updating the sync point stats.
- *
- * Add writes to the write log map.
- */
- std::shared_ptr<SyncPointLogEntry> previous_sync_point_entry = nullptr;
- for (auto &log_entry : m_log_entries) {
- if ((log_entry->write_bytes() > 0) || (log_entry->bytes_dirty() > 0)) {
- /* This entry is one of the types that write */
- auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(log_entry);
- if (gen_write_entry) {
- auto sync_point_entry = sync_point_entries[gen_write_entry->ram_entry.sync_gen_number];
- if (!sync_point_entry) {
- lderr(m_image_ctx.cct) << "Sync point missing for entry=[" << *gen_write_entry << "]" << dendl;
- ceph_assert(false);
- } else {
- gen_write_entry->sync_point_entry = sync_point_entry;
- sync_point_entry->writes++;
- sync_point_entry->bytes += gen_write_entry->ram_entry.write_bytes;
- sync_point_entry->writes_completed++;
- m_blocks_to_log_entries.add_log_entry(gen_write_entry);
- /* This entry is only dirty if its sync gen number is > the flushed
- * sync gen number from the root object. */
- if (gen_write_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) {
- m_dirty_log_entries.push_back(log_entry);
- m_bytes_dirty += gen_write_entry->bytes_dirty();
- } else {
- gen_write_entry->set_flushed(true);
- sync_point_entry->writes_flushed++;
- }
- if (log_entry->write_bytes() == log_entry->bytes_dirty()) {
- /* This entry is a basic write */
- uint64_t bytes_allocated = MIN_WRITE_ALLOC_SIZE;
- if (gen_write_entry->ram_entry.write_bytes > bytes_allocated) {
- bytes_allocated = gen_write_entry->ram_entry.write_bytes;
- }
- m_bytes_allocated += bytes_allocated;
- m_bytes_cached += gen_write_entry->ram_entry.write_bytes;
- }
- }
- }
- } else {
- /* This entry is sync point entry */
- auto sync_point_entry = static_pointer_cast<SyncPointLogEntry>(log_entry);
- if (sync_point_entry) {
- if (previous_sync_point_entry) {
- previous_sync_point_entry->next_sync_point_entry = sync_point_entry;
- if (previous_sync_point_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) {
- sync_point_entry->prior_sync_point_flushed = false;
- ceph_assert(!previous_sync_point_entry->prior_sync_point_flushed ||
- (0 == previous_sync_point_entry->writes) ||
- (previous_sync_point_entry->writes >= previous_sync_point_entry->writes_flushed));
- } else {
- sync_point_entry->prior_sync_point_flushed = true;
- ceph_assert(previous_sync_point_entry->prior_sync_point_flushed);
- ceph_assert(previous_sync_point_entry->writes == previous_sync_point_entry->writes_flushed);
- }
- previous_sync_point_entry = sync_point_entry;
- } else {
- /* There are no previous sync points, so we'll consider them flushed */
- sync_point_entry->prior_sync_point_flushed = true;
- }
- ldout(m_image_ctx.cct, 10) << "Loaded to sync point=[" << *sync_point_entry << dendl;
- }
- }
- }
- if (0 == m_current_sync_gen) {
- /* If a re-opened log was completely flushed, we'll have found no sync point entries here,
- * and not advanced m_current_sync_gen. Here we ensure it starts past the last flushed sync
- * point recorded in the log. */
- m_current_sync_gen = m_flushed_sync_gen;
- }
-}
-
-template <typename I>
-void AbstractWriteLog<I>::rwl_init(Context *on_finish, DeferredContexts &later) {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 20) << dendl;
- TOID(struct WriteLogPoolRoot) pool_root;
- ceph_assert(m_cache_state);
- std::lock_guard locker(m_lock);
- ceph_assert(!m_initialized);
- ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl;
- ldout(cct,5) << "rwl_size: " << m_cache_state->size << dendl;
- std::string rwl_path = m_cache_state->path;
- ldout(cct,5) << "rwl_path: " << rwl_path << dendl;
-
- std::string pool_name = m_image_ctx.md_ctx.get_pool_name();
- std::string log_pool_name = rwl_path + "/rbd-rwl." + pool_name + "." + m_image_ctx.id + ".pool";
- std::string log_poolset_name = rwl_path + "/rbd-rwl." + pool_name + "." + m_image_ctx.id + ".poolset";
- m_log_pool_config_size = max(m_cache_state->size, MIN_POOL_SIZE);
-
- if (access(log_poolset_name.c_str(), F_OK) == 0) {
- m_log_pool_name = log_poolset_name;
- m_log_is_poolset = true;
- } else {
- m_log_pool_name = log_pool_name;
- ldout(cct, 5) << "Poolset file " << log_poolset_name
- << " not present (or can't open). Using unreplicated pool" << dendl;
- }
-
- if ((!m_cache_state->present) &&
- (access(m_log_pool_name.c_str(), F_OK) == 0)) {
- ldout(cct, 5) << "There's an existing pool/poolset file " << m_log_pool_name
- << ", While there's no cache in the image metatata." << dendl;
- if (remove(m_log_pool_name.c_str()) != 0) {
- lderr(cct) << "Failed to remove the pool/poolset file " << m_log_pool_name
- << dendl;
- on_finish->complete(-errno);
- return;
- } else {
- ldout(cct, 5) << "Removed the existing pool/poolset file." << dendl;
- }
- }
-
- if (access(m_log_pool_name.c_str(), F_OK) != 0) {
- if ((m_log_pool =
- pmemobj_create(m_log_pool_name.c_str(),
- m_rwl_pool_layout_name,
- m_log_pool_config_size,
- (S_IWUSR | S_IRUSR))) == NULL) {
- lderr(cct) << "failed to create pool (" << m_log_pool_name << ")"
- << pmemobj_errormsg() << dendl;
- m_cache_state->present = false;
- m_cache_state->clean = true;
- m_cache_state->empty = true;
- /* TODO: filter/replace errnos that are meaningless to the caller */
- on_finish->complete(-errno);
- return;
- }
- m_cache_state->present = true;
- m_cache_state->clean = true;
- m_cache_state->empty = true;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-
- /* new pool, calculate and store metadata */
- size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE);
- size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogPmemEntry);
- uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size);
- if (num_small_writes > MAX_LOG_ENTRIES) {
- num_small_writes = MAX_LOG_ENTRIES;
- }
- if (num_small_writes <= 2) {
- lderr(cct) << "num_small_writes needs to > 2" << dendl;
- on_finish->complete(-EINVAL);
- return;
- }
- m_log_pool_actual_size = m_log_pool_config_size;
- m_bytes_allocated_cap = effective_pool_size;
- /* Log ring empty */
- m_first_free_entry = 0;
- m_first_valid_entry = 0;
- TX_BEGIN(m_log_pool) {
- TX_ADD(pool_root);
- D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION;
- D_RW(pool_root)->log_entries =
- TX_ZALLOC(struct WriteLogPmemEntry,
- sizeof(struct WriteLogPmemEntry) * num_small_writes);
- D_RW(pool_root)->pool_size = m_log_pool_actual_size;
- D_RW(pool_root)->flushed_sync_gen = m_flushed_sync_gen;
- D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE;
- D_RW(pool_root)->num_log_entries = num_small_writes;
- D_RW(pool_root)->first_free_entry = m_first_free_entry;
- D_RW(pool_root)->first_valid_entry = m_first_valid_entry;
- } TX_ONCOMMIT {
- m_total_log_entries = D_RO(pool_root)->num_log_entries;
- m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free
- } TX_ONABORT {
- m_total_log_entries = 0;
- m_free_log_entries = 0;
- lderr(cct) << "failed to initialize pool (" << m_log_pool_name << ")" << dendl;
- on_finish->complete(-pmemobj_tx_errno());
- return;
- } TX_FINALLY {
- } TX_END;
- } else {
- m_cache_state->present = true;
- /* Open existing pool */
- if ((m_log_pool =
- pmemobj_open(m_log_pool_name.c_str(),
- m_rwl_pool_layout_name)) == NULL) {
- lderr(cct) << "failed to open pool (" << m_log_pool_name << "): "
- << pmemobj_errormsg() << dendl;
- on_finish->complete(-errno);
- return;
- }
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- if (D_RO(pool_root)->header.layout_version != RWL_POOL_VERSION) {
- // TODO: will handle upgrading version in the future
- lderr(cct) << "Pool layout version is " << D_RO(pool_root)->header.layout_version
- << " expected " << RWL_POOL_VERSION << dendl;
- on_finish->complete(-EINVAL);
- return;
- }
- if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) {
- lderr(cct) << "Pool block size is " << D_RO(pool_root)->block_size
- << " expected " << MIN_WRITE_ALLOC_SIZE << dendl;
- on_finish->complete(-EINVAL);
- return;
- }
- m_log_pool_actual_size = D_RO(pool_root)->pool_size;
- m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen;
- m_total_log_entries = D_RO(pool_root)->num_log_entries;
- m_first_free_entry = D_RO(pool_root)->first_free_entry;
- m_first_valid_entry = D_RO(pool_root)->first_valid_entry;
- if (m_first_free_entry < m_first_valid_entry) {
- /* Valid entries wrap around the end of the ring, so first_free is lower
- * than first_valid. If first_valid was == first_free+1, the entry at
- * first_free would be empty. The last entry is never used, so in
- * that case there would be zero free log entries. */
- m_free_log_entries = m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1;
- } else {
- /* first_valid is <= first_free. If they are == we have zero valid log
- * entries, and n-1 free log entries */
- m_free_log_entries = m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1;
- }
- size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE);
- m_bytes_allocated_cap = effective_pool_size;
- load_existing_entries(later);
- m_cache_state->clean = m_dirty_log_entries.empty();
- m_cache_state->empty = m_log_entries.empty();
- }
-
- ldout(cct,1) << "pool " << m_log_pool_name << " has " << m_total_log_entries
- << " log entries, " << m_free_log_entries << " of which are free."
- << " first_valid=" << m_first_valid_entry
- << ", first_free=" << m_first_free_entry
- << ", flushed_sync_gen=" << m_flushed_sync_gen
- << ", m_current_sync_gen=" << m_current_sync_gen << dendl;
- if (m_first_free_entry == m_first_valid_entry) {
- ldout(cct,1) << "write log is empty" << dendl;
- m_cache_state->empty = true;
- }
-
- /* Start the sync point following the last one seen in the
- * log. Flush the last sync point created during the loading of the
- * existing log entries. */
- init_flush_new_sync_point(later);
- ldout(cct,20) << "new sync point = [" << m_current_sync_point << "]" << dendl;
-
- m_initialized = true;
- // Start the thread
- m_thread_pool.start();
-
- m_periodic_stats_enabled = m_cache_state->log_periodic_stats;
- /* Do these after we drop lock */
- later.add(new LambdaContext([this](int r) {
- if (m_periodic_stats_enabled) {
- /* Log stats for the first time */
- periodic_stats();
- /* Arm periodic stats logging for the first time */
- std::lock_guard timer_locker(*m_timer_lock);
- arm_periodic_stats();
- }
- }));
- m_image_ctx.op_work_queue->queue(on_finish, 0);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::update_image_cache_state(Context *on_finish) {
- m_cache_state->write_image_cache_state(on_finish);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::init(Context *on_finish) {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 20) << dendl;
- perf_start(m_image_ctx.id);
-
- ceph_assert(!m_initialized);
-
- Context *ctx = new LambdaContext(
- [this, on_finish](int r) {
- if (r >= 0) {
- update_image_cache_state(on_finish);
- } else {
- on_finish->complete(r);
- }
- });
-
- DeferredContexts later;
- rwl_init(ctx, later);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::shut_down(Context *on_finish) {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 20) << dendl;
-
- ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl;
-
- Context *ctx = new LambdaContext(
- [this, on_finish](int r) {
- ldout(m_image_ctx.cct, 6) << "shutdown complete" << dendl;
- m_image_ctx.op_work_queue->queue(on_finish, r);
- });
- ctx = new LambdaContext(
- [this, ctx](int r) {
- Context *next_ctx = override_ctx(r, ctx);
- bool periodic_stats_enabled = m_periodic_stats_enabled;
- m_periodic_stats_enabled = false;
-
- if (periodic_stats_enabled) {
- /* Log stats one last time if they were enabled */
- periodic_stats();
- }
- {
- std::lock_guard locker(m_lock);
- ceph_assert(m_dirty_log_entries.size() == 0);
- m_wake_up_enabled = false;
- m_cache_state->clean = true;
- m_log_entries.clear();
- if (m_log_pool) {
- ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl;
- pmemobj_close(m_log_pool);
- }
- if (m_cache_state->clean) {
- if (m_log_is_poolset) {
- ldout(m_image_ctx.cct, 5) << "Not removing poolset " << m_log_pool_name << dendl;
- } else {
- ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << m_log_pool_name << dendl;
- if (remove(m_log_pool_name.c_str()) != 0) {
- lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << m_log_pool_name << "\": "
- << pmemobj_errormsg() << dendl;
- } else {
- m_cache_state->clean = true;
- m_cache_state->empty = true;
- m_cache_state->present = false;
- }
- }
- } else {
- if (m_log_is_poolset) {
- ldout(m_image_ctx.cct, 5) << "Not removing poolset " << m_log_pool_name << dendl;
- } else {
- ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << m_log_pool_name << dendl;
- }
- }
- if (m_perfcounter) {
- perf_stop();
- }
- }
- update_image_cache_state(next_ctx);
- });
- ctx = new LambdaContext(
- [this, ctx](int r) {
- Context *next_ctx = override_ctx(r, ctx);
- {
- /* Sync with process_writeback_dirty_entries() */
- RWLock::WLocker entry_reader_wlocker(m_entry_reader_lock);
- m_shutting_down = true;
- /* Flush all writes to OSDs (unless disabled) and wait for all
- in-progress flush writes to complete */
- ldout(m_image_ctx.cct, 6) << "flushing" << dendl;
- if (m_periodic_stats_enabled) {
- periodic_stats();
- }
- }
- flush_dirty_entries(next_ctx);
- });
- ctx = new LambdaContext(
- [this, ctx](int r) {
- Context *next_ctx = override_ctx(r, ctx);
- ldout(m_image_ctx.cct, 6) << "waiting for in flight operations" << dendl;
- // Wait for in progress IOs to complete
- next_ctx = util::create_async_context_callback(m_image_ctx, next_ctx);
- m_async_op_tracker.wait_for_ops(next_ctx);
- });
- ctx = new LambdaContext(
- [this, ctx](int r) {
- ldout(m_image_ctx.cct, 6) << "Done internal_flush in shutdown" << dendl;
- m_work_queue.queue(ctx, r);
- });
- /* Complete all in-flight writes before shutting down */
- ldout(m_image_ctx.cct, 6) << "internal_flush in shutdown" << dendl;
- internal_flush(false, ctx);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::read(Extents&& image_extents,
- ceph::bufferlist* bl,
- int fadvise_flags, Context *on_finish) {
- // TODO: handle writesame and discard case in later PRs
- CephContext *cct = m_image_ctx.cct;
- utime_t now = ceph_clock_now();
- C_ReadRequest *read_ctx = new C_ReadRequest(cct, now, m_perfcounter, bl, on_finish);
- ldout(cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id
- << "image_extents=" << image_extents << ", "
- << "bl=" << bl << ", "
- << "on_finish=" << on_finish << dendl;
-
- ceph_assert(m_initialized);
- bl->clear();
- m_perfcounter->inc(l_librbd_rwl_rd_req, 1);
-
- /*
- * The strategy here is to look up all the WriteLogMapEntries that overlap
- * this read, and iterate through those to separate this read into hits and
- * misses. A new Extents object is produced here with Extents for each miss
- * region. The miss Extents is then passed on to the read cache below RWL. We
- * also produce an ImageExtentBufs for all the extents (hit or miss) in this
- * read. When the read from the lower cache layer completes, we iterate
- * through the ImageExtentBufs and insert buffers for each cache hit at the
- * appropriate spot in the bufferlist returned from below for the miss
- * read. The buffers we insert here refer directly to regions of various
- * write log entry data buffers.
- *
- * Locking: These buffer objects hold a reference on the write log entries
- * they refer to. Log entries can't be retired until there are no references.
- * The GenericWriteLogEntry references are released by the buffer destructor.
- */
- for (auto &extent : image_extents) {
- uint64_t extent_offset = 0;
- RWLock::RLocker entry_reader_locker(m_entry_reader_lock);
- WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries(block_extent(extent));
- for (auto &map_entry : map_entries) {
- Extent entry_image_extent(rwl::image_extent(map_entry.block_extent));
- /* If this map entry starts after the current image extent offset ... */
- if (entry_image_extent.first > extent.first + extent_offset) {
- /* ... add range before map_entry to miss extents */
- uint64_t miss_extent_start = extent.first + extent_offset;
- uint64_t miss_extent_length = entry_image_extent.first - miss_extent_start;
- Extent miss_extent(miss_extent_start, miss_extent_length);
- read_ctx->miss_extents.push_back(miss_extent);
- /* Add miss range to read extents */
- ImageExtentBuf miss_extent_buf(miss_extent);
- read_ctx->read_extents.push_back(miss_extent_buf);
- extent_offset += miss_extent_length;
- }
- ceph_assert(entry_image_extent.first <= extent.first + extent_offset);
- uint64_t entry_offset = 0;
- /* If this map entry starts before the current image extent offset ... */
- if (entry_image_extent.first < extent.first + extent_offset) {
- /* ... compute offset into log entry for this read extent */
- entry_offset = (extent.first + extent_offset) - entry_image_extent.first;
- }
- /* This read hit ends at the end of the extent or the end of the log
- entry, whichever is less. */
- uint64_t entry_hit_length = min(entry_image_extent.second - entry_offset,
- extent.second - extent_offset);
- Extent hit_extent(entry_image_extent.first, entry_hit_length);
- if (0 == map_entry.log_entry->write_bytes() && 0 < map_entry.log_entry->bytes_dirty()) {
- /* discard log entry */
- auto discard_entry = map_entry.log_entry;
- ldout(cct, 20) << "read hit on discard entry: log_entry=" << *discard_entry << dendl;
- /* Discards read as zero, so we'll construct a bufferlist of zeros */
- bufferlist zero_bl;
- zero_bl.append_zero(entry_hit_length);
- /* Add hit extent to read extents */
- ImageExtentBuf hit_extent_buf(hit_extent, zero_bl);
- read_ctx->read_extents.push_back(hit_extent_buf);
- } else {
- /* write and writesame log entry */
- /* Offset of the map entry into the log entry's buffer */
- uint64_t map_entry_buffer_offset = entry_image_extent.first - map_entry.log_entry->ram_entry.image_offset_bytes;
- /* Offset into the log entry buffer of this read hit */
- uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset;
- /* Create buffer object referring to pmem pool for this read hit */
- auto write_entry = map_entry.log_entry;
-
- /* Make a bl for this hit extent. This will add references to the write_entry->pmem_bp */
- buffer::list hit_bl;
-
- buffer::list entry_bl_copy;
- write_entry->copy_pmem_bl(&entry_bl_copy);
- entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl);
-
- ceph_assert(hit_bl.length() == entry_hit_length);
-
- /* Add hit extent to read extents */
- ImageExtentBuf hit_extent_buf(hit_extent, hit_bl);
- read_ctx->read_extents.push_back(hit_extent_buf);
- }
- /* Exclude RWL hit range from buffer and extent */
- extent_offset += entry_hit_length;
- ldout(cct, 20) << map_entry << dendl;
- }
- /* If the last map entry didn't consume the entire image extent ... */
- if (extent.second > extent_offset) {
- /* ... add the rest of this extent to miss extents */
- uint64_t miss_extent_start = extent.first + extent_offset;
- uint64_t miss_extent_length = extent.second - extent_offset;
- Extent miss_extent(miss_extent_start, miss_extent_length);
- read_ctx->miss_extents.push_back(miss_extent);
- /* Add miss range to read extents */
- ImageExtentBuf miss_extent_buf(miss_extent);
- read_ctx->read_extents.push_back(miss_extent_buf);
- extent_offset += miss_extent_length;
- }
- }
-
- ldout(cct, 20) << "miss_extents=" << read_ctx->miss_extents << ", "
- << "miss_bl=" << read_ctx->miss_bl << dendl;
-
- if (read_ctx->miss_extents.empty()) {
- /* All of this read comes from RWL */
- read_ctx->complete(0);
- } else {
- /* Pass the read misses on to the layer below RWL */
- m_image_writeback.aio_read(std::move(read_ctx->miss_extents), &read_ctx->miss_bl, fadvise_flags, read_ctx);
- }
-}
-
-template <typename I>
-void AbstractWriteLog<I>::write(Extents &&image_extents,
- bufferlist&& bl,
- int fadvise_flags,
- Context *on_finish) {
- CephContext *cct = m_image_ctx.cct;
-
- ldout(cct, 20) << "aio_write" << dendl;
-
- utime_t now = ceph_clock_now();
- m_perfcounter->inc(l_librbd_rwl_wr_req, 1);
-
- ceph_assert(m_initialized);
-
- auto *write_req =
- new C_WriteRequestT(*this, now, std::move(image_extents), std::move(bl), fadvise_flags,
- m_lock, m_perfcounter, on_finish);
- m_perfcounter->inc(l_librbd_rwl_wr_bytes, write_req->image_extents_summary.total_bytes);
-
- /* The lambda below will be called when the block guard for all
- * blocks affected by this write is obtained */
- GuardedRequestFunctionContext *guarded_ctx =
- new GuardedRequestFunctionContext([this, write_req](GuardedRequestFunctionContext &guard_ctx) {
- write_req->blockguard_acquired(guard_ctx);
- alloc_and_dispatch_io_req(write_req);
- });
-
- detain_guarded_request(write_req, guarded_ctx, false);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::discard(uint64_t offset, uint64_t length,
- uint32_t discard_granularity_bytes,
- Context *on_finish) {
- CephContext *cct = m_image_ctx.cct;
-
- ldout(cct, 20) << dendl;
-
- utime_t now = ceph_clock_now();
- m_perfcounter->inc(l_librbd_rwl_discard, 1);
- Extents discard_extents = {{offset, length}};
- m_discard_granularity_bytes = discard_granularity_bytes;
-
- ceph_assert(m_initialized);
-
- auto *discard_req =
- new C_DiscardRequestT(*this, now, std::move(discard_extents), discard_granularity_bytes,
- m_lock, m_perfcounter, on_finish);
-
- /* The lambda below will be called when the block guard for all
- * blocks affected by this write is obtained */
- GuardedRequestFunctionContext *guarded_ctx =
- new GuardedRequestFunctionContext([this, discard_req](GuardedRequestFunctionContext &guard_ctx) {
- discard_req->blockguard_acquired(guard_ctx);
- alloc_and_dispatch_io_req(discard_req);
- });
-
- detain_guarded_request(discard_req, guarded_ctx, false);
-}
-
-/**
- * Aio_flush completes when all previously completed writes are
- * flushed to persistent cache. We make a best-effort attempt to also
- * defer until all in-progress writes complete, but we may not know
- * about all of the writes the application considers in-progress yet,
- * due to uncertainty in the IO submission workq (multiple WQ threads
- * may allow out-of-order submission).
- *
- * This flush operation will not wait for writes deferred for overlap
- * in the block guard.
- */
-template <typename I>
-void AbstractWriteLog<I>::flush(io::FlushSource flush_source, Context *on_finish) {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 20) << "on_finish=" << on_finish << " flush_source=" << flush_source << dendl;
-
- if (io::FLUSH_SOURCE_SHUTDOWN == flush_source || io::FLUSH_SOURCE_INTERNAL == flush_source) {
- internal_flush(false, on_finish);
- return;
- }
- m_perfcounter->inc(l_librbd_rwl_aio_flush, 1);
-
- /* May be called even if initialization fails */
- if (!m_initialized) {
- ldout(cct, 05) << "never initialized" << dendl;
- /* Deadlock if completed here */
- m_image_ctx.op_work_queue->queue(on_finish, 0);
- return;
- }
-
- {
- std::shared_lock image_locker(m_image_ctx.image_lock);
- if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) {
- on_finish->complete(-EROFS);
- return;
- }
- }
-
- auto flush_req = make_flush_req(on_finish);
-
- GuardedRequestFunctionContext *guarded_ctx =
- new GuardedRequestFunctionContext([this, flush_req](GuardedRequestFunctionContext &guard_ctx) {
- ldout(m_image_ctx.cct, 20) << "flush_req=" << flush_req << " cell=" << guard_ctx.cell << dendl;
- ceph_assert(guard_ctx.cell);
- flush_req->detained = guard_ctx.state.detained;
- /* We don't call flush_req->set_cell(), because the block guard will be released here */
- {
- DeferredContexts post_unlock; /* Do these when the lock below is released */
- std::lock_guard locker(m_lock);
-
- if (!m_persist_on_flush && m_persist_on_write_until_flush) {
- m_persist_on_flush = true;
- ldout(m_image_ctx.cct, 5) << "now persisting on flush" << dendl;
- }
-
- /*
- * Create a new sync point if there have been writes since the last
- * one.
- *
- * We do not flush the caches below the RWL here.
- */
- flush_new_sync_point_if_needed(flush_req, post_unlock);
- }
-
- release_guarded_request(guard_ctx.cell);
- });
-
- detain_guarded_request(flush_req, guarded_ctx, true);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::writesame(uint64_t offset, uint64_t length,
- bufferlist&& bl, int fadvise_flags,
- Context *on_finish) {
- CephContext *cct = m_image_ctx.cct;
-
- ldout(cct, 20) << "aio_writesame" << dendl;
-
- utime_t now = ceph_clock_now();
- Extents ws_extents = {{offset, length}};
- m_perfcounter->inc(l_librbd_rwl_ws, 1);
- ceph_assert(m_initialized);
-
- /* A write same request is also a write request. The key difference is the
- * write same data buffer is shorter than the extent of the request. The full
- * extent will be used in the block guard, and appear in
- * m_blocks_to_log_entries_map. The data buffer allocated for the WS is only
- * as long as the length of the bl here, which is the pattern that's repeated
- * in the image for the entire length of this WS. Read hits and flushing of
- * write sames are different than normal writes. */
- auto *ws_req =
- new C_WriteSameRequestT(*this, now, std::move(ws_extents), std::move(bl),
- fadvise_flags, m_lock, m_perfcounter, on_finish);
- m_perfcounter->inc(l_librbd_rwl_ws_bytes, ws_req->image_extents_summary.total_bytes);
-
- /* The lambda below will be called when the block guard for all
- * blocks affected by this write is obtained */
- GuardedRequestFunctionContext *guarded_ctx =
- new GuardedRequestFunctionContext([this, ws_req](GuardedRequestFunctionContext &guard_ctx) {
- ws_req->blockguard_acquired(guard_ctx);
- alloc_and_dispatch_io_req(ws_req);
- });
-
- detain_guarded_request(ws_req, guarded_ctx, false);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::compare_and_write(Extents &&image_extents,
- bufferlist&& cmp_bl,
- bufferlist&& bl,
- uint64_t *mismatch_offset,
- int fadvise_flags,
- Context *on_finish) {
- ldout(m_image_ctx.cct, 20) << dendl;
-
- utime_t now = ceph_clock_now();
- m_perfcounter->inc(l_librbd_rwl_cmp, 1);
- ceph_assert(m_initialized);
-
- /* A compare and write request is also a write request. We only allocate
- * resources and dispatch this write request if the compare phase
- * succeeds. */
- auto *cw_req =
- new C_CompAndWriteRequestT(*this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl),
- mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish);
- m_perfcounter->inc(l_librbd_rwl_cmp_bytes, cw_req->image_extents_summary.total_bytes);
-
- /* The lambda below will be called when the block guard for all
- * blocks affected by this write is obtained */
- GuardedRequestFunctionContext *guarded_ctx =
- new GuardedRequestFunctionContext([this, cw_req](GuardedRequestFunctionContext &guard_ctx) {
- cw_req->blockguard_acquired(guard_ctx);
-
- auto read_complete_ctx = new LambdaContext(
- [this, cw_req](int r) {
- ldout(m_image_ctx.cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id
- << "cw_req=" << cw_req << dendl;
-
- /* Compare read_bl to cmp_bl to determine if this will produce a write */
- buffer::list aligned_read_bl;
- if (cw_req->cmp_bl.length() < cw_req->read_bl.length()) {
- aligned_read_bl.substr_of(cw_req->read_bl, 0, cw_req->cmp_bl.length());
- }
- if (cw_req->cmp_bl.contents_equal(cw_req->read_bl) ||
- cw_req->cmp_bl.contents_equal(aligned_read_bl)) {
- /* Compare phase succeeds. Begin write */
- ldout(m_image_ctx.cct, 5) << " cw_req=" << cw_req << " compare matched" << dendl;
- cw_req->compare_succeeded = true;
- *cw_req->mismatch_offset = 0;
- /* Continue with this request as a write. Blockguard release and
- * user request completion handled as if this were a plain
- * write. */
- alloc_and_dispatch_io_req(cw_req);
- } else {
- /* Compare phase fails. Comp-and write ends now. */
- ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " compare failed" << dendl;
- /* Bufferlist doesn't tell us where they differed, so we'll have to determine that here */
- uint64_t bl_index = 0;
- for (bl_index = 0; bl_index < cw_req->cmp_bl.length(); bl_index++) {
- if (cw_req->cmp_bl[bl_index] != cw_req->read_bl[bl_index]) {
- ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " mismatch at " << bl_index << dendl;
- break;
- }
- }
- cw_req->compare_succeeded = false;
- *cw_req->mismatch_offset = bl_index;
- cw_req->complete_user_request(-EILSEQ);
- cw_req->release_cell();
- cw_req->complete(0);
- }
- });
-
- /* Read phase of comp-and-write must read through RWL */
- Extents image_extents_copy = cw_req->image_extents;
- read(std::move(image_extents_copy), &cw_req->read_bl, cw_req->fadvise_flags, read_complete_ctx);
- });
-
- detain_guarded_request(cw_req, guarded_ctx, false);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::flush(Context *on_finish) {
- internal_flush(false, on_finish);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::invalidate(Context *on_finish) {
- internal_flush(true, on_finish);
-}
-
-template <typename I>
-CephContext *AbstractWriteLog<I>::get_context() {
- return m_image_ctx.cct;
-}
-
-template <typename I>
-BlockGuardCell* AbstractWriteLog<I>::detain_guarded_request_helper(GuardedRequest &req)
-{
- CephContext *cct = m_image_ctx.cct;
- BlockGuardCell *cell;
-
- ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock));
- ldout(cct, 20) << dendl;
-
- int r = m_write_log_guard.detain(req.block_extent, &req, &cell);
- ceph_assert(r>=0);
- if (r > 0) {
- ldout(cct, 20) << "detaining guarded request due to in-flight requests: "
- << "req=" << req << dendl;
- return nullptr;
- }
-
- ldout(cct, 20) << "in-flight request cell: " << cell << dendl;
- return cell;
-}
-
-template <typename I>
-BlockGuardCell* AbstractWriteLog<I>::detain_guarded_request_barrier_helper(
- GuardedRequest &req)
-{
- BlockGuardCell *cell = nullptr;
-
- ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock));
- ldout(m_image_ctx.cct, 20) << dendl;
-
- if (m_barrier_in_progress) {
- req.guard_ctx->state.queued = true;
- m_awaiting_barrier.push_back(req);
- } else {
- bool barrier = req.guard_ctx->state.barrier;
- if (barrier) {
- m_barrier_in_progress = true;
- req.guard_ctx->state.current_barrier = true;
- }
- cell = detain_guarded_request_helper(req);
- if (barrier) {
- /* Only non-null if the barrier acquires the guard now */
- m_barrier_cell = cell;
- }
- }
-
- return cell;
-}
-
-template <typename I>
-void AbstractWriteLog<I>::detain_guarded_request(
- C_BlockIORequestT *request,
- GuardedRequestFunctionContext *guarded_ctx,
- bool is_barrier)
-{
- BlockExtent extent;
- if (request) {
- extent = request->image_extents_summary.block_extent();
- } else {
- extent = block_extent(whole_volume_extent());
- }
- auto req = GuardedRequest(extent, guarded_ctx, is_barrier);
- BlockGuardCell *cell = nullptr;
-
- ldout(m_image_ctx.cct, 20) << dendl;
- {
- std::lock_guard locker(m_blockguard_lock);
- cell = detain_guarded_request_barrier_helper(req);
- }
- if (cell) {
- req.guard_ctx->cell = cell;
- req.guard_ctx->complete(0);
- }
-}
-
-template <typename I>
-void AbstractWriteLog<I>::release_guarded_request(BlockGuardCell *released_cell)
-{
- CephContext *cct = m_image_ctx.cct;
- WriteLogGuard::BlockOperations block_reqs;
- ldout(cct, 20) << "released_cell=" << released_cell << dendl;
-
- {
- std::lock_guard locker(m_blockguard_lock);
- m_write_log_guard.release(released_cell, &block_reqs);
-
- for (auto &req : block_reqs) {
- req.guard_ctx->state.detained = true;
- BlockGuardCell *detained_cell = detain_guarded_request_helper(req);
- if (detained_cell) {
- if (req.guard_ctx->state.current_barrier) {
- /* The current barrier is acquiring the block guard, so now we know its cell */
- m_barrier_cell = detained_cell;
- /* detained_cell could be == released_cell here */
- ldout(cct, 20) << "current barrier cell=" << detained_cell << " req=" << req << dendl;
- }
- req.guard_ctx->cell = detained_cell;
- m_work_queue.queue(req.guard_ctx);
- }
- }
-
- if (m_barrier_in_progress && (released_cell == m_barrier_cell)) {
- ldout(cct, 20) << "current barrier released cell=" << released_cell << dendl;
- /* The released cell is the current barrier request */
- m_barrier_in_progress = false;
- m_barrier_cell = nullptr;
- /* Move waiting requests into the blockguard. Stop if there's another barrier */
- while (!m_barrier_in_progress && !m_awaiting_barrier.empty()) {
- auto &req = m_awaiting_barrier.front();
- ldout(cct, 20) << "submitting queued request to blockguard: " << req << dendl;
- BlockGuardCell *detained_cell = detain_guarded_request_barrier_helper(req);
- if (detained_cell) {
- req.guard_ctx->cell = detained_cell;
- m_work_queue.queue(req.guard_ctx);
- }
- m_awaiting_barrier.pop_front();
- }
- }
- }
-
- ldout(cct, 20) << "exit" << dendl;
-}
-
-/*
- * Performs the log event append operation for all of the scheduled
- * events.
- */
-template <typename I>
-void AbstractWriteLog<I>::append_scheduled_ops(void)
-{
- GenericLogOperations ops;
- int append_result = 0;
- bool ops_remain = false;
- bool appending = false; /* true if we set m_appending */
- ldout(m_image_ctx.cct, 20) << dendl;
- do {
- ops.clear();
-
- {
- std::lock_guard locker(m_lock);
- if (!appending && m_appending) {
- /* Another thread is appending */
- ldout(m_image_ctx.cct, 15) << "Another thread is appending" << dendl;
- return;
- }
- if (m_ops_to_append.size()) {
- appending = true;
- m_appending = true;
- auto last_in_batch = m_ops_to_append.begin();
- unsigned int ops_to_append = m_ops_to_append.size();
- if (ops_to_append > OPS_APPENDED_TOGETHER) {
- ops_to_append = OPS_APPENDED_TOGETHER;
- }
- std::advance(last_in_batch, ops_to_append);
- ops.splice(ops.end(), m_ops_to_append, m_ops_to_append.begin(), last_in_batch);
- ops_remain = true; /* Always check again before leaving */
- ldout(m_image_ctx.cct, 20) << "appending " << ops.size() << ", "
- << m_ops_to_append.size() << " remain" << dendl;
- } else {
- ops_remain = false;
- if (appending) {
- appending = false;
- m_appending = false;
- }
- }
- }
-
- if (ops.size()) {
- std::lock_guard locker(m_log_append_lock);
- alloc_op_log_entries(ops);
- append_result = append_op_log_entries(ops);
- }
-
- int num_ops = ops.size();
- if (num_ops) {
- /* New entries may be flushable. Completion will wake up flusher. */
- complete_op_log_entries(std::move(ops), append_result);
- }
- } while (ops_remain);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::enlist_op_appender()
-{
- m_async_append_ops++;
- m_async_op_tracker.start_op();
- Context *append_ctx = new LambdaContext([this](int r) {
- append_scheduled_ops();
- m_async_append_ops--;
- m_async_op_tracker.finish_op();
- });
- m_work_queue.queue(append_ctx);
-}
-
-/*
- * Takes custody of ops. They'll all get their log entries appended,
- * and have their on_write_persist contexts completed once they and
- * all prior log entries are persisted everywhere.
- */
-template <typename I>
-void AbstractWriteLog<I>::schedule_append(GenericLogOperations &ops)
-{
- bool need_finisher;
- GenericLogOperationsVector appending;
-
- std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
- {
- std::lock_guard locker(m_lock);
-
- need_finisher = m_ops_to_append.empty() && !m_appending;
- m_ops_to_append.splice(m_ops_to_append.end(), ops);
- }
-
- if (need_finisher) {
- enlist_op_appender();
- }
-
- for (auto &op : appending) {
- op->appending();
- }
-}
-
-template <typename I>
-void AbstractWriteLog<I>::schedule_append(GenericLogOperationsVector &ops)
-{
- GenericLogOperations to_append(ops.begin(), ops.end());
-
- schedule_append(to_append);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::schedule_append(GenericLogOperationSharedPtr op)
-{
- GenericLogOperations to_append { op };
-
- schedule_append(to_append);
-}
-
-const unsigned long int ops_flushed_together = 4;
-/*
- * Performs the pmem buffer flush on all scheduled ops, then schedules
- * the log event append operation for all of them.
- */
-template <typename I>
-void AbstractWriteLog<I>::flush_then_append_scheduled_ops(void)
-{
- GenericLogOperations ops;
- bool ops_remain = false;
- ldout(m_image_ctx.cct, 20) << dendl;
- do {
- {
- ops.clear();
- std::lock_guard locker(m_lock);
- if (m_ops_to_flush.size()) {
- auto last_in_batch = m_ops_to_flush.begin();
- unsigned int ops_to_flush = m_ops_to_flush.size();
- if (ops_to_flush > ops_flushed_together) {
- ops_to_flush = ops_flushed_together;
- }
- ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl;
- std::advance(last_in_batch, ops_to_flush);
- ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch);
- ops_remain = !m_ops_to_flush.empty();
- ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", "
- << m_ops_to_flush.size() << " remain" << dendl;
- } else {
- ops_remain = false;
- }
- }
- if (ops_remain) {
- enlist_op_flusher();
- }
-
- /* Ops subsequently scheduled for flush may finish before these,
- * which is fine. We're unconcerned with completion order until we
- * get to the log message append step. */
- if (ops.size()) {
- flush_pmem_buffer(ops);
- schedule_append(ops);
- }
- } while (ops_remain);
- append_scheduled_ops();
-}
-
-template <typename I>
-void AbstractWriteLog<I>::enlist_op_flusher()
-{
- m_async_flush_ops++;
- m_async_op_tracker.start_op();
- Context *flush_ctx = new LambdaContext([this](int r) {
- flush_then_append_scheduled_ops();
- m_async_flush_ops--;
- m_async_op_tracker.finish_op();
- });
- m_work_queue.queue(flush_ctx);
-}
-
-/*
- * Takes custody of ops. They'll all get their pmem blocks flushed,
- * then get their log entries appended.
- */
-template <typename I>
-void AbstractWriteLog<I>::schedule_flush_and_append(GenericLogOperationsVector &ops)
-{
- GenericLogOperations to_flush(ops.begin(), ops.end());
- bool need_finisher;
- ldout(m_image_ctx.cct, 20) << dendl;
- {
- std::lock_guard locker(m_lock);
-
- need_finisher = m_ops_to_flush.empty();
- m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush);
- }
-
- if (need_finisher) {
- enlist_op_flusher();
- }
-}
-
-/*
- * Flush the pmem regions for the data blocks of a set of operations
- *
- * V is expected to be GenericLogOperations<I>, or GenericLogOperationsVector<I>
- */
-template <typename I>
-template <typename V>
-void AbstractWriteLog<I>::flush_pmem_buffer(V& ops)
-{
- for (auto &operation : ops) {
- operation->flush_pmem_buf_to_cache(m_log_pool);
- }
-
- /* Drain once for all */
- pmemobj_drain(m_log_pool);
-
- utime_t now = ceph_clock_now();
- for (auto &operation : ops) {
- if (operation->reserved_allocated()) {
- operation->buf_persist_comp_time = now;
- } else {
- ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
- }
- }
-}
-
-/*
- * Allocate the (already reserved) write log entries for a set of operations.
- *
- * Locking:
- * Acquires lock
- */
-template <typename I>
-void AbstractWriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops)
-{
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
-
- ceph_assert(ceph_mutex_is_locked_by_me(m_log_append_lock));
-
- /* Allocate the (already reserved) log entries */
- std::lock_guard locker(m_lock);
-
- for (auto &operation : ops) {
- uint32_t entry_index = m_first_free_entry;
- m_first_free_entry = (m_first_free_entry + 1) % m_total_log_entries;
- auto &log_entry = operation->get_log_entry();
- log_entry->log_entry_index = entry_index;
- log_entry->ram_entry.entry_index = entry_index;
- log_entry->pmem_entry = &pmem_log_entries[entry_index];
- log_entry->ram_entry.entry_valid = 1;
- m_log_entries.push_back(log_entry);
- ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
- }
-}
-
-/*
- * Flush the persistent write log entries set of ops. The entries must
- * be contiguous in persistent memory.
- */
-template <typename I>
-void AbstractWriteLog<I>::flush_op_log_entries(GenericLogOperationsVector &ops)
-{
- if (ops.empty()) {
- return;
- }
-
- if (ops.size() > 1) {
- ceph_assert(ops.front()->get_log_entry()->pmem_entry < ops.back()->get_log_entry()->pmem_entry);
- }
-
- ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() << " "
- << "start address="
- << ops.front()->get_log_entry()->pmem_entry << " "
- << "bytes="
- << ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry))
- << dendl;
- pmemobj_flush(m_log_pool,
- ops.front()->get_log_entry()->pmem_entry,
- ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry)));
-}
-
-/*
- * Write and persist the (already allocated) write log entries and
- * data buffer allocations for a set of ops. The data buffer for each
- * of these must already have been persisted to its reserved area.
- */
-template <typename I>
-int AbstractWriteLog<I>::append_op_log_entries(GenericLogOperations &ops)
-{
- CephContext *cct = m_image_ctx.cct;
- GenericLogOperationsVector entries_to_flush;
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- int ret = 0;
-
- ceph_assert(ceph_mutex_is_locked_by_me(m_log_append_lock));
-
- if (ops.empty()) {
- return 0;
- }
- entries_to_flush.reserve(OPS_APPENDED_TOGETHER);
-
- /* Write log entries to ring and persist */
- utime_t now = ceph_clock_now();
- for (auto &operation : ops) {
- if (!entries_to_flush.empty()) {
- /* Flush these and reset the list if the current entry wraps to the
- * tail of the ring */
- if (entries_to_flush.back()->get_log_entry()->log_entry_index >
- operation->get_log_entry()->log_entry_index) {
- ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at "
- << "operation=[" << *operation << "]" << dendl;
- flush_op_log_entries(entries_to_flush);
- entries_to_flush.clear();
- now = ceph_clock_now();
- }
- }
- ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index="
- << operation->get_log_entry()->log_entry_index << " "
- << "from " << &operation->get_log_entry()->ram_entry << " "
- << "to " << operation->get_log_entry()->pmem_entry << " "
- << "operation=[" << *operation << "]" << dendl;
- ldout(m_image_ctx.cct, 05) << "APPENDING: index="
- << operation->get_log_entry()->log_entry_index << " "
- << "operation=[" << *operation << "]" << dendl;
- operation->log_append_time = now;
- *operation->get_log_entry()->pmem_entry = operation->get_log_entry()->ram_entry;
- ldout(m_image_ctx.cct, 20) << "APPENDING: index="
- << operation->get_log_entry()->log_entry_index << " "
- << "pmem_entry=[" << *operation->get_log_entry()->pmem_entry
- << "]" << dendl;
- entries_to_flush.push_back(operation);
- }
- flush_op_log_entries(entries_to_flush);
-
- /* Drain once for all */
- pmemobj_drain(m_log_pool);
-
- /*
- * Atomically advance the log head pointer and publish the
- * allocations for all the data buffers they refer to.
- */
- utime_t tx_start = ceph_clock_now();
- TX_BEGIN(m_log_pool) {
- D_RW(pool_root)->first_free_entry = m_first_free_entry;
- for (auto &operation : ops) {
- if (operation->reserved_allocated()) {
- auto write_op = (std::shared_ptr<WriteLogOperation>&) operation;
- pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1);
- } else {
- ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
- }
- }
- } TX_ONCOMMIT {
- } TX_ONABORT {
- lderr(cct) << "failed to commit " << ops.size()
- << " log entries (" << m_log_pool_name << ")" << dendl;
- ceph_assert(false);
- ret = -EIO;
- } TX_FINALLY {
- } TX_END;
-
- utime_t tx_end = ceph_clock_now();
- m_perfcounter->tinc(l_librbd_rwl_append_tx_t, tx_end - tx_start);
- m_perfcounter->hinc(
- l_librbd_rwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size());
- for (auto &operation : ops) {
- operation->log_append_comp_time = tx_end;
- }
-
- return ret;
-}
-
-/*
- * Complete a set of write ops with the result of append_op_entries.
- */
-template <typename I>
-void AbstractWriteLog<I>::complete_op_log_entries(GenericLogOperations &&ops,
- const int result)
-{
- GenericLogEntries dirty_entries;
- int published_reserves = 0;
- ldout(m_image_ctx.cct, 20) << __func__ << ": completing" << dendl;
- for (auto &op : ops) {
- utime_t now = ceph_clock_now();
- auto log_entry = op->get_log_entry();
- log_entry->completed = true;
- if (op->is_writing_op()) {
- op->mark_log_entry_completed();
- dirty_entries.push_back(log_entry);
- }
- if (op->reserved_allocated()) {
- published_reserves++;
- }
- op->complete(result);
- m_perfcounter->tinc(l_librbd_rwl_log_op_dis_to_app_t,
- op->log_append_time - op->dispatch_time);
- m_perfcounter->tinc(l_librbd_rwl_log_op_dis_to_cmp_t, now - op->dispatch_time);
- m_perfcounter->hinc(l_librbd_rwl_log_op_dis_to_cmp_t_hist,
- utime_t(now - op->dispatch_time).to_nsec(),
- log_entry->ram_entry.write_bytes);
- utime_t app_lat = op->log_append_comp_time - op->log_append_time;
- m_perfcounter->tinc(l_librbd_rwl_log_op_app_to_appc_t, app_lat);
- m_perfcounter->hinc(l_librbd_rwl_log_op_app_to_appc_t_hist, app_lat.to_nsec(),
- log_entry->ram_entry.write_bytes);
- m_perfcounter->tinc(l_librbd_rwl_log_op_app_to_cmp_t, now - op->log_append_time);
- }
-
- {
- std::lock_guard locker(m_lock);
- m_unpublished_reserves -= published_reserves;
- m_dirty_log_entries.splice(m_dirty_log_entries.end(), dirty_entries);
-
- /* New entries may be flushable */
- wake_up();
- }
-}
-
-/**
- * Dispatch as many deferred writes as possible
- */
-template <typename I>
-void AbstractWriteLog<I>::dispatch_deferred_writes(void)
-{
- C_BlockIORequestT *front_req = nullptr; /* req still on front of deferred list */
- C_BlockIORequestT *allocated_req = nullptr; /* req that was allocated, and is now off the list */
- bool allocated = false; /* front_req allocate succeeded */
- bool cleared_dispatching_flag = false;
-
- /* If we can't become the dispatcher, we'll exit */
- {
- std::lock_guard locker(m_lock);
- if (m_dispatching_deferred_ops ||
- !m_deferred_ios.size()) {
- return;
- }
- m_dispatching_deferred_ops = true;
- }
-
- /* There are ops to dispatch, and this should be the only thread dispatching them */
- {
- std::lock_guard deferred_dispatch(m_deferred_dispatch_lock);
- do {
- {
- std::lock_guard locker(m_lock);
- ceph_assert(m_dispatching_deferred_ops);
- if (allocated) {
- /* On the 2..n-1 th time we get lock, front_req->alloc_resources() will
- * have succeeded, and we'll need to pop it off the deferred ops list
- * here. */
- ceph_assert(front_req);
- ceph_assert(!allocated_req);
- m_deferred_ios.pop_front();
- allocated_req = front_req;
- front_req = nullptr;
- allocated = false;
- }
- ceph_assert(!allocated);
- if (!allocated && front_req) {
- /* front_req->alloc_resources() failed on the last iteration. We'll stop dispatching. */
- front_req = nullptr;
- ceph_assert(!cleared_dispatching_flag);
- m_dispatching_deferred_ops = false;
- cleared_dispatching_flag = true;
- } else {
- ceph_assert(!front_req);
- if (m_deferred_ios.size()) {
- /* New allocation candidate */
- front_req = m_deferred_ios.front();
- } else {
- ceph_assert(!cleared_dispatching_flag);
- m_dispatching_deferred_ops = false;
- cleared_dispatching_flag = true;
- }
- }
- }
- /* Try allocating for front_req before we decide what to do with allocated_req
- * (if any) */
- if (front_req) {
- ceph_assert(!cleared_dispatching_flag);
- allocated = front_req->alloc_resources();
- }
- if (allocated_req && front_req && allocated) {
- /* Push dispatch of the first allocated req to a wq */
- m_work_queue.queue(new LambdaContext(
- [this, allocated_req](int r) {
- allocated_req->dispatch();
- }), 0);
- allocated_req = nullptr;
- }
- ceph_assert(!(allocated_req && front_req && allocated));
-
- /* Continue while we're still considering the front of the deferred ops list */
- } while (front_req);
- ceph_assert(!allocated);
- }
- ceph_assert(cleared_dispatching_flag);
-
- /* If any deferred requests were allocated, the last one will still be in allocated_req */
- if (allocated_req) {
- allocated_req->dispatch();
- }
-}
-
-/**
- * Returns the lanes used by this write, and attempts to dispatch the next
- * deferred write
- */
-template <typename I>
-void AbstractWriteLog<I>::release_write_lanes(C_BlockIORequestT *req)
-{
- {
- std::lock_guard locker(m_lock);
- m_free_lanes += req->image_extents.size();
- }
- dispatch_deferred_writes();
-}
-
-/**
- * Attempts to allocate log resources for a write. Write is dispatched if
- * resources are available, or queued if they aren't.
- */
-template <typename I>
-void AbstractWriteLog<I>::alloc_and_dispatch_io_req(C_BlockIORequestT *req)
-{
- bool dispatch_here = false;
-
- {
- /* If there are already deferred writes, queue behind them for resources */
- {
- std::lock_guard locker(m_lock);
- dispatch_here = m_deferred_ios.empty();
- }
- if (dispatch_here) {
- dispatch_here = req->alloc_resources();
- }
- if (dispatch_here) {
- ldout(m_image_ctx.cct, 20) << "dispatching" << dendl;
- req->dispatch();
- } else {
- req->deferred();
- {
- std::lock_guard locker(m_lock);
- m_deferred_ios.push_back(req);
- }
- ldout(m_image_ctx.cct, 20) << "deferred IOs: " << m_deferred_ios.size() << dendl;
- dispatch_deferred_writes();
- }
- }
-}
-
-template <typename I>
-bool AbstractWriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
- bool alloc_succeeds = true;
- bool no_space = false;
- uint64_t bytes_allocated = 0;
- uint64_t bytes_cached = 0;
- uint64_t bytes_dirtied = 0;
- uint64_t num_lanes = 0;
- uint64_t num_unpublished_reserves = 0;
- uint64_t num_log_entries = 0;
-
- // Setup buffer, and get all the number of required resources
- req->setup_buffer_resources(bytes_cached, bytes_dirtied, bytes_allocated,
- num_lanes, num_log_entries, num_unpublished_reserves);
-
- {
- std::lock_guard locker(m_lock);
- if (m_free_lanes < num_lanes) {
- req->set_io_waited_for_lanes(true);
- ldout(m_image_ctx.cct, 20) << "not enough free lanes (need "
- << num_lanes
- << ", have " << m_free_lanes << ") "
- << *req << dendl;
- alloc_succeeds = false;
- /* This isn't considered a "no space" alloc fail. Lanes are a throttling mechanism. */
- }
- if (m_free_log_entries < num_log_entries) {
- req->set_io_waited_for_entries(true);
- ldout(m_image_ctx.cct, 20) << "not enough free entries (need "
- << num_log_entries
- << ", have " << m_free_log_entries << ") "
- << *req << dendl;
- alloc_succeeds = false;
- no_space = true; /* Entries must be retired */
- }
- /* Don't attempt buffer allocate if we've exceeded the "full" threshold */
- if (m_bytes_allocated + bytes_allocated > m_bytes_allocated_cap) {
- if (!req->has_io_waited_for_buffers()) {
- req->set_io_waited_for_entries(true);
- ldout(m_image_ctx.cct, 1) << "Waiting for allocation cap (cap="
- << m_bytes_allocated_cap
- << ", allocated=" << m_bytes_allocated
- << ") in write [" << *req << "]" << dendl;
- }
- alloc_succeeds = false;
- no_space = true; /* Entries must be retired */
- }
- }
-
- std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
- if (alloc_succeeds) {
- for (auto &buffer : buffers) {
- utime_t before_reserve = ceph_clock_now();
- buffer.buffer_oid = pmemobj_reserve(m_log_pool,
- &buffer.buffer_alloc_action,
- buffer.allocation_size,
- 0 /* Object type */);
- buffer.allocation_lat = ceph_clock_now() - before_reserve;
- if (TOID_IS_NULL(buffer.buffer_oid)) {
- if (!req->has_io_waited_for_buffers()) {
- req->set_io_waited_for_entries(true);
- }
- ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: "
- << pmemobj_errormsg() << ". "
- << *req << dendl;
- alloc_succeeds = false;
- no_space = true; /* Entries need to be retired */
- break;
- } else {
- buffer.allocated = true;
- }
- ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo
- << "." << buffer.buffer_oid.oid.off
- << ", size=" << buffer.allocation_size << dendl;
- }
- }
-
- if (alloc_succeeds) {
- std::lock_guard locker(m_lock);
- /* We need one free log entry per extent (each is a separate entry), and
- * one free "lane" for remote replication. */
- if ((m_free_lanes >= num_lanes) &&
- (m_free_log_entries >= num_log_entries)) {
- m_free_lanes -= num_lanes;
- m_free_log_entries -= num_log_entries;
- m_unpublished_reserves += num_unpublished_reserves;
- m_bytes_allocated += bytes_allocated;
- m_bytes_cached += bytes_cached;
- m_bytes_dirty += bytes_dirtied;
- } else {
- alloc_succeeds = false;
- }
- }
-
- if (!alloc_succeeds) {
- /* On alloc failure, free any buffers we did allocate */
- for (auto &buffer : buffers) {
- if (buffer.allocated) {
- pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1);
- }
- }
- if (no_space) {
- /* Expedite flushing and/or retiring */
- std::lock_guard locker(m_lock);
- m_alloc_failed_since_retire = true;
- m_last_alloc_fail = ceph_clock_now();
- }
- }
-
- req->set_allocated(alloc_succeeds);
-
- return alloc_succeeds;
-}
-
-template <typename I>
-C_FlushRequest<AbstractWriteLog<I>>* AbstractWriteLog<I>::make_flush_req(Context *on_finish) {
- utime_t flush_begins = ceph_clock_now();
- bufferlist bl;
- auto *flush_req =
- new C_FlushRequestT(*this, flush_begins, Extents({whole_volume_extent()}),
- std::move(bl), 0, m_lock, m_perfcounter, on_finish);
-
- return flush_req;
-}
-
-template <typename I>
-void AbstractWriteLog<I>::wake_up() {
- CephContext *cct = m_image_ctx.cct;
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
-
- if (!m_wake_up_enabled) {
- // wake_up is disabled during shutdown after flushing completes
- ldout(m_image_ctx.cct, 6) << "deferred processing disabled" << dendl;
- return;
- }
-
- if (m_wake_up_requested && m_wake_up_scheduled) {
- return;
- }
-
- ldout(cct, 20) << dendl;
-
- /* Wake-up can be requested while it's already scheduled */
- m_wake_up_requested = true;
-
- /* Wake-up cannot be scheduled if it's already scheduled */
- if (m_wake_up_scheduled) {
- return;
- }
- m_wake_up_scheduled = true;
- m_async_process_work++;
- m_async_op_tracker.start_op();
- m_work_queue.queue(new LambdaContext(
- [this](int r) {
- process_work();
- m_async_op_tracker.finish_op();
- m_async_process_work--;
- }), 0);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::process_work() {
- CephContext *cct = m_image_ctx.cct;
- int max_iterations = 4;
- bool wake_up_requested = false;
- uint64_t aggressive_high_water_bytes = m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER;
- uint64_t high_water_bytes = m_bytes_allocated_cap * RETIRE_HIGH_WATER;
- uint64_t low_water_bytes = m_bytes_allocated_cap * RETIRE_LOW_WATER;
- uint64_t aggressive_high_water_entries = m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER;
- uint64_t high_water_entries = m_total_log_entries * RETIRE_HIGH_WATER;
- uint64_t low_water_entries = m_total_log_entries * RETIRE_LOW_WATER;
-
- ldout(cct, 20) << dendl;
-
- do {
- {
- std::lock_guard locker(m_lock);
- m_wake_up_requested = false;
- }
- if (m_alloc_failed_since_retire || m_invalidating ||
- m_bytes_allocated > high_water_bytes ||
- (m_log_entries.size() > high_water_entries)) {
- int retired = 0;
- utime_t started = ceph_clock_now();
- ldout(m_image_ctx.cct, 10) << "alloc_fail=" << m_alloc_failed_since_retire
- << ", allocated > high_water="
- << (m_bytes_allocated > high_water_bytes)
- << ", allocated_entries > high_water="
- << (m_log_entries.size() > high_water_entries)
- << dendl;
- while (m_alloc_failed_since_retire || m_invalidating ||
- (m_bytes_allocated > high_water_bytes) ||
- (m_log_entries.size() > high_water_entries) ||
- (((m_bytes_allocated > low_water_bytes) || (m_log_entries.size() > low_water_entries)) &&
- (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) {
- if (!retire_entries((m_shutting_down || m_invalidating ||
- (m_bytes_allocated > aggressive_high_water_bytes) ||
- (m_log_entries.size() > aggressive_high_water_entries))
- ? MAX_ALLOC_PER_TRANSACTION
- : MAX_FREE_PER_TRANSACTION)) {
- break;
- }
- retired++;
- dispatch_deferred_writes();
- process_writeback_dirty_entries();
- }
- ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl;
- }
- dispatch_deferred_writes();
- process_writeback_dirty_entries();
-
- {
- std::lock_guard locker(m_lock);
- wake_up_requested = m_wake_up_requested;
- }
- } while (wake_up_requested && --max_iterations > 0);
-
- {
- std::lock_guard locker(m_lock);
- m_wake_up_scheduled = false;
- /* Reschedule if it's still requested */
- if (m_wake_up_requested) {
- wake_up();
- }
- }
-}
-
-template <typename I>
-bool AbstractWriteLog<I>::can_flush_entry(std::shared_ptr<GenericLogEntry> log_entry) {
- CephContext *cct = m_image_ctx.cct;
-
- ldout(cct, 20) << "" << dendl;
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
-
- if (m_invalidating) {
- return true;
- }
-
- /* For OWB we can flush entries with the same sync gen number (write between
- * aio_flush() calls) concurrently. Here we'll consider an entry flushable if
- * its sync gen number is <= the lowest sync gen number carried by all the
- * entries currently flushing.
- *
- * If the entry considered here bears a sync gen number lower than a
- * previously flushed entry, the application had to have submitted the write
- * bearing the higher gen number before the write with the lower gen number
- * completed. So, flushing these concurrently is OK.
- *
- * If the entry considered here bears a sync gen number higher than a
- * currently flushing entry, the write with the lower gen number may have
- * completed to the application before the write with the higher sync gen
- * number was submitted, and the application may rely on that completion
- * order for volume consistency. In this case the entry will not be
- * considered flushable until all the entries bearing lower sync gen numbers
- * finish flushing.
- */
-
- if (m_flush_ops_in_flight &&
- (log_entry->ram_entry.sync_gen_number > m_lowest_flushing_sync_gen)) {
- return false;
- }
-
- return (log_entry->can_writeback() &&
- (m_flush_ops_in_flight <= IN_FLIGHT_FLUSH_WRITE_LIMIT) &&
- (m_flush_bytes_in_flight <= IN_FLIGHT_FLUSH_BYTES_LIMIT));
-}
-
-template <typename I>
-Context* AbstractWriteLog<I>::construct_flush_entry_ctx(std::shared_ptr<GenericLogEntry> log_entry) {
- CephContext *cct = m_image_ctx.cct;
- bool invalidating = m_invalidating; // snapshot so we behave consistently
-
- ldout(cct, 20) << "" << dendl;
- ceph_assert(m_entry_reader_lock.is_locked());
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- if (!m_flush_ops_in_flight ||
- (log_entry->ram_entry.sync_gen_number < m_lowest_flushing_sync_gen)) {
- m_lowest_flushing_sync_gen = log_entry->ram_entry.sync_gen_number;
- }
- m_flush_ops_in_flight += 1;
- /* For write same this is the bytes affected bt the flush op, not the bytes transferred */
- m_flush_bytes_in_flight += log_entry->ram_entry.write_bytes;
-
- /* Flush write completion action */
- Context *ctx = new LambdaContext(
- [this, log_entry, invalidating](int r) {
- {
- std::lock_guard locker(m_lock);
- if (r < 0) {
- lderr(m_image_ctx.cct) << "failed to flush log entry"
- << cpp_strerror(r) << dendl;
- m_dirty_log_entries.push_front(log_entry);
- } else {
- ceph_assert(m_bytes_dirty >= log_entry->bytes_dirty());
- log_entry->set_flushed(true);
- m_bytes_dirty -= log_entry->bytes_dirty();
- sync_point_writer_flushed(log_entry->get_sync_point_entry());
- ldout(m_image_ctx.cct, 20) << "flushed: " << log_entry
- << " invalidating=" << invalidating
- << dendl;
- }
- m_flush_ops_in_flight -= 1;
- m_flush_bytes_in_flight -= log_entry->ram_entry.write_bytes;
- wake_up();
- }
- });
- /* Flush through lower cache before completing */
- ctx = new LambdaContext(
- [this, ctx](int r) {
- if (r < 0) {
- lderr(m_image_ctx.cct) << "failed to flush log entry"
- << cpp_strerror(r) << dendl;
- ctx->complete(r);
- } else {
- m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, ctx);
- }
- });
-
- if (invalidating) {
- return ctx;
- }
- return new LambdaContext(
- [this, log_entry, ctx](int r) {
- m_image_ctx.op_work_queue->queue(new LambdaContext(
- [this, log_entry, ctx](int r) {
- ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
- << " " << *log_entry << dendl;
- log_entry->writeback(m_image_writeback, ctx);
- }), 0);
- });
-}
-
-template <typename I>
-void AbstractWriteLog<I>::process_writeback_dirty_entries() {
- CephContext *cct = m_image_ctx.cct;
- bool all_clean = false;
- int flushed = 0;
-
- ldout(cct, 20) << "Look for dirty entries" << dendl;
- {
- DeferredContexts post_unlock;
- std::shared_lock entry_reader_locker(m_entry_reader_lock);
- while (flushed < IN_FLIGHT_FLUSH_WRITE_LIMIT) {
- std::lock_guard locker(m_lock);
- if (m_shutting_down) {
- ldout(cct, 5) << "Flush during shutdown supressed" << dendl;
- /* Do flush complete only when all flush ops are finished */
- all_clean = !m_flush_ops_in_flight;
- break;
- }
- if (m_dirty_log_entries.empty()) {
- ldout(cct, 20) << "Nothing new to flush" << dendl;
- /* Do flush complete only when all flush ops are finished */
- all_clean = !m_flush_ops_in_flight;
- break;
- }
- auto candidate = m_dirty_log_entries.front();
- bool flushable = can_flush_entry(candidate);
- if (flushable) {
- post_unlock.add(construct_flush_entry_ctx(candidate));
- flushed++;
- m_dirty_log_entries.pop_front();
- } else {
- ldout(cct, 20) << "Next dirty entry isn't flushable yet" << dendl;
- break;
- }
- }
- }
-
- if (all_clean) {
- /* All flushing complete, drain outside lock */
- Contexts flush_contexts;
- {
- std::lock_guard locker(m_lock);
- flush_contexts.swap(m_flush_complete_contexts);
- }
- finish_contexts(m_image_ctx.cct, flush_contexts, 0);
- }
-}
-
-/**
- * Update/persist the last flushed sync point in the log
- */
-template <typename I>
-void AbstractWriteLog<I>::persist_last_flushed_sync_gen()
-{
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- uint64_t flushed_sync_gen;
-
- std::lock_guard append_locker(m_log_append_lock);
- {
- std::lock_guard locker(m_lock);
- flushed_sync_gen = m_flushed_sync_gen;
- }
-
- if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
- ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from "
- << D_RO(pool_root)->flushed_sync_gen << " to "
- << flushed_sync_gen << dendl;
- TX_BEGIN(m_log_pool) {
- D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
- } TX_ONCOMMIT {
- } TX_ONABORT {
- lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl;
- ceph_assert(false);
- } TX_FINALLY {
- } TX_END;
- }
-}
-
-/* Returns true if the specified SyncPointLogEntry is considered flushed, and
- * the log will be updated to reflect this. */
-template <typename I>
-bool AbstractWriteLog<I>::handle_flushed_sync_point(std::shared_ptr<SyncPointLogEntry> log_entry)
-{
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- ceph_assert(log_entry);
-
- if ((log_entry->writes_flushed == log_entry->writes) &&
- log_entry->completed && log_entry->prior_sync_point_flushed &&
- log_entry->next_sync_point_entry) {
- ldout(m_image_ctx.cct, 20) << "All writes flushed up to sync point="
- << *log_entry << dendl;
- log_entry->next_sync_point_entry->prior_sync_point_flushed = true;
- /* Don't move the flushed sync gen num backwards. */
- if (m_flushed_sync_gen < log_entry->ram_entry.sync_gen_number) {
- m_flushed_sync_gen = log_entry->ram_entry.sync_gen_number;
- }
- m_async_op_tracker.start_op();
- m_work_queue.queue(new LambdaContext(
- [this, log_entry](int r) {
- bool handled_by_next;
- {
- std::lock_guard locker(m_lock);
- handled_by_next = handle_flushed_sync_point(log_entry->next_sync_point_entry);
- }
- if (!handled_by_next) {
- persist_last_flushed_sync_gen();
- }
- m_async_op_tracker.finish_op();
- }));
- return true;
- }
- return false;
-}
-
-template <typename I>
-void AbstractWriteLog<I>::sync_point_writer_flushed(std::shared_ptr<SyncPointLogEntry> log_entry)
-{
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- ceph_assert(log_entry);
- log_entry->writes_flushed++;
-
- /* If this entry might be completely flushed, look closer */
- if ((log_entry->writes_flushed == log_entry->writes) && log_entry->completed) {
- ldout(m_image_ctx.cct, 15) << "All writes flushed for sync point="
- << *log_entry << dendl;
- handle_flushed_sync_point(log_entry);
- }
-}
-
-/* Make a new sync point and flush the previous during initialization, when there may or may
- * not be a previous sync point */
-template <typename I>
-void AbstractWriteLog<I>::init_flush_new_sync_point(DeferredContexts &later) {
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- ceph_assert(!m_initialized); /* Don't use this after init */
-
- if (!m_current_sync_point) {
- /* First sync point since start */
- new_sync_point(later);
- } else {
- flush_new_sync_point(nullptr, later);
- }
-}
-
-/**
- * Begin a new sync point
- */
-template <typename I>
-void AbstractWriteLog<I>::new_sync_point(DeferredContexts &later) {
- CephContext *cct = m_image_ctx.cct;
- std::shared_ptr<SyncPoint> old_sync_point = m_current_sync_point;
- std::shared_ptr<SyncPoint> new_sync_point;
- ldout(cct, 20) << dendl;
-
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
-
- /* The first time this is called, if this is a newly created log,
- * this makes the first sync gen number we'll use 1. On the first
- * call for a re-opened log m_current_sync_gen will be the highest
- * gen number from all the sync point entries found in the re-opened
- * log, and this advances to the next sync gen number. */
- ++m_current_sync_gen;
-
- new_sync_point = std::make_shared<SyncPoint>(m_current_sync_gen, cct);
- m_current_sync_point = new_sync_point;
-
- /* If this log has been re-opened, old_sync_point will initially be
- * nullptr, but m_current_sync_gen may not be zero. */
- if (old_sync_point) {
- new_sync_point->setup_earlier_sync_point(old_sync_point, m_last_op_sequence_num);
- m_perfcounter->hinc(l_librbd_rwl_syncpoint_hist,
- old_sync_point->log_entry->writes,
- old_sync_point->log_entry->bytes);
- /* This sync point will acquire no more sub-ops. Activation needs
- * to acquire m_lock, so defer to later*/
- later.add(new LambdaContext(
- [this, old_sync_point](int r) {
- old_sync_point->prior_persisted_gather_activate();
- }));
- }
-
- new_sync_point->prior_persisted_gather_set_finisher();
-
- if (old_sync_point) {
- ldout(cct,6) << "new sync point = [" << *m_current_sync_point
- << "], prior = [" << *old_sync_point << "]" << dendl;
- } else {
- ldout(cct,6) << "first sync point = [" << *m_current_sync_point
- << "]" << dendl;
- }
-}
-
-template <typename I>
-void AbstractWriteLog<I>::flush_new_sync_point(C_FlushRequestT *flush_req,
- DeferredContexts &later) {
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
-
- if (!flush_req) {
- m_async_null_flush_finish++;
- m_async_op_tracker.start_op();
- Context *flush_ctx = new LambdaContext([this](int r) {
- m_async_null_flush_finish--;
- m_async_op_tracker.finish_op();
- });
- flush_req = make_flush_req(flush_ctx);
- flush_req->internal = true;
- }
-
- /* Add a new sync point. */
- new_sync_point(later);
- std::shared_ptr<SyncPoint> to_append = m_current_sync_point->earlier_sync_point;
- ceph_assert(to_append);
-
- /* This flush request will append/persist the (now) previous sync point */
- flush_req->to_append = to_append;
-
- /* When the m_sync_point_persist Gather completes this sync point can be
- * appended. The only sub for this Gather is the finisher Context for
- * m_prior_log_entries_persisted, which records the result of the Gather in
- * the sync point, and completes. TODO: Do we still need both of these
- * Gathers?*/
- Context * ctx = new LambdaContext([this, flush_req](int r) {
- ldout(m_image_ctx.cct, 20) << "Flush req=" << flush_req
- << " sync point =" << flush_req->to_append
- << ". Ready to persist." << dendl;
- alloc_and_dispatch_io_req(flush_req);
- });
- to_append->persist_gather_set_finisher(ctx);
-
- /* The m_sync_point_persist Gather has all the subs it will ever have, and
- * now has its finisher. If the sub is already complete, activation will
- * complete the Gather. The finisher will acquire m_lock, so we'll activate
- * this when we release m_lock.*/
- later.add(new LambdaContext([this, to_append](int r) {
- to_append->persist_gather_activate();
- }));
-
- /* The flush request completes when the sync point persists */
- to_append->add_in_on_persisted_ctxs(flush_req);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::flush_new_sync_point_if_needed(C_FlushRequestT *flush_req,
- DeferredContexts &later) {
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
-
- /* If there have been writes since the last sync point ... */
- if (m_current_sync_point->log_entry->writes) {
- flush_new_sync_point(flush_req, later);
- } else {
- /* There have been no writes to the current sync point. */
- if (m_current_sync_point->earlier_sync_point) {
- /* If previous sync point hasn't completed, complete this flush
- * with the earlier sync point. No alloc or dispatch needed. */
- m_current_sync_point->earlier_sync_point->on_sync_point_persisted.push_back(flush_req);
- } else {
- /* The previous sync point has already completed and been
- * appended. The current sync point has no writes, so this flush
- * has nothing to wait for. This flush completes now. */
- later.add(flush_req);
- }
- }
-}
-
-/*
- * RWL internal flush - will actually flush the RWL.
- *
- * User flushes should arrive at aio_flush(), and only flush prior
- * writes to all log replicas.
- *
- * Librbd internal flushes will arrive at flush(invalidate=false,
- * discard=false), and traverse the block guard to ensure in-flight writes are
- * flushed.
- */
-template <typename I>
-void AbstractWriteLog<I>::flush_dirty_entries(Context *on_finish) {
- CephContext *cct = m_image_ctx.cct;
- bool all_clean;
- bool flushing;
- bool stop_flushing;
-
- {
- std::lock_guard locker(m_lock);
- flushing = (0 != m_flush_ops_in_flight);
- all_clean = m_dirty_log_entries.empty();
- stop_flushing = (m_shutting_down);
- }
-
- if (!flushing && (all_clean || stop_flushing)) {
- /* Complete without holding m_lock */
- if (all_clean) {
- ldout(cct, 20) << "no dirty entries" << dendl;
- } else {
- ldout(cct, 5) << "flush during shutdown suppressed" << dendl;
- }
- on_finish->complete(0);
- } else {
- if (all_clean) {
- ldout(cct, 5) << "flush ops still in progress" << dendl;
- } else {
- ldout(cct, 20) << "dirty entries remain" << dendl;
- }
- std::lock_guard locker(m_lock);
- /* on_finish can't be completed yet */
- m_flush_complete_contexts.push_back(new LambdaContext(
- [this, on_finish](int r) {
- flush_dirty_entries(on_finish);
- }));
- wake_up();
- }
-}
-
-template <typename I>
-void AbstractWriteLog<I>::internal_flush(bool invalidate, Context *on_finish) {
- ldout(m_image_ctx.cct, 20) << "invalidate=" << invalidate << dendl;
-
- if (m_perfcounter) {
- if (invalidate) {
- m_perfcounter->inc(l_librbd_rwl_invalidate_cache, 1);
- } else {
- m_perfcounter->inc(l_librbd_rwl_flush, 1);
- }
- }
-
- /* May be called even if initialization fails */
- if (!m_initialized) {
- ldout(m_image_ctx.cct, 05) << "never initialized" << dendl;
- /* Deadlock if completed here */
- m_image_ctx.op_work_queue->queue(on_finish, 0);
- return;
- }
-
- /* Flush/invalidate must pass through block guard to ensure all layers of
- * cache are consistently flush/invalidated. This ensures no in-flight write leaves
- * some layers with valid regions, which may later produce inconsistent read
- * results. */
- GuardedRequestFunctionContext *guarded_ctx =
- new GuardedRequestFunctionContext(
- [this, on_finish, invalidate](GuardedRequestFunctionContext &guard_ctx) {
- DeferredContexts on_exit;
- ldout(m_image_ctx.cct, 20) << "cell=" << guard_ctx.cell << dendl;
- ceph_assert(guard_ctx.cell);
-
- Context *ctx = new LambdaContext(
- [this, cell=guard_ctx.cell, invalidate, on_finish](int r) {
- std::lock_guard locker(m_lock);
- m_invalidating = false;
- ldout(m_image_ctx.cct, 6) << "Done flush/invalidating (invalidate="
- << invalidate << ")" << dendl;
- if (m_log_entries.size()) {
- ldout(m_image_ctx.cct, 1) << "m_log_entries.size()="
- << m_log_entries.size() << ", "
- << "front()=" << *m_log_entries.front()
- << dendl;
- }
- if (invalidate) {
- ceph_assert(m_log_entries.size() == 0);
- }
- ceph_assert(m_dirty_log_entries.size() == 0);
- m_image_ctx.op_work_queue->queue(on_finish, r);
- release_guarded_request(cell);
- });
- ctx = new LambdaContext(
- [this, ctx, invalidate](int r) {
- Context *next_ctx = ctx;
- if (r < 0) {
- /* Override on_finish status with this error */
- next_ctx = new LambdaContext([r, ctx](int _r) {
- ctx->complete(r);
- });
- }
- if (invalidate) {
- {
- std::lock_guard locker(m_lock);
- ceph_assert(m_dirty_log_entries.size() == 0);
- ceph_assert(!m_invalidating);
- ldout(m_image_ctx.cct, 6) << "Invalidating" << dendl;
- m_invalidating = true;
- }
- /* Discards all RWL entries */
- while (retire_entries(MAX_ALLOC_PER_TRANSACTION)) { }
- next_ctx->complete(0);
- } else {
- {
- std::lock_guard locker(m_lock);
- ceph_assert(m_dirty_log_entries.size() == 0);
- ceph_assert(!m_invalidating);
- }
- m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, next_ctx);
- }
- });
- ctx = new LambdaContext(
- [this, ctx](int r) {
- flush_dirty_entries(ctx);
- });
- std::lock_guard locker(m_lock);
- /* Even if we're throwing everything away, but we want the last entry to
- * be a sync point so we can cleanly resume.
- *
- * Also, the blockguard only guarantees the replication of this op
- * can't overlap with prior ops. It doesn't guarantee those are all
- * completed and eligible for flush & retire, which we require here.
- */
- auto flush_req = make_flush_req(ctx);
- flush_new_sync_point_if_needed(flush_req, on_exit);
- });
- detain_guarded_request(nullptr, guarded_ctx, true);
-}
-
-template <typename I>
-void AbstractWriteLog<I>::add_into_log_map(GenericWriteLogEntries &log_entries) {
- m_blocks_to_log_entries.add_log_entries(log_entries);
-}
-
-template <typename I>
-bool AbstractWriteLog<I>::can_retire_entry(std::shared_ptr<GenericLogEntry> log_entry) {
- CephContext *cct = m_image_ctx.cct;
-
- ldout(cct, 20) << dendl;
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- return log_entry->can_retire();
-}
-
-/**
- * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries
- * that are eligible to be retired. Returns true if anything was
- * retired.
- */
-template <typename I>
-bool AbstractWriteLog<I>::retire_entries(const unsigned long int frees_per_tx) {
- CephContext *cct = m_image_ctx.cct;
- GenericLogEntriesVector retiring_entries;
- uint32_t initial_first_valid_entry;
- uint32_t first_valid_entry;
-
- std::lock_guard retire_locker(m_log_retire_lock);
- ldout(cct, 20) << "Look for entries to retire" << dendl;
- {
- /* Entry readers can't be added while we hold m_entry_reader_lock */
- RWLock::WLocker entry_reader_locker(m_entry_reader_lock);
- std::lock_guard locker(m_lock);
- initial_first_valid_entry = m_first_valid_entry;
- first_valid_entry = m_first_valid_entry;
- auto entry = m_log_entries.front();
- while (!m_log_entries.empty() &&
- retiring_entries.size() < frees_per_tx &&
- can_retire_entry(entry)) {
- if (entry->log_entry_index != first_valid_entry) {
- lderr(cct) << "Retiring entry index (" << entry->log_entry_index
- << ") and first valid log entry index (" << first_valid_entry
- << ") must be ==." << dendl;
- }
- ceph_assert(entry->log_entry_index == first_valid_entry);
- first_valid_entry = (first_valid_entry + 1) % m_total_log_entries;
- m_log_entries.pop_front();
- retiring_entries.push_back(entry);
- /* Remove entry from map so there will be no more readers */
- if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) {
- auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(entry);
- if (gen_write_entry) {
- m_blocks_to_log_entries.remove_log_entry(gen_write_entry);
- }
- }
- entry = m_log_entries.front();
- }
- }
-
- if (retiring_entries.size()) {
- ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl;
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-
- utime_t tx_start;
- utime_t tx_end;
- /* Advance first valid entry and release buffers */
- {
- uint64_t flushed_sync_gen;
- std::lock_guard append_locker(m_log_append_lock);
- {
- std::lock_guard locker(m_lock);
- flushed_sync_gen = m_flushed_sync_gen;
- }
-
- tx_start = ceph_clock_now();
- TX_BEGIN(m_log_pool) {
- if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
- ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from "
- << D_RO(pool_root)->flushed_sync_gen << " to "
- << flushed_sync_gen << dendl;
- D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
- }
- D_RW(pool_root)->first_valid_entry = first_valid_entry;
- for (auto &entry: retiring_entries) {
- if (entry->write_bytes()) {
- ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo
- << "." << entry->ram_entry.write_data.oid.off << dendl;
- TX_FREE(entry->ram_entry.write_data);
- } else {
- ldout(cct, 20) << "Retiring non-write: " << *entry << dendl;
- }
- }
- } TX_ONCOMMIT {
- } TX_ONABORT {
- lderr(cct) << "failed to commit free of" << retiring_entries.size() << " log entries (" << m_log_pool_name << ")" << dendl;
- ceph_assert(false);
- } TX_FINALLY {
- } TX_END;
- tx_end = ceph_clock_now();
- }
- m_perfcounter->tinc(l_librbd_rwl_retire_tx_t, tx_end - tx_start);
- m_perfcounter->hinc(l_librbd_rwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), retiring_entries.size());
-
- /* Update runtime copy of first_valid, and free entries counts */
- {
- std::lock_guard locker(m_lock);
-
- ceph_assert(m_first_valid_entry == initial_first_valid_entry);
- m_first_valid_entry = first_valid_entry;
- m_free_log_entries += retiring_entries.size();
- for (auto &entry: retiring_entries) {
- if (entry->write_bytes()) {
- ceph_assert(m_bytes_cached >= entry->write_bytes());
- m_bytes_cached -= entry->write_bytes();
- uint64_t entry_allocation_size = entry->write_bytes();
- if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) {
- entry_allocation_size = MIN_WRITE_ALLOC_SIZE;
- }
- ceph_assert(m_bytes_allocated >= entry_allocation_size);
- m_bytes_allocated -= entry_allocation_size;
- }
- }
- m_alloc_failed_since_retire = false;
- wake_up();
- }
- } else {
- ldout(cct, 20) << "Nothing to retire" << dendl;
- return false;
- }
- return true;
-}
-
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::AbstractWriteLog<librbd::ImageCtx>;
-template void librbd::cache::AbstractWriteLog<librbd::ImageCtx>:: \
- flush_pmem_buffer(std::vector<std::shared_ptr< \
- librbd::cache::rwl::GenericLogOperation>>&);
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
-#define CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
-
-#include "common/RWLock.h"
-#include "common/WorkQueue.h"
-#include "common/AsyncOpTracker.h"
-#include "librbd/cache/ImageCache.h"
-#include "librbd/cache/ImageWriteback.h"
-#include "librbd/Utils.h"
-#include "librbd/BlockGuard.h"
-#include "librbd/cache/Types.h"
-#include "librbd/cache/rwl/LogOperation.h"
-#include "librbd/cache/rwl/Request.h"
-#include "librbd/cache/rwl/LogMap.h"
-#include <functional>
-#include <list>
-
-class Context;
-class SafeTimer;
-
-namespace librbd {
-
-struct ImageCtx;
-
-namespace cache {
-
-namespace rwl {
-
-class SyncPointLogEntry;
-class GenericWriteLogEntry;
-class WriteLogEntry;
-class GenericLogEntry;
-
-typedef std::list<std::shared_ptr<WriteLogEntry>> WriteLogEntries;
-typedef std::list<std::shared_ptr<GenericLogEntry>> GenericLogEntries;
-typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
-typedef std::vector<std::shared_ptr<GenericLogEntry>> GenericLogEntriesVector;
-
-typedef LogMapEntries<GenericWriteLogEntry> WriteLogMapEntries;
-typedef LogMap<GenericWriteLogEntry> WriteLogMap;
-
-/**** Write log entries end ****/
-
-typedef librbd::BlockGuard<GuardedRequest> WriteLogGuard;
-
-class DeferredContexts;
-template <typename> class ImageCacheState;
-
-template <typename T>
-struct C_BlockIORequest;
-
-template <typename T>
-struct C_WriteRequest;
-
-using GenericLogOperations = std::list<GenericLogOperationSharedPtr>;
-
-} // namespace rwl
-
-
-template <typename ImageCtxT>
-class AbstractWriteLog {
-public:
- typedef io::Extent Extent;
- typedef io::Extents Extents;
-
- AbstractWriteLog(ImageCtxT &image_ctx, librbd::cache::rwl::ImageCacheState<ImageCtxT>* cache_state);
- ~AbstractWriteLog();
- AbstractWriteLog(const AbstractWriteLog&) = delete;
- AbstractWriteLog &operator=(const AbstractWriteLog&) = delete;
-
- /// IO methods
- void read(Extents&& image_extents, ceph::bufferlist *bl,
- int fadvise_flags, Context *on_finish);
- void write(Extents&& image_extents, ceph::bufferlist&& bl,
- int fadvise_flags,
- Context *on_finish);
- void discard(uint64_t offset, uint64_t length,
- uint32_t discard_granularity_bytes,
- Context *on_finish);
- void flush(io::FlushSource flush_source, Context *on_finish);
- void writesame(uint64_t offset, uint64_t length,
- ceph::bufferlist&& bl,
- int fadvise_flags, Context *on_finish);
- void compare_and_write(Extents&& image_extents,
- ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl,
- uint64_t *mismatch_offset,int fadvise_flags,
- Context *on_finish);
-
- /// internal state methods
- void init(Context *on_finish);
- void shut_down(Context *on_finish);
- void invalidate(Context *on_finish);
- void flush(Context *on_finish);
-
- using This = AbstractWriteLog<ImageCtxT>;
- using C_WriteRequestT = rwl::C_WriteRequest<This>;
- using C_BlockIORequestT = rwl::C_BlockIORequest<This>;
- using C_FlushRequestT = rwl::C_FlushRequest<This>;
- using C_DiscardRequestT = rwl::C_DiscardRequest<This>;
- using C_WriteSameRequestT = rwl::C_WriteSameRequest<This>;
- using C_CompAndWriteRequestT = rwl::C_CompAndWriteRequest<This>;
-
- CephContext * get_context();
- void release_guarded_request(BlockGuardCell *cell);
- void release_write_lanes(C_BlockIORequestT *req);
- bool alloc_resources(C_BlockIORequestT *req);
- template <typename V>
- void flush_pmem_buffer(V& ops);
- void schedule_append(rwl::GenericLogOperationsVector &ops);
- void schedule_append(rwl::GenericLogOperationSharedPtr op);
- void schedule_flush_and_append(rwl::GenericLogOperationsVector &ops);
- void flush_new_sync_point(C_FlushRequestT *flush_req, rwl::DeferredContexts &later);
- std::shared_ptr<rwl::SyncPoint> get_current_sync_point() {
- return m_current_sync_point;
- }
- bool get_persist_on_flush() {
- return m_persist_on_flush;
- }
- void inc_last_op_sequence_num() {
- m_perfcounter->inc(l_librbd_rwl_log_ops, 1);
- ++m_last_op_sequence_num;
- }
- uint64_t get_last_op_sequence_num() {
- return m_last_op_sequence_num;
- }
- uint64_t get_current_sync_gen() {
- return m_current_sync_gen;
- }
- unsigned int get_free_lanes() {
- return m_free_lanes;
- }
- uint32_t get_free_log_entries() {
- return m_free_log_entries;
- }
- void add_into_log_map(rwl::GenericWriteLogEntries &log_entries);
-protected:
- typedef std::list<rwl::C_WriteRequest<This> *> C_WriteRequests;
- typedef std::list<rwl::C_BlockIORequest<This> *> C_BlockIORequests;
-
- BlockGuardCell* detain_guarded_request_helper(rwl::GuardedRequest &req);
- BlockGuardCell* detain_guarded_request_barrier_helper(rwl::GuardedRequest &req);
- void detain_guarded_request(C_BlockIORequestT *request,
- rwl::GuardedRequestFunctionContext *guarded_ctx,
- bool is_barrier);
-
- librbd::cache::rwl::ImageCacheState<ImageCtxT>* m_cache_state = nullptr;
-
- std::atomic<bool> m_initialized = {false};
- std::atomic<bool> m_shutting_down = {false};
- std::atomic<bool> m_invalidating = {false};
- PMEMobjpool *m_log_pool = nullptr;
- const char* m_rwl_pool_layout_name;
-
- ImageCtxT &m_image_ctx;
-
- std::string m_log_pool_name;
- bool m_log_is_poolset = false;
- uint64_t m_log_pool_config_size; /* Configured size of RWL */
- uint64_t m_log_pool_actual_size = 0; /* Actual size of RWL pool */
-
- uint32_t m_total_log_entries = 0;
- uint32_t m_free_log_entries = 0;
-
- std::atomic<uint64_t> m_bytes_allocated = {0}; /* Total bytes allocated in write buffers */
- uint64_t m_bytes_cached = 0; /* Total bytes used in write buffers */
- uint64_t m_bytes_dirty = 0; /* Total bytes yet to flush to RBD */
- uint64_t m_bytes_allocated_cap = 0;
-
- utime_t m_last_alloc_fail; /* Entry or buffer allocation fail seen */
- std::atomic<bool> m_alloc_failed_since_retire = {false};
-
- ImageWriteback<ImageCtxT> m_image_writeback;
- rwl::WriteLogGuard m_write_log_guard;
- /*
- * When m_first_free_entry == m_first_valid_entry, the log is
- * empty. There is always at least one free entry, which can't be
- * used.
- */
- uint64_t m_first_free_entry = 0; /* Entries from here to m_first_valid_entry-1 are free */
- uint64_t m_first_valid_entry = 0; /* Entries from here to m_first_free_entry-1 are valid */
-
- /* Starts at 0 for a new write log. Incremented on every flush. */
- uint64_t m_current_sync_gen = 0;
- /* Starts at 0 on each sync gen increase. Incremented before applied
- to an operation */
- uint64_t m_last_op_sequence_num = 0;
- /* All writes bearing this and all prior sync gen numbers are flushed */
- uint64_t m_flushed_sync_gen = 0;
-
- bool m_persist_on_write_until_flush = true;
-
- AsyncOpTracker m_async_op_tracker;
- /* Debug counters for the places m_async_op_tracker is used */
- std::atomic<int> m_async_flush_ops = {0};
- std::atomic<int> m_async_append_ops = {0};
- std::atomic<int> m_async_complete_ops = {0};
- std::atomic<int> m_async_null_flush_finish = {0};
- std::atomic<int> m_async_process_work = {0};
-
- /* Acquire locks in order declared here */
-
- mutable ceph::mutex m_log_retire_lock;
- /* Hold a read lock on m_entry_reader_lock to add readers to log entry
- * bufs. Hold a write lock to prevent readers from being added (e.g. when
- * removing log entrys from the map). No lock required to remove readers. */
- mutable RWLock m_entry_reader_lock;
- /* Hold m_deferred_dispatch_lock while consuming from m_deferred_ios. */
- mutable ceph::mutex m_deferred_dispatch_lock;
- /* Hold m_log_append_lock while appending or retiring log entries. */
- mutable ceph::mutex m_log_append_lock;
- /* Used for most synchronization */
- mutable ceph::mutex m_lock;
-
- /* Used in release/detain to make BlockGuard preserve submission order */
- mutable ceph::mutex m_blockguard_lock;
-
- /* Use m_blockguard_lock for the following 3 things */
- rwl::WriteLogGuard::BlockOperations m_awaiting_barrier;
- bool m_barrier_in_progress = false;
- BlockGuardCell *m_barrier_cell = nullptr;
-
- bool m_wake_up_requested = false;
- bool m_wake_up_scheduled = false;
- bool m_wake_up_enabled = true;
- bool m_appending = false;
- bool m_dispatching_deferred_ops = false;
-
- Contexts m_flush_complete_contexts;
-
- rwl::GenericLogOperations m_ops_to_flush; /* Write ops needing flush in local log */
- rwl::GenericLogOperations m_ops_to_append; /* Write ops needing event append in local log */
-
- rwl::WriteLogMap m_blocks_to_log_entries;
-
- /* New entries are at the back. Oldest at the front */
- rwl::GenericLogEntries m_log_entries;
- rwl::GenericLogEntries m_dirty_log_entries;
-
- PerfCounters *m_perfcounter = nullptr;
-
- std::shared_ptr<rwl::SyncPoint> m_current_sync_point = nullptr;
- bool m_persist_on_flush = false; /* If false, persist each write before completion */
-
- int m_flush_ops_in_flight = 0;
- int m_flush_bytes_in_flight = 0;
- uint64_t m_lowest_flushing_sync_gen = 0;
-
- /* Writes that have left the block guard, but are waiting for resources */
- C_BlockIORequests m_deferred_ios;
- /* Throttle writes concurrently allocating & replicating */
- unsigned int m_free_lanes = rwl::MAX_CONCURRENT_WRITES;
- unsigned int m_unpublished_reserves = 0;
-
- /* Initialized from config, then set false during shutdown */
- std::atomic<bool> m_periodic_stats_enabled = {false};
- SafeTimer *m_timer = nullptr; /* Used with m_timer_lock */
- mutable ceph::mutex *m_timer_lock = nullptr; /* Used with and by m_timer */
- Context *m_timer_ctx = nullptr;
-
- ThreadPool m_thread_pool;
- ContextWQ m_work_queue;
-
- uint32_t m_discard_granularity_bytes;
-
- void perf_start(const std::string name);
- void perf_stop();
- void log_perf();
- void periodic_stats();
- void arm_periodic_stats();
-
- void rwl_init(Context *on_finish, rwl::DeferredContexts &later);
- void update_image_cache_state(Context *on_finish);
- void load_existing_entries(rwl::DeferredContexts &later);
- void wake_up();
- void process_work();
-
- void flush_dirty_entries(Context *on_finish);
- bool can_flush_entry(const std::shared_ptr<rwl::GenericLogEntry> log_entry);
- Context *construct_flush_entry_ctx(const std::shared_ptr<rwl::GenericLogEntry> log_entry);
- void persist_last_flushed_sync_gen();
- bool handle_flushed_sync_point(std::shared_ptr<rwl::SyncPointLogEntry> log_entry);
- void sync_point_writer_flushed(std::shared_ptr<rwl::SyncPointLogEntry> log_entry);
- void process_writeback_dirty_entries();
- bool can_retire_entry(const std::shared_ptr<rwl::GenericLogEntry> log_entry);
- bool retire_entries(const unsigned long int frees_per_tx);
-
- void init_flush_new_sync_point(rwl::DeferredContexts &later);
- void new_sync_point(rwl::DeferredContexts &later);
- rwl::C_FlushRequest<AbstractWriteLog<ImageCtxT>>* make_flush_req(Context *on_finish);
- void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, rwl::DeferredContexts &later);
-
- void dispatch_deferred_writes(void);
- void alloc_and_dispatch_io_req(C_BlockIORequestT *write_req);
- void append_scheduled_ops(void);
- void enlist_op_appender();
- void schedule_append(rwl::GenericLogOperations &ops);
- void flush_then_append_scheduled_ops(void);
- void enlist_op_flusher();
- void alloc_op_log_entries(rwl::GenericLogOperations &ops);
- void flush_op_log_entries(rwl::GenericLogOperationsVector &ops);
- int append_op_log_entries(rwl::GenericLogOperations &ops);
- void complete_op_log_entries(rwl::GenericLogOperations &&ops, const int r);
- void schedule_complete_op_log_entries(rwl::GenericLogOperations &&ops, const int r);
- void internal_flush(bool invalidate, Context *on_finish);
-};
-
-} // namespace cache
-} // namespace librbd
-
-extern template class librbd::cache::AbstractWriteLog<librbd::ImageCtx>;
-
-#endif // CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// // vim: ts=8 sw=2 smarttab
-
-#include "ReplicatedWriteLog.h"
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "include/ceph_assert.h"
-#include "common/deleter.h"
-#include "common/dout.h"
-#include "common/environment.h"
-#include "common/errno.h"
-#include "common/WorkQueue.h"
-#include "common/Timer.h"
-#include "common/perf_counters.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/cache/rwl/ImageCacheState.h"
-#include "librbd/cache/rwl/LogEntry.h"
-#include <map>
-#include <vector>
-
-#undef dout_subsys
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::ReplicatedWriteLog: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
- namespace cache {
-
- using namespace librbd::cache::rwl;
-
- template <typename I>
- ReplicatedWriteLog<I>::ReplicatedWriteLog(I &image_ctx, librbd::cache::rwl::ImageCacheState<I>* cache_state)
- : AbstractWriteLog<I>(image_ctx, cache_state)
- {
- }
-
-
- } // namespace cache
-} // namespace librbd
-
-template class librbd::cache::ReplicatedWriteLog<librbd::ImageCtx>;
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
-#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
-
-#include "common/RWLock.h"
-#include "common/WorkQueue.h"
-#include "common/AsyncOpTracker.h"
-#include "librbd/cache/ImageCache.h"
-#include "librbd/cache/ImageWriteback.h"
-#include "librbd/Utils.h"
-#include "librbd/BlockGuard.h"
-#include "librbd/cache/Types.h"
-#include "librbd/cache/rwl/LogOperation.h"
-#include "librbd/cache/rwl/Request.h"
-#include "librbd/cache/rwl/LogMap.h"
-#include "AbstractWriteLog.h"
-#include <functional>
-#include <list>
-
-class Context;
-class SafeTimer;
-
-namespace librbd {
-
-struct ImageCtx;
-
-namespace cache {
-
-template <typename ImageCtxT>
-class ReplicatedWriteLog : public AbstractWriteLog<ImageCtxT> {
-public:
- typedef io::Extent Extent;
- typedef io::Extents Extents;
-
- ReplicatedWriteLog(ImageCtxT &image_ctx, librbd::cache::rwl::ImageCacheState<ImageCtxT>* cache_state);
- ~ReplicatedWriteLog();
- ReplicatedWriteLog(const ReplicatedWriteLog&) = delete;
- ReplicatedWriteLog &operator=(const ReplicatedWriteLog&) = delete;
-
-private:
- using This = AbstractWriteLog<ImageCtxT>;
- using C_WriteRequestT = rwl::C_WriteRequest<This>;
- using C_BlockIORequestT = rwl::C_BlockIORequest<This>;
- using C_FlushRequestT = rwl::C_FlushRequest<This>;
- using C_DiscardRequestT = rwl::C_DiscardRequest<This>;
- using C_WriteSameRequestT = rwl::C_WriteSameRequest<This>;
- using C_CompAndWriteRequestT = rwl::C_CompAndWriteRequest<This>;
-
-};
-
-} // namespace cache
-} // namespace librbd
-
-extern template class librbd::cache::ReplicatedWriteLog<librbd::ImageCtx>;
-
-#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
namespace util {
template <typename T>
-bool is_rwl_enabled(T& image_ctx) {
+bool is_pwl_enabled(T& image_ctx) {
#if defined(WITH_RBD_RWL)
return image_ctx.config.template get_val<bool>("rbd_rwl_enabled");
#else
// vim: ts=8 sw=2 smarttab
#include "WriteLogCache.h"
-#include "ReplicatedWriteLog.h"
-#include "librbd/cache/rwl/ImageCacheState.h"
+#include "librbd/cache/pwl/ReplicatedWriteLog.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
#undef dout_subsys
-#define dout_subsys ceph_subsys_rbd_rwl
+#define dout_subsys ceph_subsys_rbd_pwl
#undef dout_prefix
#define dout_prefix *_dout << "librbd::cache::WriteLogCache: " << this << " " \
<< __func__ << ": "
namespace librbd {
namespace cache {
-using namespace librbd::cache::rwl;
+using namespace librbd::cache::pwl;
typedef WriteLogCache<ImageCtx>::Extent Extent;
typedef WriteLogCache<ImageCtx>::Extents Extents;
template <typename I>
-WriteLogCache<I>::WriteLogCache(I &image_ctx, librbd::cache::rwl::ImageCacheState<I>* cache_state) {
- m_write_log = new ReplicatedWriteLog<I>(image_ctx, cache_state);
+WriteLogCache<I>::WriteLogCache(I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state) {
+ m_write_log = new librbd::cache::pwl::ReplicatedWriteLog<I>(image_ctx, cache_state);
}
template <typename I>
#include "librbd/cache/ImageCache.h"
-class Context;
-class SafeTimer;
-
-class Context;
-class SafeTimer;
-
namespace librbd {
struct ImageCtx;
namespace cache {
+namespace pwl {
template <typename> class AbstractWriteLog;
-
-namespace rwl {
template <typename> class ImageCacheState;
}
using typename ImageCache<ImageCtxT>::Extent;
using typename ImageCache<ImageCtxT>::Extents;
- WriteLogCache(ImageCtxT &image_ctx, librbd::cache::rwl::ImageCacheState<ImageCtxT>* cache_state);
+ WriteLogCache(ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state);
~WriteLogCache();
WriteLogCache(const WriteLogCache&) = delete;
WriteLogCache &operator=(const WriteLogCache&) = delete;
void invalidate(Context *on_finish) override;
void flush(Context *on_finish) override;
- AbstractWriteLog<ImageCtxT> *m_write_log;
+ librbd::cache::pwl::AbstractWriteLog<ImageCtxT> *m_write_log;
};
} // namespace cache
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <libpmemobj.h>
+#include "AbstractWriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/cache/pwl/ReadRequest.h"
+#include "librbd/cache/pwl/Types.h"
+#include <map>
+#include <vector>
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::AbstractWriteLog: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+using namespace librbd::cache::pwl;
+
+typedef AbstractWriteLog<ImageCtx>::Extent Extent;
+typedef AbstractWriteLog<ImageCtx>::Extents Extents;
+
+const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION;
+
+template <typename I>
+AbstractWriteLog<I>::AbstractWriteLog(I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state)
+ : m_cache_state(cache_state),
+ m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl)),
+ m_image_ctx(image_ctx),
+ m_log_pool_config_size(DEFAULT_POOL_SIZE),
+ m_image_writeback(image_ctx), m_write_log_guard(image_ctx.cct),
+ m_log_retire_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_log_retire_lock", this))),
+ m_entry_reader_lock("librbd::cache::pwl::AbstractWriteLog::m_entry_reader_lock"),
+ m_deferred_dispatch_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_deferred_dispatch_lock", this))),
+ m_log_append_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_log_append_lock", this))),
+ m_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_lock", this))),
+ m_blockguard_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::pwl::AbstractWriteLog::m_blockguard_lock", this))),
+ m_blocks_to_log_entries(image_ctx.cct),
+ m_thread_pool(image_ctx.cct, "librbd::cache::pwl::AbstractWriteLog::thread_pool", "tp_pwl",
+ 4,
+ ""),
+ m_work_queue("librbd::cache::pwl::ReplicatedWriteLog::work_queue",
+ ceph::make_timespan(
+ image_ctx.config.template get_val<uint64_t>(
+ "rbd_op_thread_timeout")),
+ &m_thread_pool)
+{
+ CephContext *cct = m_image_ctx.cct;
+ ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock);
+}
+
+template <typename I>
+AbstractWriteLog<I>::~AbstractWriteLog() {
+ ldout(m_image_ctx.cct, 15) << "enter" << dendl;
+ {
+ std::lock_guard timer_locker(*m_timer_lock);
+ std::lock_guard locker(m_lock);
+ m_timer->cancel_event(m_timer_ctx);
+ m_thread_pool.stop();
+ ceph_assert(m_deferred_ios.size() == 0);
+ ceph_assert(m_ops_to_flush.size() == 0);
+ ceph_assert(m_ops_to_append.size() == 0);
+ ceph_assert(m_flush_ops_in_flight == 0);
+
+ m_log_pool = nullptr;
+ delete m_cache_state;
+ m_cache_state = nullptr;
+ }
+ ldout(m_image_ctx.cct, 15) << "exit" << dendl;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::perf_start(std::string name) {
+ PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_pwl_first, l_librbd_pwl_last);
+
+ // Latency axis configuration for op histograms, values are in nanoseconds
+ PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
+ "Latency (nsec)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
+ 0, ///< Start at 0
+ 5000, ///< Quantization unit is 5usec
+ 16, ///< Ranges into the mS
+ };
+
+ // Syncpoint logentry number x-axis configuration for op histograms
+ PerfHistogramCommon::axis_config_d sp_logentry_number_config{
+ "logentry number",
+ PerfHistogramCommon::SCALE_LINEAR, // log entry number in linear scale
+ 0, // Start at 0
+ 1, // Quantization unit is 1
+ 260, // Up to 260 > (MAX_WRITES_PER_SYNC_POINT)
+ };
+
+ // Syncpoint bytes number y-axis configuration for op histogram
+ PerfHistogramCommon::axis_config_d sp_bytes_number_config{
+ "Number of SyncPoint",
+ PerfHistogramCommon::SCALE_LOG2, // Request size in logarithmic scale
+ 0, // Start at 0
+ 512, // Quantization unit is 512
+ 17, // Writes up to 8M >= MAX_BYTES_PER_SYNC_POINT
+ };
+
+ // Op size axis configuration for op histogram y axis, values are in bytes
+ PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
+ "Request size (bytes)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+ 0, ///< Start at 0
+ 512, ///< Quantization unit is 512 bytes
+ 16, ///< Writes up to >32k
+ };
+
+ // Num items configuration for op histogram y axis, values are in items
+ PerfHistogramCommon::axis_config_d op_hist_y_axis_count_config{
+ "Number of items",
+ PerfHistogramCommon::SCALE_LINEAR, ///< Request size in linear scale
+ 0, ///< Start at 0
+ 1, ///< Quantization unit is 1
+ 32, ///< Writes up to >32k
+ };
+
+ plb.add_u64_counter(l_librbd_pwl_rd_req, "rd", "Reads");
+ plb.add_u64_counter(l_librbd_pwl_rd_bytes, "rd_bytes", "Data size in reads");
+ plb.add_time_avg(l_librbd_pwl_rd_latency, "rd_latency", "Latency of reads");
+
+ plb.add_u64_counter(l_librbd_pwl_rd_hit_req, "hit_rd", "Reads completely hitting RWL");
+ plb.add_u64_counter(l_librbd_pwl_rd_hit_bytes, "rd_hit_bytes", "Bytes read from RWL");
+ plb.add_time_avg(l_librbd_pwl_rd_hit_latency, "hit_rd_latency", "Latency of read hits");
+
+ plb.add_u64_counter(l_librbd_pwl_rd_part_hit_req, "part_hit_rd", "reads partially hitting RWL");
+
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_syncpoint_hist, "syncpoint_logentry_bytes_histogram",
+ sp_logentry_number_config, sp_bytes_number_config,
+ "Histogram of syncpoint's logentry numbers vs bytes number");
+
+ plb.add_u64_counter(l_librbd_pwl_wr_req, "wr", "Writes");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_def, "wr_def", "Writes deferred for resources");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_def_lanes, "wr_def_lanes", "Writes deferred for lanes");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_def_log, "wr_def_log", "Writes deferred for log entries");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_def_buf, "wr_def_buf", "Writes deferred for buffers");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_overlap, "wr_overlap", "Writes overlapping with prior in-progress writes");
+ plb.add_u64_counter(l_librbd_pwl_wr_req_queued, "wr_q_barrier", "Writes queued for prior barriers (aio_flush)");
+ plb.add_u64_counter(l_librbd_pwl_wr_bytes, "wr_bytes", "Data size in writes");
+
+ plb.add_u64_counter(l_librbd_pwl_log_ops, "log_ops", "Log appends");
+ plb.add_u64_avg(l_librbd_pwl_log_op_bytes, "log_op_bytes", "Average log append bytes");
+
+ plb.add_time_avg(
+ l_librbd_pwl_req_arr_to_all_t, "req_arr_to_all_t",
+ "Average arrival to allocation time (time deferred for overlap)");
+ plb.add_time_avg(
+ l_librbd_pwl_req_arr_to_dis_t, "req_arr_to_dis_t",
+ "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
+ plb.add_time_avg(
+ l_librbd_pwl_req_all_to_dis_t, "req_all_to_dis_t",
+ "Average allocation to dispatch time (time deferred for log resources)");
+ plb.add_time_avg(
+ l_librbd_pwl_wr_latency, "wr_latency",
+ "Latency of writes (persistent completion)");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_wr_latency_hist, "wr_latency_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of write request latency (nanoseconds) vs. bytes written");
+ plb.add_time_avg(
+ l_librbd_pwl_wr_caller_latency, "caller_wr_latency",
+ "Latency of write completion to caller");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_req_arr_to_all_t, "req_arr_to_all_nw_t",
+ "Average arrival to allocation time (time deferred for overlap)");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_req_arr_to_dis_t, "req_arr_to_dis_nw_t",
+ "Average arrival to dispatch time (includes time deferred for overlaps and allocation)");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_req_all_to_dis_t, "req_all_to_dis_nw_t",
+ "Average allocation to dispatch time (time deferred for log resources)");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_wr_latency, "wr_latency_nw",
+ "Latency of writes (persistent completion) not deferred for free space");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_nowait_wr_latency_hist, "wr_latency_nw_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of write request latency (nanoseconds) vs. bytes written for writes not deferred for free space");
+ plb.add_time_avg(
+ l_librbd_pwl_nowait_wr_caller_latency, "caller_wr_latency_nw",
+ "Latency of write completion to callerfor writes not deferred for free space");
+ plb.add_time_avg(l_librbd_pwl_log_op_alloc_t, "op_alloc_t", "Average buffer pmemobj_reserve() time");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_log_op_alloc_t_hist, "op_alloc_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of buffer pmemobj_reserve() time (nanoseconds) vs. bytes written");
+ plb.add_time_avg(l_librbd_pwl_log_op_dis_to_buf_t, "op_dis_to_buf_t", "Average dispatch to buffer persist time");
+ plb.add_time_avg(l_librbd_pwl_log_op_dis_to_app_t, "op_dis_to_app_t", "Average dispatch to log append time");
+ plb.add_time_avg(l_librbd_pwl_log_op_dis_to_cmp_t, "op_dis_to_cmp_t", "Average dispatch to persist completion time");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_log_op_dis_to_cmp_t_hist, "op_dis_to_cmp_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of op dispatch to persist complete time (nanoseconds) vs. bytes written");
+
+ plb.add_time_avg(
+ l_librbd_pwl_log_op_buf_to_app_t, "op_buf_to_app_t",
+ "Average buffer persist to log append time (write data persist/replicate + wait for append time)");
+ plb.add_time_avg(
+ l_librbd_pwl_log_op_buf_to_bufc_t, "op_buf_to_bufc_t",
+ "Average buffer persist time (write data persist/replicate time)");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_log_op_buf_to_bufc_t_hist, "op_buf_to_bufc_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of write buffer persist time (nanoseconds) vs. bytes written");
+ plb.add_time_avg(
+ l_librbd_pwl_log_op_app_to_cmp_t, "op_app_to_cmp_t",
+ "Average log append to persist complete time (log entry append/replicate + wait for complete time)");
+ plb.add_time_avg(
+ l_librbd_pwl_log_op_app_to_appc_t, "op_app_to_appc_t",
+ "Average log append to persist complete time (log entry append/replicate time)");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_log_op_app_to_appc_t_hist, "op_app_to_appc_t_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of log append persist time (nanoseconds) (vs. op bytes)");
+
+ plb.add_u64_counter(l_librbd_pwl_discard, "discard", "Discards");
+ plb.add_u64_counter(l_librbd_pwl_discard_bytes, "discard_bytes", "Bytes discarded");
+ plb.add_time_avg(l_librbd_pwl_discard_latency, "discard_lat", "Discard latency");
+
+ plb.add_u64_counter(l_librbd_pwl_aio_flush, "aio_flush", "AIO flush (flush to RWL)");
+ plb.add_u64_counter(l_librbd_pwl_aio_flush_def, "aio_flush_def", "AIO flushes deferred for resources");
+ plb.add_time_avg(l_librbd_pwl_aio_flush_latency, "aio_flush_lat", "AIO flush latency");
+
+ plb.add_u64_counter(l_librbd_pwl_ws,"ws", "Write Sames");
+ plb.add_u64_counter(l_librbd_pwl_ws_bytes, "ws_bytes", "Write Same bytes to image");
+ plb.add_time_avg(l_librbd_pwl_ws_latency, "ws_lat", "Write Same latency");
+
+ plb.add_u64_counter(l_librbd_pwl_cmp, "cmp", "Compare and Write requests");
+ plb.add_u64_counter(l_librbd_pwl_cmp_bytes, "cmp_bytes", "Compare and Write bytes compared/written");
+ plb.add_time_avg(l_librbd_pwl_cmp_latency, "cmp_lat", "Compare and Write latecy");
+ plb.add_u64_counter(l_librbd_pwl_cmp_fails, "cmp_fails", "Compare and Write compare fails");
+
+ plb.add_u64_counter(l_librbd_pwl_flush, "flush", "Flush (flush RWL)");
+ plb.add_u64_counter(l_librbd_pwl_invalidate_cache, "invalidate", "Invalidate RWL");
+ plb.add_u64_counter(l_librbd_pwl_invalidate_discard_cache, "discard", "Discard and invalidate RWL");
+
+ plb.add_time_avg(l_librbd_pwl_append_tx_t, "append_tx_lat", "Log append transaction latency");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_append_tx_t_hist, "append_tx_lat_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_count_config,
+ "Histogram of log append transaction time (nanoseconds) vs. entries appended");
+ plb.add_time_avg(l_librbd_pwl_retire_tx_t, "retire_tx_lat", "Log retire transaction latency");
+ plb.add_u64_counter_histogram(
+ l_librbd_pwl_retire_tx_t_hist, "retire_tx_lat_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_count_config,
+ "Histogram of log retire transaction time (nanoseconds) vs. entries retired");
+
+ m_perfcounter = plb.create_perf_counters();
+ m_image_ctx.cct->get_perfcounters_collection()->add(m_perfcounter);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::perf_stop() {
+ ceph_assert(m_perfcounter);
+ m_image_ctx.cct->get_perfcounters_collection()->remove(m_perfcounter);
+ delete m_perfcounter;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::log_perf() {
+ bufferlist bl;
+ Formatter *f = Formatter::create("json-pretty");
+ bl.append("Perf dump follows\n--- Begin perf dump ---\n");
+ bl.append("{\n");
+ stringstream ss;
+ utime_t now = ceph_clock_now();
+ ss << "\"test_time\": \"" << now << "\",";
+ ss << "\"image\": \"" << m_image_ctx.name << "\",";
+ bl.append(ss);
+ bl.append("\"stats\": ");
+ m_image_ctx.cct->get_perfcounters_collection()->dump_formatted(f, 0);
+ f->flush(bl);
+ bl.append(",\n\"histograms\": ");
+ m_image_ctx.cct->get_perfcounters_collection()->dump_formatted_histograms(f, 0);
+ f->flush(bl);
+ delete f;
+ bl.append("}\n--- End perf dump ---\n");
+ bl.append('\0');
+ ldout(m_image_ctx.cct, 1) << bl.c_str() << dendl;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::periodic_stats() {
+ std::lock_guard locker(m_lock);
+ ldout(m_image_ctx.cct, 1) << "STATS: "
+ << "m_free_log_entries=" << m_free_log_entries << ", "
+ << "m_log_entries=" << m_log_entries.size() << ", "
+ << "m_dirty_log_entries=" << m_dirty_log_entries.size() << ", "
+ << "m_bytes_allocated=" << m_bytes_allocated << ", "
+ << "m_bytes_cached=" << m_bytes_cached << ", "
+ << "m_bytes_dirty=" << m_bytes_dirty << ", "
+ << "bytes available=" << m_bytes_allocated_cap - m_bytes_allocated << ", "
+ << "m_current_sync_gen=" << m_current_sync_gen << ", "
+ << "m_flushed_sync_gen=" << m_flushed_sync_gen << ", "
+ << dendl;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::arm_periodic_stats() {
+ ceph_assert(ceph_mutex_is_locked(*m_timer_lock));
+ if (m_periodic_stats_enabled) {
+ m_timer_ctx = new LambdaContext(
+ [this](int r) {
+ /* m_timer_lock is held */
+ periodic_stats();
+ arm_periodic_stats();
+ });
+ m_timer->add_event_after(LOG_STATS_INTERVAL_SECONDS, m_timer_ctx);
+ }
+}
+
+/*
+ * Loads the log entries from an existing log.
+ *
+ * Creates the in-memory structures to represent the state of the
+ * re-opened log.
+ *
+ * Finds the last appended sync point, and any sync points referred to
+ * in log entries, but missing from the log. These missing sync points
+ * are created and scheduled for append. Some rudimentary consistency
+ * checking is done.
+ *
+ * Rebuilds the m_blocks_to_log_entries map, to make log entries
+ * readable.
+ *
+ * Places all writes on the dirty entries list, which causes them all
+ * to be flushed.
+ *
+ */
+template <typename I>
+void AbstractWriteLog<I>::load_existing_entries(DeferredContexts &later) {
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
+ uint64_t entry_index = m_first_valid_entry;
+ /* The map below allows us to find sync point log entries by sync
+ * gen number, which is necessary so write entries can be linked to
+ * their sync points. */
+ std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
+ /* The map below tracks sync points referred to in writes but not
+ * appearing in the sync_point_entries map. We'll use this to
+ * determine which sync points are missing and need to be
+ * created. */
+ std::map<uint64_t, bool> missing_sync_points;
+
+ /*
+ * Read the existing log entries. Construct an in-memory log entry
+ * object of the appropriate type for each. Add these to the global
+ * log entries list.
+ *
+ * Write entries will not link to their sync points yet. We'll do
+ * that in the next pass. Here we'll accumulate a map of sync point
+ * gen numbers that are referred to in writes but do not appearing in
+ * the log.
+ */
+ while (entry_index != m_first_free_entry) {
+ WriteLogPmemEntry *pmem_entry = &pmem_log_entries[entry_index];
+ std::shared_ptr<GenericLogEntry> log_entry = nullptr;
+ bool writer = pmem_entry->is_writer();
+
+ ceph_assert(pmem_entry->entry_index == entry_index);
+ if (pmem_entry->is_sync_point()) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " is a sync point. pmem_entry=[" << *pmem_entry << "]" << dendl;
+ auto sync_point_entry = std::make_shared<SyncPointLogEntry>(pmem_entry->sync_gen_number);
+ log_entry = sync_point_entry;
+ sync_point_entries[pmem_entry->sync_gen_number] = sync_point_entry;
+ missing_sync_points.erase(pmem_entry->sync_gen_number);
+ m_current_sync_gen = pmem_entry->sync_gen_number;
+ } else if (pmem_entry->is_write()) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " is a write. pmem_entry=[" << *pmem_entry << "]" << dendl;
+ auto write_entry =
+ std::make_shared<WriteLogEntry>(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes);
+ write_entry->pmem_buffer = D_RW(pmem_entry->write_data);
+ log_entry = write_entry;
+ } else if (pmem_entry->is_writesame()) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " is a write same. pmem_entry=[" << *pmem_entry << "]" << dendl;
+ auto ws_entry =
+ std::make_shared<WriteSameLogEntry>(nullptr, pmem_entry->image_offset_bytes,
+ pmem_entry->write_bytes, pmem_entry->ws_datalen);
+ ws_entry->pmem_buffer = D_RW(pmem_entry->write_data);
+ log_entry = ws_entry;
+ } else if (pmem_entry->is_discard()) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " is a discard. pmem_entry=[" << *pmem_entry << "]" << dendl;
+ auto discard_entry =
+ std::make_shared<DiscardLogEntry>(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes,
+ m_discard_granularity_bytes);
+ log_entry = discard_entry;
+ } else {
+ lderr(m_image_ctx.cct) << "Unexpected entry type in entry " << entry_index
+ << ", pmem_entry=[" << *pmem_entry << "]" << dendl;
+ }
+
+ if (writer) {
+ ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
+ << " writes. pmem_entry=[" << *pmem_entry << "]" << dendl;
+ if (!sync_point_entries[pmem_entry->sync_gen_number]) {
+ missing_sync_points[pmem_entry->sync_gen_number] = true;
+ }
+ }
+
+ log_entry->ram_entry = *pmem_entry;
+ log_entry->pmem_entry = pmem_entry;
+ log_entry->log_entry_index = entry_index;
+ log_entry->completed = true;
+
+ m_log_entries.push_back(log_entry);
+
+ entry_index = (entry_index + 1) % m_total_log_entries;
+ }
+
+ /* Create missing sync points. These must not be appended until the
+ * entry reload is complete and the write map is up to
+ * date. Currently this is handled by the deferred contexts object
+ * passed to new_sync_point(). These contexts won't be completed
+ * until this function returns. */
+ for (auto &kv : missing_sync_points) {
+ ldout(m_image_ctx.cct, 5) << "Adding sync point " << kv.first << dendl;
+ if (0 == m_current_sync_gen) {
+ /* The unlikely case where the log contains writing entries, but no sync
+ * points (e.g. because they were all retired) */
+ m_current_sync_gen = kv.first-1;
+ }
+ ceph_assert(kv.first == m_current_sync_gen+1);
+ init_flush_new_sync_point(later);
+ ceph_assert(kv.first == m_current_sync_gen);
+ sync_point_entries[kv.first] = m_current_sync_point->log_entry;;
+ }
+
+ /*
+ * Iterate over the log entries again (this time via the global
+ * entries list), connecting write entries to their sync points and
+ * updating the sync point stats.
+ *
+ * Add writes to the write log map.
+ */
+ std::shared_ptr<SyncPointLogEntry> previous_sync_point_entry = nullptr;
+ for (auto &log_entry : m_log_entries) {
+ if ((log_entry->write_bytes() > 0) || (log_entry->bytes_dirty() > 0)) {
+ /* This entry is one of the types that write */
+ auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(log_entry);
+ if (gen_write_entry) {
+ auto sync_point_entry = sync_point_entries[gen_write_entry->ram_entry.sync_gen_number];
+ if (!sync_point_entry) {
+ lderr(m_image_ctx.cct) << "Sync point missing for entry=[" << *gen_write_entry << "]" << dendl;
+ ceph_assert(false);
+ } else {
+ gen_write_entry->sync_point_entry = sync_point_entry;
+ sync_point_entry->writes++;
+ sync_point_entry->bytes += gen_write_entry->ram_entry.write_bytes;
+ sync_point_entry->writes_completed++;
+ m_blocks_to_log_entries.add_log_entry(gen_write_entry);
+ /* This entry is only dirty if its sync gen number is > the flushed
+ * sync gen number from the root object. */
+ if (gen_write_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) {
+ m_dirty_log_entries.push_back(log_entry);
+ m_bytes_dirty += gen_write_entry->bytes_dirty();
+ } else {
+ gen_write_entry->set_flushed(true);
+ sync_point_entry->writes_flushed++;
+ }
+ if (log_entry->write_bytes() == log_entry->bytes_dirty()) {
+ /* This entry is a basic write */
+ uint64_t bytes_allocated = MIN_WRITE_ALLOC_SIZE;
+ if (gen_write_entry->ram_entry.write_bytes > bytes_allocated) {
+ bytes_allocated = gen_write_entry->ram_entry.write_bytes;
+ }
+ m_bytes_allocated += bytes_allocated;
+ m_bytes_cached += gen_write_entry->ram_entry.write_bytes;
+ }
+ }
+ }
+ } else {
+ /* This entry is sync point entry */
+ auto sync_point_entry = static_pointer_cast<SyncPointLogEntry>(log_entry);
+ if (sync_point_entry) {
+ if (previous_sync_point_entry) {
+ previous_sync_point_entry->next_sync_point_entry = sync_point_entry;
+ if (previous_sync_point_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) {
+ sync_point_entry->prior_sync_point_flushed = false;
+ ceph_assert(!previous_sync_point_entry->prior_sync_point_flushed ||
+ (0 == previous_sync_point_entry->writes) ||
+ (previous_sync_point_entry->writes >= previous_sync_point_entry->writes_flushed));
+ } else {
+ sync_point_entry->prior_sync_point_flushed = true;
+ ceph_assert(previous_sync_point_entry->prior_sync_point_flushed);
+ ceph_assert(previous_sync_point_entry->writes == previous_sync_point_entry->writes_flushed);
+ }
+ previous_sync_point_entry = sync_point_entry;
+ } else {
+ /* There are no previous sync points, so we'll consider them flushed */
+ sync_point_entry->prior_sync_point_flushed = true;
+ }
+ ldout(m_image_ctx.cct, 10) << "Loaded to sync point=[" << *sync_point_entry << dendl;
+ }
+ }
+ }
+ if (0 == m_current_sync_gen) {
+ /* If a re-opened log was completely flushed, we'll have found no sync point entries here,
+ * and not advanced m_current_sync_gen. Here we ensure it starts past the last flushed sync
+ * point recorded in the log. */
+ m_current_sync_gen = m_flushed_sync_gen;
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::pwl_init(Context *on_finish, DeferredContexts &later) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ ceph_assert(m_cache_state);
+ std::lock_guard locker(m_lock);
+ ceph_assert(!m_initialized);
+ ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl;
+ ldout(cct,5) << "pwl_size: " << m_cache_state->size << dendl;
+ std::string pwl_path = m_cache_state->path;
+ ldout(cct,5) << "pwl_path: " << pwl_path << dendl;
+
+ std::string pool_name = m_image_ctx.md_ctx.get_pool_name();
+ std::string log_pool_name = pwl_path + "/rbd-pwl." + pool_name + "." + m_image_ctx.id + ".pool";
+ std::string log_poolset_name = pwl_path + "/rbd-pwl." + pool_name + "." + m_image_ctx.id + ".poolset";
+ m_log_pool_config_size = max(m_cache_state->size, MIN_POOL_SIZE);
+
+ if (access(log_poolset_name.c_str(), F_OK) == 0) {
+ m_log_pool_name = log_poolset_name;
+ m_log_is_poolset = true;
+ } else {
+ m_log_pool_name = log_pool_name;
+ ldout(cct, 5) << "Poolset file " << log_poolset_name
+ << " not present (or can't open). Using unreplicated pool" << dendl;
+ }
+
+ if ((!m_cache_state->present) &&
+ (access(m_log_pool_name.c_str(), F_OK) == 0)) {
+ ldout(cct, 5) << "There's an existing pool/poolset file " << m_log_pool_name
+ << ", While there's no cache in the image metatata." << dendl;
+ if (remove(m_log_pool_name.c_str()) != 0) {
+ lderr(cct) << "Failed to remove the pool/poolset file " << m_log_pool_name
+ << dendl;
+ on_finish->complete(-errno);
+ return;
+ } else {
+ ldout(cct, 5) << "Removed the existing pool/poolset file." << dendl;
+ }
+ }
+
+ if (access(m_log_pool_name.c_str(), F_OK) != 0) {
+ if ((m_log_pool =
+ pmemobj_create(m_log_pool_name.c_str(),
+ m_pwl_pool_layout_name,
+ m_log_pool_config_size,
+ (S_IWUSR | S_IRUSR))) == NULL) {
+ lderr(cct) << "failed to create pool (" << m_log_pool_name << ")"
+ << pmemobj_errormsg() << dendl;
+ m_cache_state->present = false;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ /* TODO: filter/replace errnos that are meaningless to the caller */
+ on_finish->complete(-errno);
+ return;
+ }
+ m_cache_state->present = true;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+ /* new pool, calculate and store metadata */
+ size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE);
+ size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogPmemEntry);
+ uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size);
+ if (num_small_writes > MAX_LOG_ENTRIES) {
+ num_small_writes = MAX_LOG_ENTRIES;
+ }
+ if (num_small_writes <= 2) {
+ lderr(cct) << "num_small_writes needs to > 2" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ m_log_pool_actual_size = m_log_pool_config_size;
+ m_bytes_allocated_cap = effective_pool_size;
+ /* Log ring empty */
+ m_first_free_entry = 0;
+ m_first_valid_entry = 0;
+ TX_BEGIN(m_log_pool) {
+ TX_ADD(pool_root);
+ D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION;
+ D_RW(pool_root)->log_entries =
+ TX_ZALLOC(struct WriteLogPmemEntry,
+ sizeof(struct WriteLogPmemEntry) * num_small_writes);
+ D_RW(pool_root)->pool_size = m_log_pool_actual_size;
+ D_RW(pool_root)->flushed_sync_gen = m_flushed_sync_gen;
+ D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE;
+ D_RW(pool_root)->num_log_entries = num_small_writes;
+ D_RW(pool_root)->first_free_entry = m_first_free_entry;
+ D_RW(pool_root)->first_valid_entry = m_first_valid_entry;
+ } TX_ONCOMMIT {
+ m_total_log_entries = D_RO(pool_root)->num_log_entries;
+ m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free
+ } TX_ONABORT {
+ m_total_log_entries = 0;
+ m_free_log_entries = 0;
+ lderr(cct) << "failed to initialize pool (" << m_log_pool_name << ")" << dendl;
+ on_finish->complete(-pmemobj_tx_errno());
+ return;
+ } TX_FINALLY {
+ } TX_END;
+ } else {
+ m_cache_state->present = true;
+ /* Open existing pool */
+ if ((m_log_pool =
+ pmemobj_open(m_log_pool_name.c_str(),
+ m_pwl_pool_layout_name)) == NULL) {
+ lderr(cct) << "failed to open pool (" << m_log_pool_name << "): "
+ << pmemobj_errormsg() << dendl;
+ on_finish->complete(-errno);
+ return;
+ }
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ if (D_RO(pool_root)->header.layout_version != RWL_POOL_VERSION) {
+ // TODO: will handle upgrading version in the future
+ lderr(cct) << "Pool layout version is " << D_RO(pool_root)->header.layout_version
+ << " expected " << RWL_POOL_VERSION << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) {
+ lderr(cct) << "Pool block size is " << D_RO(pool_root)->block_size
+ << " expected " << MIN_WRITE_ALLOC_SIZE << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ m_log_pool_actual_size = D_RO(pool_root)->pool_size;
+ m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen;
+ m_total_log_entries = D_RO(pool_root)->num_log_entries;
+ m_first_free_entry = D_RO(pool_root)->first_free_entry;
+ m_first_valid_entry = D_RO(pool_root)->first_valid_entry;
+ if (m_first_free_entry < m_first_valid_entry) {
+ /* Valid entries wrap around the end of the ring, so first_free is lower
+ * than first_valid. If first_valid was == first_free+1, the entry at
+ * first_free would be empty. The last entry is never used, so in
+ * that case there would be zero free log entries. */
+ m_free_log_entries = m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1;
+ } else {
+ /* first_valid is <= first_free. If they are == we have zero valid log
+ * entries, and n-1 free log entries */
+ m_free_log_entries = m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1;
+ }
+ size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE);
+ m_bytes_allocated_cap = effective_pool_size;
+ load_existing_entries(later);
+ m_cache_state->clean = m_dirty_log_entries.empty();
+ m_cache_state->empty = m_log_entries.empty();
+ }
+
+ ldout(cct,1) << "pool " << m_log_pool_name << " has " << m_total_log_entries
+ << " log entries, " << m_free_log_entries << " of which are free."
+ << " first_valid=" << m_first_valid_entry
+ << ", first_free=" << m_first_free_entry
+ << ", flushed_sync_gen=" << m_flushed_sync_gen
+ << ", m_current_sync_gen=" << m_current_sync_gen << dendl;
+ if (m_first_free_entry == m_first_valid_entry) {
+ ldout(cct,1) << "write log is empty" << dendl;
+ m_cache_state->empty = true;
+ }
+
+ /* Start the sync point following the last one seen in the
+ * log. Flush the last sync point created during the loading of the
+ * existing log entries. */
+ init_flush_new_sync_point(later);
+ ldout(cct,20) << "new sync point = [" << m_current_sync_point << "]" << dendl;
+
+ m_initialized = true;
+ // Start the thread
+ m_thread_pool.start();
+
+ m_periodic_stats_enabled = m_cache_state->log_periodic_stats;
+ /* Do these after we drop lock */
+ later.add(new LambdaContext([this](int r) {
+ if (m_periodic_stats_enabled) {
+ /* Log stats for the first time */
+ periodic_stats();
+ /* Arm periodic stats logging for the first time */
+ std::lock_guard timer_locker(*m_timer_lock);
+ arm_periodic_stats();
+ }
+ }));
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::update_image_cache_state(Context *on_finish) {
+ m_cache_state->write_image_cache_state(on_finish);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::init(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+ perf_start(m_image_ctx.id);
+
+ ceph_assert(!m_initialized);
+
+ Context *ctx = new LambdaContext(
+ [this, on_finish](int r) {
+ if (r >= 0) {
+ update_image_cache_state(on_finish);
+ } else {
+ on_finish->complete(r);
+ }
+ });
+
+ DeferredContexts later;
+ pwl_init(ctx, later);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::shut_down(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << dendl;
+
+ ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl;
+
+ Context *ctx = new LambdaContext(
+ [this, on_finish](int r) {
+ ldout(m_image_ctx.cct, 6) << "shutdown complete" << dendl;
+ m_image_ctx.op_work_queue->queue(on_finish, r);
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ Context *next_ctx = override_ctx(r, ctx);
+ bool periodic_stats_enabled = m_periodic_stats_enabled;
+ m_periodic_stats_enabled = false;
+
+ if (periodic_stats_enabled) {
+ /* Log stats one last time if they were enabled */
+ periodic_stats();
+ }
+ {
+ std::lock_guard locker(m_lock);
+ ceph_assert(m_dirty_log_entries.size() == 0);
+ m_wake_up_enabled = false;
+ m_cache_state->clean = true;
+ m_log_entries.clear();
+ if (m_log_pool) {
+ ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl;
+ pmemobj_close(m_log_pool);
+ }
+ if (m_cache_state->clean) {
+ if (m_log_is_poolset) {
+ ldout(m_image_ctx.cct, 5) << "Not removing poolset " << m_log_pool_name << dendl;
+ } else {
+ ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << m_log_pool_name << dendl;
+ if (remove(m_log_pool_name.c_str()) != 0) {
+ lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << m_log_pool_name << "\": "
+ << pmemobj_errormsg() << dendl;
+ } else {
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ m_cache_state->present = false;
+ }
+ }
+ } else {
+ if (m_log_is_poolset) {
+ ldout(m_image_ctx.cct, 5) << "Not removing poolset " << m_log_pool_name << dendl;
+ } else {
+ ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << m_log_pool_name << dendl;
+ }
+ }
+ if (m_perfcounter) {
+ perf_stop();
+ }
+ }
+ update_image_cache_state(next_ctx);
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ Context *next_ctx = override_ctx(r, ctx);
+ {
+ /* Sync with process_writeback_dirty_entries() */
+ RWLock::WLocker entry_reader_wlocker(m_entry_reader_lock);
+ m_shutting_down = true;
+ /* Flush all writes to OSDs (unless disabled) and wait for all
+ in-progress flush writes to complete */
+ ldout(m_image_ctx.cct, 6) << "flushing" << dendl;
+ if (m_periodic_stats_enabled) {
+ periodic_stats();
+ }
+ }
+ flush_dirty_entries(next_ctx);
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ Context *next_ctx = override_ctx(r, ctx);
+ ldout(m_image_ctx.cct, 6) << "waiting for in flight operations" << dendl;
+ // Wait for in progress IOs to complete
+ next_ctx = util::create_async_context_callback(m_image_ctx, next_ctx);
+ m_async_op_tracker.wait_for_ops(next_ctx);
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ ldout(m_image_ctx.cct, 6) << "Done internal_flush in shutdown" << dendl;
+ m_work_queue.queue(ctx, r);
+ });
+ /* Complete all in-flight writes before shutting down */
+ ldout(m_image_ctx.cct, 6) << "internal_flush in shutdown" << dendl;
+ internal_flush(false, ctx);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::read(Extents&& image_extents,
+ ceph::bufferlist* bl,
+ int fadvise_flags, Context *on_finish) {
+ // TODO: handle writesame and discard case in later PRs
+ CephContext *cct = m_image_ctx.cct;
+ utime_t now = ceph_clock_now();
+ C_ReadRequest *read_ctx = new C_ReadRequest(cct, now, m_perfcounter, bl, on_finish);
+ ldout(cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id
+ << "image_extents=" << image_extents << ", "
+ << "bl=" << bl << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ ceph_assert(m_initialized);
+ bl->clear();
+ m_perfcounter->inc(l_librbd_pwl_rd_req, 1);
+
+ /*
+ * The strategy here is to look up all the WriteLogMapEntries that overlap
+ * this read, and iterate through those to separate this read into hits and
+ * misses. A new Extents object is produced here with Extents for each miss
+ * region. The miss Extents is then passed on to the read cache below RWL. We
+ * also produce an ImageExtentBufs for all the extents (hit or miss) in this
+ * read. When the read from the lower cache layer completes, we iterate
+ * through the ImageExtentBufs and insert buffers for each cache hit at the
+ * appropriate spot in the bufferlist returned from below for the miss
+ * read. The buffers we insert here refer directly to regions of various
+ * write log entry data buffers.
+ *
+ * Locking: These buffer objects hold a reference on the write log entries
+ * they refer to. Log entries can't be retired until there are no references.
+ * The GenericWriteLogEntry references are released by the buffer destructor.
+ */
+ for (auto &extent : image_extents) {
+ uint64_t extent_offset = 0;
+ RWLock::RLocker entry_reader_locker(m_entry_reader_lock);
+ WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries(block_extent(extent));
+ for (auto &map_entry : map_entries) {
+ Extent entry_image_extent(pwl::image_extent(map_entry.block_extent));
+ /* If this map entry starts after the current image extent offset ... */
+ if (entry_image_extent.first > extent.first + extent_offset) {
+ /* ... add range before map_entry to miss extents */
+ uint64_t miss_extent_start = extent.first + extent_offset;
+ uint64_t miss_extent_length = entry_image_extent.first - miss_extent_start;
+ Extent miss_extent(miss_extent_start, miss_extent_length);
+ read_ctx->miss_extents.push_back(miss_extent);
+ /* Add miss range to read extents */
+ ImageExtentBuf miss_extent_buf(miss_extent);
+ read_ctx->read_extents.push_back(miss_extent_buf);
+ extent_offset += miss_extent_length;
+ }
+ ceph_assert(entry_image_extent.first <= extent.first + extent_offset);
+ uint64_t entry_offset = 0;
+ /* If this map entry starts before the current image extent offset ... */
+ if (entry_image_extent.first < extent.first + extent_offset) {
+ /* ... compute offset into log entry for this read extent */
+ entry_offset = (extent.first + extent_offset) - entry_image_extent.first;
+ }
+ /* This read hit ends at the end of the extent or the end of the log
+ entry, whichever is less. */
+ uint64_t entry_hit_length = min(entry_image_extent.second - entry_offset,
+ extent.second - extent_offset);
+ Extent hit_extent(entry_image_extent.first, entry_hit_length);
+ if (0 == map_entry.log_entry->write_bytes() && 0 < map_entry.log_entry->bytes_dirty()) {
+ /* discard log entry */
+ auto discard_entry = map_entry.log_entry;
+ ldout(cct, 20) << "read hit on discard entry: log_entry=" << *discard_entry << dendl;
+ /* Discards read as zero, so we'll construct a bufferlist of zeros */
+ bufferlist zero_bl;
+ zero_bl.append_zero(entry_hit_length);
+ /* Add hit extent to read extents */
+ ImageExtentBuf hit_extent_buf(hit_extent, zero_bl);
+ read_ctx->read_extents.push_back(hit_extent_buf);
+ } else {
+ /* write and writesame log entry */
+ /* Offset of the map entry into the log entry's buffer */
+ uint64_t map_entry_buffer_offset = entry_image_extent.first - map_entry.log_entry->ram_entry.image_offset_bytes;
+ /* Offset into the log entry buffer of this read hit */
+ uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset;
+ /* Create buffer object referring to pmem pool for this read hit */
+ auto write_entry = map_entry.log_entry;
+
+ /* Make a bl for this hit extent. This will add references to the write_entry->pmem_bp */
+ buffer::list hit_bl;
+
+ buffer::list entry_bl_copy;
+ write_entry->copy_pmem_bl(&entry_bl_copy);
+ entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl);
+
+ ceph_assert(hit_bl.length() == entry_hit_length);
+
+ /* Add hit extent to read extents */
+ ImageExtentBuf hit_extent_buf(hit_extent, hit_bl);
+ read_ctx->read_extents.push_back(hit_extent_buf);
+ }
+ /* Exclude RWL hit range from buffer and extent */
+ extent_offset += entry_hit_length;
+ ldout(cct, 20) << map_entry << dendl;
+ }
+ /* If the last map entry didn't consume the entire image extent ... */
+ if (extent.second > extent_offset) {
+ /* ... add the rest of this extent to miss extents */
+ uint64_t miss_extent_start = extent.first + extent_offset;
+ uint64_t miss_extent_length = extent.second - extent_offset;
+ Extent miss_extent(miss_extent_start, miss_extent_length);
+ read_ctx->miss_extents.push_back(miss_extent);
+ /* Add miss range to read extents */
+ ImageExtentBuf miss_extent_buf(miss_extent);
+ read_ctx->read_extents.push_back(miss_extent_buf);
+ extent_offset += miss_extent_length;
+ }
+ }
+
+ ldout(cct, 20) << "miss_extents=" << read_ctx->miss_extents << ", "
+ << "miss_bl=" << read_ctx->miss_bl << dendl;
+
+ if (read_ctx->miss_extents.empty()) {
+ /* All of this read comes from RWL */
+ read_ctx->complete(0);
+ } else {
+ /* Pass the read misses on to the layer below RWL */
+ m_image_writeback.aio_read(std::move(read_ctx->miss_extents), &read_ctx->miss_bl, fadvise_flags, read_ctx);
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::write(Extents &&image_extents,
+ bufferlist&& bl,
+ int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << "aio_write" << dendl;
+
+ utime_t now = ceph_clock_now();
+ m_perfcounter->inc(l_librbd_pwl_wr_req, 1);
+
+ ceph_assert(m_initialized);
+
+ auto *write_req =
+ new C_WriteRequestT(*this, now, std::move(image_extents), std::move(bl), fadvise_flags,
+ m_lock, m_perfcounter, on_finish);
+ m_perfcounter->inc(l_librbd_pwl_wr_bytes, write_req->image_extents_summary.total_bytes);
+
+ /* The lambda below will be called when the block guard for all
+ * blocks affected by this write is obtained */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, write_req](GuardedRequestFunctionContext &guard_ctx) {
+ write_req->blockguard_acquired(guard_ctx);
+ alloc_and_dispatch_io_req(write_req);
+ });
+
+ detain_guarded_request(write_req, guarded_ctx, false);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << dendl;
+
+ utime_t now = ceph_clock_now();
+ m_perfcounter->inc(l_librbd_pwl_discard, 1);
+ Extents discard_extents = {{offset, length}};
+ m_discard_granularity_bytes = discard_granularity_bytes;
+
+ ceph_assert(m_initialized);
+
+ auto *discard_req =
+ new C_DiscardRequestT(*this, now, std::move(discard_extents), discard_granularity_bytes,
+ m_lock, m_perfcounter, on_finish);
+
+ /* The lambda below will be called when the block guard for all
+ * blocks affected by this write is obtained */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, discard_req](GuardedRequestFunctionContext &guard_ctx) {
+ discard_req->blockguard_acquired(guard_ctx);
+ alloc_and_dispatch_io_req(discard_req);
+ });
+
+ detain_guarded_request(discard_req, guarded_ctx, false);
+}
+
+/**
+ * Aio_flush completes when all previously completed writes are
+ * flushed to persistent cache. We make a best-effort attempt to also
+ * defer until all in-progress writes complete, but we may not know
+ * about all of the writes the application considers in-progress yet,
+ * due to uncertainty in the IO submission workq (multiple WQ threads
+ * may allow out-of-order submission).
+ *
+ * This flush operation will not wait for writes deferred for overlap
+ * in the block guard.
+ */
+template <typename I>
+void AbstractWriteLog<I>::flush(io::FlushSource flush_source, Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "on_finish=" << on_finish << " flush_source=" << flush_source << dendl;
+
+ if (io::FLUSH_SOURCE_SHUTDOWN == flush_source || io::FLUSH_SOURCE_INTERNAL == flush_source) {
+ internal_flush(false, on_finish);
+ return;
+ }
+ m_perfcounter->inc(l_librbd_pwl_aio_flush, 1);
+
+ /* May be called even if initialization fails */
+ if (!m_initialized) {
+ ldout(cct, 05) << "never initialized" << dendl;
+ /* Deadlock if completed here */
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ {
+ std::shared_lock image_locker(m_image_ctx.image_lock);
+ if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) {
+ on_finish->complete(-EROFS);
+ return;
+ }
+ }
+
+ auto flush_req = make_flush_req(on_finish);
+
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, flush_req](GuardedRequestFunctionContext &guard_ctx) {
+ ldout(m_image_ctx.cct, 20) << "flush_req=" << flush_req << " cell=" << guard_ctx.cell << dendl;
+ ceph_assert(guard_ctx.cell);
+ flush_req->detained = guard_ctx.state.detained;
+ /* We don't call flush_req->set_cell(), because the block guard will be released here */
+ {
+ DeferredContexts post_unlock; /* Do these when the lock below is released */
+ std::lock_guard locker(m_lock);
+
+ if (!m_persist_on_flush && m_persist_on_write_until_flush) {
+ m_persist_on_flush = true;
+ ldout(m_image_ctx.cct, 5) << "now persisting on flush" << dendl;
+ }
+
+ /*
+ * Create a new sync point if there have been writes since the last
+ * one.
+ *
+ * We do not flush the caches below the RWL here.
+ */
+ flush_new_sync_point_if_needed(flush_req, post_unlock);
+ }
+
+ release_guarded_request(guard_ctx.cell);
+ });
+
+ detain_guarded_request(flush_req, guarded_ctx, true);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::writesame(uint64_t offset, uint64_t length,
+ bufferlist&& bl, int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << "aio_writesame" << dendl;
+
+ utime_t now = ceph_clock_now();
+ Extents ws_extents = {{offset, length}};
+ m_perfcounter->inc(l_librbd_pwl_ws, 1);
+ ceph_assert(m_initialized);
+
+ /* A write same request is also a write request. The key difference is the
+ * write same data buffer is shorter than the extent of the request. The full
+ * extent will be used in the block guard, and appear in
+ * m_blocks_to_log_entries_map. The data buffer allocated for the WS is only
+ * as long as the length of the bl here, which is the pattern that's repeated
+ * in the image for the entire length of this WS. Read hits and flushing of
+ * write sames are different than normal writes. */
+ auto *ws_req =
+ new C_WriteSameRequestT(*this, now, std::move(ws_extents), std::move(bl),
+ fadvise_flags, m_lock, m_perfcounter, on_finish);
+ m_perfcounter->inc(l_librbd_pwl_ws_bytes, ws_req->image_extents_summary.total_bytes);
+
+ /* The lambda below will be called when the block guard for all
+ * blocks affected by this write is obtained */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, ws_req](GuardedRequestFunctionContext &guard_ctx) {
+ ws_req->blockguard_acquired(guard_ctx);
+ alloc_and_dispatch_io_req(ws_req);
+ });
+
+ detain_guarded_request(ws_req, guarded_ctx, false);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::compare_and_write(Extents &&image_extents,
+ bufferlist&& cmp_bl,
+ bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) {
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ utime_t now = ceph_clock_now();
+ m_perfcounter->inc(l_librbd_pwl_cmp, 1);
+ ceph_assert(m_initialized);
+
+ /* A compare and write request is also a write request. We only allocate
+ * resources and dispatch this write request if the compare phase
+ * succeeds. */
+ auto *cw_req =
+ new C_CompAndWriteRequestT(*this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl),
+ mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish);
+ m_perfcounter->inc(l_librbd_pwl_cmp_bytes, cw_req->image_extents_summary.total_bytes);
+
+ /* The lambda below will be called when the block guard for all
+ * blocks affected by this write is obtained */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext([this, cw_req](GuardedRequestFunctionContext &guard_ctx) {
+ cw_req->blockguard_acquired(guard_ctx);
+
+ auto read_complete_ctx = new LambdaContext(
+ [this, cw_req](int r) {
+ ldout(m_image_ctx.cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id
+ << "cw_req=" << cw_req << dendl;
+
+ /* Compare read_bl to cmp_bl to determine if this will produce a write */
+ buffer::list aligned_read_bl;
+ if (cw_req->cmp_bl.length() < cw_req->read_bl.length()) {
+ aligned_read_bl.substr_of(cw_req->read_bl, 0, cw_req->cmp_bl.length());
+ }
+ if (cw_req->cmp_bl.contents_equal(cw_req->read_bl) ||
+ cw_req->cmp_bl.contents_equal(aligned_read_bl)) {
+ /* Compare phase succeeds. Begin write */
+ ldout(m_image_ctx.cct, 5) << " cw_req=" << cw_req << " compare matched" << dendl;
+ cw_req->compare_succeeded = true;
+ *cw_req->mismatch_offset = 0;
+ /* Continue with this request as a write. Blockguard release and
+ * user request completion handled as if this were a plain
+ * write. */
+ alloc_and_dispatch_io_req(cw_req);
+ } else {
+ /* Compare phase fails. Comp-and write ends now. */
+ ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " compare failed" << dendl;
+ /* Bufferlist doesn't tell us where they differed, so we'll have to determine that here */
+ uint64_t bl_index = 0;
+ for (bl_index = 0; bl_index < cw_req->cmp_bl.length(); bl_index++) {
+ if (cw_req->cmp_bl[bl_index] != cw_req->read_bl[bl_index]) {
+ ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " mismatch at " << bl_index << dendl;
+ break;
+ }
+ }
+ cw_req->compare_succeeded = false;
+ *cw_req->mismatch_offset = bl_index;
+ cw_req->complete_user_request(-EILSEQ);
+ cw_req->release_cell();
+ cw_req->complete(0);
+ }
+ });
+
+ /* Read phase of comp-and-write must read through RWL */
+ Extents image_extents_copy = cw_req->image_extents;
+ read(std::move(image_extents_copy), &cw_req->read_bl, cw_req->fadvise_flags, read_complete_ctx);
+ });
+
+ detain_guarded_request(cw_req, guarded_ctx, false);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::flush(Context *on_finish) {
+ internal_flush(false, on_finish);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::invalidate(Context *on_finish) {
+ internal_flush(true, on_finish);
+}
+
+template <typename I>
+CephContext *AbstractWriteLog<I>::get_context() {
+ return m_image_ctx.cct;
+}
+
+template <typename I>
+BlockGuardCell* AbstractWriteLog<I>::detain_guarded_request_helper(GuardedRequest &req)
+{
+ CephContext *cct = m_image_ctx.cct;
+ BlockGuardCell *cell;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock));
+ ldout(cct, 20) << dendl;
+
+ int r = m_write_log_guard.detain(req.block_extent, &req, &cell);
+ ceph_assert(r>=0);
+ if (r > 0) {
+ ldout(cct, 20) << "detaining guarded request due to in-flight requests: "
+ << "req=" << req << dendl;
+ return nullptr;
+ }
+
+ ldout(cct, 20) << "in-flight request cell: " << cell << dendl;
+ return cell;
+}
+
+template <typename I>
+BlockGuardCell* AbstractWriteLog<I>::detain_guarded_request_barrier_helper(
+ GuardedRequest &req)
+{
+ BlockGuardCell *cell = nullptr;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock));
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ if (m_barrier_in_progress) {
+ req.guard_ctx->state.queued = true;
+ m_awaiting_barrier.push_back(req);
+ } else {
+ bool barrier = req.guard_ctx->state.barrier;
+ if (barrier) {
+ m_barrier_in_progress = true;
+ req.guard_ctx->state.current_barrier = true;
+ }
+ cell = detain_guarded_request_helper(req);
+ if (barrier) {
+ /* Only non-null if the barrier acquires the guard now */
+ m_barrier_cell = cell;
+ }
+ }
+
+ return cell;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::detain_guarded_request(
+ C_BlockIORequestT *request,
+ GuardedRequestFunctionContext *guarded_ctx,
+ bool is_barrier)
+{
+ BlockExtent extent;
+ if (request) {
+ extent = request->image_extents_summary.block_extent();
+ } else {
+ extent = block_extent(whole_volume_extent());
+ }
+ auto req = GuardedRequest(extent, guarded_ctx, is_barrier);
+ BlockGuardCell *cell = nullptr;
+
+ ldout(m_image_ctx.cct, 20) << dendl;
+ {
+ std::lock_guard locker(m_blockguard_lock);
+ cell = detain_guarded_request_barrier_helper(req);
+ }
+ if (cell) {
+ req.guard_ctx->cell = cell;
+ req.guard_ctx->complete(0);
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::release_guarded_request(BlockGuardCell *released_cell)
+{
+ CephContext *cct = m_image_ctx.cct;
+ WriteLogGuard::BlockOperations block_reqs;
+ ldout(cct, 20) << "released_cell=" << released_cell << dendl;
+
+ {
+ std::lock_guard locker(m_blockguard_lock);
+ m_write_log_guard.release(released_cell, &block_reqs);
+
+ for (auto &req : block_reqs) {
+ req.guard_ctx->state.detained = true;
+ BlockGuardCell *detained_cell = detain_guarded_request_helper(req);
+ if (detained_cell) {
+ if (req.guard_ctx->state.current_barrier) {
+ /* The current barrier is acquiring the block guard, so now we know its cell */
+ m_barrier_cell = detained_cell;
+ /* detained_cell could be == released_cell here */
+ ldout(cct, 20) << "current barrier cell=" << detained_cell << " req=" << req << dendl;
+ }
+ req.guard_ctx->cell = detained_cell;
+ m_work_queue.queue(req.guard_ctx);
+ }
+ }
+
+ if (m_barrier_in_progress && (released_cell == m_barrier_cell)) {
+ ldout(cct, 20) << "current barrier released cell=" << released_cell << dendl;
+ /* The released cell is the current barrier request */
+ m_barrier_in_progress = false;
+ m_barrier_cell = nullptr;
+ /* Move waiting requests into the blockguard. Stop if there's another barrier */
+ while (!m_barrier_in_progress && !m_awaiting_barrier.empty()) {
+ auto &req = m_awaiting_barrier.front();
+ ldout(cct, 20) << "submitting queued request to blockguard: " << req << dendl;
+ BlockGuardCell *detained_cell = detain_guarded_request_barrier_helper(req);
+ if (detained_cell) {
+ req.guard_ctx->cell = detained_cell;
+ m_work_queue.queue(req.guard_ctx);
+ }
+ m_awaiting_barrier.pop_front();
+ }
+ }
+ }
+
+ ldout(cct, 20) << "exit" << dendl;
+}
+
+/*
+ * Performs the log event append operation for all of the scheduled
+ * events.
+ */
+template <typename I>
+void AbstractWriteLog<I>::append_scheduled_ops(void)
+{
+ GenericLogOperations ops;
+ int append_result = 0;
+ bool ops_remain = false;
+ bool appending = false; /* true if we set m_appending */
+ ldout(m_image_ctx.cct, 20) << dendl;
+ do {
+ ops.clear();
+
+ {
+ std::lock_guard locker(m_lock);
+ if (!appending && m_appending) {
+ /* Another thread is appending */
+ ldout(m_image_ctx.cct, 15) << "Another thread is appending" << dendl;
+ return;
+ }
+ if (m_ops_to_append.size()) {
+ appending = true;
+ m_appending = true;
+ auto last_in_batch = m_ops_to_append.begin();
+ unsigned int ops_to_append = m_ops_to_append.size();
+ if (ops_to_append > OPS_APPENDED_TOGETHER) {
+ ops_to_append = OPS_APPENDED_TOGETHER;
+ }
+ std::advance(last_in_batch, ops_to_append);
+ ops.splice(ops.end(), m_ops_to_append, m_ops_to_append.begin(), last_in_batch);
+ ops_remain = true; /* Always check again before leaving */
+ ldout(m_image_ctx.cct, 20) << "appending " << ops.size() << ", "
+ << m_ops_to_append.size() << " remain" << dendl;
+ } else {
+ ops_remain = false;
+ if (appending) {
+ appending = false;
+ m_appending = false;
+ }
+ }
+ }
+
+ if (ops.size()) {
+ std::lock_guard locker(m_log_append_lock);
+ alloc_op_log_entries(ops);
+ append_result = append_op_log_entries(ops);
+ }
+
+ int num_ops = ops.size();
+ if (num_ops) {
+ /* New entries may be flushable. Completion will wake up flusher. */
+ complete_op_log_entries(std::move(ops), append_result);
+ }
+ } while (ops_remain);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::enlist_op_appender()
+{
+ m_async_append_ops++;
+ m_async_op_tracker.start_op();
+ Context *append_ctx = new LambdaContext([this](int r) {
+ append_scheduled_ops();
+ m_async_append_ops--;
+ m_async_op_tracker.finish_op();
+ });
+ m_work_queue.queue(append_ctx);
+}
+
+/*
+ * Takes custody of ops. They'll all get their log entries appended,
+ * and have their on_write_persist contexts completed once they and
+ * all prior log entries are persisted everywhere.
+ */
+template <typename I>
+void AbstractWriteLog<I>::schedule_append(GenericLogOperations &ops)
+{
+ bool need_finisher;
+ GenericLogOperationsVector appending;
+
+ std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
+ {
+ std::lock_guard locker(m_lock);
+
+ need_finisher = m_ops_to_append.empty() && !m_appending;
+ m_ops_to_append.splice(m_ops_to_append.end(), ops);
+ }
+
+ if (need_finisher) {
+ enlist_op_appender();
+ }
+
+ for (auto &op : appending) {
+ op->appending();
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::schedule_append(GenericLogOperationsVector &ops)
+{
+ GenericLogOperations to_append(ops.begin(), ops.end());
+
+ schedule_append(to_append);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::schedule_append(GenericLogOperationSharedPtr op)
+{
+ GenericLogOperations to_append { op };
+
+ schedule_append(to_append);
+}
+
+const unsigned long int ops_flushed_together = 4;
+/*
+ * Performs the pmem buffer flush on all scheduled ops, then schedules
+ * the log event append operation for all of them.
+ */
+template <typename I>
+void AbstractWriteLog<I>::flush_then_append_scheduled_ops(void)
+{
+ GenericLogOperations ops;
+ bool ops_remain = false;
+ ldout(m_image_ctx.cct, 20) << dendl;
+ do {
+ {
+ ops.clear();
+ std::lock_guard locker(m_lock);
+ if (m_ops_to_flush.size()) {
+ auto last_in_batch = m_ops_to_flush.begin();
+ unsigned int ops_to_flush = m_ops_to_flush.size();
+ if (ops_to_flush > ops_flushed_together) {
+ ops_to_flush = ops_flushed_together;
+ }
+ ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl;
+ std::advance(last_in_batch, ops_to_flush);
+ ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch);
+ ops_remain = !m_ops_to_flush.empty();
+ ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", "
+ << m_ops_to_flush.size() << " remain" << dendl;
+ } else {
+ ops_remain = false;
+ }
+ }
+ if (ops_remain) {
+ enlist_op_flusher();
+ }
+
+ /* Ops subsequently scheduled for flush may finish before these,
+ * which is fine. We're unconcerned with completion order until we
+ * get to the log message append step. */
+ if (ops.size()) {
+ flush_pmem_buffer(ops);
+ schedule_append(ops);
+ }
+ } while (ops_remain);
+ append_scheduled_ops();
+}
+
+template <typename I>
+void AbstractWriteLog<I>::enlist_op_flusher()
+{
+ m_async_flush_ops++;
+ m_async_op_tracker.start_op();
+ Context *flush_ctx = new LambdaContext([this](int r) {
+ flush_then_append_scheduled_ops();
+ m_async_flush_ops--;
+ m_async_op_tracker.finish_op();
+ });
+ m_work_queue.queue(flush_ctx);
+}
+
+/*
+ * Takes custody of ops. They'll all get their pmem blocks flushed,
+ * then get their log entries appended.
+ */
+template <typename I>
+void AbstractWriteLog<I>::schedule_flush_and_append(GenericLogOperationsVector &ops)
+{
+ GenericLogOperations to_flush(ops.begin(), ops.end());
+ bool need_finisher;
+ ldout(m_image_ctx.cct, 20) << dendl;
+ {
+ std::lock_guard locker(m_lock);
+
+ need_finisher = m_ops_to_flush.empty();
+ m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush);
+ }
+
+ if (need_finisher) {
+ enlist_op_flusher();
+ }
+}
+
+/*
+ * Flush the pmem regions for the data blocks of a set of operations
+ *
+ * V is expected to be GenericLogOperations<I>, or GenericLogOperationsVector<I>
+ */
+template <typename I>
+template <typename V>
+void AbstractWriteLog<I>::flush_pmem_buffer(V& ops)
+{
+ for (auto &operation : ops) {
+ operation->flush_pmem_buf_to_cache(m_log_pool);
+ }
+
+ /* Drain once for all */
+ pmemobj_drain(m_log_pool);
+
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ if (operation->reserved_allocated()) {
+ operation->buf_persist_comp_time = now;
+ } else {
+ ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
+ }
+ }
+}
+
+/*
+ * Allocate the (already reserved) write log entries for a set of operations.
+ *
+ * Locking:
+ * Acquires lock
+ */
+template <typename I>
+void AbstractWriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops)
+{
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_log_append_lock));
+
+ /* Allocate the (already reserved) log entries */
+ std::lock_guard locker(m_lock);
+
+ for (auto &operation : ops) {
+ uint32_t entry_index = m_first_free_entry;
+ m_first_free_entry = (m_first_free_entry + 1) % m_total_log_entries;
+ auto &log_entry = operation->get_log_entry();
+ log_entry->log_entry_index = entry_index;
+ log_entry->ram_entry.entry_index = entry_index;
+ log_entry->pmem_entry = &pmem_log_entries[entry_index];
+ log_entry->ram_entry.entry_valid = 1;
+ m_log_entries.push_back(log_entry);
+ ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
+ }
+}
+
+/*
+ * Flush the persistent write log entries set of ops. The entries must
+ * be contiguous in persistent memory.
+ */
+template <typename I>
+void AbstractWriteLog<I>::flush_op_log_entries(GenericLogOperationsVector &ops)
+{
+ if (ops.empty()) {
+ return;
+ }
+
+ if (ops.size() > 1) {
+ ceph_assert(ops.front()->get_log_entry()->pmem_entry < ops.back()->get_log_entry()->pmem_entry);
+ }
+
+ ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() << " "
+ << "start address="
+ << ops.front()->get_log_entry()->pmem_entry << " "
+ << "bytes="
+ << ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry))
+ << dendl;
+ pmemobj_flush(m_log_pool,
+ ops.front()->get_log_entry()->pmem_entry,
+ ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry)));
+}
+
+/*
+ * Write and persist the (already allocated) write log entries and
+ * data buffer allocations for a set of ops. The data buffer for each
+ * of these must already have been persisted to its reserved area.
+ */
+template <typename I>
+int AbstractWriteLog<I>::append_op_log_entries(GenericLogOperations &ops)
+{
+ CephContext *cct = m_image_ctx.cct;
+ GenericLogOperationsVector entries_to_flush;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ int ret = 0;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_log_append_lock));
+
+ if (ops.empty()) {
+ return 0;
+ }
+ entries_to_flush.reserve(OPS_APPENDED_TOGETHER);
+
+ /* Write log entries to ring and persist */
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ if (!entries_to_flush.empty()) {
+ /* Flush these and reset the list if the current entry wraps to the
+ * tail of the ring */
+ if (entries_to_flush.back()->get_log_entry()->log_entry_index >
+ operation->get_log_entry()->log_entry_index) {
+ ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at "
+ << "operation=[" << *operation << "]" << dendl;
+ flush_op_log_entries(entries_to_flush);
+ entries_to_flush.clear();
+ now = ceph_clock_now();
+ }
+ }
+ ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index="
+ << operation->get_log_entry()->log_entry_index << " "
+ << "from " << &operation->get_log_entry()->ram_entry << " "
+ << "to " << operation->get_log_entry()->pmem_entry << " "
+ << "operation=[" << *operation << "]" << dendl;
+ ldout(m_image_ctx.cct, 05) << "APPENDING: index="
+ << operation->get_log_entry()->log_entry_index << " "
+ << "operation=[" << *operation << "]" << dendl;
+ operation->log_append_time = now;
+ *operation->get_log_entry()->pmem_entry = operation->get_log_entry()->ram_entry;
+ ldout(m_image_ctx.cct, 20) << "APPENDING: index="
+ << operation->get_log_entry()->log_entry_index << " "
+ << "pmem_entry=[" << *operation->get_log_entry()->pmem_entry
+ << "]" << dendl;
+ entries_to_flush.push_back(operation);
+ }
+ flush_op_log_entries(entries_to_flush);
+
+ /* Drain once for all */
+ pmemobj_drain(m_log_pool);
+
+ /*
+ * Atomically advance the log head pointer and publish the
+ * allocations for all the data buffers they refer to.
+ */
+ utime_t tx_start = ceph_clock_now();
+ TX_BEGIN(m_log_pool) {
+ D_RW(pool_root)->first_free_entry = m_first_free_entry;
+ for (auto &operation : ops) {
+ if (operation->reserved_allocated()) {
+ auto write_op = (std::shared_ptr<WriteLogOperation>&) operation;
+ pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1);
+ } else {
+ ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
+ }
+ }
+ } TX_ONCOMMIT {
+ } TX_ONABORT {
+ lderr(cct) << "failed to commit " << ops.size()
+ << " log entries (" << m_log_pool_name << ")" << dendl;
+ ceph_assert(false);
+ ret = -EIO;
+ } TX_FINALLY {
+ } TX_END;
+
+ utime_t tx_end = ceph_clock_now();
+ m_perfcounter->tinc(l_librbd_pwl_append_tx_t, tx_end - tx_start);
+ m_perfcounter->hinc(
+ l_librbd_pwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size());
+ for (auto &operation : ops) {
+ operation->log_append_comp_time = tx_end;
+ }
+
+ return ret;
+}
+
+/*
+ * Complete a set of write ops with the result of append_op_entries.
+ */
+template <typename I>
+void AbstractWriteLog<I>::complete_op_log_entries(GenericLogOperations &&ops,
+ const int result)
+{
+ GenericLogEntries dirty_entries;
+ int published_reserves = 0;
+ ldout(m_image_ctx.cct, 20) << __func__ << ": completing" << dendl;
+ for (auto &op : ops) {
+ utime_t now = ceph_clock_now();
+ auto log_entry = op->get_log_entry();
+ log_entry->completed = true;
+ if (op->is_writing_op()) {
+ op->mark_log_entry_completed();
+ dirty_entries.push_back(log_entry);
+ }
+ if (op->reserved_allocated()) {
+ published_reserves++;
+ }
+ op->complete(result);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_app_t,
+ op->log_append_time - op->dispatch_time);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_cmp_t, now - op->dispatch_time);
+ m_perfcounter->hinc(l_librbd_pwl_log_op_dis_to_cmp_t_hist,
+ utime_t(now - op->dispatch_time).to_nsec(),
+ log_entry->ram_entry.write_bytes);
+ utime_t app_lat = op->log_append_comp_time - op->log_append_time;
+ m_perfcounter->tinc(l_librbd_pwl_log_op_app_to_appc_t, app_lat);
+ m_perfcounter->hinc(l_librbd_pwl_log_op_app_to_appc_t_hist, app_lat.to_nsec(),
+ log_entry->ram_entry.write_bytes);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_app_to_cmp_t, now - op->log_append_time);
+ }
+
+ {
+ std::lock_guard locker(m_lock);
+ m_unpublished_reserves -= published_reserves;
+ m_dirty_log_entries.splice(m_dirty_log_entries.end(), dirty_entries);
+
+ /* New entries may be flushable */
+ wake_up();
+ }
+}
+
+/**
+ * Dispatch as many deferred writes as possible
+ */
+template <typename I>
+void AbstractWriteLog<I>::dispatch_deferred_writes(void)
+{
+ C_BlockIORequestT *front_req = nullptr; /* req still on front of deferred list */
+ C_BlockIORequestT *allocated_req = nullptr; /* req that was allocated, and is now off the list */
+ bool allocated = false; /* front_req allocate succeeded */
+ bool cleared_dispatching_flag = false;
+
+ /* If we can't become the dispatcher, we'll exit */
+ {
+ std::lock_guard locker(m_lock);
+ if (m_dispatching_deferred_ops ||
+ !m_deferred_ios.size()) {
+ return;
+ }
+ m_dispatching_deferred_ops = true;
+ }
+
+ /* There are ops to dispatch, and this should be the only thread dispatching them */
+ {
+ std::lock_guard deferred_dispatch(m_deferred_dispatch_lock);
+ do {
+ {
+ std::lock_guard locker(m_lock);
+ ceph_assert(m_dispatching_deferred_ops);
+ if (allocated) {
+ /* On the 2..n-1 th time we get lock, front_req->alloc_resources() will
+ * have succeeded, and we'll need to pop it off the deferred ops list
+ * here. */
+ ceph_assert(front_req);
+ ceph_assert(!allocated_req);
+ m_deferred_ios.pop_front();
+ allocated_req = front_req;
+ front_req = nullptr;
+ allocated = false;
+ }
+ ceph_assert(!allocated);
+ if (!allocated && front_req) {
+ /* front_req->alloc_resources() failed on the last iteration. We'll stop dispatching. */
+ front_req = nullptr;
+ ceph_assert(!cleared_dispatching_flag);
+ m_dispatching_deferred_ops = false;
+ cleared_dispatching_flag = true;
+ } else {
+ ceph_assert(!front_req);
+ if (m_deferred_ios.size()) {
+ /* New allocation candidate */
+ front_req = m_deferred_ios.front();
+ } else {
+ ceph_assert(!cleared_dispatching_flag);
+ m_dispatching_deferred_ops = false;
+ cleared_dispatching_flag = true;
+ }
+ }
+ }
+ /* Try allocating for front_req before we decide what to do with allocated_req
+ * (if any) */
+ if (front_req) {
+ ceph_assert(!cleared_dispatching_flag);
+ allocated = front_req->alloc_resources();
+ }
+ if (allocated_req && front_req && allocated) {
+ /* Push dispatch of the first allocated req to a wq */
+ m_work_queue.queue(new LambdaContext(
+ [this, allocated_req](int r) {
+ allocated_req->dispatch();
+ }), 0);
+ allocated_req = nullptr;
+ }
+ ceph_assert(!(allocated_req && front_req && allocated));
+
+ /* Continue while we're still considering the front of the deferred ops list */
+ } while (front_req);
+ ceph_assert(!allocated);
+ }
+ ceph_assert(cleared_dispatching_flag);
+
+ /* If any deferred requests were allocated, the last one will still be in allocated_req */
+ if (allocated_req) {
+ allocated_req->dispatch();
+ }
+}
+
+/**
+ * Returns the lanes used by this write, and attempts to dispatch the next
+ * deferred write
+ */
+template <typename I>
+void AbstractWriteLog<I>::release_write_lanes(C_BlockIORequestT *req)
+{
+ {
+ std::lock_guard locker(m_lock);
+ m_free_lanes += req->image_extents.size();
+ }
+ dispatch_deferred_writes();
+}
+
+/**
+ * Attempts to allocate log resources for a write. Write is dispatched if
+ * resources are available, or queued if they aren't.
+ */
+template <typename I>
+void AbstractWriteLog<I>::alloc_and_dispatch_io_req(C_BlockIORequestT *req)
+{
+ bool dispatch_here = false;
+
+ {
+ /* If there are already deferred writes, queue behind them for resources */
+ {
+ std::lock_guard locker(m_lock);
+ dispatch_here = m_deferred_ios.empty();
+ }
+ if (dispatch_here) {
+ dispatch_here = req->alloc_resources();
+ }
+ if (dispatch_here) {
+ ldout(m_image_ctx.cct, 20) << "dispatching" << dendl;
+ req->dispatch();
+ } else {
+ req->deferred();
+ {
+ std::lock_guard locker(m_lock);
+ m_deferred_ios.push_back(req);
+ }
+ ldout(m_image_ctx.cct, 20) << "deferred IOs: " << m_deferred_ios.size() << dendl;
+ dispatch_deferred_writes();
+ }
+ }
+}
+
+template <typename I>
+bool AbstractWriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
+ bool alloc_succeeds = true;
+ bool no_space = false;
+ uint64_t bytes_allocated = 0;
+ uint64_t bytes_cached = 0;
+ uint64_t bytes_dirtied = 0;
+ uint64_t num_lanes = 0;
+ uint64_t num_unpublished_reserves = 0;
+ uint64_t num_log_entries = 0;
+
+ // Setup buffer, and get all the number of required resources
+ req->setup_buffer_resources(bytes_cached, bytes_dirtied, bytes_allocated,
+ num_lanes, num_log_entries, num_unpublished_reserves);
+
+ {
+ std::lock_guard locker(m_lock);
+ if (m_free_lanes < num_lanes) {
+ req->set_io_waited_for_lanes(true);
+ ldout(m_image_ctx.cct, 20) << "not enough free lanes (need "
+ << num_lanes
+ << ", have " << m_free_lanes << ") "
+ << *req << dendl;
+ alloc_succeeds = false;
+ /* This isn't considered a "no space" alloc fail. Lanes are a throttling mechanism. */
+ }
+ if (m_free_log_entries < num_log_entries) {
+ req->set_io_waited_for_entries(true);
+ ldout(m_image_ctx.cct, 20) << "not enough free entries (need "
+ << num_log_entries
+ << ", have " << m_free_log_entries << ") "
+ << *req << dendl;
+ alloc_succeeds = false;
+ no_space = true; /* Entries must be retired */
+ }
+ /* Don't attempt buffer allocate if we've exceeded the "full" threshold */
+ if (m_bytes_allocated + bytes_allocated > m_bytes_allocated_cap) {
+ if (!req->has_io_waited_for_buffers()) {
+ req->set_io_waited_for_entries(true);
+ ldout(m_image_ctx.cct, 1) << "Waiting for allocation cap (cap="
+ << m_bytes_allocated_cap
+ << ", allocated=" << m_bytes_allocated
+ << ") in write [" << *req << "]" << dendl;
+ }
+ alloc_succeeds = false;
+ no_space = true; /* Entries must be retired */
+ }
+ }
+
+ std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
+ if (alloc_succeeds) {
+ for (auto &buffer : buffers) {
+ utime_t before_reserve = ceph_clock_now();
+ buffer.buffer_oid = pmemobj_reserve(m_log_pool,
+ &buffer.buffer_alloc_action,
+ buffer.allocation_size,
+ 0 /* Object type */);
+ buffer.allocation_lat = ceph_clock_now() - before_reserve;
+ if (TOID_IS_NULL(buffer.buffer_oid)) {
+ if (!req->has_io_waited_for_buffers()) {
+ req->set_io_waited_for_entries(true);
+ }
+ ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: "
+ << pmemobj_errormsg() << ". "
+ << *req << dendl;
+ alloc_succeeds = false;
+ no_space = true; /* Entries need to be retired */
+ break;
+ } else {
+ buffer.allocated = true;
+ }
+ ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo
+ << "." << buffer.buffer_oid.oid.off
+ << ", size=" << buffer.allocation_size << dendl;
+ }
+ }
+
+ if (alloc_succeeds) {
+ std::lock_guard locker(m_lock);
+ /* We need one free log entry per extent (each is a separate entry), and
+ * one free "lane" for remote replication. */
+ if ((m_free_lanes >= num_lanes) &&
+ (m_free_log_entries >= num_log_entries)) {
+ m_free_lanes -= num_lanes;
+ m_free_log_entries -= num_log_entries;
+ m_unpublished_reserves += num_unpublished_reserves;
+ m_bytes_allocated += bytes_allocated;
+ m_bytes_cached += bytes_cached;
+ m_bytes_dirty += bytes_dirtied;
+ } else {
+ alloc_succeeds = false;
+ }
+ }
+
+ if (!alloc_succeeds) {
+ /* On alloc failure, free any buffers we did allocate */
+ for (auto &buffer : buffers) {
+ if (buffer.allocated) {
+ pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1);
+ }
+ }
+ if (no_space) {
+ /* Expedite flushing and/or retiring */
+ std::lock_guard locker(m_lock);
+ m_alloc_failed_since_retire = true;
+ m_last_alloc_fail = ceph_clock_now();
+ }
+ }
+
+ req->set_allocated(alloc_succeeds);
+
+ return alloc_succeeds;
+}
+
+template <typename I>
+C_FlushRequest<AbstractWriteLog<I>>* AbstractWriteLog<I>::make_flush_req(Context *on_finish) {
+ utime_t flush_begins = ceph_clock_now();
+ bufferlist bl;
+ auto *flush_req =
+ new C_FlushRequestT(*this, flush_begins, Extents({whole_volume_extent()}),
+ std::move(bl), 0, m_lock, m_perfcounter, on_finish);
+
+ return flush_req;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::wake_up() {
+ CephContext *cct = m_image_ctx.cct;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (!m_wake_up_enabled) {
+ // wake_up is disabled during shutdown after flushing completes
+ ldout(m_image_ctx.cct, 6) << "deferred processing disabled" << dendl;
+ return;
+ }
+
+ if (m_wake_up_requested && m_wake_up_scheduled) {
+ return;
+ }
+
+ ldout(cct, 20) << dendl;
+
+ /* Wake-up can be requested while it's already scheduled */
+ m_wake_up_requested = true;
+
+ /* Wake-up cannot be scheduled if it's already scheduled */
+ if (m_wake_up_scheduled) {
+ return;
+ }
+ m_wake_up_scheduled = true;
+ m_async_process_work++;
+ m_async_op_tracker.start_op();
+ m_work_queue.queue(new LambdaContext(
+ [this](int r) {
+ process_work();
+ m_async_op_tracker.finish_op();
+ m_async_process_work--;
+ }), 0);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::process_work() {
+ CephContext *cct = m_image_ctx.cct;
+ int max_iterations = 4;
+ bool wake_up_requested = false;
+ uint64_t aggressive_high_water_bytes = m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER;
+ uint64_t high_water_bytes = m_bytes_allocated_cap * RETIRE_HIGH_WATER;
+ uint64_t low_water_bytes = m_bytes_allocated_cap * RETIRE_LOW_WATER;
+ uint64_t aggressive_high_water_entries = m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER;
+ uint64_t high_water_entries = m_total_log_entries * RETIRE_HIGH_WATER;
+ uint64_t low_water_entries = m_total_log_entries * RETIRE_LOW_WATER;
+
+ ldout(cct, 20) << dendl;
+
+ do {
+ {
+ std::lock_guard locker(m_lock);
+ m_wake_up_requested = false;
+ }
+ if (m_alloc_failed_since_retire || m_invalidating ||
+ m_bytes_allocated > high_water_bytes ||
+ (m_log_entries.size() > high_water_entries)) {
+ int retired = 0;
+ utime_t started = ceph_clock_now();
+ ldout(m_image_ctx.cct, 10) << "alloc_fail=" << m_alloc_failed_since_retire
+ << ", allocated > high_water="
+ << (m_bytes_allocated > high_water_bytes)
+ << ", allocated_entries > high_water="
+ << (m_log_entries.size() > high_water_entries)
+ << dendl;
+ while (m_alloc_failed_since_retire || m_invalidating ||
+ (m_bytes_allocated > high_water_bytes) ||
+ (m_log_entries.size() > high_water_entries) ||
+ (((m_bytes_allocated > low_water_bytes) || (m_log_entries.size() > low_water_entries)) &&
+ (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) {
+ if (!retire_entries((m_shutting_down || m_invalidating ||
+ (m_bytes_allocated > aggressive_high_water_bytes) ||
+ (m_log_entries.size() > aggressive_high_water_entries))
+ ? MAX_ALLOC_PER_TRANSACTION
+ : MAX_FREE_PER_TRANSACTION)) {
+ break;
+ }
+ retired++;
+ dispatch_deferred_writes();
+ process_writeback_dirty_entries();
+ }
+ ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl;
+ }
+ dispatch_deferred_writes();
+ process_writeback_dirty_entries();
+
+ {
+ std::lock_guard locker(m_lock);
+ wake_up_requested = m_wake_up_requested;
+ }
+ } while (wake_up_requested && --max_iterations > 0);
+
+ {
+ std::lock_guard locker(m_lock);
+ m_wake_up_scheduled = false;
+ /* Reschedule if it's still requested */
+ if (m_wake_up_requested) {
+ wake_up();
+ }
+ }
+}
+
+template <typename I>
+bool AbstractWriteLog<I>::can_flush_entry(std::shared_ptr<GenericLogEntry> log_entry) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << "" << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (m_invalidating) {
+ return true;
+ }
+
+ /* For OWB we can flush entries with the same sync gen number (write between
+ * aio_flush() calls) concurrently. Here we'll consider an entry flushable if
+ * its sync gen number is <= the lowest sync gen number carried by all the
+ * entries currently flushing.
+ *
+ * If the entry considered here bears a sync gen number lower than a
+ * previously flushed entry, the application had to have submitted the write
+ * bearing the higher gen number before the write with the lower gen number
+ * completed. So, flushing these concurrently is OK.
+ *
+ * If the entry considered here bears a sync gen number higher than a
+ * currently flushing entry, the write with the lower gen number may have
+ * completed to the application before the write with the higher sync gen
+ * number was submitted, and the application may rely on that completion
+ * order for volume consistency. In this case the entry will not be
+ * considered flushable until all the entries bearing lower sync gen numbers
+ * finish flushing.
+ */
+
+ if (m_flush_ops_in_flight &&
+ (log_entry->ram_entry.sync_gen_number > m_lowest_flushing_sync_gen)) {
+ return false;
+ }
+
+ return (log_entry->can_writeback() &&
+ (m_flush_ops_in_flight <= IN_FLIGHT_FLUSH_WRITE_LIMIT) &&
+ (m_flush_bytes_in_flight <= IN_FLIGHT_FLUSH_BYTES_LIMIT));
+}
+
+template <typename I>
+Context* AbstractWriteLog<I>::construct_flush_entry_ctx(std::shared_ptr<GenericLogEntry> log_entry) {
+ CephContext *cct = m_image_ctx.cct;
+ bool invalidating = m_invalidating; // snapshot so we behave consistently
+
+ ldout(cct, 20) << "" << dendl;
+ ceph_assert(m_entry_reader_lock.is_locked());
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ if (!m_flush_ops_in_flight ||
+ (log_entry->ram_entry.sync_gen_number < m_lowest_flushing_sync_gen)) {
+ m_lowest_flushing_sync_gen = log_entry->ram_entry.sync_gen_number;
+ }
+ m_flush_ops_in_flight += 1;
+ /* For write same this is the bytes affected bt the flush op, not the bytes transferred */
+ m_flush_bytes_in_flight += log_entry->ram_entry.write_bytes;
+
+ /* Flush write completion action */
+ Context *ctx = new LambdaContext(
+ [this, log_entry, invalidating](int r) {
+ {
+ std::lock_guard locker(m_lock);
+ if (r < 0) {
+ lderr(m_image_ctx.cct) << "failed to flush log entry"
+ << cpp_strerror(r) << dendl;
+ m_dirty_log_entries.push_front(log_entry);
+ } else {
+ ceph_assert(m_bytes_dirty >= log_entry->bytes_dirty());
+ log_entry->set_flushed(true);
+ m_bytes_dirty -= log_entry->bytes_dirty();
+ sync_point_writer_flushed(log_entry->get_sync_point_entry());
+ ldout(m_image_ctx.cct, 20) << "flushed: " << log_entry
+ << " invalidating=" << invalidating
+ << dendl;
+ }
+ m_flush_ops_in_flight -= 1;
+ m_flush_bytes_in_flight -= log_entry->ram_entry.write_bytes;
+ wake_up();
+ }
+ });
+ /* Flush through lower cache before completing */
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ if (r < 0) {
+ lderr(m_image_ctx.cct) << "failed to flush log entry"
+ << cpp_strerror(r) << dendl;
+ ctx->complete(r);
+ } else {
+ m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, ctx);
+ }
+ });
+
+ if (invalidating) {
+ return ctx;
+ }
+ return new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ m_image_ctx.op_work_queue->queue(new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+ << " " << *log_entry << dendl;
+ log_entry->writeback(m_image_writeback, ctx);
+ }), 0);
+ });
+}
+
+template <typename I>
+void AbstractWriteLog<I>::process_writeback_dirty_entries() {
+ CephContext *cct = m_image_ctx.cct;
+ bool all_clean = false;
+ int flushed = 0;
+
+ ldout(cct, 20) << "Look for dirty entries" << dendl;
+ {
+ DeferredContexts post_unlock;
+ std::shared_lock entry_reader_locker(m_entry_reader_lock);
+ while (flushed < IN_FLIGHT_FLUSH_WRITE_LIMIT) {
+ std::lock_guard locker(m_lock);
+ if (m_shutting_down) {
+ ldout(cct, 5) << "Flush during shutdown supressed" << dendl;
+ /* Do flush complete only when all flush ops are finished */
+ all_clean = !m_flush_ops_in_flight;
+ break;
+ }
+ if (m_dirty_log_entries.empty()) {
+ ldout(cct, 20) << "Nothing new to flush" << dendl;
+ /* Do flush complete only when all flush ops are finished */
+ all_clean = !m_flush_ops_in_flight;
+ break;
+ }
+ auto candidate = m_dirty_log_entries.front();
+ bool flushable = can_flush_entry(candidate);
+ if (flushable) {
+ post_unlock.add(construct_flush_entry_ctx(candidate));
+ flushed++;
+ m_dirty_log_entries.pop_front();
+ } else {
+ ldout(cct, 20) << "Next dirty entry isn't flushable yet" << dendl;
+ break;
+ }
+ }
+ }
+
+ if (all_clean) {
+ /* All flushing complete, drain outside lock */
+ Contexts flush_contexts;
+ {
+ std::lock_guard locker(m_lock);
+ flush_contexts.swap(m_flush_complete_contexts);
+ }
+ finish_contexts(m_image_ctx.cct, flush_contexts, 0);
+ }
+}
+
+/**
+ * Update/persist the last flushed sync point in the log
+ */
+template <typename I>
+void AbstractWriteLog<I>::persist_last_flushed_sync_gen()
+{
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ uint64_t flushed_sync_gen;
+
+ std::lock_guard append_locker(m_log_append_lock);
+ {
+ std::lock_guard locker(m_lock);
+ flushed_sync_gen = m_flushed_sync_gen;
+ }
+
+ if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
+ ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from "
+ << D_RO(pool_root)->flushed_sync_gen << " to "
+ << flushed_sync_gen << dendl;
+ TX_BEGIN(m_log_pool) {
+ D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
+ } TX_ONCOMMIT {
+ } TX_ONABORT {
+ lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl;
+ ceph_assert(false);
+ } TX_FINALLY {
+ } TX_END;
+ }
+}
+
+/* Returns true if the specified SyncPointLogEntry is considered flushed, and
+ * the log will be updated to reflect this. */
+template <typename I>
+bool AbstractWriteLog<I>::handle_flushed_sync_point(std::shared_ptr<SyncPointLogEntry> log_entry)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(log_entry);
+
+ if ((log_entry->writes_flushed == log_entry->writes) &&
+ log_entry->completed && log_entry->prior_sync_point_flushed &&
+ log_entry->next_sync_point_entry) {
+ ldout(m_image_ctx.cct, 20) << "All writes flushed up to sync point="
+ << *log_entry << dendl;
+ log_entry->next_sync_point_entry->prior_sync_point_flushed = true;
+ /* Don't move the flushed sync gen num backwards. */
+ if (m_flushed_sync_gen < log_entry->ram_entry.sync_gen_number) {
+ m_flushed_sync_gen = log_entry->ram_entry.sync_gen_number;
+ }
+ m_async_op_tracker.start_op();
+ m_work_queue.queue(new LambdaContext(
+ [this, log_entry](int r) {
+ bool handled_by_next;
+ {
+ std::lock_guard locker(m_lock);
+ handled_by_next = handle_flushed_sync_point(log_entry->next_sync_point_entry);
+ }
+ if (!handled_by_next) {
+ persist_last_flushed_sync_gen();
+ }
+ m_async_op_tracker.finish_op();
+ }));
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void AbstractWriteLog<I>::sync_point_writer_flushed(std::shared_ptr<SyncPointLogEntry> log_entry)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(log_entry);
+ log_entry->writes_flushed++;
+
+ /* If this entry might be completely flushed, look closer */
+ if ((log_entry->writes_flushed == log_entry->writes) && log_entry->completed) {
+ ldout(m_image_ctx.cct, 15) << "All writes flushed for sync point="
+ << *log_entry << dendl;
+ handle_flushed_sync_point(log_entry);
+ }
+}
+
+/* Make a new sync point and flush the previous during initialization, when there may or may
+ * not be a previous sync point */
+template <typename I>
+void AbstractWriteLog<I>::init_flush_new_sync_point(DeferredContexts &later) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(!m_initialized); /* Don't use this after init */
+
+ if (!m_current_sync_point) {
+ /* First sync point since start */
+ new_sync_point(later);
+ } else {
+ flush_new_sync_point(nullptr, later);
+ }
+}
+
+/**
+ * Begin a new sync point
+ */
+template <typename I>
+void AbstractWriteLog<I>::new_sync_point(DeferredContexts &later) {
+ CephContext *cct = m_image_ctx.cct;
+ std::shared_ptr<SyncPoint> old_sync_point = m_current_sync_point;
+ std::shared_ptr<SyncPoint> new_sync_point;
+ ldout(cct, 20) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ /* The first time this is called, if this is a newly created log,
+ * this makes the first sync gen number we'll use 1. On the first
+ * call for a re-opened log m_current_sync_gen will be the highest
+ * gen number from all the sync point entries found in the re-opened
+ * log, and this advances to the next sync gen number. */
+ ++m_current_sync_gen;
+
+ new_sync_point = std::make_shared<SyncPoint>(m_current_sync_gen, cct);
+ m_current_sync_point = new_sync_point;
+
+ /* If this log has been re-opened, old_sync_point will initially be
+ * nullptr, but m_current_sync_gen may not be zero. */
+ if (old_sync_point) {
+ new_sync_point->setup_earlier_sync_point(old_sync_point, m_last_op_sequence_num);
+ m_perfcounter->hinc(l_librbd_pwl_syncpoint_hist,
+ old_sync_point->log_entry->writes,
+ old_sync_point->log_entry->bytes);
+ /* This sync point will acquire no more sub-ops. Activation needs
+ * to acquire m_lock, so defer to later*/
+ later.add(new LambdaContext(
+ [this, old_sync_point](int r) {
+ old_sync_point->prior_persisted_gather_activate();
+ }));
+ }
+
+ new_sync_point->prior_persisted_gather_set_finisher();
+
+ if (old_sync_point) {
+ ldout(cct,6) << "new sync point = [" << *m_current_sync_point
+ << "], prior = [" << *old_sync_point << "]" << dendl;
+ } else {
+ ldout(cct,6) << "first sync point = [" << *m_current_sync_point
+ << "]" << dendl;
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::flush_new_sync_point(C_FlushRequestT *flush_req,
+ DeferredContexts &later) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (!flush_req) {
+ m_async_null_flush_finish++;
+ m_async_op_tracker.start_op();
+ Context *flush_ctx = new LambdaContext([this](int r) {
+ m_async_null_flush_finish--;
+ m_async_op_tracker.finish_op();
+ });
+ flush_req = make_flush_req(flush_ctx);
+ flush_req->internal = true;
+ }
+
+ /* Add a new sync point. */
+ new_sync_point(later);
+ std::shared_ptr<SyncPoint> to_append = m_current_sync_point->earlier_sync_point;
+ ceph_assert(to_append);
+
+ /* This flush request will append/persist the (now) previous sync point */
+ flush_req->to_append = to_append;
+
+ /* When the m_sync_point_persist Gather completes this sync point can be
+ * appended. The only sub for this Gather is the finisher Context for
+ * m_prior_log_entries_persisted, which records the result of the Gather in
+ * the sync point, and completes. TODO: Do we still need both of these
+ * Gathers?*/
+ Context * ctx = new LambdaContext([this, flush_req](int r) {
+ ldout(m_image_ctx.cct, 20) << "Flush req=" << flush_req
+ << " sync point =" << flush_req->to_append
+ << ". Ready to persist." << dendl;
+ alloc_and_dispatch_io_req(flush_req);
+ });
+ to_append->persist_gather_set_finisher(ctx);
+
+ /* The m_sync_point_persist Gather has all the subs it will ever have, and
+ * now has its finisher. If the sub is already complete, activation will
+ * complete the Gather. The finisher will acquire m_lock, so we'll activate
+ * this when we release m_lock.*/
+ later.add(new LambdaContext([this, to_append](int r) {
+ to_append->persist_gather_activate();
+ }));
+
+ /* The flush request completes when the sync point persists */
+ to_append->add_in_on_persisted_ctxs(flush_req);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::flush_new_sync_point_if_needed(C_FlushRequestT *flush_req,
+ DeferredContexts &later) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ /* If there have been writes since the last sync point ... */
+ if (m_current_sync_point->log_entry->writes) {
+ flush_new_sync_point(flush_req, later);
+ } else {
+ /* There have been no writes to the current sync point. */
+ if (m_current_sync_point->earlier_sync_point) {
+ /* If previous sync point hasn't completed, complete this flush
+ * with the earlier sync point. No alloc or dispatch needed. */
+ m_current_sync_point->earlier_sync_point->on_sync_point_persisted.push_back(flush_req);
+ } else {
+ /* The previous sync point has already completed and been
+ * appended. The current sync point has no writes, so this flush
+ * has nothing to wait for. This flush completes now. */
+ later.add(flush_req);
+ }
+ }
+}
+
+/*
+ * RWL internal flush - will actually flush the RWL.
+ *
+ * User flushes should arrive at aio_flush(), and only flush prior
+ * writes to all log replicas.
+ *
+ * Librbd internal flushes will arrive at flush(invalidate=false,
+ * discard=false), and traverse the block guard to ensure in-flight writes are
+ * flushed.
+ */
+template <typename I>
+void AbstractWriteLog<I>::flush_dirty_entries(Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ bool all_clean;
+ bool flushing;
+ bool stop_flushing;
+
+ {
+ std::lock_guard locker(m_lock);
+ flushing = (0 != m_flush_ops_in_flight);
+ all_clean = m_dirty_log_entries.empty();
+ stop_flushing = (m_shutting_down);
+ }
+
+ if (!flushing && (all_clean || stop_flushing)) {
+ /* Complete without holding m_lock */
+ if (all_clean) {
+ ldout(cct, 20) << "no dirty entries" << dendl;
+ } else {
+ ldout(cct, 5) << "flush during shutdown suppressed" << dendl;
+ }
+ on_finish->complete(0);
+ } else {
+ if (all_clean) {
+ ldout(cct, 5) << "flush ops still in progress" << dendl;
+ } else {
+ ldout(cct, 20) << "dirty entries remain" << dendl;
+ }
+ std::lock_guard locker(m_lock);
+ /* on_finish can't be completed yet */
+ m_flush_complete_contexts.push_back(new LambdaContext(
+ [this, on_finish](int r) {
+ flush_dirty_entries(on_finish);
+ }));
+ wake_up();
+ }
+}
+
+template <typename I>
+void AbstractWriteLog<I>::internal_flush(bool invalidate, Context *on_finish) {
+ ldout(m_image_ctx.cct, 20) << "invalidate=" << invalidate << dendl;
+
+ if (m_perfcounter) {
+ if (invalidate) {
+ m_perfcounter->inc(l_librbd_pwl_invalidate_cache, 1);
+ } else {
+ m_perfcounter->inc(l_librbd_pwl_flush, 1);
+ }
+ }
+
+ /* May be called even if initialization fails */
+ if (!m_initialized) {
+ ldout(m_image_ctx.cct, 05) << "never initialized" << dendl;
+ /* Deadlock if completed here */
+ m_image_ctx.op_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ /* Flush/invalidate must pass through block guard to ensure all layers of
+ * cache are consistently flush/invalidated. This ensures no in-flight write leaves
+ * some layers with valid regions, which may later produce inconsistent read
+ * results. */
+ GuardedRequestFunctionContext *guarded_ctx =
+ new GuardedRequestFunctionContext(
+ [this, on_finish, invalidate](GuardedRequestFunctionContext &guard_ctx) {
+ DeferredContexts on_exit;
+ ldout(m_image_ctx.cct, 20) << "cell=" << guard_ctx.cell << dendl;
+ ceph_assert(guard_ctx.cell);
+
+ Context *ctx = new LambdaContext(
+ [this, cell=guard_ctx.cell, invalidate, on_finish](int r) {
+ std::lock_guard locker(m_lock);
+ m_invalidating = false;
+ ldout(m_image_ctx.cct, 6) << "Done flush/invalidating (invalidate="
+ << invalidate << ")" << dendl;
+ if (m_log_entries.size()) {
+ ldout(m_image_ctx.cct, 1) << "m_log_entries.size()="
+ << m_log_entries.size() << ", "
+ << "front()=" << *m_log_entries.front()
+ << dendl;
+ }
+ if (invalidate) {
+ ceph_assert(m_log_entries.size() == 0);
+ }
+ ceph_assert(m_dirty_log_entries.size() == 0);
+ m_image_ctx.op_work_queue->queue(on_finish, r);
+ release_guarded_request(cell);
+ });
+ ctx = new LambdaContext(
+ [this, ctx, invalidate](int r) {
+ Context *next_ctx = ctx;
+ if (r < 0) {
+ /* Override on_finish status with this error */
+ next_ctx = new LambdaContext([r, ctx](int _r) {
+ ctx->complete(r);
+ });
+ }
+ if (invalidate) {
+ {
+ std::lock_guard locker(m_lock);
+ ceph_assert(m_dirty_log_entries.size() == 0);
+ ceph_assert(!m_invalidating);
+ ldout(m_image_ctx.cct, 6) << "Invalidating" << dendl;
+ m_invalidating = true;
+ }
+ /* Discards all RWL entries */
+ while (retire_entries(MAX_ALLOC_PER_TRANSACTION)) { }
+ next_ctx->complete(0);
+ } else {
+ {
+ std::lock_guard locker(m_lock);
+ ceph_assert(m_dirty_log_entries.size() == 0);
+ ceph_assert(!m_invalidating);
+ }
+ m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, next_ctx);
+ }
+ });
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ flush_dirty_entries(ctx);
+ });
+ std::lock_guard locker(m_lock);
+ /* Even if we're throwing everything away, but we want the last entry to
+ * be a sync point so we can cleanly resume.
+ *
+ * Also, the blockguard only guarantees the replication of this op
+ * can't overlap with prior ops. It doesn't guarantee those are all
+ * completed and eligible for flush & retire, which we require here.
+ */
+ auto flush_req = make_flush_req(ctx);
+ flush_new_sync_point_if_needed(flush_req, on_exit);
+ });
+ detain_guarded_request(nullptr, guarded_ctx, true);
+}
+
+template <typename I>
+void AbstractWriteLog<I>::add_into_log_map(GenericWriteLogEntries &log_entries) {
+ m_blocks_to_log_entries.add_log_entries(log_entries);
+}
+
+template <typename I>
+bool AbstractWriteLog<I>::can_retire_entry(std::shared_ptr<GenericLogEntry> log_entry) {
+ CephContext *cct = m_image_ctx.cct;
+
+ ldout(cct, 20) << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ return log_entry->can_retire();
+}
+
+/**
+ * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries
+ * that are eligible to be retired. Returns true if anything was
+ * retired.
+ */
+template <typename I>
+bool AbstractWriteLog<I>::retire_entries(const unsigned long int frees_per_tx) {
+ CephContext *cct = m_image_ctx.cct;
+ GenericLogEntriesVector retiring_entries;
+ uint32_t initial_first_valid_entry;
+ uint32_t first_valid_entry;
+
+ std::lock_guard retire_locker(m_log_retire_lock);
+ ldout(cct, 20) << "Look for entries to retire" << dendl;
+ {
+ /* Entry readers can't be added while we hold m_entry_reader_lock */
+ RWLock::WLocker entry_reader_locker(m_entry_reader_lock);
+ std::lock_guard locker(m_lock);
+ initial_first_valid_entry = m_first_valid_entry;
+ first_valid_entry = m_first_valid_entry;
+ auto entry = m_log_entries.front();
+ while (!m_log_entries.empty() &&
+ retiring_entries.size() < frees_per_tx &&
+ can_retire_entry(entry)) {
+ if (entry->log_entry_index != first_valid_entry) {
+ lderr(cct) << "Retiring entry index (" << entry->log_entry_index
+ << ") and first valid log entry index (" << first_valid_entry
+ << ") must be ==." << dendl;
+ }
+ ceph_assert(entry->log_entry_index == first_valid_entry);
+ first_valid_entry = (first_valid_entry + 1) % m_total_log_entries;
+ m_log_entries.pop_front();
+ retiring_entries.push_back(entry);
+ /* Remove entry from map so there will be no more readers */
+ if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) {
+ auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(entry);
+ if (gen_write_entry) {
+ m_blocks_to_log_entries.remove_log_entry(gen_write_entry);
+ }
+ }
+ entry = m_log_entries.front();
+ }
+ }
+
+ if (retiring_entries.size()) {
+ ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+ utime_t tx_start;
+ utime_t tx_end;
+ /* Advance first valid entry and release buffers */
+ {
+ uint64_t flushed_sync_gen;
+ std::lock_guard append_locker(m_log_append_lock);
+ {
+ std::lock_guard locker(m_lock);
+ flushed_sync_gen = m_flushed_sync_gen;
+ }
+
+ tx_start = ceph_clock_now();
+ TX_BEGIN(m_log_pool) {
+ if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
+ ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from "
+ << D_RO(pool_root)->flushed_sync_gen << " to "
+ << flushed_sync_gen << dendl;
+ D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
+ }
+ D_RW(pool_root)->first_valid_entry = first_valid_entry;
+ for (auto &entry: retiring_entries) {
+ if (entry->write_bytes()) {
+ ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo
+ << "." << entry->ram_entry.write_data.oid.off << dendl;
+ TX_FREE(entry->ram_entry.write_data);
+ } else {
+ ldout(cct, 20) << "Retiring non-write: " << *entry << dendl;
+ }
+ }
+ } TX_ONCOMMIT {
+ } TX_ONABORT {
+ lderr(cct) << "failed to commit free of" << retiring_entries.size() << " log entries (" << m_log_pool_name << ")" << dendl;
+ ceph_assert(false);
+ } TX_FINALLY {
+ } TX_END;
+ tx_end = ceph_clock_now();
+ }
+ m_perfcounter->tinc(l_librbd_pwl_retire_tx_t, tx_end - tx_start);
+ m_perfcounter->hinc(l_librbd_pwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), retiring_entries.size());
+
+ /* Update runtime copy of first_valid, and free entries counts */
+ {
+ std::lock_guard locker(m_lock);
+
+ ceph_assert(m_first_valid_entry == initial_first_valid_entry);
+ m_first_valid_entry = first_valid_entry;
+ m_free_log_entries += retiring_entries.size();
+ for (auto &entry: retiring_entries) {
+ if (entry->write_bytes()) {
+ ceph_assert(m_bytes_cached >= entry->write_bytes());
+ m_bytes_cached -= entry->write_bytes();
+ uint64_t entry_allocation_size = entry->write_bytes();
+ if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) {
+ entry_allocation_size = MIN_WRITE_ALLOC_SIZE;
+ }
+ ceph_assert(m_bytes_allocated >= entry_allocation_size);
+ m_bytes_allocated -= entry_allocation_size;
+ }
+ }
+ m_alloc_failed_since_retire = false;
+ wake_up();
+ }
+ } else {
+ ldout(cct, 20) << "Nothing to retire" << dendl;
+ return false;
+ }
+ return true;
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx>;
+template void librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx>:: \
+ flush_pmem_buffer(std::vector<std::shared_ptr< \
+ librbd::cache::pwl::GenericLogOperation>>&);
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
+
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/cache/ImageCache.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/Utils.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/Types.h"
+#include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/Request.h"
+#include "librbd/cache/pwl/LogMap.h"
+#include <functional>
+#include <list>
+
+class Context;
+class SafeTimer;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+namespace pwl {
+
+class SyncPointLogEntry;
+class GenericWriteLogEntry;
+class WriteLogEntry;
+class GenericLogEntry;
+
+typedef std::list<std::shared_ptr<WriteLogEntry>> WriteLogEntries;
+typedef std::list<std::shared_ptr<GenericLogEntry>> GenericLogEntries;
+typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
+typedef std::vector<std::shared_ptr<GenericLogEntry>> GenericLogEntriesVector;
+
+typedef LogMapEntries<GenericWriteLogEntry> WriteLogMapEntries;
+typedef LogMap<GenericWriteLogEntry> WriteLogMap;
+
+/**** Write log entries end ****/
+
+typedef librbd::BlockGuard<GuardedRequest> WriteLogGuard;
+
+class DeferredContexts;
+template <typename> class ImageCacheState;
+
+template <typename T>
+struct C_BlockIORequest;
+
+template <typename T>
+struct C_WriteRequest;
+
+using GenericLogOperations = std::list<GenericLogOperationSharedPtr>;
+
+
+template <typename ImageCtxT>
+class AbstractWriteLog {
+public:
+ typedef io::Extent Extent;
+ typedef io::Extents Extents;
+
+ AbstractWriteLog(ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state);
+ ~AbstractWriteLog();
+ AbstractWriteLog(const AbstractWriteLog&) = delete;
+ AbstractWriteLog &operator=(const AbstractWriteLog&) = delete;
+
+ /// IO methods
+ void read(Extents&& image_extents, ceph::bufferlist *bl,
+ int fadvise_flags, Context *on_finish);
+ void write(Extents&& image_extents, ceph::bufferlist&& bl,
+ int fadvise_flags,
+ Context *on_finish);
+ void discard(uint64_t offset, uint64_t length,
+ uint32_t discard_granularity_bytes,
+ Context *on_finish);
+ void flush(io::FlushSource flush_source, Context *on_finish);
+ void writesame(uint64_t offset, uint64_t length,
+ ceph::bufferlist&& bl,
+ int fadvise_flags, Context *on_finish);
+ void compare_and_write(Extents&& image_extents,
+ ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,int fadvise_flags,
+ Context *on_finish);
+
+ /// internal state methods
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+ void invalidate(Context *on_finish);
+ void flush(Context *on_finish);
+
+ using This = AbstractWriteLog<ImageCtxT>;
+ using C_WriteRequestT = pwl::C_WriteRequest<This>;
+ using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
+ using C_FlushRequestT = pwl::C_FlushRequest<This>;
+ using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
+ using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
+ using C_CompAndWriteRequestT = pwl::C_CompAndWriteRequest<This>;
+
+ CephContext * get_context();
+ void release_guarded_request(BlockGuardCell *cell);
+ void release_write_lanes(C_BlockIORequestT *req);
+ bool alloc_resources(C_BlockIORequestT *req);
+ template <typename V>
+ void flush_pmem_buffer(V& ops);
+ void schedule_append(pwl::GenericLogOperationsVector &ops);
+ void schedule_append(pwl::GenericLogOperationSharedPtr op);
+ void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops);
+ void flush_new_sync_point(C_FlushRequestT *flush_req, pwl::DeferredContexts &later);
+ std::shared_ptr<pwl::SyncPoint> get_current_sync_point() {
+ return m_current_sync_point;
+ }
+ bool get_persist_on_flush() {
+ return m_persist_on_flush;
+ }
+ void inc_last_op_sequence_num() {
+ m_perfcounter->inc(l_librbd_pwl_log_ops, 1);
+ ++m_last_op_sequence_num;
+ }
+ uint64_t get_last_op_sequence_num() {
+ return m_last_op_sequence_num;
+ }
+ uint64_t get_current_sync_gen() {
+ return m_current_sync_gen;
+ }
+ unsigned int get_free_lanes() {
+ return m_free_lanes;
+ }
+ uint32_t get_free_log_entries() {
+ return m_free_log_entries;
+ }
+ void add_into_log_map(pwl::GenericWriteLogEntries &log_entries);
+protected:
+ typedef std::list<pwl::C_WriteRequest<This> *> C_WriteRequests;
+ typedef std::list<pwl::C_BlockIORequest<This> *> C_BlockIORequests;
+
+ BlockGuardCell* detain_guarded_request_helper(pwl::GuardedRequest &req);
+ BlockGuardCell* detain_guarded_request_barrier_helper(pwl::GuardedRequest &req);
+ void detain_guarded_request(C_BlockIORequestT *request,
+ pwl::GuardedRequestFunctionContext *guarded_ctx,
+ bool is_barrier);
+
+ librbd::cache::pwl::ImageCacheState<ImageCtxT>* m_cache_state = nullptr;
+
+ std::atomic<bool> m_initialized = {false};
+ std::atomic<bool> m_shutting_down = {false};
+ std::atomic<bool> m_invalidating = {false};
+ PMEMobjpool *m_log_pool = nullptr;
+ const char* m_pwl_pool_layout_name;
+
+ ImageCtxT &m_image_ctx;
+
+ std::string m_log_pool_name;
+ bool m_log_is_poolset = false;
+ uint64_t m_log_pool_config_size; /* Configured size of RWL */
+ uint64_t m_log_pool_actual_size = 0; /* Actual size of RWL pool */
+
+ uint32_t m_total_log_entries = 0;
+ uint32_t m_free_log_entries = 0;
+
+ std::atomic<uint64_t> m_bytes_allocated = {0}; /* Total bytes allocated in write buffers */
+ uint64_t m_bytes_cached = 0; /* Total bytes used in write buffers */
+ uint64_t m_bytes_dirty = 0; /* Total bytes yet to flush to RBD */
+ uint64_t m_bytes_allocated_cap = 0;
+
+ utime_t m_last_alloc_fail; /* Entry or buffer allocation fail seen */
+ std::atomic<bool> m_alloc_failed_since_retire = {false};
+
+ ImageWriteback<ImageCtxT> m_image_writeback;
+ pwl::WriteLogGuard m_write_log_guard;
+ /*
+ * When m_first_free_entry == m_first_valid_entry, the log is
+ * empty. There is always at least one free entry, which can't be
+ * used.
+ */
+ uint64_t m_first_free_entry = 0; /* Entries from here to m_first_valid_entry-1 are free */
+ uint64_t m_first_valid_entry = 0; /* Entries from here to m_first_free_entry-1 are valid */
+
+ /* Starts at 0 for a new write log. Incremented on every flush. */
+ uint64_t m_current_sync_gen = 0;
+ /* Starts at 0 on each sync gen increase. Incremented before applied
+ to an operation */
+ uint64_t m_last_op_sequence_num = 0;
+ /* All writes bearing this and all prior sync gen numbers are flushed */
+ uint64_t m_flushed_sync_gen = 0;
+
+ bool m_persist_on_write_until_flush = true;
+
+ AsyncOpTracker m_async_op_tracker;
+ /* Debug counters for the places m_async_op_tracker is used */
+ std::atomic<int> m_async_flush_ops = {0};
+ std::atomic<int> m_async_append_ops = {0};
+ std::atomic<int> m_async_complete_ops = {0};
+ std::atomic<int> m_async_null_flush_finish = {0};
+ std::atomic<int> m_async_process_work = {0};
+
+ /* Acquire locks in order declared here */
+
+ mutable ceph::mutex m_log_retire_lock;
+ /* Hold a read lock on m_entry_reader_lock to add readers to log entry
+ * bufs. Hold a write lock to prevent readers from being added (e.g. when
+ * removing log entrys from the map). No lock required to remove readers. */
+ mutable RWLock m_entry_reader_lock;
+ /* Hold m_deferred_dispatch_lock while consuming from m_deferred_ios. */
+ mutable ceph::mutex m_deferred_dispatch_lock;
+ /* Hold m_log_append_lock while appending or retiring log entries. */
+ mutable ceph::mutex m_log_append_lock;
+ /* Used for most synchronization */
+ mutable ceph::mutex m_lock;
+
+ /* Used in release/detain to make BlockGuard preserve submission order */
+ mutable ceph::mutex m_blockguard_lock;
+
+ /* Use m_blockguard_lock for the following 3 things */
+ pwl::WriteLogGuard::BlockOperations m_awaiting_barrier;
+ bool m_barrier_in_progress = false;
+ BlockGuardCell *m_barrier_cell = nullptr;
+
+ bool m_wake_up_requested = false;
+ bool m_wake_up_scheduled = false;
+ bool m_wake_up_enabled = true;
+ bool m_appending = false;
+ bool m_dispatching_deferred_ops = false;
+
+ Contexts m_flush_complete_contexts;
+
+ pwl::GenericLogOperations m_ops_to_flush; /* Write ops needing flush in local log */
+ pwl::GenericLogOperations m_ops_to_append; /* Write ops needing event append in local log */
+
+ pwl::WriteLogMap m_blocks_to_log_entries;
+
+ /* New entries are at the back. Oldest at the front */
+ pwl::GenericLogEntries m_log_entries;
+ pwl::GenericLogEntries m_dirty_log_entries;
+
+ PerfCounters *m_perfcounter = nullptr;
+
+ std::shared_ptr<pwl::SyncPoint> m_current_sync_point = nullptr;
+ bool m_persist_on_flush = false; /* If false, persist each write before completion */
+
+ int m_flush_ops_in_flight = 0;
+ int m_flush_bytes_in_flight = 0;
+ uint64_t m_lowest_flushing_sync_gen = 0;
+
+ /* Writes that have left the block guard, but are waiting for resources */
+ C_BlockIORequests m_deferred_ios;
+ /* Throttle writes concurrently allocating & replicating */
+ unsigned int m_free_lanes = pwl::MAX_CONCURRENT_WRITES;
+ unsigned int m_unpublished_reserves = 0;
+
+ /* Initialized from config, then set false during shutdown */
+ std::atomic<bool> m_periodic_stats_enabled = {false};
+ SafeTimer *m_timer = nullptr; /* Used with m_timer_lock */
+ mutable ceph::mutex *m_timer_lock = nullptr; /* Used with and by m_timer */
+ Context *m_timer_ctx = nullptr;
+
+ ThreadPool m_thread_pool;
+ ContextWQ m_work_queue;
+
+ uint32_t m_discard_granularity_bytes;
+
+ void perf_start(const std::string name);
+ void perf_stop();
+ void log_perf();
+ void periodic_stats();
+ void arm_periodic_stats();
+
+ void pwl_init(Context *on_finish, pwl::DeferredContexts &later);
+ void update_image_cache_state(Context *on_finish);
+ void load_existing_entries(pwl::DeferredContexts &later);
+ void wake_up();
+ void process_work();
+
+ void flush_dirty_entries(Context *on_finish);
+ bool can_flush_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry);
+ Context *construct_flush_entry_ctx(const std::shared_ptr<pwl::GenericLogEntry> log_entry);
+ void persist_last_flushed_sync_gen();
+ bool handle_flushed_sync_point(std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
+ void sync_point_writer_flushed(std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
+ void process_writeback_dirty_entries();
+ bool can_retire_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry);
+ bool retire_entries(const unsigned long int frees_per_tx);
+
+ void init_flush_new_sync_point(pwl::DeferredContexts &later);
+ void new_sync_point(pwl::DeferredContexts &later);
+ pwl::C_FlushRequest<AbstractWriteLog<ImageCtxT>>* make_flush_req(Context *on_finish);
+ void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, pwl::DeferredContexts &later);
+
+ void dispatch_deferred_writes(void);
+ void alloc_and_dispatch_io_req(C_BlockIORequestT *write_req);
+ void append_scheduled_ops(void);
+ void enlist_op_appender();
+ void schedule_append(pwl::GenericLogOperations &ops);
+ void flush_then_append_scheduled_ops(void);
+ void enlist_op_flusher();
+ void alloc_op_log_entries(pwl::GenericLogOperations &ops);
+ void flush_op_log_entries(pwl::GenericLogOperationsVector &ops);
+ int append_op_log_entries(pwl::GenericLogOperations &ops);
+ void complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r);
+ void schedule_complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r);
+ void internal_flush(bool invalidate, Context *on_finish);
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/Types.h"
+#include "librbd/cache/Utils.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Operations.h"
+#include "common/environment.h"
+#include "common/hostname.h"
+#include "common/config_proxy.h"
+#include "common/ceph_json.h"
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ImageCacheState: " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+namespace {
+bool get_json_format(const std::string& s, JSONFormattable *f) {
+ JSONParser p;
+ bool success = p.parse(s.c_str(), s.size());
+ if (success) {
+ decode_json_obj(*f, &p);
+ }
+ return success;
+}
+} // namespace
+
+template <typename I>
+ImageCacheState<I>::ImageCacheState(I *image_ctx) : m_image_ctx(image_ctx) {
+ ldout(image_ctx->cct, 20) << "Initialize RWL cache state with config data. "
+ << dendl;
+
+ ConfigProxy &config = image_ctx->config;
+ host = ceph_get_short_hostname();
+ path = config.get_val<std::string>("rbd_rwl_path");
+ size = config.get_val<uint64_t>("rbd_rwl_size");
+ log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
+}
+
+template <typename I>
+ImageCacheState<I>::ImageCacheState(
+ I *image_ctx, JSONFormattable &f) : m_image_ctx(image_ctx) {
+ ldout(image_ctx->cct, 20) << "Initialize RWL cache state with data from "
+ << "server side"<< dendl;
+
+ present = (bool)f["present"];
+ empty = (bool)f["empty"];
+ clean = (bool)f["clean"];
+ host = (string)f["rwl_host"];
+ path = (string)f["rwl_path"];
+ uint64_t pwl_size;
+ std::istringstream iss(f["rwl_size"]);
+ iss >> pwl_size;
+ size = pwl_size;
+
+ // Others from config
+ ConfigProxy &config = image_ctx->config;
+ log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
+}
+
+template <typename I>
+void ImageCacheState<I>::write_image_cache_state(Context *on_finish) {
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ JSONFormattable f;
+ ::encode_json(IMAGE_CACHE_STATE.c_str(), *this, &f);
+ std::ostringstream oss;
+ f.flush(oss);
+ std::string image_state_json = oss.str();
+
+ ldout(m_image_ctx->cct, 20) << __func__ << " Store state: "
+ << image_state_json << dendl;
+ m_image_ctx->operations->execute_metadata_set(IMAGE_CACHE_STATE,
+ image_state_json, on_finish);
+}
+
+template <typename I>
+void ImageCacheState<I>::clear_image_cache_state(Context *on_finish) {
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ ldout(m_image_ctx->cct, 20) << __func__ << " Remove state: " << dendl;
+ m_image_ctx->operations->execute_metadata_remove(IMAGE_CACHE_STATE, on_finish);
+}
+
+template <typename I>
+void ImageCacheState<I>::dump(ceph::Formatter *f) const {
+ ::encode_json("present", present, f);
+ ::encode_json("empty", empty, f);
+ ::encode_json("clean", clean, f);
+ ::encode_json("cache_type", (int)get_image_cache_type(), f);
+ ::encode_json("pwl_host", host, f);
+ ::encode_json("pwl_path", path, f);
+ ::encode_json("pwl_size", size, f);
+}
+
+template <typename I>
+ImageCacheState<I>* ImageCacheState<I>::get_image_cache_state(
+ I* image_ctx, int &r) {
+ std::string cache_state_str;
+ ImageCacheState<I>* cache_state = nullptr;
+ ldout(image_ctx->cct, 20) << "image_cache_state:" << cache_state_str << dendl;
+
+ r = 0;
+ bool dirty_cache = image_ctx->test_features(RBD_FEATURE_DIRTY_CACHE);
+ if (dirty_cache) {
+ cls_client::metadata_get(&image_ctx->md_ctx, image_ctx->header_oid,
+ IMAGE_CACHE_STATE, &cache_state_str);
+ }
+
+ bool pwl_enabled = cache::util::is_pwl_enabled(*image_ctx);
+ bool cache_desired = pwl_enabled;
+ cache_desired &= !image_ctx->read_only;
+ cache_desired &= !image_ctx->test_features(RBD_FEATURE_MIGRATING);
+ cache_desired &= !image_ctx->test_features(RBD_FEATURE_JOURNALING);
+ cache_desired &= !image_ctx->old_format;
+
+ if (!dirty_cache && !cache_desired) {
+ ldout(image_ctx->cct, 20) << "Do not desire to use image cache." << dendl;
+ } else if (dirty_cache && !cache_desired) {
+ lderr(image_ctx->cct) << "There's a dirty cache, but RWL cache is disabled."
+ << dendl;
+ r = -EINVAL;
+ }else if ((!dirty_cache || cache_state_str.empty()) && cache_desired) {
+ cache_state = new ImageCacheState<I>(image_ctx);
+ } else {
+ ceph_assert(!cache_state_str.empty());
+ JSONFormattable f;
+ bool success = get_json_format(cache_state_str, &f);
+ if (!success) {
+ lderr(image_ctx->cct) << "Failed to parse cache state: "
+ << cache_state_str << dendl;
+ r = -EINVAL;
+ return nullptr;
+ }
+
+ bool cache_exists = (bool)f["present"];
+ int cache_type = (int)f["cache_type"];
+
+ switch (cache_type) {
+ case IMAGE_CACHE_TYPE_RWL:
+ if (!cache_exists) {
+ cache_state = new ImageCacheState<I>(image_ctx);
+ } else {
+ cache_state = new ImageCacheState<I>(image_ctx, f);
+ }
+ break;
+ default:
+ r = -EINVAL;
+ }
+ }
+ return cache_state;
+}
+
+template <typename I>
+bool ImageCacheState<I>::is_valid() {
+ if (this->present &&
+ (host.compare(ceph_get_short_hostname()) != 0)) {
+ auto cleanstring = "dirty";
+ if (this->clean) {
+ cleanstring = "clean";
+ }
+ lderr(m_image_ctx->cct) << "An image cache (RWL) remains on another host "
+ << host << " which is " << cleanstring
+ << ". Flush/close the image there to remove the "
+ << "image cache" << dendl;
+ return false;
+ }
+ return true;
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ImageCacheState<librbd::ImageCtx>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
+#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
+
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/Types.h"
+#include <string>
+
+class JSONFormattable;
+namespace ceph {
+ class Formatter;
+}
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+template <typename ImageCtxT = ImageCtx>
+class ImageCacheState {
+private:
+ ImageCtxT* m_image_ctx;
+public:
+ bool present = true;
+ bool empty = true;
+ bool clean = true;
+ std::string host;
+ std::string path;
+ uint64_t size;
+ bool log_periodic_stats;
+
+ ImageCacheState(ImageCtxT* image_ctx);
+
+ ImageCacheState(ImageCtxT* image_ctx, JSONFormattable& f);
+
+ ~ImageCacheState() {}
+
+ ImageCacheType get_image_cache_type() const {
+ return IMAGE_CACHE_TYPE_RWL;
+ }
+
+
+ void write_image_cache_state(Context *on_finish);
+
+ void clear_image_cache_state(Context *on_finish);
+
+ void dump(ceph::Formatter *f) const;
+
+ static ImageCacheState<ImageCtxT>* get_image_cache_state(
+ ImageCtxT* image_ctx, int &r);
+
+ bool is_valid();
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::ImageCacheState<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/pwl/InitRequest.h"
+#include "librbd/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/asio/ContextWQ.h"
+
+#if defined(WITH_RBD_RWL)
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/WriteLogCache.h"
+#endif // WITH_RBD_RWL
+
+#include "librbd/cache/Utils.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl:InitRequest " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+InitRequest<I>* InitRequest<I>::create(I &image_ctx,
+ Context *on_finish) {
+ return new InitRequest(image_ctx, on_finish);
+}
+
+template <typename I>
+InitRequest<I>::InitRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+ m_error_result(0) {
+}
+
+template <typename I>
+void InitRequest<I>::send() {
+#if defined(WITH_RBD_RWL)
+ get_image_cache_state();
+#else
+ finish();
+#endif // WITH_RBD_RWL
+}
+
+#if defined(WITH_RBD_RWL)
+template <typename I>
+void InitRequest<I>::get_image_cache_state() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ int r;
+ auto cache_state = ImageCacheState<I>::get_image_cache_state(&m_image_ctx, r);
+
+ if (r < 0 || !cache_state) {
+ save_result(r);
+ finish();
+ return;
+ } else if (!cache_state->is_valid()) {
+ delete cache_state;
+ cache_state = nullptr;
+ lderr(cct) << "failed to get image cache state: " << cpp_strerror(r)
+ << dendl;
+ save_result(-ENOENT);
+ finish();
+ return;
+ }
+
+ auto cache_type = cache_state->get_image_cache_type();
+ switch(cache_type) {
+ case cache::IMAGE_CACHE_TYPE_RWL:
+ m_image_ctx.image_cache =
+ new librbd::cache::WriteLogCache<I>(m_image_ctx,
+ cache_state);
+ break;
+ default:
+ delete cache_state;
+ cache_state = nullptr;
+ save_result(-ENOENT);
+ finish();
+ return;
+ }
+
+ init_image_cache();
+}
+
+template <typename I>
+void InitRequest<I>::init_image_cache() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = InitRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_init_image_cache>(
+ this);
+ m_image_ctx.image_cache->init(ctx);
+}
+
+template <typename I>
+void InitRequest<I>::handle_init_image_cache(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to init image cache: " << cpp_strerror(r)
+ << dendl;
+ delete m_image_ctx.image_cache;
+ m_image_ctx.image_cache = nullptr;
+ save_result(r);
+ finish();
+ return;
+ }
+ set_feature_bit();
+}
+
+template <typename I>
+void InitRequest<I>::set_feature_bit() {
+ CephContext *cct = m_image_ctx.cct;
+
+ uint64_t new_features = m_image_ctx.features | RBD_FEATURE_DIRTY_CACHE;
+ uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE;
+ ldout(cct, 10) << "old_features=" << m_image_ctx.features
+ << ", new_features=" << new_features
+ << ", features_mask=" << features_mask
+ << dendl;
+
+ int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx,
+ m_image_ctx.header_oid,
+ new_features, features_mask);
+ m_image_ctx.features |= RBD_FEATURE_DIRTY_CACHE;
+ using klass = InitRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_set_feature_bit>(
+ this);
+ ctx->complete(r);
+}
+
+template <typename I>
+void InitRequest<I>::handle_set_feature_bit(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to set feature bit: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+ } else if (m_image_ctx.discard_granularity_bytes) {
+ ldout(cct, 1) << "RWL image cache is enabled and "
+ << "set discard_granularity_bytes = 0." << dendl;
+ m_image_ctx.discard_granularity_bytes = 0;
+ }
+ finish();
+}
+
+#endif // WITH_RBD_RWL
+
+template <typename I>
+void InitRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::InitRequest<librbd::ImageCtx>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace cache {
+namespace pwl {
+
+template<typename>
+class ImageCacheState;
+
+template <typename ImageCtxT = ImageCtx>
+class InitRequest {
+public:
+ static InitRequest* create(ImageCtxT &image_ctx, Context *on_finish);
+
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * Init request goes through the following state machine:
+ *
+ * <start>
+ * |
+ * v
+ * GET_IMAGE_CACHE_STATE
+ * |
+ * v
+ * INIT_IMAGE_CACHE
+ * |
+ * v
+ * SET_FEATURE_BIT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ InitRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ int m_error_result;
+
+ bool is_pwl_enabled();
+
+ void get_image_cache_state();
+
+ void init_image_cache();
+ void handle_init_image_cache(int r);
+
+ void set_feature_bit();
+ void handle_set_feature_bit(int r);
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::InitRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "LogEntry.h"
+#include "librbd/cache/ImageWriteback.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::LogEntry: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+
+namespace cache {
+
+namespace pwl {
+
+std::ostream& GenericLogEntry::format(std::ostream &os) const {
+ os << "ram_entry=[" << ram_entry << "], "
+ << "pmem_entry=" << (void*)pmem_entry << ", "
+ << "log_entry_index=" << log_entry_index << ", "
+ << "completed=" << completed;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const GenericLogEntry &entry) {
+ return entry.format(os);
+}
+
+std::ostream& SyncPointLogEntry::format(std::ostream &os) const {
+ os << "(Sync Point) ";
+ GenericLogEntry::format(os);
+ os << ", "
+ << "writes=" << writes << ", "
+ << "bytes=" << bytes << ", "
+ << "writes_completed=" << writes_completed << ", "
+ << "writes_flushed=" << writes_flushed << ", "
+ << "prior_sync_point_flushed=" << prior_sync_point_flushed << ", "
+ << "next_sync_point_entry=" << next_sync_point_entry;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const SyncPointLogEntry &entry) {
+ return entry.format(os);
+}
+
+bool GenericWriteLogEntry::can_writeback() const {
+ return (this->completed &&
+ (ram_entry.sequenced ||
+ (sync_point_entry &&
+ sync_point_entry->completed)));
+}
+
+std::ostream& GenericWriteLogEntry::format(std::ostream &os) const {
+ GenericLogEntry::format(os);
+ os << ", "
+ << "sync_point_entry=[";
+ if (sync_point_entry) {
+ os << *sync_point_entry;
+ } else {
+ os << "nullptr";
+ }
+ os << "], "
+ << "referring_map_entries=" << referring_map_entries;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const GenericWriteLogEntry &entry) {
+ return entry.format(os);
+}
+
+void WriteLogEntry::init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation,
+ uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush) {
+ ram_entry.has_data = 1;
+ ram_entry.write_data = allocation->buffer_oid;
+ ceph_assert(!TOID_IS_NULL(ram_entry.write_data));
+ pmem_buffer = D_RW(ram_entry.write_data);
+ ram_entry.sync_gen_number = current_sync_gen;
+ if (persist_on_flush) {
+ /* Persist on flush. Sequence #0 is never used. */
+ ram_entry.write_sequence_number = 0;
+ } else {
+ /* Persist on write */
+ ram_entry.write_sequence_number = last_op_sequence_num;
+ ram_entry.sequenced = 1;
+ }
+ ram_entry.sync_point = 0;
+ ram_entry.discard = 0;
+}
+
+void WriteLogEntry::init_pmem_bp() {
+ ceph_assert(!pmem_bp.have_raw());
+ pmem_bp = buffer::ptr(buffer::create_static(this->write_bytes(), (char*)pmem_buffer));
+}
+
+void WriteLogEntry::init_pmem_bl() {
+ pmem_bl.clear();
+ init_pmem_bp();
+ ceph_assert(pmem_bp.have_raw());
+ int before_bl = pmem_bp.raw_nref();
+ this->init_bl(pmem_bp, pmem_bl);
+ int after_bl = pmem_bp.raw_nref();
+ bl_refs = after_bl - before_bl;
+}
+
+unsigned int WriteLogEntry::reader_count() const {
+ if (pmem_bp.have_raw()) {
+ return (pmem_bp.raw_nref() - bl_refs - 1);
+ } else {
+ return 0;
+ }
+}
+
+/* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */
+buffer::list& WriteLogEntry::get_pmem_bl() {
+ if (0 == bl_refs) {
+ std::lock_guard locker(m_entry_bl_lock);
+ if (0 == bl_refs) {
+ init_pmem_bl();
+ }
+ ceph_assert(0 != bl_refs);
+ }
+ return pmem_bl;
+}
+
+/* Constructs a new bl containing copies of pmem_bp */
+void WriteLogEntry::copy_pmem_bl(bufferlist *out_bl) {
+ this->get_pmem_bl();
+ /* pmem_bp is now initialized */
+ buffer::ptr cloned_bp(pmem_bp.clone());
+ out_bl->clear();
+ this->init_bl(cloned_bp, *out_bl);
+}
+
+void WriteLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) {
+ /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the bl even after flush()). */
+ bufferlist entry_bl;
+ buffer::list entry_bl_copy;
+ copy_pmem_bl(&entry_bl_copy);
+ entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
+ image_writeback.aio_write({{ram_entry.image_offset_bytes, ram_entry.write_bytes}},
+ std::move(entry_bl), 0, ctx);
+}
+
+std::ostream& WriteLogEntry::format(std::ostream &os) const {
+ os << "(Write) ";
+ GenericWriteLogEntry::format(os);
+ os << ", "
+ << "pmem_buffer=" << (void*)pmem_buffer << ", ";
+ os << "pmem_bp=" << pmem_bp << ", ";
+ os << "pmem_bl=" << pmem_bl << ", ";
+ os << "bl_refs=" << bl_refs;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const WriteLogEntry &entry) {
+ return entry.format(os);
+}
+
+void DiscardLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) {
+ image_writeback.aio_discard(ram_entry.image_offset_bytes, ram_entry.write_bytes,
+ m_discard_granularity_bytes, ctx);
+}
+
+void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num) {
+ ram_entry.sync_gen_number = current_sync_gen;
+ if (persist_on_flush) {
+ /* Persist on flush. Sequence #0 is never used. */
+ ram_entry.write_sequence_number = 0;
+ } else {
+ /* Persist on write */
+ ram_entry.write_sequence_number = last_op_sequence_num;
+ ram_entry.sequenced = 1;
+ }
+}
+
+std::ostream &DiscardLogEntry::format(std::ostream &os) const {
+ os << "(Discard) ";
+ GenericWriteLogEntry::format(os);
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const DiscardLogEntry &entry) {
+ return entry.format(os);
+}
+
+void WriteSameLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) {
+ for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) {
+ bl.append(bp);
+ }
+ int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen;
+ if (trailing_partial) {
+ bl.append(bp, 0, trailing_partial);
+ }
+}
+
+void WriteSameLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) {
+ bufferlist entry_bl;
+ buffer::list entry_bl_copy;
+ copy_pmem_bl(&entry_bl_copy);
+ entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
+ image_writeback.aio_writesame(ram_entry.image_offset_bytes, ram_entry.write_bytes,
+ std::move(entry_bl), 0, ctx);
+}
+
+std::ostream &WriteSameLogEntry::format(std::ostream &os) const {
+ os << "(WriteSame) ";
+ WriteLogEntry::format(os);
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const WriteSameLogEntry &entry) {
+ return entry.format(os);
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+
+#include "common/ceph_mutex.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/pwl/Types.h"
+#include <atomic>
+#include <memory>
+
+namespace librbd {
+namespace cache {
+class ImageWritebackInterface;
+namespace pwl {
+
+class SyncPointLogEntry;
+class GenericWriteLogEntry;
+class WriteLogEntry;
+
+typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
+
+class GenericLogEntry {
+public:
+ WriteLogPmemEntry ram_entry;
+ WriteLogPmemEntry *pmem_entry = nullptr;
+ uint32_t log_entry_index = 0;
+ bool completed = false;
+ GenericLogEntry(const uint64_t image_offset_bytes = 0, const uint64_t write_bytes = 0)
+ : ram_entry(image_offset_bytes, write_bytes) {
+ };
+ virtual ~GenericLogEntry() { };
+ GenericLogEntry(const GenericLogEntry&) = delete;
+ GenericLogEntry &operator=(const GenericLogEntry&) = delete;
+ virtual bool can_writeback() const {
+ return false;
+ }
+ virtual bool can_retire() const {
+ return false;
+ }
+ virtual void set_flushed(bool flushed) {
+ ceph_assert(false);
+ }
+ virtual unsigned int write_bytes() const {
+ return 0;
+ };
+ virtual unsigned int bytes_dirty() const {
+ return 0;
+ };
+ virtual std::shared_ptr<SyncPointLogEntry> get_sync_point_entry() {
+ return nullptr;
+ }
+ virtual void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) {
+ ceph_assert(false);
+ };
+ virtual std::ostream& format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const GenericLogEntry &entry);
+};
+
+class SyncPointLogEntry : public GenericLogEntry {
+public:
+ /* Writing entries using this sync gen number */
+ std::atomic<unsigned int> writes = {0};
+ /* Total bytes for all writing entries using this sync gen number */
+ std::atomic<uint64_t> bytes = {0};
+ /* Writing entries using this sync gen number that have completed */
+ std::atomic<unsigned int> writes_completed = {0};
+ /* Writing entries using this sync gen number that have completed flushing to the writeback interface */
+ std::atomic<unsigned int> writes_flushed = {0};
+ /* All writing entries using all prior sync gen numbers have been flushed */
+ std::atomic<bool> prior_sync_point_flushed = {true};
+ std::shared_ptr<SyncPointLogEntry> next_sync_point_entry = nullptr;
+ SyncPointLogEntry(const uint64_t sync_gen_number) {
+ ram_entry.sync_gen_number = sync_gen_number;
+ ram_entry.sync_point = 1;
+ };
+ ~SyncPointLogEntry() override {};
+ SyncPointLogEntry(const SyncPointLogEntry&) = delete;
+ SyncPointLogEntry &operator=(const SyncPointLogEntry&) = delete;
+ bool can_retire() const override {
+ return this->completed;
+ }
+ std::ostream& format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const SyncPointLogEntry &entry);
+};
+
+class GenericWriteLogEntry : public GenericLogEntry {
+public:
+ uint32_t referring_map_entries = 0;
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry;
+ GenericWriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(sync_point_entry) { }
+ GenericWriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(nullptr) { }
+ ~GenericWriteLogEntry() override {};
+ GenericWriteLogEntry(const GenericWriteLogEntry&) = delete;
+ GenericWriteLogEntry &operator=(const GenericWriteLogEntry&) = delete;
+ unsigned int write_bytes() const override {
+ /* The valid bytes in this ops data buffer. Discard and WS override. */
+ return ram_entry.write_bytes;
+ };
+ unsigned int bytes_dirty() const override {
+ /* The bytes in the image this op makes dirty. Discard and WS override. */
+ return write_bytes();
+ };
+ BlockExtent block_extent() {
+ return ram_entry.block_extent();
+ }
+ uint32_t get_map_ref() {
+ return(referring_map_entries);
+ }
+ void inc_map_ref() { referring_map_entries++; }
+ void dec_map_ref() { referring_map_entries--; }
+ bool can_writeback() const override;
+ std::shared_ptr<SyncPointLogEntry> get_sync_point_entry() override {
+ return sync_point_entry;
+ }
+ virtual void copy_pmem_bl(bufferlist *out_bl) = 0;
+ void set_flushed(bool flushed) override {
+ m_flushed = flushed;
+ }
+ bool get_flushed() const {
+ return m_flushed;
+ }
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const GenericWriteLogEntry &entry);
+
+private:
+ bool m_flushed = false; /* or invalidated */
+};
+
+class WriteLogEntry : public GenericWriteLogEntry {
+protected:
+ buffer::ptr pmem_bp;
+ buffer::list pmem_bl;
+ std::atomic<int> bl_refs = {0}; /* The refs held on pmem_bp by pmem_bl */
+ /* Used in WriteLogEntry::get_pmem_bl() to syncronize between threads making entries readable */
+ mutable ceph::mutex m_entry_bl_lock;
+
+ void init_pmem_bp();
+
+ /* Write same will override */
+ virtual void init_bl(buffer::ptr &bp, buffer::list &bl) {
+ bl.append(bp);
+ }
+
+ void init_pmem_bl();
+
+public:
+ uint8_t *pmem_buffer = nullptr;
+ WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
+ m_entry_bl_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this)))
+ { }
+ WriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes),
+ m_entry_bl_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this)))
+ { }
+ ~WriteLogEntry() override {};
+ WriteLogEntry(const WriteLogEntry&) = delete;
+ WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+ void init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation,
+ uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush);
+ BlockExtent block_extent();
+ unsigned int reader_count() const;
+ /* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */
+ buffer::list &get_pmem_bl();
+ /* Constructs a new bl containing copies of pmem_bp */
+ void copy_pmem_bl(bufferlist *out_bl) override;
+ void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) override;
+ bool can_retire() const override {
+ return (this->completed && this->get_flushed() && (0 == reader_count()));
+ }
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const WriteLogEntry &entry);
+};
+
+class DiscardLogEntry : public GenericWriteLogEntry {
+public:
+ DiscardLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ const uint64_t image_offset_bytes, const uint64_t write_bytes,
+ uint32_t discard_granularity_bytes)
+ : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
+ m_discard_granularity_bytes(discard_granularity_bytes) {
+ ram_entry.discard = 1;
+ };
+ DiscardLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
+ ram_entry.discard = 1;
+ };
+ DiscardLogEntry(const DiscardLogEntry&) = delete;
+ DiscardLogEntry &operator=(const DiscardLogEntry&) = delete;
+ unsigned int write_bytes() const override {
+ /* The valid bytes in this ops data buffer. */
+ return 0;
+ };
+ unsigned int bytes_dirty() const override {
+ /* The bytes in the image this op makes dirty. */
+ return ram_entry.write_bytes;
+ };
+ bool can_retire() const override {
+ return this->completed;
+ }
+ void copy_pmem_bl(bufferlist *out_bl) override {
+ ceph_assert(false);
+ }
+ void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) override;
+ void init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num);
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const DiscardLogEntry &entry);
+private:
+ uint32_t m_discard_granularity_bytes;
+};
+
+class WriteSameLogEntry : public WriteLogEntry {
+protected:
+ void init_bl(buffer::ptr &bp, buffer::list &bl) override;
+
+public:
+ WriteSameLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ const uint64_t image_offset_bytes, const uint64_t write_bytes,
+ const uint32_t data_length)
+ : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {
+ ram_entry.writesame = 1;
+ ram_entry.ws_datalen = data_length;
+ };
+ WriteSameLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes,
+ const uint32_t data_length)
+ : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
+ ram_entry.writesame = 1;
+ ram_entry.ws_datalen = data_length;
+ };
+ WriteSameLogEntry(const WriteSameLogEntry&) = delete;
+ WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
+ unsigned int write_bytes() const override {
+ /* The valid bytes in this ops data buffer. */
+ return ram_entry.ws_datalen;
+ };
+ unsigned int bytes_dirty() const override {
+ /* The bytes in the image this op makes dirty. */
+ return ram_entry.write_bytes;
+ };
+ void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) override;
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const WriteSameLogEntry &entry);
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LogMap.h"
+#include "include/ceph_assert.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/pwl/LogEntry.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::LogMap: " << this << " " \
+ << __func__ << ": "
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ LogMapEntry<T> &e) {
+ os << "block_extent=" << e.block_extent << ", "
+ << "log_entry=[" << e.log_entry << "]";
+ return os;
+}
+
+template <typename T>
+LogMapEntry<T>::LogMapEntry(const BlockExtent block_extent,
+ std::shared_ptr<T> log_entry)
+ : block_extent(block_extent) , log_entry(log_entry) {
+}
+
+template <typename T>
+LogMapEntry<T>::LogMapEntry(std::shared_ptr<T> log_entry)
+ : block_extent(log_entry->block_extent()) , log_entry(log_entry) {
+}
+
+template <typename T>
+LogMap<T>::LogMap(CephContext *cct)
+ : m_cct(cct),
+ m_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::pwl::LogMap::m_lock", this))) {
+}
+
+/**
+ * Add a write log entry to the map. Subsequent queries for blocks
+ * within this log entry's extent will find this log entry. Portions
+ * of prior write log entries overlapping with this log entry will
+ * be replaced in the map by this log entry.
+ *
+ * The map_entries field of the log entry object will be updated to
+ * contain this map entry.
+ *
+ * The map_entries fields of all log entries overlapping with this
+ * entry will be updated to remove the regions that overlap with
+ * this.
+ */
+template <typename T>
+void LogMap<T>::add_log_entry(std::shared_ptr<T> log_entry) {
+ std::lock_guard locker(m_lock);
+ add_log_entry_locked(log_entry);
+}
+
+template <typename T>
+void LogMap<T>::add_log_entries(std::list<std::shared_ptr<T>> &log_entries) {
+ std::lock_guard locker(m_lock);
+ ldout(m_cct, 20) << dendl;
+ for (auto &log_entry : log_entries) {
+ add_log_entry_locked(log_entry);
+ }
+}
+
+/**
+ * Remove any map entries that refer to the supplied write log
+ * entry.
+ */
+template <typename T>
+void LogMap<T>::remove_log_entry(std::shared_ptr<T> log_entry) {
+ std::lock_guard locker(m_lock);
+ remove_log_entry_locked(log_entry);
+}
+
+template <typename T>
+void LogMap<T>::remove_log_entries(std::list<std::shared_ptr<T>> &log_entries) {
+ std::lock_guard locker(m_lock);
+ ldout(m_cct, 20) << dendl;
+ for (auto &log_entry : log_entries) {
+ remove_log_entry_locked(log_entry);
+ }
+}
+
+/**
+ * Returns the list of all write log entries that overlap the specified block
+ * extent. This doesn't tell you which portions of these entries overlap the
+ * extent, or each other. For that, use find_map_entries(). A log entry may
+ * appear in the list more than once, if multiple map entries refer to it
+ * (e.g. the middle of that write log entry has been overwritten).
+ */
+template <typename T>
+std::list<std::shared_ptr<T>> LogMap<T>::find_log_entries(BlockExtent block_extent) {
+ std::lock_guard locker(m_lock);
+ ldout(m_cct, 20) << dendl;
+ return find_log_entries_locked(block_extent);
+}
+
+/**
+ * Returns the list of all write log map entries that overlap the
+ * specified block extent.
+ */
+template <typename T>
+LogMapEntries<T> LogMap<T>::find_map_entries(BlockExtent block_extent) {
+ std::lock_guard locker(m_lock);
+ ldout(m_cct, 20) << dendl;
+ return find_map_entries_locked(block_extent);
+}
+
+template <typename T>
+void LogMap<T>::add_log_entry_locked(std::shared_ptr<T> log_entry) {
+ LogMapEntry<T> map_entry(log_entry);
+ ldout(m_cct, 20) << "block_extent=" << map_entry.block_extent
+ << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ LogMapEntries<T> overlap_entries = find_map_entries_locked(map_entry.block_extent);
+ for (auto &entry : overlap_entries) {
+ ldout(m_cct, 20) << entry << dendl;
+ if (map_entry.block_extent.block_start <= entry.block_extent.block_start) {
+ if (map_entry.block_extent.block_end >= entry.block_extent.block_end) {
+ ldout(m_cct, 20) << "map entry completely occluded by new log entry" << dendl;
+ remove_map_entry_locked(entry);
+ } else {
+ ceph_assert(map_entry.block_extent.block_end < entry.block_extent.block_end);
+ /* The new entry occludes the beginning of the old entry */
+ BlockExtent adjusted_extent(map_entry.block_extent.block_end,
+ entry.block_extent.block_end);
+ adjust_map_entry_locked(entry, adjusted_extent);
+ }
+ } else {
+ if (map_entry.block_extent.block_end >= entry.block_extent.block_end) {
+ /* The new entry occludes the end of the old entry */
+ BlockExtent adjusted_extent(entry.block_extent.block_start,
+ map_entry.block_extent.block_start);
+ adjust_map_entry_locked(entry, adjusted_extent);
+ } else {
+ /* The new entry splits the old entry */
+ split_map_entry_locked(entry, map_entry.block_extent);
+ }
+ }
+ }
+ add_map_entry_locked(map_entry);
+}
+
+template <typename T>
+void LogMap<T>::remove_log_entry_locked(std::shared_ptr<T> log_entry) {
+ ldout(m_cct, 20) << "*log_entry=" << *log_entry << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ LogMapEntries<T> possible_hits = find_map_entries_locked(log_entry->block_extent());
+ for (auto &possible_hit : possible_hits) {
+ if (possible_hit.log_entry == log_entry) {
+ /* This map entry refers to the specified log entry */
+ remove_map_entry_locked(possible_hit);
+ }
+ }
+}
+
+template <typename T>
+void LogMap<T>::add_map_entry_locked(LogMapEntry<T> &map_entry) {
+ ceph_assert(map_entry.log_entry);
+ m_block_to_log_entry_map.insert(map_entry);
+ map_entry.log_entry->inc_map_ref();
+}
+
+template <typename T>
+void LogMap<T>::remove_map_entry_locked(LogMapEntry<T> &map_entry) {
+ auto it = m_block_to_log_entry_map.find(map_entry);
+ ceph_assert(it != m_block_to_log_entry_map.end());
+
+ LogMapEntry<T> erased = *it;
+ m_block_to_log_entry_map.erase(it);
+ erased.log_entry->dec_map_ref();
+ if (0 == erased.log_entry->get_map_ref()) {
+ ldout(m_cct, 20) << "log entry has zero map entries: " << erased.log_entry << dendl;
+ }
+}
+
+template <typename T>
+void LogMap<T>::adjust_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &new_extent) {
+ auto it = m_block_to_log_entry_map.find(map_entry);
+ ceph_assert(it != m_block_to_log_entry_map.end());
+
+ LogMapEntry<T> adjusted = *it;
+ m_block_to_log_entry_map.erase(it);
+
+ m_block_to_log_entry_map.insert(LogMapEntry<T>(new_extent, adjusted.log_entry));
+}
+
+template <typename T>
+void LogMap<T>::split_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &removed_extent) {
+ auto it = m_block_to_log_entry_map.find(map_entry);
+ ceph_assert(it != m_block_to_log_entry_map.end());
+
+ LogMapEntry<T> split = *it;
+ m_block_to_log_entry_map.erase(it);
+
+ BlockExtent left_extent(split.block_extent.block_start,
+ removed_extent.block_start);
+ m_block_to_log_entry_map.insert(LogMapEntry<T>(left_extent, split.log_entry));
+
+ BlockExtent right_extent(removed_extent.block_end,
+ split.block_extent.block_end);
+ m_block_to_log_entry_map.insert(LogMapEntry<T>(right_extent, split.log_entry));
+
+ split.log_entry->inc_map_ref();
+}
+
+template <typename T>
+std::list<std::shared_ptr<T>> LogMap<T>::find_log_entries_locked(const BlockExtent &block_extent) {
+ std::list<std::shared_ptr<T>> overlaps;
+ ldout(m_cct, 20) << "block_extent=" << block_extent << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ LogMapEntries<T> map_entries = find_map_entries_locked(block_extent);
+ for (auto &map_entry : map_entries) {
+ overlaps.emplace_back(map_entry.log_entry);
+ }
+ return overlaps;
+}
+
+/**
+ * TODO: Generalize this to do some arbitrary thing to each map
+ * extent, instead of returning a list.
+ */
+template <typename T>
+LogMapEntries<T> LogMap<T>::find_map_entries_locked(const BlockExtent &block_extent) {
+ LogMapEntries<T> overlaps;
+
+ ldout(m_cct, 20) << "block_extent=" << block_extent << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ auto p = m_block_to_log_entry_map.equal_range(LogMapEntry<T>(block_extent));
+ ldout(m_cct, 20) << "count=" << std::distance(p.first, p.second) << dendl;
+ for ( auto i = p.first; i != p.second; ++i ) {
+ LogMapEntry<T> entry = *i;
+ overlaps.emplace_back(entry);
+ ldout(m_cct, 20) << entry << dendl;
+ }
+ return overlaps;
+}
+
+/* We map block extents to write log entries, or portions of write log
+ * entries. These are both represented by a WriteLogMapEntry. When a
+ * GenericWriteLogEntry is added to this map, a WriteLogMapEntry is created to
+ * represent the entire block extent of the GenericWriteLogEntry, and the
+ * WriteLogMapEntry is added to the set.
+ *
+ * The set must not contain overlapping WriteLogMapEntrys. WriteLogMapEntrys
+ * in the set that overlap with one being added are adjusted (shrunk, split,
+ * or removed) before the new entry is added.
+ *
+ * This comparison works despite the ambiguity because we ensure the set
+ * contains no overlapping entries. This comparison works to find entries
+ * that overlap with a given block extent because equal_range() returns the
+ * first entry in which the extent doesn't end before the given extent
+ * starts, and the last entry for which the extent starts before the given
+ * extent ends (the first entry that the key is less than, and the last entry
+ * that is less than the key).
+ */
+template <typename T>
+bool LogMap<T>::LogMapEntryCompare::operator()(const LogMapEntry<T> &lhs,
+ const LogMapEntry<T> &rhs) const {
+ if (lhs.block_extent.block_end <= rhs.block_extent.block_start) {
+ return true;
+ }
+ return false;
+}
+
+} //namespace pwl
+} //namespace cache
+} //namespace librbd
+
+template class librbd::cache::pwl::LogMap<librbd::cache::pwl::GenericWriteLogEntry>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H
+#define CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H
+
+#include "librbd/BlockGuard.h"
+#include <list>
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+/**
+ * WriteLogMap: maps block extents to GenericWriteLogEntries
+ *
+ * A WriteLogMapEntry (based on LogMapEntry) refers to a portion of a GenericWriteLogEntry
+ */
+template <typename T>
+class LogMapEntry {
+public:
+ BlockExtent block_extent;
+ std::shared_ptr<T> log_entry;
+
+ LogMapEntry(BlockExtent block_extent,
+ std::shared_ptr<T> log_entry = nullptr);
+ LogMapEntry(std::shared_ptr<T> log_entry);
+
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ LogMapEntry<U> &e);
+};
+
+template <typename T>
+using LogMapEntries = std::list<LogMapEntry<T>>;
+
+template <typename T>
+class LogMap {
+public:
+ LogMap(CephContext *cct);
+ LogMap(const LogMap&) = delete;
+ LogMap &operator=(const LogMap&) = delete;
+
+ void add_log_entry(std::shared_ptr<T> log_entry);
+ void add_log_entries(std::list<std::shared_ptr<T>> &log_entries);
+ void remove_log_entry(std::shared_ptr<T> log_entry);
+ void remove_log_entries(std::list<std::shared_ptr<T>> &log_entries);
+ std::list<std::shared_ptr<T>> find_log_entries(BlockExtent block_extent);
+ LogMapEntries<T> find_map_entries(BlockExtent block_extent);
+
+private:
+ void add_log_entry_locked(std::shared_ptr<T> log_entry);
+ void remove_log_entry_locked(std::shared_ptr<T> log_entry);
+ void add_map_entry_locked(LogMapEntry<T> &map_entry);
+ void remove_map_entry_locked(LogMapEntry<T> &map_entry);
+ void adjust_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &new_extent);
+ void split_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &removed_extent);
+ std::list<std::shared_ptr<T>> find_log_entries_locked(const BlockExtent &block_extent);
+ LogMapEntries<T> find_map_entries_locked(const BlockExtent &block_extent);
+
+ using LogMapEntryT = LogMapEntry<T>;
+
+ class LogMapEntryCompare {
+ public:
+ bool operator()(const LogMapEntryT &lhs,
+ const LogMapEntryT &rhs) const;
+ };
+
+ using BlockExtentToLogMapEntries = std::set<LogMapEntryT,
+ LogMapEntryCompare>;
+
+ CephContext *m_cct;
+ ceph::mutex m_lock;
+ BlockExtentToLogMapEntries m_block_to_log_entry_map;
+};
+
+} //namespace pwl
+} //namespace cache
+} //namespace librbd
+
+#endif //CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "LogOperation.h"
+#include "librbd/cache/pwl/Types.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::LogOperation: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+
+namespace cache {
+
+namespace pwl {
+
+GenericLogOperation::GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter)
+ : m_perfcounter(perfcounter), dispatch_time(dispatch_time) {
+}
+
+std::ostream& GenericLogOperation::format(std::ostream &os) const {
+ os << "dispatch_time=[" << dispatch_time << "], "
+ << "buf_persist_time=[" << buf_persist_time << "], "
+ << "buf_persist_comp_time=[" << buf_persist_comp_time << "], "
+ << "log_append_time=[" << log_append_time << "], "
+ << "log_append_comp_time=[" << log_append_comp_time << "], ";
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const GenericLogOperation &op) {
+ return op.format(os);
+}
+
+SyncPointLogOperation::SyncPointLogOperation(ceph::mutex &lock,
+ std::shared_ptr<SyncPoint> sync_point,
+ const utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct)
+ : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock), sync_point(sync_point) {
+}
+
+SyncPointLogOperation::~SyncPointLogOperation() { }
+
+std::ostream &SyncPointLogOperation::format(std::ostream &os) const {
+ os << "(Sync Point) ";
+ GenericLogOperation::format(os);
+ os << ", "
+ << "sync_point=[" << *sync_point << "]";
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const SyncPointLogOperation &op) {
+ return op.format(os);
+}
+
+std::vector<Context*> SyncPointLogOperation::append_sync_point() {
+ std::vector<Context*> appending_contexts;
+ std::lock_guard locker(m_lock);
+ if (!sync_point->appending) {
+ sync_point->appending = true;
+ }
+ appending_contexts.swap(sync_point->on_sync_point_appending);
+ return appending_contexts;
+}
+
+void SyncPointLogOperation::clear_earlier_sync_point() {
+ std::lock_guard locker(m_lock);
+ ceph_assert(sync_point->later_sync_point);
+ ceph_assert(sync_point->later_sync_point->earlier_sync_point ==
+ sync_point);
+ sync_point->later_sync_point->earlier_sync_point = nullptr;
+}
+
+std::vector<Context*> SyncPointLogOperation::swap_on_sync_point_persisted() {
+ std::lock_guard locker(m_lock);
+ std::vector<Context*> persisted_contexts;
+ persisted_contexts.swap(sync_point->on_sync_point_persisted);
+ return persisted_contexts;
+}
+
+void SyncPointLogOperation::appending() {
+ ceph_assert(sync_point);
+ ldout(m_cct, 20) << "Sync point op=[" << *this
+ << "] appending" << dendl;
+ auto appending_contexts = append_sync_point();
+ for (auto &ctx : appending_contexts) {
+ ctx->complete(0);
+ }
+}
+
+void SyncPointLogOperation::complete(int result) {
+ ceph_assert(sync_point);
+ ldout(m_cct, 20) << "Sync point op =[" << *this
+ << "] completed" << dendl;
+ clear_earlier_sync_point();
+
+ /* Do append now in case completion occurred before the
+ * normal append callback executed, and to handle
+ * on_append work that was queued after the sync point
+ * entered the appending state. */
+ appending();
+ auto persisted_contexts = swap_on_sync_point_persisted();
+ for (auto &ctx : persisted_contexts) {
+ ctx->complete(result);
+ }
+}
+
+GenericWriteLogOperation::GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
+ const utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct)
+ : GenericLogOperation(dispatch_time, perfcounter),
+ m_lock(ceph::make_mutex(util::unique_lock_name(
+ "librbd::cache::pwl::GenericWriteLogOperation::m_lock", this))),
+ m_cct(cct),
+ sync_point(sync_point) {
+}
+
+GenericWriteLogOperation::~GenericWriteLogOperation() { }
+
+std::ostream &GenericWriteLogOperation::format(std::ostream &os) const {
+ GenericLogOperation::format(os);
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const GenericWriteLogOperation &op) {
+ return op.format(os);
+}
+
+/* Called when the write log operation is appending and its log position is guaranteed */
+void GenericWriteLogOperation::appending() {
+ Context *on_append = nullptr;
+ ldout(m_cct, 20) << __func__ << " " << this << dendl;
+ {
+ std::lock_guard locker(m_lock);
+ on_append = on_write_append;
+ on_write_append = nullptr;
+ }
+ if (on_append) {
+ ldout(m_cct, 20) << __func__ << " " << this << " on_append=" << on_append << dendl;
+ on_append->complete(0);
+ }
+}
+
+/* Called when the write log operation is completed in all log replicas */
+void GenericWriteLogOperation::complete(int result) {
+ appending();
+ Context *on_persist = nullptr;
+ ldout(m_cct, 20) << __func__ << " " << this << dendl;
+ {
+ std::lock_guard locker(m_lock);
+ on_persist = on_write_persist;
+ on_write_persist = nullptr;
+ }
+ if (on_persist) {
+ ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist << dendl;
+ on_persist->complete(result);
+ }
+}
+
+WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ CephContext *cct)
+ : GenericWriteLogOperation(set.sync_point, set.dispatch_time, set.perfcounter, cct),
+ log_entry(std::make_shared<WriteLogEntry>(set.sync_point->log_entry, image_offset_bytes, write_bytes)) {
+ on_write_append = set.extent_ops_appending->new_sub();
+ on_write_persist = set.extent_ops_persist->new_sub();
+ log_entry->sync_point_entry->writes++;
+ log_entry->sync_point_entry->bytes += write_bytes;
+}
+
+WriteLogOperation::~WriteLogOperation() { }
+
+void WriteLogOperation::init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation, uint64_t current_sync_gen,
+ uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset,
+ bool persist_on_flush) {
+ log_entry->init(has_data, allocation, current_sync_gen, last_op_sequence_num, persist_on_flush);
+ buffer_alloc = &(*allocation);
+ bl.substr_of(write_req_bl, buffer_offset,
+ log_entry->write_bytes());
+}
+
+std::ostream &WriteLogOperation::format(std::ostream &os) const {
+ os << "(Write) ";
+ GenericWriteLogOperation::format(os);
+ os << ", ";
+ if (log_entry) {
+ os << "log_entry=[" << *log_entry << "], ";
+ } else {
+ os << "log_entry=nullptr, ";
+ }
+ os << "bl=[" << bl << "],"
+ << "buffer_alloc=" << buffer_alloc;
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const WriteLogOperation &op) {
+ return op.format(os);
+}
+
+
+void WriteLogOperation::complete(int result) {
+ GenericWriteLogOperation::complete(result);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_buf_t, buf_persist_time - dispatch_time);
+ utime_t buf_lat = buf_persist_comp_time - buf_persist_time;
+ m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_bufc_t, buf_lat);
+ m_perfcounter->hinc(l_librbd_pwl_log_op_buf_to_bufc_t_hist, buf_lat.to_nsec(),
+ log_entry->ram_entry.write_bytes);
+ m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_app_t, log_append_time - buf_persist_time);
+}
+
+void WriteLogOperation::copy_bl_to_pmem_buffer() {
+ /* operation is a shared_ptr, so write_op is only good as long as operation is in scope */
+ bufferlist::iterator i(&bl);
+ m_perfcounter->inc(l_librbd_pwl_log_op_bytes, log_entry->write_bytes());
+ ldout(m_cct, 20) << bl << dendl;
+ i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->pmem_buffer);
+}
+
+void WriteLogOperation::flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {
+ buf_persist_time = ceph_clock_now();
+ pmemobj_flush(log_pool, log_entry->pmem_buffer, log_entry->write_bytes());
+}
+
+WriteLogOperationSet::WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr<SyncPoint> sync_point,
+ bool persist_on_flush, CephContext *cct, Context *on_finish)
+ : m_cct(cct), m_on_finish(on_finish),
+ persist_on_flush(persist_on_flush),
+ dispatch_time(dispatched),
+ perfcounter(perfcounter),
+ sync_point(sync_point) {
+ on_ops_appending = sync_point->prior_persisted_gather_new_sub();
+ on_ops_persist = nullptr;
+ extent_ops_persist =
+ new C_Gather(m_cct,
+ new LambdaContext( [this](int r) {
+ ldout(this->m_cct,20) << __func__ << " " << this << " m_extent_ops_persist completed" << dendl;
+ if (on_ops_persist) {
+ on_ops_persist->complete(r);
+ }
+ m_on_finish->complete(r);
+ }));
+ auto appending_persist_sub = extent_ops_persist->new_sub();
+ extent_ops_appending =
+ new C_Gather(m_cct,
+ new LambdaContext( [this, appending_persist_sub](int r) {
+ ldout(this->m_cct, 20) << __func__ << " " << this << " m_extent_ops_appending completed" << dendl;
+ on_ops_appending->complete(r);
+ appending_persist_sub->complete(r);
+ }));
+}
+
+WriteLogOperationSet::~WriteLogOperationSet() { }
+
+std::ostream &operator<<(std::ostream &os,
+ const WriteLogOperationSet &s) {
+ os << "cell=" << (void*)s.cell << ", "
+ << "extent_ops_appending=[" << s.extent_ops_appending << ", "
+ << "extent_ops_persist=[" << s.extent_ops_persist << "]";
+ return os;
+}
+
+DiscardLogOperation::DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
+ const uint64_t image_offset_bytes,
+ const uint64_t write_bytes,
+ uint32_t discard_granularity_bytes,
+ const utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct)
+ : GenericWriteLogOperation(sync_point, dispatch_time, perfcounter, cct),
+ log_entry(std::make_shared<DiscardLogEntry>(sync_point->log_entry,
+ image_offset_bytes,
+ write_bytes,
+ discard_granularity_bytes)) {
+ on_write_append = sync_point->prior_persisted_gather_new_sub();
+ on_write_persist = nullptr;
+ log_entry->sync_point_entry->writes++;
+ log_entry->sync_point_entry->bytes += write_bytes;
+}
+
+DiscardLogOperation::~DiscardLogOperation() { }
+
+void DiscardLogOperation::init(uint64_t current_sync_gen, bool persist_on_flush,
+ uint64_t last_op_sequence_num, Context *write_persist) {
+ log_entry->init(current_sync_gen, persist_on_flush, last_op_sequence_num);
+ this->on_write_persist = write_persist;
+}
+
+std::ostream &DiscardLogOperation::format(std::ostream &os) const {
+ os << "(Discard) ";
+ GenericWriteLogOperation::format(os);
+ os << ", ";
+ if (log_entry) {
+ os << "log_entry=[" << *log_entry << "], ";
+ } else {
+ os << "log_entry=nullptr, ";
+ }
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const DiscardLogOperation &op) {
+ return op.format(os);
+}
+
+WriteSameLogOperation::WriteSameLogOperation(WriteLogOperationSet &set,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes,
+ uint32_t data_len,
+ CephContext *cct)
+ : WriteLogOperation(set, image_offset_bytes, write_bytes, cct) {
+ log_entry =
+ std::make_shared<WriteSameLogEntry>(set.sync_point->log_entry, image_offset_bytes, write_bytes, data_len);
+ ldout(m_cct, 20) << __func__ << " " << this << dendl;
+}
+
+WriteSameLogOperation::~WriteSameLogOperation() { }
+
+std::ostream &WriteSameLogOperation::format(std::ostream &os) const {
+ os << "(Write Same) ";
+ WriteLogOperation::format(os);
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const WriteSameLogOperation &op) {
+ return op.format(os);
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H
+#define CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H
+
+#include "include/utime.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/cache/pwl/SyncPoint.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+struct WriteBufferAllocation;
+
+class WriteLogOperationSet;
+
+class WriteLogOperation;
+
+class GenericWriteLogOperation;
+
+class SyncPointLogOperation;
+
+class GenericLogOperation;
+
+using GenericLogOperationSharedPtr = std::shared_ptr<GenericLogOperation>;
+
+using GenericLogOperationsVector = std::vector<GenericLogOperationSharedPtr>;
+
+class GenericLogOperation {
+protected:
+ PerfCounters *m_perfcounter = nullptr;
+public:
+ utime_t dispatch_time; // When op created
+ utime_t buf_persist_time; // When buffer persist begins
+ utime_t buf_persist_comp_time; // When buffer persist completes
+ utime_t log_append_time; // When log append begins
+ utime_t log_append_comp_time; // When log append completes
+ GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter);
+ virtual ~GenericLogOperation() { };
+ GenericLogOperation(const GenericLogOperation&) = delete;
+ GenericLogOperation &operator=(const GenericLogOperation&) = delete;
+ virtual std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const GenericLogOperation &op);
+ virtual const std::shared_ptr<GenericLogEntry> get_log_entry() = 0;
+ virtual void appending() = 0;
+ virtual void complete(int r) = 0;
+ virtual void mark_log_entry_completed() {};
+ virtual bool reserved_allocated() const {
+ return false;
+ }
+ virtual bool is_writing_op() const {
+ return false;
+ }
+ virtual void copy_bl_to_pmem_buffer() {};
+ virtual void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {};
+};
+
+class SyncPointLogOperation : public GenericLogOperation {
+private:
+ CephContext *m_cct;
+ ceph::mutex &m_lock;
+ std::vector<Context*> append_sync_point();
+ void clear_earlier_sync_point();
+ std::vector<Context*> swap_on_sync_point_persisted();
+public:
+ std::shared_ptr<SyncPoint> sync_point;
+ SyncPointLogOperation(ceph::mutex &lock,
+ std::shared_ptr<SyncPoint> sync_point,
+ const utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct);
+ ~SyncPointLogOperation() override;
+ SyncPointLogOperation(const SyncPointLogOperation&) = delete;
+ SyncPointLogOperation &operator=(const SyncPointLogOperation&) = delete;
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const SyncPointLogOperation &op);
+ const std::shared_ptr<GenericLogEntry> get_log_entry() override {
+ return sync_point->log_entry;
+ }
+ void appending() override;
+ void complete(int r) override;
+};
+
+class GenericWriteLogOperation : public GenericLogOperation {
+protected:
+ ceph::mutex m_lock;
+ CephContext *m_cct;
+public:
+ std::shared_ptr<SyncPoint> sync_point;
+ Context *on_write_append = nullptr; /* Completion for things waiting on this
+ * write's position in the log to be
+ * guaranteed */
+ Context *on_write_persist = nullptr; /* Completion for things waiting on this
+ * write to persist */
+ GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
+ const utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct);
+ ~GenericWriteLogOperation() override;
+ GenericWriteLogOperation(const GenericWriteLogOperation&) = delete;
+ GenericWriteLogOperation &operator=(const GenericWriteLogOperation&) = delete;
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const GenericWriteLogOperation &op);
+ void mark_log_entry_completed() override{
+ sync_point->log_entry->writes_completed++;
+ }
+ bool reserved_allocated() const override {
+ return true;
+ }
+ bool is_writing_op() const override {
+ return true;
+ }
+ void appending() override;
+ void complete(int r) override;
+};
+
+class WriteLogOperation : public GenericWriteLogOperation {
+public:
+ using GenericWriteLogOperation::m_lock;
+ using GenericWriteLogOperation::sync_point;
+ using GenericWriteLogOperation::on_write_append;
+ using GenericWriteLogOperation::on_write_persist;
+ std::shared_ptr<WriteLogEntry> log_entry;
+ bufferlist bl;
+ WriteBufferAllocation *buffer_alloc = nullptr;
+ WriteLogOperation(WriteLogOperationSet &set, const uint64_t image_offset_bytes,
+ const uint64_t write_bytes, CephContext *cct);
+ ~WriteLogOperation() override;
+ WriteLogOperation(const WriteLogOperation&) = delete;
+ WriteLogOperation &operator=(const WriteLogOperation&) = delete;
+ void init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation, uint64_t current_sync_gen,
+ uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset,
+ bool persist_on_flush);
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const WriteLogOperation &op);
+ const std::shared_ptr<GenericLogEntry> get_log_entry() override {
+ return log_entry;
+ }
+
+ void complete(int r) override;
+ void copy_bl_to_pmem_buffer() override;
+ void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) override;
+};
+
+
+class WriteLogOperationSet {
+private:
+ CephContext *m_cct;
+ Context *m_on_finish;
+public:
+ bool persist_on_flush;
+ BlockGuardCell *cell;
+ C_Gather *extent_ops_appending;
+ Context *on_ops_appending;
+ C_Gather *extent_ops_persist;
+ Context *on_ops_persist;
+ GenericLogOperationsVector operations;
+ utime_t dispatch_time; /* When set created */
+ PerfCounters *perfcounter = nullptr;
+ std::shared_ptr<SyncPoint> sync_point;
+ WriteLogOperationSet(const utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr<SyncPoint> sync_point,
+ const bool persist_on_flush, CephContext *cct, Context *on_finish);
+ ~WriteLogOperationSet();
+ WriteLogOperationSet(const WriteLogOperationSet&) = delete;
+ WriteLogOperationSet &operator=(const WriteLogOperationSet&) = delete;
+ friend std::ostream &operator<<(std::ostream &os,
+ const WriteLogOperationSet &s);
+};
+
+class DiscardLogOperation : public GenericWriteLogOperation {
+public:
+ using GenericWriteLogOperation::m_lock;
+ using GenericWriteLogOperation::sync_point;
+ using GenericWriteLogOperation::on_write_append;
+ using GenericWriteLogOperation::on_write_persist;
+ std::shared_ptr<DiscardLogEntry> log_entry;
+ DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
+ const uint64_t image_offset_bytes,
+ const uint64_t write_bytes,
+ uint32_t discard_granularity_bytes,
+ const utime_t dispatch_time,
+ PerfCounters *perfcounter,
+ CephContext *cct);
+ ~DiscardLogOperation() override;
+ DiscardLogOperation(const DiscardLogOperation&) = delete;
+ DiscardLogOperation &operator=(const DiscardLogOperation&) = delete;
+ const std::shared_ptr<GenericLogEntry> get_log_entry() override {
+ return log_entry;
+ }
+ bool reserved_allocated() const override {
+ return false;
+ }
+ void init(uint64_t current_sync_gen, bool persist_on_flush,
+ uint64_t last_op_sequence_num, Context *write_persist);
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const DiscardLogOperation &op);
+};
+
+class WriteSameLogOperation : public WriteLogOperation {
+public:
+ using GenericWriteLogOperation::m_lock;
+ using GenericWriteLogOperation::sync_point;
+ using GenericWriteLogOperation::on_write_append;
+ using GenericWriteLogOperation::on_write_persist;
+ using WriteLogOperation::log_entry;
+ using WriteLogOperation::bl;
+ using WriteLogOperation::buffer_alloc;
+ WriteSameLogOperation(WriteLogOperationSet &set,
+ const uint64_t image_offset_bytes,
+ const uint64_t write_bytes,
+ const uint32_t data_len,
+ CephContext *cct);
+ ~WriteSameLogOperation();
+ WriteSameLogOperation(const WriteSameLogOperation&) = delete;
+ WriteSameLogOperation &operator=(const WriteSameLogOperation&) = delete;
+ std::ostream &format(std::ostream &os) const;
+ friend std::ostream &operator<<(std::ostream &os,
+ const WriteSameLogOperation &op);
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ReadRequest.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ReadRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+void C_ReadRequest::finish(int r) {
+ ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl;
+ int hits = 0;
+ int misses = 0;
+ int hit_bytes = 0;
+ int miss_bytes = 0;
+ if (r >= 0) {
+ /*
+ * At this point the miss read has completed. We'll iterate through
+ * read_extents and produce *m_out_bl by assembling pieces of miss_bl
+ * and the individual hit extent bufs in the read extents that represent
+ * hits.
+ */
+ uint64_t miss_bl_offset = 0;
+ for (auto &extent : read_extents) {
+ if (extent.m_bl.length()) {
+ /* This was a hit */
+ ceph_assert(extent.second == extent.m_bl.length());
+ ++hits;
+ hit_bytes += extent.second;
+ m_out_bl->claim_append(extent.m_bl);
+ } else {
+ /* This was a miss. */
+ ++misses;
+ miss_bytes += extent.second;
+ bufferlist miss_extent_bl;
+ miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent.second);
+ /* Add this read miss bufferlist to the output bufferlist */
+ m_out_bl->claim_append(miss_extent_bl);
+ /* Consume these bytes in the read miss bufferlist */
+ miss_bl_offset += extent.second;
+ }
+ }
+ }
+ ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl;
+ utime_t now = ceph_clock_now();
+ ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes);
+ m_on_finish->complete(r);
+ m_perfcounter->inc(l_librbd_pwl_rd_bytes, hit_bytes + miss_bytes);
+ m_perfcounter->inc(l_librbd_pwl_rd_hit_bytes, hit_bytes);
+ m_perfcounter->tinc(l_librbd_pwl_rd_latency, now - m_arrived_time);
+ if (!misses) {
+ m_perfcounter->inc(l_librbd_pwl_rd_hit_req, 1);
+ m_perfcounter->tinc(l_librbd_pwl_rd_hit_latency, now - m_arrived_time);
+ } else {
+ if (hits) {
+ m_perfcounter->inc(l_librbd_pwl_rd_part_hit_req, 1);
+ }
+ }
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H
+
+#include "include/Context.h"
+#include "librbd/cache/pwl/Types.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+typedef std::vector<pwl::ImageExtentBuf> ImageExtentBufs;
+
+class C_ReadRequest : public Context {
+public:
+ io::Extents miss_extents; // move back to caller
+ ImageExtentBufs read_extents;
+ bufferlist miss_bl;
+
+ C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish)
+ : m_cct(cct), m_on_finish(on_finish), m_out_bl(out_bl),
+ m_arrived_time(arrived), m_perfcounter(perfcounter) {}
+ ~C_ReadRequest() {}
+
+ void finish(int r) override;
+
+ const char *get_name() const {
+ return "C_ReadRequest";
+ }
+
+private:
+ CephContext *m_cct;
+ Context *m_on_finish;
+ bufferlist *m_out_bl;
+ utime_t m_arrived_time;
+ PerfCounters *m_perfcounter;
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// // vim: ts=8 sw=2 smarttab
+
+#include "ReplicatedWriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include <map>
+#include <vector>
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ReplicatedWriteLog: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+using namespace librbd::cache::pwl;
+
+template <typename I>
+ReplicatedWriteLog<I>::ReplicatedWriteLog(I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state)
+: AbstractWriteLog<I>(image_ctx, cache_state)
+{
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ReplicatedWriteLog<librbd::ImageCtx>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/cache/ImageCache.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/Utils.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/Types.h"
+#include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/Request.h"
+#include "librbd/cache/pwl/LogMap.h"
+#include "AbstractWriteLog.h"
+#include <functional>
+#include <list>
+
+class Context;
+class SafeTimer;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+
+namespace pwl {
+
+template <typename ImageCtxT>
+class ReplicatedWriteLog : public AbstractWriteLog<ImageCtxT> {
+public:
+ typedef io::Extent Extent;
+ typedef io::Extents Extents;
+
+ ReplicatedWriteLog(ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state);
+ ~ReplicatedWriteLog();
+ ReplicatedWriteLog(const ReplicatedWriteLog&) = delete;
+ ReplicatedWriteLog &operator=(const ReplicatedWriteLog&) = delete;
+
+private:
+ using This = AbstractWriteLog<ImageCtxT>;
+ using C_WriteRequestT = pwl::C_WriteRequest<This>;
+ using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
+ using C_FlushRequestT = pwl::C_FlushRequest<This>;
+ using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
+ using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
+ using C_CompAndWriteRequestT = pwl::C_CompAndWriteRequest<This>;
+
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::ReplicatedWriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Request.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/cache/pwl/ReplicatedWriteLog.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::Request: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+template <typename T>
+C_BlockIORequest<T>::C_BlockIORequest(T &pwl, const utime_t arrived, io::Extents &&extents,
+ bufferlist&& bl, const int fadvise_flags, Context *user_req)
+ : pwl(pwl), image_extents(std::move(extents)),
+ bl(std::move(bl)), fadvise_flags(fadvise_flags),
+ user_req(user_req), image_extents_summary(image_extents), m_arrived_time(arrived) {
+ ldout(pwl.get_context(), 99) << this << dendl;
+}
+
+template <typename T>
+C_BlockIORequest<T>::~C_BlockIORequest() {
+ ldout(pwl.get_context(), 99) << this << dendl;
+ ceph_assert(m_cell_released || !m_cell);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_BlockIORequest<T> &req) {
+ os << "image_extents=[" << req.image_extents << "], "
+ << "image_extents_summary=[" << req.image_extents_summary << "], "
+ << "bl=" << req.bl << ", "
+ << "user_req=" << req.user_req << ", "
+ << "m_user_req_completed=" << req.m_user_req_completed << ", "
+ << "m_deferred=" << req.m_deferred << ", "
+ << "detained=" << req.detained << ", "
+ << "waited_lanes=" << req.waited_lanes << ", "
+ << "waited_entries=" << req.waited_entries << ", "
+ << "waited_buffers=" << req.waited_buffers << "";
+ return os;
+}
+
+template <typename T>
+void C_BlockIORequest<T>::set_cell(BlockGuardCell *cell) {
+ ldout(pwl.get_context(), 20) << this << " cell=" << cell << dendl;
+ ceph_assert(cell);
+ ceph_assert(!m_cell);
+ m_cell = cell;
+}
+
+template <typename T>
+BlockGuardCell *C_BlockIORequest<T>::get_cell(void) {
+ ldout(pwl.get_context(), 20) << this << " cell=" << m_cell << dendl;
+ return m_cell;
+}
+
+template <typename T>
+void C_BlockIORequest<T>::release_cell() {
+ ldout(pwl.get_context(), 20) << this << " cell=" << m_cell << dendl;
+ ceph_assert(m_cell);
+ bool initial = false;
+ if (m_cell_released.compare_exchange_strong(initial, true)) {
+ pwl.release_guarded_request(m_cell);
+ } else {
+ ldout(pwl.get_context(), 5) << "cell " << m_cell << " already released for " << this << dendl;
+ }
+}
+
+template <typename T>
+void C_BlockIORequest<T>::complete_user_request(int r) {
+ bool initial = false;
+ if (m_user_req_completed.compare_exchange_strong(initial, true)) {
+ ldout(pwl.get_context(), 15) << this << " completing user req" << dendl;
+ m_user_req_completed_time = ceph_clock_now();
+ user_req->complete(r);
+ // Set user_req as null as it is deleted
+ user_req = nullptr;
+ } else {
+ ldout(pwl.get_context(), 20) << this << " user req already completed" << dendl;
+ }
+}
+
+template <typename T>
+void C_BlockIORequest<T>::finish(int r) {
+ ldout(pwl.get_context(), 20) << this << dendl;
+
+ complete_user_request(r);
+ bool initial = false;
+ if (m_finish_called.compare_exchange_strong(initial, true)) {
+ ldout(pwl.get_context(), 15) << this << " finishing" << dendl;
+ finish_req(0);
+ } else {
+ ldout(pwl.get_context(), 20) << this << " already finished" << dendl;
+ ceph_assert(0);
+ }
+}
+
+template <typename T>
+void C_BlockIORequest<T>::deferred() {
+ bool initial = false;
+ if (m_deferred.compare_exchange_strong(initial, true)) {
+ deferred_handler();
+ }
+}
+
+template <typename T>
+C_WriteRequest<T>::C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_BlockIORequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req),
+ m_perfcounter(perfcounter), m_lock(lock) {
+ ldout(pwl.get_context(), 99) << this << dendl;
+}
+
+template <typename T>
+C_WriteRequest<T>::~C_WriteRequest() {
+ ldout(pwl.get_context(), 99) << this << dendl;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_WriteRequest<T> &req) {
+ os << (C_BlockIORequest<T>&)req
+ << " m_resources.allocated=" << req.m_resources.allocated;
+ if (req.op_set) {
+ os << "op_set=" << *req.op_set;
+ }
+ return os;
+}
+
+template <typename T>
+void C_WriteRequest<T>::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) {
+ ldout(pwl.get_context(), 20) << __func__ << " write_req=" << this << " cell=" << guard_ctx.cell << dendl;
+
+ ceph_assert(guard_ctx.cell);
+ this->detained = guard_ctx.state.detained; /* overlapped */
+ this->m_queued = guard_ctx.state.queued; /* queued behind at least one barrier */
+ this->set_cell(guard_ctx.cell);
+}
+
+template <typename T>
+void C_WriteRequest<T>::finish_req(int r) {
+ ldout(pwl.get_context(), 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl;
+
+ /* Completed to caller by here (in finish(), which calls this) */
+ utime_t now = ceph_clock_now();
+ pwl.release_write_lanes(this);
+ ceph_assert(m_resources.allocated);
+ m_resources.allocated = false;
+ this->release_cell(); /* TODO: Consider doing this in appending state */
+ update_req_stats(now);
+}
+
+template <typename T>
+void C_WriteRequest<T>::setup_buffer_resources(
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &number_lanes, uint64_t &number_log_entries,
+ uint64_t &number_unpublished_reserves) {
+
+ ceph_assert(!m_resources.allocated);
+
+ auto image_extents_size = this->image_extents.size();
+ m_resources.buffers.reserve(image_extents_size);
+
+ bytes_cached = 0;
+ bytes_allocated = 0;
+ number_lanes = image_extents_size;
+ number_log_entries = image_extents_size;
+ number_unpublished_reserves = image_extents_size;
+
+ for (auto &extent : this->image_extents) {
+ m_resources.buffers.emplace_back();
+ struct WriteBufferAllocation &buffer = m_resources.buffers.back();
+ buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
+ buffer.allocated = false;
+ bytes_cached += extent.second;
+ if (extent.second > buffer.allocation_size) {
+ buffer.allocation_size = extent.second;
+ }
+ bytes_allocated += buffer.allocation_size;
+ }
+ bytes_dirtied = bytes_cached;
+}
+
+template <typename T>
+std::shared_ptr<WriteLogOperation> C_WriteRequest<T>::create_operation(uint64_t offset, uint64_t len) {
+ return std::make_shared<WriteLogOperation>(*op_set, offset, len, pwl.get_context());
+}
+
+template <typename T>
+void C_WriteRequest<T>::setup_log_operations(DeferredContexts &on_exit) {
+ GenericWriteLogEntries log_entries;
+ {
+ std::lock_guard locker(m_lock);
+ std::shared_ptr<SyncPoint> current_sync_point = pwl.get_current_sync_point();
+ if ((!pwl.get_persist_on_flush() && current_sync_point->log_entry->writes_completed) ||
+ (current_sync_point->log_entry->writes > MAX_WRITES_PER_SYNC_POINT) ||
+ (current_sync_point->log_entry->bytes > MAX_BYTES_PER_SYNC_POINT)) {
+ /* Create new sync point and persist the previous one. This sequenced
+ * write will bear a sync gen number shared with no already completed
+ * writes. A group of sequenced writes may be safely flushed concurrently
+ * if they all arrived before any of them completed. We'll insert one on
+ * an aio_flush() from the application. Here we're inserting one to cap
+ * the number of bytes and writes per sync point. When the application is
+ * not issuing flushes, we insert sync points to record some observed
+ * write concurrency information that enables us to safely issue >1 flush
+ * write (for writes observed here to have been in flight simultaneously)
+ * at a time in persist-on-write mode.
+ */
+ pwl.flush_new_sync_point(nullptr, on_exit);
+ current_sync_point = pwl.get_current_sync_point();
+ }
+ uint64_t current_sync_gen = pwl.get_current_sync_gen();
+ op_set =
+ make_unique<WriteLogOperationSet>(this->m_dispatched_time,
+ m_perfcounter,
+ current_sync_point,
+ pwl.get_persist_on_flush(),
+ pwl.get_context(), this);
+ ldout(pwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get() << dendl;
+ ceph_assert(m_resources.allocated);
+ /* op_set->operations initialized differently for plain write or write same */
+ auto allocation = m_resources.buffers.begin();
+ uint64_t buffer_offset = 0;
+ for (auto &extent : this->image_extents) {
+ /* operation->on_write_persist connected to m_prior_log_entries_persisted Gather */
+ auto operation = this->create_operation(extent.first, extent.second);
+ this->op_set->operations.emplace_back(operation);
+
+ /* A WS is also a write */
+ ldout(pwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get()
+ << " operation=" << operation << dendl;
+ log_entries.emplace_back(operation->log_entry);
+ if (!op_set->persist_on_flush) {
+ pwl.inc_last_op_sequence_num();
+ }
+ operation->init(true, allocation, current_sync_gen,
+ pwl.get_last_op_sequence_num(), this->bl, buffer_offset, op_set->persist_on_flush);
+ buffer_offset += operation->log_entry->write_bytes();
+ ldout(pwl.get_context(), 20) << "operation=[" << *operation << "]" << dendl;
+ allocation++;
+ }
+ }
+ /* All extent ops subs created */
+ op_set->extent_ops_appending->activate();
+ op_set->extent_ops_persist->activate();
+
+ /* Write data */
+ for (auto &operation : op_set->operations) {
+ operation->copy_bl_to_pmem_buffer();
+ }
+ pwl.add_into_log_map(log_entries);
+}
+
+template <typename T>
+bool C_WriteRequest<T>::append_write_request(std::shared_ptr<SyncPoint> sync_point) {
+ std::lock_guard locker(m_lock);
+ auto write_req_sp = this;
+ if (sync_point->earlier_sync_point) {
+ Context *schedule_append_ctx = new LambdaContext([this, write_req_sp](int r) {
+ write_req_sp->schedule_append();
+ });
+ sync_point->earlier_sync_point->on_sync_point_appending.push_back(schedule_append_ctx);
+ return true;
+ }
+ return false;
+}
+
+template <typename T>
+void C_WriteRequest<T>::schedule_append() {
+ ceph_assert(++m_appended == 1);
+ if (m_do_early_flush) {
+ /* This caller is waiting for persist, so we'll use their thread to
+ * expedite it */
+ pwl.flush_pmem_buffer(this->op_set->operations);
+ pwl.schedule_append(this->op_set->operations);
+ } else {
+ /* This is probably not still the caller's thread, so do the payload
+ * flushing/replicating later. */
+ pwl.schedule_flush_and_append(this->op_set->operations);
+ }
+}
+
+/**
+ * Attempts to allocate log resources for a write. Returns true if successful.
+ *
+ * Resources include 1 lane per extent, 1 log entry per extent, and the payload
+ * data space for each extent.
+ *
+ * Lanes are released after the write persists via release_write_lanes()
+ */
+template <typename T>
+bool C_WriteRequest<T>::alloc_resources() {
+ this->allocated_time = ceph_clock_now();
+ return pwl.alloc_resources(this);
+}
+
+/**
+ * Takes custody of write_req. Resources must already be allocated.
+ *
+ * Locking:
+ * Acquires lock
+ */
+template <typename T>
+void C_WriteRequest<T>::dispatch()
+{
+ CephContext *cct = pwl.get_context();
+ DeferredContexts on_exit;
+ utime_t now = ceph_clock_now();
+ this->m_dispatched_time = now;
+
+ ldout(cct, 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl;
+ this->setup_log_operations(on_exit);
+
+ bool append_deferred = false;
+ if (!op_set->persist_on_flush &&
+ append_write_request(op_set->sync_point)) {
+ /* In persist-on-write mode, we defer the append of this write until the
+ * previous sync point is appending (meaning all the writes before it are
+ * persisted and that previous sync point can now appear in the
+ * log). Since we insert sync points in persist-on-write mode when writes
+ * have already completed to the current sync point, this limits us to
+ * one inserted sync point in flight at a time, and gives the next
+ * inserted sync point some time to accumulate a few writes if they
+ * arrive soon. Without this we can insert an absurd number of sync
+ * points, each with one or two writes. That uses a lot of log entries,
+ * and limits flushing to very few writes at a time. */
+ m_do_early_flush = false;
+ append_deferred = true;
+ } else {
+ /* The prior sync point is done, so we'll schedule append here. If this is
+ * persist-on-write, and probably still the caller's thread, we'll use this
+ * caller's thread to perform the persist & replication of the payload
+ * buffer. */
+ m_do_early_flush =
+ !(this->detained || this->m_queued || this->m_deferred || op_set->persist_on_flush);
+ }
+ if (!append_deferred) {
+ this->schedule_append();
+ }
+}
+
+template <typename T>
+C_FlushRequest<T>::C_FlushRequest(T &pwl, const utime_t arrived,
+ io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags,
+ ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req)
+ : C_BlockIORequest<T>(pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, user_req),
+ m_lock(lock), m_perfcounter(perfcounter) {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+void C_FlushRequest<T>::finish_req(int r) {
+ ldout(pwl.get_context(), 20) << "flush_req=" << this
+ << " cell=" << this->get_cell() << dendl;
+ /* Block guard already released */
+ ceph_assert(!this->get_cell());
+
+ /* Completed to caller by here */
+ utime_t now = ceph_clock_now();
+ m_perfcounter->tinc(l_librbd_pwl_aio_flush_latency, now - this->m_arrived_time);
+}
+
+template <typename T>
+bool C_FlushRequest<T>::alloc_resources() {
+ ldout(pwl.get_context(), 20) << "req type=" << get_name() << " "
+ << "req=[" << *this << "]" << dendl;
+ return pwl.alloc_resources(this);
+}
+
+template <typename T>
+void C_FlushRequest<T>::dispatch() {
+ utime_t now = ceph_clock_now();
+ ldout(pwl.get_context(), 20) << "req type=" << get_name() << " "
+ << "req=[" << *this << "]" << dendl;
+ ceph_assert(this->m_resources.allocated);
+ this->m_dispatched_time = now;
+
+ op = std::make_shared<SyncPointLogOperation>(m_lock,
+ to_append,
+ now,
+ m_perfcounter,
+ pwl.get_context());
+
+ m_perfcounter->inc(l_librbd_pwl_log_ops, 1);
+ pwl.schedule_append(op);
+}
+
+template <typename T>
+void C_FlushRequest<T>::setup_buffer_resources(
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &number_lanes, uint64_t &number_log_entries,
+ uint64_t &number_unpublished_reserves) {
+ number_log_entries = 1;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_FlushRequest<T> &req) {
+ os << (C_BlockIORequest<T>&)req
+ << " m_resources.allocated=" << req.m_resources.allocated;
+ return os;
+}
+
+template <typename T>
+C_DiscardRequest<T>::C_DiscardRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_BlockIORequest<T>(pwl, arrived, std::move(image_extents), bufferlist(), 0, user_req),
+ m_discard_granularity_bytes(discard_granularity_bytes),
+ m_lock(lock),
+ m_perfcounter(perfcounter) {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+C_DiscardRequest<T>::~C_DiscardRequest() {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+bool C_DiscardRequest<T>::alloc_resources() {
+ ldout(pwl.get_context(), 20) << "req type=" << get_name() << " "
+ << "req=[" << *this << "]" << dendl;
+ return pwl.alloc_resources(this);
+}
+
+template <typename T>
+void C_DiscardRequest<T>::setup_log_operations() {
+ std::lock_guard locker(m_lock);
+ GenericWriteLogEntries log_entries;
+ for (auto &extent : this->image_extents) {
+ op = std::make_shared<DiscardLogOperation>(pwl.get_current_sync_point(),
+ extent.first,
+ extent.second,
+ m_discard_granularity_bytes,
+ this->m_dispatched_time,
+ m_perfcounter,
+ pwl.get_context());
+ log_entries.emplace_back(op->log_entry);
+ break;
+ }
+ uint64_t current_sync_gen = pwl.get_current_sync_gen();
+ bool persist_on_flush = pwl.get_persist_on_flush();
+ if (!persist_on_flush) {
+ pwl.inc_last_op_sequence_num();
+ }
+ auto discard_req = this;
+ Context *on_write_persist = new LambdaContext(
+ [this, discard_req](int r) {
+ ldout(pwl.get_context(), 20) << "discard_req=" << discard_req
+ << " cell=" << discard_req->get_cell() << dendl;
+ ceph_assert(discard_req->get_cell());
+ discard_req->complete_user_request(r);
+ discard_req->release_cell();
+ });
+ op->init(current_sync_gen, persist_on_flush, pwl.get_last_op_sequence_num(), on_write_persist);
+ pwl.add_into_log_map(log_entries);
+}
+
+template <typename T>
+void C_DiscardRequest<T>::dispatch() {
+ utime_t now = ceph_clock_now();
+ ldout(pwl.get_context(), 20) << "req type=" << get_name() << " "
+ << "req=[" << *this << "]" << dendl;
+ ceph_assert(this->m_resources.allocated);
+ this->m_dispatched_time = now;
+ setup_log_operations();
+ m_perfcounter->inc(l_librbd_pwl_log_ops, 1);
+ pwl.schedule_append(op);
+}
+
+template <typename T>
+void C_DiscardRequest<T>::setup_buffer_resources(
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &number_lanes, uint64_t &number_log_entries,
+ uint64_t &number_unpublished_reserves) {
+ number_log_entries = 1;
+ /* No bytes are allocated for a discard, but we count the discarded bytes
+ * as dirty. This means it's possible to have more bytes dirty than
+ * there are bytes cached or allocated. */
+ for (auto &extent : this->image_extents) {
+ bytes_dirtied = extent.second;
+ break;
+ }
+}
+
+template <typename T>
+void C_DiscardRequest<T>::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) {
+ ldout(pwl.get_context(), 20) << " cell=" << guard_ctx.cell << dendl;
+
+ ceph_assert(guard_ctx.cell);
+ this->detained = guard_ctx.state.detained; /* overlapped */
+ this->set_cell(guard_ctx.cell);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_DiscardRequest<T> &req) {
+ os << (C_BlockIORequest<T>&)req;
+ if (req.op) {
+ os << " op=[" << *req.op << "]";
+ } else {
+ os << " op=nullptr";
+ }
+ return os;
+}
+
+template <typename T>
+C_WriteSameRequest<T>::C_WriteSameRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_WriteRequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req) {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+C_WriteSameRequest<T>::~C_WriteSameRequest() {
+ ldout(pwl.get_context(), 20) << this << dendl;
+}
+
+template <typename T>
+void C_WriteSameRequest<T>::update_req_stats(utime_t &now) {
+ /* Write same stats excluded from most write stats
+ * because the read phase will make them look like slow writes in
+ * those histograms. */
+ ldout(pwl.get_context(), 20) << this << dendl;
+ utime_t comp_latency = now - this->m_arrived_time;
+ this->m_perfcounter->tinc(l_librbd_pwl_ws_latency, comp_latency);
+}
+
+/* Write sames will allocate one buffer, the size of the repeating pattern */
+template <typename T>
+void C_WriteSameRequest<T>::setup_buffer_resources(
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &number_lanes, uint64_t &number_log_entries,
+ uint64_t &number_unpublished_reserves) {
+ ldout(pwl.get_context(), 20) << this << dendl;
+ ceph_assert(this->image_extents.size() == 1);
+ bytes_dirtied += this->image_extents[0].second;
+ auto pattern_length = this->bl.length();
+ this->m_resources.buffers.emplace_back();
+ struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
+ buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
+ buffer.allocated = false;
+ bytes_cached += pattern_length;
+ if (pattern_length > buffer.allocation_size) {
+ buffer.allocation_size = pattern_length;
+ }
+ bytes_allocated += buffer.allocation_size;
+}
+
+template <typename T>
+std::shared_ptr<WriteLogOperation> C_WriteSameRequest<T>::create_operation(uint64_t offset, uint64_t len) {
+ ceph_assert(this->image_extents.size() == 1);
+ return std::make_shared<WriteSameLogOperation>(*this->op_set.get(), offset, len,
+ this->bl.length(), pwl.get_context());
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_WriteSameRequest<T> &req) {
+ os << (C_WriteRequest<T>&)req;
+ return os;
+}
+
+template <typename T>
+C_CompAndWriteRequest<T>::C_CompAndWriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req)
+ : C_WriteRequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req),
+ mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)) {
+ ldout(pwl.get_context(), 20) << dendl;
+}
+
+template <typename T>
+C_CompAndWriteRequest<T>::~C_CompAndWriteRequest() {
+ ldout(pwl.get_context(), 20) << dendl;
+}
+
+template <typename T>
+void C_CompAndWriteRequest<T>::finish_req(int r) {
+ if (compare_succeeded) {
+ C_WriteRequest<T>::finish_req(r);
+ } else {
+ utime_t now = ceph_clock_now();
+ update_req_stats(now);
+ }
+}
+
+template <typename T>
+void C_CompAndWriteRequest<T>::update_req_stats(utime_t &now) {
+ /* Compare-and-write stats. Compare-and-write excluded from most write
+ * stats because the read phase will make them look like slow writes in
+ * those histograms. */
+ if (!compare_succeeded) {
+ this->m_perfcounter->inc(l_librbd_pwl_cmp_fails, 1);
+ }
+ utime_t comp_latency = now - this->m_arrived_time;
+ this->m_perfcounter->tinc(l_librbd_pwl_cmp_latency, comp_latency);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<T> &req) {
+ os << (C_WriteRequest<T>&)req
+ << "cmp_bl=" << req.cmp_bl << ", "
+ << "read_bl=" << req.read_bl << ", "
+ << "compare_succeeded=" << req.compare_succeeded << ", "
+ << "mismatch_offset=" << req.mismatch_offset;
+ return os;
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::C_BlockIORequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::C_WriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::C_FlushRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::C_DiscardRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::C_CompAndWriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+
+#include "include/Context.h"
+#include "librbd/cache/ImageCache.h"
+#include "librbd/cache/pwl/Types.h"
+#include "librbd/cache/pwl/LogOperation.h"
+
+namespace librbd {
+class BlockGuardCell;
+
+namespace cache {
+namespace pwl {
+
+class GuardedRequestFunctionContext;
+
+struct WriteRequestResources {
+ bool allocated = false;
+ std::vector<WriteBufferAllocation> buffers;
+};
+
+/**
+ * A request that can be deferred in a BlockGuard to sequence
+ * overlapping operations.
+ * This is the custodian of the BlockGuard cell for this IO, and the
+ * state information about the progress of this IO. This object lives
+ * until the IO is persisted in all (live) log replicas. User request
+ * may be completed from here before the IO persists.
+ */
+template <typename T>
+class C_BlockIORequest : public Context {
+public:
+ T &pwl;
+ io::Extents image_extents;
+ bufferlist bl;
+ int fadvise_flags;
+ Context *user_req; /* User write request */
+ ExtentsSummary<io::Extents> image_extents_summary;
+ bool detained = false; /* Detained in blockguard (overlapped with a prior IO) */
+ utime_t allocated_time; /* When allocation began */
+ bool waited_lanes = false; /* This IO waited for free persist/replicate lanes */
+ bool waited_entries = false; /* This IO waited for free log entries */
+ bool waited_buffers = false; /* This IO waited for data buffers (pmemobj_reserve() failed) */
+
+ C_BlockIORequest(T &pwl, const utime_t arrived, io::Extents &&extents,
+ bufferlist&& bl, const int fadvise_flags, Context *user_req);
+ ~C_BlockIORequest() override;
+ C_BlockIORequest(const C_BlockIORequest&) = delete;
+ C_BlockIORequest &operator=(const C_BlockIORequest&) = delete;
+
+ void set_cell(BlockGuardCell *cell);
+ BlockGuardCell *get_cell(void);
+ void release_cell();
+
+ void complete_user_request(int r);
+ void finish(int r);
+ virtual void finish_req(int r) = 0;
+
+ virtual bool alloc_resources() = 0;
+
+ void deferred();
+
+ virtual void deferred_handler() = 0;
+
+ virtual void dispatch() = 0;
+
+ virtual const char *get_name() const {
+ return "C_BlockIORequest";
+ }
+ uint64_t get_image_extents_size() {
+ return image_extents.size();
+ }
+ void set_io_waited_for_lanes(bool waited) {
+ waited_lanes = waited;
+ }
+ void set_io_waited_for_entries(bool waited) {
+ waited_entries = waited;
+ }
+ void set_io_waited_for_buffers(bool waited) {
+ waited_buffers = waited;
+ }
+ bool has_io_waited_for_buffers() {
+ return waited_buffers;
+ }
+ std::vector<WriteBufferAllocation>& get_resources_buffers() {
+ return m_resources.buffers;
+ }
+
+ void set_allocated(bool allocated) {
+ if (allocated) {
+ m_resources.allocated = true;
+ } else {
+ m_resources.buffers.clear();
+ }
+ }
+
+ virtual void setup_buffer_resources(
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &number_lanes, uint64_t &number_log_entries,
+ uint64_t &number_unpublished_reserves) {};
+
+protected:
+ utime_t m_arrived_time;
+ utime_t m_dispatched_time; /* When dispatch began */
+ utime_t m_user_req_completed_time;
+ std::atomic<bool> m_deferred = {false}; /* Deferred because this or a prior IO had to wait for write resources */
+ WriteRequestResources m_resources;
+
+private:
+ std::atomic<bool> m_user_req_completed = {false};
+ std::atomic<bool> m_finish_called = {false};
+ std::atomic<bool> m_cell_released = {false};
+ BlockGuardCell* m_cell = nullptr;
+
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_BlockIORequest<U> &req);
+};
+
+/**
+ * This is the custodian of the BlockGuard cell for this write. Block
+ * guard is not released until the write persists everywhere (this is
+ * how we guarantee to each log replica that they will never see
+ * overlapping writes).
+ */
+template <typename T>
+class C_WriteRequest : public C_BlockIORequest<T> {
+public:
+ using C_BlockIORequest<T>::pwl;
+ unique_ptr<WriteLogOperationSet> op_set = nullptr;
+
+ C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req);
+
+ ~C_WriteRequest() override;
+
+ void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx);
+
+ /* Common finish to plain write and compare-and-write (if it writes) */
+ void finish_req(int r) override;
+
+ /* Compare and write will override this */
+ virtual void update_req_stats(utime_t &now) {
+ // TODO: Add in later PRs
+ }
+ bool alloc_resources() override;
+
+ void deferred_handler() override { }
+
+ void dispatch() override;
+
+ virtual std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset, uint64_t len);
+
+ virtual void setup_log_operations(DeferredContexts &on_exit);
+
+ bool append_write_request(std::shared_ptr<SyncPoint> sync_point);
+
+ virtual void schedule_append();
+
+ const char *get_name() const override {
+ return "C_WriteRequest";
+ }
+
+protected:
+ using C_BlockIORequest<T>::m_resources;
+ PerfCounters *m_perfcounter = nullptr;
+ /* Plain writes will allocate one buffer per request extent */
+ void setup_buffer_resources(
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &number_lanes, uint64_t &number_log_entries,
+ uint64_t &number_unpublished_reserves) override;
+
+private:
+ bool m_do_early_flush = false;
+ std::atomic<int> m_appended = {0};
+ bool m_queued = false;
+ ceph::mutex &m_lock;
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_WriteRequest<U> &req);
+};
+
+/**
+ * This is the custodian of the BlockGuard cell for this
+ * aio_flush. Block guard is released as soon as the new
+ * sync point (if required) is created. Subsequent IOs can
+ * proceed while this flush waits for prior IOs to complete
+ * and any required sync points to be persisted.
+ */
+template <typename T>
+class C_FlushRequest : public C_BlockIORequest<T> {
+public:
+ using C_BlockIORequest<T>::pwl;
+ bool internal = false;
+ std::shared_ptr<SyncPoint> to_append;
+
+ C_FlushRequest(T &pwl, const utime_t arrived,
+ io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags,
+ ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req);
+
+ ~C_FlushRequest() override {}
+
+ bool alloc_resources() override;
+
+ void dispatch() override;
+
+ const char *get_name() const override {
+ return "C_FlushRequest";
+ }
+
+ void setup_buffer_resources(
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &number_lanes, uint64_t &number_log_entries,
+ uint64_t &number_unpublished_reserves) override;
+private:
+ std::shared_ptr<SyncPointLogOperation> op;
+ ceph::mutex &m_lock;
+ PerfCounters *m_perfcounter = nullptr;
+
+ void finish_req(int r) override;
+ void deferred_handler() override {
+ m_perfcounter->inc(l_librbd_pwl_aio_flush_def, 1);
+ }
+
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_FlushRequest<U> &req);
+};
+
+/**
+ * This is the custodian of the BlockGuard cell for this discard. As in the
+ * case of write, the block guard is not released until the discard persists
+ * everywhere.
+ */
+template <typename T>
+class C_DiscardRequest : public C_BlockIORequest<T> {
+public:
+ using C_BlockIORequest<T>::pwl;
+ std::shared_ptr<DiscardLogOperation> op;
+
+ C_DiscardRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ uint32_t discard_granularity_bytes, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req);
+
+ ~C_DiscardRequest() override;
+ void finish_req(int r) override {}
+
+ bool alloc_resources() override;
+
+ void deferred_handler() override { }
+
+ void setup_log_operations();
+
+ void dispatch() override;
+
+ void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx);
+
+ const char *get_name() const override {
+ return "C_DiscardRequest";
+ }
+ void setup_buffer_resources(
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &number_lanes, uint64_t &number_log_entries,
+ uint64_t &number_unpublished_reserves) override;
+private:
+ uint32_t m_discard_granularity_bytes;
+ ceph::mutex &m_lock;
+ PerfCounters *m_perfcounter = nullptr;
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_DiscardRequest<U> &req);
+};
+
+/**
+ * This is the custodian of the BlockGuard cell for this write same.
+ *
+ * A writesame allocates and persists a data buffer like a write, but the
+ * data buffer is usually much shorter than the write same.
+ */
+template <typename T>
+class C_WriteSameRequest : public C_WriteRequest<T> {
+public:
+ using C_BlockIORequest<T>::pwl;
+ C_WriteSameRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req);
+
+ ~C_WriteSameRequest() override;
+
+ void update_req_stats(utime_t &now) override;
+
+ void setup_buffer_resources(
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &number_lanes, uint64_t &number_log_entries,
+ uint64_t &number_unpublished_reserves) override;
+
+ std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset, uint64_t len) override;
+
+ const char *get_name() const override {
+ return "C_WriteSameRequest";
+ }
+
+ template<typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_WriteSameRequest<U> &req);
+};
+
+/**
+ * This is the custodian of the BlockGuard cell for this compare and write. The
+ * block guard is acquired before the read begins to guarantee atomicity of this
+ * operation. If this results in a write, the block guard will be released
+ * when the write completes to all replicas.
+ */
+template <typename T>
+class C_CompAndWriteRequest : public C_WriteRequest<T> {
+public:
+ using C_BlockIORequest<T>::pwl;
+ bool compare_succeeded = false;
+ uint64_t *mismatch_offset;
+ bufferlist cmp_bl;
+ bufferlist read_bl;
+ C_CompAndWriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req);
+ ~C_CompAndWriteRequest();
+
+ void finish_req(int r) override;
+
+ void update_req_stats(utime_t &now) override;
+
+ /*
+ * Compare and write doesn't implement alloc_resources(), deferred_handler(),
+ * or dispatch(). We use the implementation in C_WriteRequest(), and only if the
+ * compare phase succeeds and a write is actually performed.
+ */
+
+ const char *get_name() const override {
+ return "C_CompAndWriteRequest";
+ }
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<U> &req);
+};
+
+struct BlockGuardReqState {
+ bool barrier = false; /* This is a barrier request */
+ bool current_barrier = false; /* This is the currently active barrier */
+ bool detained = false;
+ bool queued = false; /* Queued for barrier */
+ friend std::ostream &operator<<(std::ostream &os,
+ const BlockGuardReqState &r) {
+ os << "barrier=" << r.barrier << ", "
+ << "current_barrier=" << r.current_barrier << ", "
+ << "detained=" << r.detained << ", "
+ << "queued=" << r.queued;
+ return os;
+ }
+};
+
+class GuardedRequestFunctionContext : public Context {
+public:
+ BlockGuardCell *cell = nullptr;
+ BlockGuardReqState state;
+ GuardedRequestFunctionContext(boost::function<void(GuardedRequestFunctionContext&)> &&callback)
+ : m_callback(std::move(callback)){ }
+ ~GuardedRequestFunctionContext(void) override { };
+ GuardedRequestFunctionContext(const GuardedRequestFunctionContext&) = delete;
+ GuardedRequestFunctionContext &operator=(const GuardedRequestFunctionContext&) = delete;
+
+private:
+ boost::function<void(GuardedRequestFunctionContext&)> m_callback;
+ void finish(int r) override {
+ ceph_assert(cell);
+ m_callback(*this);
+ }
+};
+
+class GuardedRequest {
+public:
+ const BlockExtent block_extent;
+ GuardedRequestFunctionContext *guard_ctx; /* Work to do when guard on range obtained */
+
+ GuardedRequest(const BlockExtent block_extent,
+ GuardedRequestFunctionContext *on_guard_acquire, bool barrier = false)
+ : block_extent(block_extent), guard_ctx(on_guard_acquire) {
+ guard_ctx->state.barrier = barrier;
+ }
+ friend std::ostream &operator<<(std::ostream &os,
+ const GuardedRequest &r) {
+ os << "guard_ctx->state=[" << r.guard_ctx->state << "], "
+ << "block_extent.block_start=" << r.block_extent.block_start << ", "
+ << "block_extent.block_start=" << r.block_extent.block_end;
+ return os;
+ }
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/pwl/ShutdownRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/Operations.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/ImageCache.h"
+#include "librbd/cache/Types.h"
+
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl:ShutdownRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+ShutdownRequest<I>* ShutdownRequest<I>::create(I &image_ctx,
+ Context *on_finish) {
+ return new ShutdownRequest(image_ctx, on_finish);
+}
+
+template <typename I>
+ShutdownRequest<I>::ShutdownRequest(I &image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx),
+ m_on_finish(create_async_context_callback(image_ctx, on_finish)),
+ m_error_result(0) {
+}
+
+template <typename I>
+void ShutdownRequest<I>::send() {
+ send_shutdown_image_cache();
+}
+
+template <typename I>
+void ShutdownRequest<I>::send_shutdown_image_cache() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (m_image_ctx.image_cache == nullptr) {
+ finish();
+ return;
+ }
+
+ using klass = ShutdownRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_shutdown_image_cache>(
+ this);
+
+ m_image_ctx.image_cache->shut_down(ctx);
+}
+
+template <typename I>
+void ShutdownRequest<I>::handle_shutdown_image_cache(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to shut down the image cache: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+ finish();
+ return;
+ } else {
+ delete m_image_ctx.image_cache;
+ m_image_ctx.image_cache = nullptr;
+ }
+ send_remove_feature_bit();
+}
+
+template <typename I>
+void ShutdownRequest<I>::send_remove_feature_bit() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ uint64_t new_features = m_image_ctx.features & ~RBD_FEATURE_DIRTY_CACHE;
+ uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE;
+ ldout(cct, 10) << "old_features=" << m_image_ctx.features
+ << ", new_features=" << new_features
+ << ", features_mask=" << features_mask
+ << dendl;
+
+ int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, m_image_ctx.header_oid,
+ new_features, features_mask);
+ m_image_ctx.features &= ~RBD_FEATURE_DIRTY_CACHE;
+ using klass = ShutdownRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_feature_bit>(
+ this);
+ ctx->complete(r);
+}
+
+template <typename I>
+void ShutdownRequest<I>::handle_remove_feature_bit(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove the feature bit: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+ finish();
+ return;
+ }
+ send_remove_image_cache_state();
+}
+
+template <typename I>
+void ShutdownRequest<I>::send_remove_image_cache_state() {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ using klass = ShutdownRequest<I>;
+ Context *ctx = create_context_callback<klass, &klass::handle_remove_image_cache_state>(
+ this);
+ std::shared_lock owner_lock{m_image_ctx.owner_lock};
+ m_image_ctx.operations->execute_metadata_remove(IMAGE_CACHE_STATE, ctx);
+}
+
+template <typename I>
+void ShutdownRequest<I>::handle_remove_image_cache_state(int r) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 10) << dendl;
+
+ if (r < 0) {
+ lderr(cct) << "failed to remove the image cache state: " << cpp_strerror(r)
+ << dendl;
+ save_result(r);
+ }
+ finish();
+}
+
+template <typename I>
+void ShutdownRequest<I>::finish() {
+ m_on_finish->complete(m_error_result);
+ delete this;
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ShutdownRequest<librbd::ImageCtx>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace cache {
+namespace pwl {
+
+template<typename>
+class ImageCacheState;
+
+template <typename ImageCtxT = ImageCtx>
+class ShutdownRequest {
+public:
+ static ShutdownRequest* create(ImageCtxT &image_ctx, Context *on_finish);
+
+ void send();
+
+private:
+
+ /**
+ * @verbatim
+ *
+ * Shutdown request goes through the following state machine:
+ *
+ * <start>
+ * |
+ * v
+ * SHUTDOWN_IMAGE_CACHE
+ * |
+ * v
+ * REMOVE_IMAGE_FEATURE_BIT
+ * |
+ * v
+ * REMOVE_IMAGE_CACHE_STATE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ShutdownRequest(ImageCtxT &image_ctx, Context *on_finish);
+
+ ImageCtxT &m_image_ctx;
+ Context *m_on_finish;
+
+ int m_error_result;
+
+ void send_shutdown_image_cache();
+ void handle_shutdown_image_cache(int r);
+
+ void send_remove_feature_bit();
+ void handle_remove_feature_bit(int r);
+
+ void send_remove_image_cache_state();
+ void handle_remove_image_cache_state(int r);
+
+ void finish();
+
+ void save_result(int result) {
+ if (m_error_result == 0 && result < 0) {
+ m_error_result = result;
+ }
+ }
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::ShutdownRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SyncPoint.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::SyncPoint: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+SyncPoint::SyncPoint(uint64_t sync_gen_num, CephContext *cct)
+ : log_entry(std::make_shared<SyncPointLogEntry>(sync_gen_num)), m_cct(cct) {
+ m_prior_log_entries_persisted = new C_Gather(cct, nullptr);
+ m_sync_point_persist = new C_Gather(cct, nullptr);
+ on_sync_point_appending.reserve(MAX_WRITES_PER_SYNC_POINT + 2);
+ on_sync_point_persisted.reserve(MAX_WRITES_PER_SYNC_POINT + 2);
+ ldout(m_cct, 20) << "sync point " << sync_gen_num << dendl;
+}
+
+SyncPoint::~SyncPoint() {
+ ceph_assert(on_sync_point_appending.empty());
+ ceph_assert(on_sync_point_persisted.empty());
+ ceph_assert(!earlier_sync_point);
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const SyncPoint &p) {
+ os << "log_entry=[" << *p.log_entry << "], "
+ << "earlier_sync_point=" << p.earlier_sync_point << ", "
+ << "later_sync_point=" << p.later_sync_point << ", "
+ << "m_final_op_sequence_num=" << p.m_final_op_sequence_num << ", "
+ << "m_prior_log_entries_persisted=" << p.m_prior_log_entries_persisted << ", "
+ << "m_prior_log_entries_persisted_complete=" << p.m_prior_log_entries_persisted_complete << ", "
+ << "m_append_scheduled=" << p.m_append_scheduled << ", "
+ << "appending=" << p.appending << ", "
+ << "on_sync_point_appending=" << p.on_sync_point_appending.size() << ", "
+ << "on_sync_point_persisted=" << p.on_sync_point_persisted.size() << "";
+ return os;
+}
+
+void SyncPoint::persist_gather_set_finisher(Context *ctx) {
+ m_append_scheduled = true;
+ /* All prior sync points that are still in this list must already be scheduled for append */
+ std::shared_ptr<SyncPoint> previous = earlier_sync_point;
+ while (previous) {
+ ceph_assert(previous->m_append_scheduled);
+ previous = previous->earlier_sync_point;
+ }
+
+ m_sync_point_persist->set_finisher(ctx);
+}
+
+void SyncPoint::persist_gather_activate() {
+ m_sync_point_persist->activate();
+}
+
+Context* SyncPoint::persist_gather_new_sub() {
+ return m_sync_point_persist->new_sub();
+}
+
+void SyncPoint::prior_persisted_gather_activate() {
+ m_prior_log_entries_persisted->activate();
+}
+
+Context* SyncPoint::prior_persisted_gather_new_sub() {
+ return m_prior_log_entries_persisted->new_sub();
+}
+
+void SyncPoint::prior_persisted_gather_set_finisher() {
+ Context *sync_point_persist_ready = persist_gather_new_sub();
+ std::shared_ptr<SyncPoint> sp = shared_from_this();
+ m_prior_log_entries_persisted->
+ set_finisher(new LambdaContext([this, sp, sync_point_persist_ready](int r) {
+ ldout(m_cct, 20) << "Prior log entries persisted for sync point =["
+ << sp << "]" << dendl;
+ sp->m_prior_log_entries_persisted_result = r;
+ sp->m_prior_log_entries_persisted_complete = true;
+ sync_point_persist_ready->complete(r);
+ }));
+}
+
+void SyncPoint::add_in_on_persisted_ctxs(Context* ctx) {
+ on_sync_point_persisted.push_back(ctx);
+}
+
+void SyncPoint::add_in_on_appending_ctxs(Context* ctx) {
+ on_sync_point_appending.push_back(ctx);
+}
+
+void SyncPoint::setup_earlier_sync_point(std::shared_ptr<SyncPoint> sync_point,
+ uint64_t last_op_sequence_num) {
+ earlier_sync_point = sync_point;
+ log_entry->prior_sync_point_flushed = false;
+ earlier_sync_point->log_entry->next_sync_point_entry = log_entry;
+ earlier_sync_point->later_sync_point = shared_from_this();
+ earlier_sync_point->m_final_op_sequence_num = last_op_sequence_num;
+ if (!earlier_sync_point->appending) {
+ /* Append of new sync point deferred until old sync point is appending */
+ earlier_sync_point->add_in_on_appending_ctxs(prior_persisted_gather_new_sub());
+ }
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H
+#define CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H
+
+#include "librbd/ImageCtx.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/cache/pwl/Types.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+class SyncPoint: public std::enable_shared_from_this<SyncPoint> {
+public:
+ std::shared_ptr<SyncPointLogEntry> log_entry;
+ /* Use lock for earlier/later links */
+ std::shared_ptr<SyncPoint> earlier_sync_point; /* NULL if earlier has completed */
+ std::shared_ptr<SyncPoint> later_sync_point;
+ bool appending = false;
+ /* Signal these when this sync point is appending to the log, and its order
+ * of appearance is guaranteed. One of these is is a sub-operation of the
+ * next sync point's m_prior_log_entries_persisted Gather. */
+ std::vector<Context*> on_sync_point_appending;
+ /* Signal these when this sync point is appended and persisted. User
+ * aio_flush() calls are added to this. */
+ std::vector<Context*> on_sync_point_persisted;
+
+ SyncPoint(uint64_t sync_gen_num, CephContext *cct);
+ ~SyncPoint();
+ SyncPoint(const SyncPoint&) = delete;
+ SyncPoint &operator=(const SyncPoint&) = delete;
+ void persist_gather_activate();
+ Context* persist_gather_new_sub();
+ void persist_gather_set_finisher(Context *ctx);
+ void prior_persisted_gather_activate();
+ Context* prior_persisted_gather_new_sub();
+ void prior_persisted_gather_set_finisher();
+ void add_in_on_persisted_ctxs(Context* cxt);
+ void add_in_on_appending_ctxs(Context* cxt);
+ void setup_earlier_sync_point(std::shared_ptr<SyncPoint> sync_point,
+ uint64_t last_op_sequence_num);
+private:
+ CephContext *m_cct;
+ bool m_append_scheduled = false;
+ uint64_t m_final_op_sequence_num = 0;
+ /* A sync point can't appear in the log until all the writes bearing
+ * it and all the prior sync points have been appended and
+ * persisted.
+ *
+ * Writes bearing this sync gen number and the prior sync point will be
+ * sub-ops of this Gather. This sync point will not be appended until all
+ * these complete to the point where their persist order is guaranteed. */
+ C_Gather *m_prior_log_entries_persisted;
+ /* The finisher for this will append the sync point to the log. The finisher
+ * for m_prior_log_entries_persisted will be a sub-op of this. */
+ C_Gather *m_sync_point_persist;
+ int m_prior_log_entries_persisted_result = 0;
+ int m_prior_log_entries_persisted_complete = false;
+ friend std::ostream &operator<<(std::ostream &os,
+ const SyncPoint &p);
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "Types.h"
+#include "common/ceph_context.h"
+#include "include/Context.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::Types: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+
+namespace cache {
+
+namespace pwl {
+
+DeferredContexts::~DeferredContexts() {
+ finish_contexts(nullptr, contexts, 0);
+}
+
+void DeferredContexts::add(Context* ctx) {
+ contexts.push_back(ctx);
+}
+
+/*
+ * A BlockExtent identifies a range by first and last.
+ *
+ * An Extent ("image extent") identifies a range by start and length.
+ *
+ * The ImageCache interface is defined in terms of image extents, and
+ * requires no alignment of the beginning or end of the extent. We
+ * convert between image and block extents here using a "block size"
+ * of 1.
+ */
+BlockExtent convert_to_block_extent(const uint64_t offset_bytes, const uint64_t length_bytes)
+{
+ return BlockExtent(offset_bytes,
+ offset_bytes + length_bytes);
+}
+
+BlockExtent WriteLogPmemEntry::block_extent() {
+ return convert_to_block_extent(image_offset_bytes, write_bytes);
+}
+
+uint64_t WriteLogPmemEntry::get_offset_bytes() {
+ return image_offset_bytes;
+}
+
+uint64_t WriteLogPmemEntry::get_write_bytes() {
+ return write_bytes;
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const WriteLogPmemEntry &entry) {
+ os << "entry_valid=" << (bool)entry.entry_valid << ", "
+ << "sync_point=" << (bool)entry.sync_point << ", "
+ << "sequenced=" << (bool)entry.sequenced << ", "
+ << "has_data=" << (bool)entry.has_data << ", "
+ << "discard=" << (bool)entry.discard << ", "
+ << "writesame=" << (bool)entry.writesame << ", "
+ << "sync_gen_number=" << entry.sync_gen_number << ", "
+ << "write_sequence_number=" << entry.write_sequence_number << ", "
+ << "image_offset_bytes=" << entry.image_offset_bytes << ", "
+ << "write_bytes=" << entry.write_bytes << ", "
+ << "ws_datalen=" << entry.ws_datalen << ", "
+ << "entry_index=" << entry.entry_index;
+ return os;
+}
+
+template <typename ExtentsType>
+ExtentsSummary<ExtentsType>::ExtentsSummary(const ExtentsType &extents)
+ : total_bytes(0), first_image_byte(0), last_image_byte(0)
+{
+ if (extents.empty()) return;
+ /* These extents refer to image offsets between first_image_byte
+ * and last_image_byte, inclusive, but we don't guarantee here
+ * that they address all of those bytes. There may be gaps. */
+ first_image_byte = extents.front().first;
+ last_image_byte = first_image_byte + extents.front().second;
+ for (auto &extent : extents) {
+ /* Ignore zero length extents */
+ if (extent.second) {
+ total_bytes += extent.second;
+ if (extent.first < first_image_byte) {
+ first_image_byte = extent.first;
+ }
+ if ((extent.first + extent.second) > last_image_byte) {
+ last_image_byte = extent.first + extent.second;
+ }
+ }
+ }
+}
+
+io::Extent whole_volume_extent() {
+ return io::Extent({0, std::numeric_limits<uint64_t>::max()});
+}
+
+BlockExtent block_extent(const io::Extent& image_extent) {
+ return convert_to_block_extent(image_extent.first, image_extent.second);
+}
+
+Context * override_ctx(int r, Context *ctx) {
+ if (r < 0) {
+ /* Override next_ctx status with this error */
+ return new LambdaContext(
+ [r, ctx](int _r) {
+ ctx->complete(r);
+ });
+ } else {
+ return ctx;
+ }
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ExtentsSummary<librbd::io::Extents>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_TYPES_H
+#define CEPH_LIBRBD_CACHE_RWL_TYPES_H
+
+#include <vector>
+#include <libpmemobj.h>
+#include "librbd/BlockGuard.h"
+#include "librbd/io/Types.h"
+
+class Context;
+
+enum {
+ l_librbd_pwl_first = 26500,
+
+ // All read requests
+ l_librbd_pwl_rd_req, // read requests
+ l_librbd_pwl_rd_bytes, // bytes read
+ l_librbd_pwl_rd_latency, // average req completion latency
+
+ // Read requests completed from RWL (no misses)
+ l_librbd_pwl_rd_hit_req, // read requests
+ l_librbd_pwl_rd_hit_bytes, // bytes read
+ l_librbd_pwl_rd_hit_latency, // average req completion latency
+
+ // Reed requests with hit and miss extents
+ l_librbd_pwl_rd_part_hit_req, // read ops
+
+ // Per SyncPoint's LogEntry number and write bytes distribution
+ l_librbd_pwl_syncpoint_hist,
+
+ // All write requests
+ l_librbd_pwl_wr_req, // write requests
+ l_librbd_pwl_wr_req_def, // write requests deferred for resources
+ l_librbd_pwl_wr_req_def_lanes, // write requests deferred for lanes
+ l_librbd_pwl_wr_req_def_log, // write requests deferred for log entries
+ l_librbd_pwl_wr_req_def_buf, // write requests deferred for buffer space
+ l_librbd_pwl_wr_req_overlap, // write requests detained for overlap
+ l_librbd_pwl_wr_req_queued, // write requests queued for prior barrier
+ l_librbd_pwl_wr_bytes, // bytes written
+
+ // Write log operations (1 .. n per request that appends to the log)
+ l_librbd_pwl_log_ops, // log append ops
+ l_librbd_pwl_log_op_bytes, // average bytes written per log op
+
+ /*
+
+ Req and op average latencies to the beginning of and over various phases:
+
+ +------------------------------+------+-------------------------------+
+ | Phase | Name | Description |
+ +------------------------------+------+-------------------------------+
+ | Arrive at RWL | arr |Arrives as a request |
+ +------------------------------+------+-------------------------------+
+ | Allocate resources | all |time spent in block guard for |
+ | | |overlap sequencing occurs |
+ | | |before this point |
+ +------------------------------+------+-------------------------------+
+ | Dispatch | dis |Op lifetime begins here. time |
+ | | |spent in allocation waiting for|
+ | | |resources occurs before this |
+ | | |point |
+ +------------------------------+------+-------------------------------+
+ | Payload buffer persist and | buf |time spent queued for |
+ |replicate | |replication occurs before here |
+ +------------------------------+------+-------------------------------+
+ | Payload buffer persist | bufc |bufc - buf is just the persist |
+ |complete | |time |
+ +------------------------------+------+-------------------------------+
+ | Log append | app |time spent queued for append |
+ | | |occurs before here |
+ +------------------------------+------+-------------------------------+
+ | Append complete | appc |appc - app is just the time |
+ | | |spent in the append operation |
+ +------------------------------+------+-------------------------------+
+ | Complete | cmp |write persisted, replicated, |
+ | | |and globally visible |
+ +------------------------------+------+-------------------------------+
+
+ */
+
+ /* Request times */
+ l_librbd_pwl_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard
+ l_librbd_pwl_req_arr_to_dis_t, // arrival to dispatch elapsed time
+ l_librbd_pwl_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources
+ l_librbd_pwl_wr_latency, // average req (persist) completion latency
+ l_librbd_pwl_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written
+ l_librbd_pwl_wr_caller_latency, // average req completion (to caller) latency
+
+ /* Request times for requests that never waited for space*/
+ l_librbd_pwl_nowait_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard
+ l_librbd_pwl_nowait_req_arr_to_dis_t, // arrival to dispatch elapsed time
+ l_librbd_pwl_nowait_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources
+ l_librbd_pwl_nowait_wr_latency, // average req (persist) completion latency
+ l_librbd_pwl_nowait_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written
+ l_librbd_pwl_nowait_wr_caller_latency, // average req completion (to caller) latency
+
+ /* Log operation times */
+ l_librbd_pwl_log_op_alloc_t, // elapsed time of pmemobj_reserve()
+ l_librbd_pwl_log_op_alloc_t_hist, // Histogram of elapsed time of pmemobj_reserve()
+
+ l_librbd_pwl_log_op_dis_to_buf_t, // dispatch to buffer persist elapsed time
+ l_librbd_pwl_log_op_dis_to_app_t, // dispatch to log append elapsed time
+ l_librbd_pwl_log_op_dis_to_cmp_t, // dispatch to persist completion elapsed time
+ l_librbd_pwl_log_op_dis_to_cmp_t_hist, // Histogram of dispatch to persist completion elapsed time
+
+ l_librbd_pwl_log_op_buf_to_app_t, // data buf persist + append wait time
+ l_librbd_pwl_log_op_buf_to_bufc_t,// data buf persist / replicate elapsed time
+ l_librbd_pwl_log_op_buf_to_bufc_t_hist,// data buf persist time vs bytes histogram
+ l_librbd_pwl_log_op_app_to_cmp_t, // log entry append + completion wait time
+ l_librbd_pwl_log_op_app_to_appc_t, // log entry append / replicate elapsed time
+ l_librbd_pwl_log_op_app_to_appc_t_hist, // log entry append time (vs. op bytes) histogram
+
+ l_librbd_pwl_discard,
+ l_librbd_pwl_discard_bytes,
+ l_librbd_pwl_discard_latency,
+
+ l_librbd_pwl_aio_flush,
+ l_librbd_pwl_aio_flush_def,
+ l_librbd_pwl_aio_flush_latency,
+ l_librbd_pwl_ws,
+ l_librbd_pwl_ws_bytes, // Bytes modified by write same, probably much larger than WS payload bytes
+ l_librbd_pwl_ws_latency,
+
+ l_librbd_pwl_cmp,
+ l_librbd_pwl_cmp_bytes,
+ l_librbd_pwl_cmp_latency,
+ l_librbd_pwl_cmp_fails,
+
+ l_librbd_pwl_flush,
+ l_librbd_pwl_invalidate_cache,
+ l_librbd_pwl_invalidate_discard_cache,
+
+ l_librbd_pwl_append_tx_t,
+ l_librbd_pwl_retire_tx_t,
+ l_librbd_pwl_append_tx_t_hist,
+ l_librbd_pwl_retire_tx_t_hist,
+
+ l_librbd_pwl_last,
+};
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+class ImageExtentBuf;
+typedef std::vector<ImageExtentBuf> ImageExtentBufs;
+
+const int IN_FLIGHT_FLUSH_WRITE_LIMIT = 64;
+const int IN_FLIGHT_FLUSH_BYTES_LIMIT = (1 * 1024 * 1024);
+
+/* Limit work between sync points */
+const uint64_t MAX_WRITES_PER_SYNC_POINT = 256;
+const uint64_t MAX_BYTES_PER_SYNC_POINT = (1024 * 1024 * 8);
+
+const uint32_t MIN_WRITE_ALLOC_SIZE = 512;
+const uint32_t LOG_STATS_INTERVAL_SECONDS = 5;
+
+/**** Write log entries ****/
+const unsigned long int MAX_ALLOC_PER_TRANSACTION = 8;
+const unsigned long int MAX_FREE_PER_TRANSACTION = 1;
+const unsigned int MAX_CONCURRENT_WRITES = 256;
+
+const uint64_t DEFAULT_POOL_SIZE = 1u<<30;
+const uint64_t MIN_POOL_SIZE = DEFAULT_POOL_SIZE;
+constexpr double USABLE_SIZE = (7.0 / 10);
+const uint64_t BLOCK_ALLOC_OVERHEAD_BYTES = 16;
+const uint8_t RWL_POOL_VERSION = 1;
+const uint64_t MAX_LOG_ENTRIES = (1024 * 1024);
+const double AGGRESSIVE_RETIRE_HIGH_WATER = 0.75;
+const double RETIRE_HIGH_WATER = 0.50;
+const double RETIRE_LOW_WATER = 0.40;
+const int RETIRE_BATCH_TIME_LIMIT_MS = 250;
+
+/* Defer a set of Contexts until destruct/exit. Used for deferring
+ * work on a given thread until a required lock is dropped. */
+class DeferredContexts {
+private:
+ std::vector<Context*> contexts;
+public:
+ ~DeferredContexts();
+ void add(Context* ctx);
+};
+
+/* Pmem structures */
+POBJ_LAYOUT_BEGIN(rbd_pwl);
+POBJ_LAYOUT_ROOT(rbd_pwl, struct WriteLogPoolRoot);
+POBJ_LAYOUT_TOID(rbd_pwl, uint8_t);
+POBJ_LAYOUT_TOID(rbd_pwl, struct WriteLogPmemEntry);
+POBJ_LAYOUT_END(rbd_pwl);
+
+struct WriteLogPmemEntry {
+ uint64_t sync_gen_number = 0;
+ uint64_t write_sequence_number = 0;
+ uint64_t image_offset_bytes;
+ uint64_t write_bytes;
+ TOID(uint8_t) write_data;
+ struct {
+ uint8_t entry_valid :1; /* if 0, this entry is free */
+ uint8_t sync_point :1; /* No data. No write sequence number. Marks sync
+ point for this sync gen number */
+ uint8_t sequenced :1; /* write sequence number is valid */
+ uint8_t has_data :1; /* write_data field is valid (else ignore) */
+ uint8_t discard :1; /* has_data will be 0 if this is a discard */
+ uint8_t writesame :1; /* ws_datalen indicates length of data at write_bytes */
+ };
+ uint32_t ws_datalen = 0; /* Length of data buffer (writesame only) */
+ uint32_t entry_index = 0; /* For debug consistency check. Can be removed if
+ * we need the space */
+ WriteLogPmemEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes),
+ entry_valid(0), sync_point(0), sequenced(0), has_data(0), discard(0), writesame(0) {
+ }
+ BlockExtent block_extent();
+ uint64_t get_offset_bytes();
+ uint64_t get_write_bytes();
+ bool is_sync_point() {
+ return sync_point;
+ }
+ bool is_discard() {
+ return discard;
+ }
+ bool is_writesame() {
+ return writesame;
+ }
+ bool is_write() {
+ /* Log entry is a basic write */
+ return !is_sync_point() && !is_discard() && !is_writesame();
+ }
+ bool is_writer() {
+ /* Log entry is any type that writes data */
+ return is_write() || is_discard() || is_writesame();
+ }
+ friend std::ostream& operator<<(std::ostream& os,
+ const WriteLogPmemEntry &entry);
+};
+
+static_assert(sizeof(WriteLogPmemEntry) == 64);
+
+struct WriteLogPoolRoot {
+ union {
+ struct {
+ uint8_t layout_version; /* Version of this structure (RWL_POOL_VERSION) */
+ };
+ uint64_t _u64;
+ } header;
+ TOID(struct WriteLogPmemEntry) log_entries; /* contiguous array of log entries */
+ uint64_t pool_size;
+ uint64_t flushed_sync_gen; /* All writing entries with this or a lower
+ * sync gen number are flushed. */
+ uint32_t block_size; /* block size */
+ uint32_t num_log_entries;
+ uint32_t first_free_entry; /* Entry following the newest valid entry */
+ uint32_t first_valid_entry; /* Index of the oldest valid entry in the log */
+};
+
+struct WriteBufferAllocation {
+ unsigned int allocation_size = 0;
+ pobj_action buffer_alloc_action;
+ TOID(uint8_t) buffer_oid = OID_NULL;
+ bool allocated = false;
+ utime_t allocation_lat;
+};
+
+static inline io::Extent image_extent(const BlockExtent& block_extent) {
+ return io::Extent(block_extent.block_start,
+ block_extent.block_end - block_extent.block_start);
+}
+
+template <typename ExtentsType>
+class ExtentsSummary {
+public:
+ uint64_t total_bytes;
+ uint64_t first_image_byte;
+ uint64_t last_image_byte;
+ explicit ExtentsSummary(const ExtentsType &extents);
+ friend std::ostream &operator<<(std::ostream &os,
+ const ExtentsSummary &s) {
+ os << "total_bytes=" << s.total_bytes << ", "
+ << "first_image_byte=" << s.first_image_byte << ", "
+ << "last_image_byte=" << s.last_image_byte << "";
+ return os;
+ }
+ BlockExtent block_extent() {
+ return BlockExtent(first_image_byte, last_image_byte);
+ }
+ io::Extent image_extent() {
+ return librbd::cache::pwl::image_extent(block_extent());
+ }
+};
+
+io::Extent whole_volume_extent();
+
+BlockExtent block_extent(const io::Extent& image_extent);
+
+Context * override_ctx(int r, Context *ctx);
+
+class ImageExtentBuf : public io::Extent {
+public:
+ bufferlist m_bl;
+ ImageExtentBuf(io::Extent extent)
+ : io::Extent(extent) { }
+ ImageExtentBuf(io::Extent extent, bufferlist bl)
+ : io::Extent(extent), m_bl(bl) { }
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_TYPES_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "librbd/cache/Types.h"
-#include "librbd/cache/Utils.h"
-#include "librbd/cache/rwl/ImageCacheState.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/Operations.h"
-#include "common/environment.h"
-#include "common/hostname.h"
-#include "common/config_proxy.h"
-#include "common/ceph_json.h"
-
-#undef dout_subsys
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl::ImageCacheState: " \
- << __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-namespace {
-bool get_json_format(const std::string& s, JSONFormattable *f) {
- JSONParser p;
- bool success = p.parse(s.c_str(), s.size());
- if (success) {
- decode_json_obj(*f, &p);
- }
- return success;
-}
-} // namespace
-
-template <typename I>
-ImageCacheState<I>::ImageCacheState(I *image_ctx) : m_image_ctx(image_ctx) {
- ldout(image_ctx->cct, 20) << "Initialize RWL cache state with config data. "
- << dendl;
-
- ConfigProxy &config = image_ctx->config;
- host = ceph_get_short_hostname();
- path = config.get_val<std::string>("rbd_rwl_path");
- size = config.get_val<uint64_t>("rbd_rwl_size");
- log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
-}
-
-template <typename I>
-ImageCacheState<I>::ImageCacheState(
- I *image_ctx, JSONFormattable &f) : m_image_ctx(image_ctx) {
- ldout(image_ctx->cct, 20) << "Initialize RWL cache state with data from "
- << "server side"<< dendl;
-
- present = (bool)f["present"];
- empty = (bool)f["empty"];
- clean = (bool)f["clean"];
- host = (string)f["rwl_host"];
- path = (string)f["rwl_path"];
- uint64_t rwl_size;
- std::istringstream iss(f["rwl_size"]);
- iss >> rwl_size;
- size = rwl_size;
-
- // Others from config
- ConfigProxy &config = image_ctx->config;
- log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
-}
-
-template <typename I>
-void ImageCacheState<I>::write_image_cache_state(Context *on_finish) {
- std::shared_lock owner_lock{m_image_ctx->owner_lock};
- JSONFormattable f;
- ::encode_json(IMAGE_CACHE_STATE.c_str(), *this, &f);
- std::ostringstream oss;
- f.flush(oss);
- std::string image_state_json = oss.str();
-
- ldout(m_image_ctx->cct, 20) << __func__ << " Store state: "
- << image_state_json << dendl;
- m_image_ctx->operations->execute_metadata_set(IMAGE_CACHE_STATE,
- image_state_json, on_finish);
-}
-
-template <typename I>
-void ImageCacheState<I>::clear_image_cache_state(Context *on_finish) {
- std::shared_lock owner_lock{m_image_ctx->owner_lock};
- ldout(m_image_ctx->cct, 20) << __func__ << " Remove state: " << dendl;
- m_image_ctx->operations->execute_metadata_remove(IMAGE_CACHE_STATE, on_finish);
-}
-
-template <typename I>
-void ImageCacheState<I>::dump(ceph::Formatter *f) const {
- ::encode_json("present", present, f);
- ::encode_json("empty", empty, f);
- ::encode_json("clean", clean, f);
- ::encode_json("cache_type", (int)get_image_cache_type(), f);
- ::encode_json("rwl_host", host, f);
- ::encode_json("rwl_path", path, f);
- ::encode_json("rwl_size", size, f);
-}
-
-template <typename I>
-ImageCacheState<I>* ImageCacheState<I>::get_image_cache_state(
- I* image_ctx, int &r) {
- std::string cache_state_str;
- ImageCacheState<I>* cache_state = nullptr;
- ldout(image_ctx->cct, 20) << "image_cache_state:" << cache_state_str << dendl;
-
- r = 0;
- bool dirty_cache = image_ctx->test_features(RBD_FEATURE_DIRTY_CACHE);
- if (dirty_cache) {
- cls_client::metadata_get(&image_ctx->md_ctx, image_ctx->header_oid,
- IMAGE_CACHE_STATE, &cache_state_str);
- }
-
- bool rwl_enabled = cache::util::is_rwl_enabled(*image_ctx);
- bool cache_desired = rwl_enabled;
- cache_desired &= !image_ctx->read_only;
- cache_desired &= !image_ctx->test_features(RBD_FEATURE_MIGRATING);
- cache_desired &= !image_ctx->test_features(RBD_FEATURE_JOURNALING);
- cache_desired &= !image_ctx->old_format;
-
- if (!dirty_cache && !cache_desired) {
- ldout(image_ctx->cct, 20) << "Do not desire to use image cache." << dendl;
- } else if (dirty_cache && !cache_desired) {
- lderr(image_ctx->cct) << "There's a dirty cache, but RWL cache is disabled."
- << dendl;
- r = -EINVAL;
- }else if ((!dirty_cache || cache_state_str.empty()) && cache_desired) {
- cache_state = new ImageCacheState<I>(image_ctx);
- } else {
- ceph_assert(!cache_state_str.empty());
- JSONFormattable f;
- bool success = get_json_format(cache_state_str, &f);
- if (!success) {
- lderr(image_ctx->cct) << "Failed to parse cache state: "
- << cache_state_str << dendl;
- r = -EINVAL;
- return nullptr;
- }
-
- bool cache_exists = (bool)f["present"];
- int cache_type = (int)f["cache_type"];
-
- switch (cache_type) {
- case IMAGE_CACHE_TYPE_RWL:
- if (!cache_exists) {
- cache_state = new ImageCacheState<I>(image_ctx);
- } else {
- cache_state = new ImageCacheState<I>(image_ctx, f);
- }
- break;
- default:
- r = -EINVAL;
- }
- }
- return cache_state;
-}
-
-template <typename I>
-bool ImageCacheState<I>::is_valid() {
- if (this->present &&
- (host.compare(ceph_get_short_hostname()) != 0)) {
- auto cleanstring = "dirty";
- if (this->clean) {
- cleanstring = "clean";
- }
- lderr(m_image_ctx->cct) << "An image cache (RWL) remains on another host "
- << host << " which is " << cleanstring
- << ". Flush/close the image there to remove the "
- << "image cache" << dendl;
- return false;
- }
- return true;
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::rwl::ImageCacheState<librbd::ImageCtx>;
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
-#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
-
-#include "librbd/ImageCtx.h"
-#include "librbd/cache/Types.h"
-#include <string>
-
-class JSONFormattable;
-namespace ceph {
- class Formatter;
-}
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-template <typename ImageCtxT = ImageCtx>
-class ImageCacheState {
-private:
- ImageCtxT* m_image_ctx;
-public:
- bool present = true;
- bool empty = true;
- bool clean = true;
- std::string host;
- std::string path;
- uint64_t size;
- bool log_periodic_stats;
-
- ImageCacheState(ImageCtxT* image_ctx);
-
- ImageCacheState(ImageCtxT* image_ctx, JSONFormattable& f);
-
- ~ImageCacheState() {}
-
- ImageCacheType get_image_cache_type() const {
- return IMAGE_CACHE_TYPE_RWL;
- }
-
-
- void write_image_cache_state(Context *on_finish);
-
- void clear_image_cache_state(Context *on_finish);
-
- void dump(ceph::Formatter *f) const;
-
- static ImageCacheState<ImageCtxT>* get_image_cache_state(
- ImageCtxT* image_ctx, int &r);
-
- bool is_valid();
-};
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-extern template class librbd::cache::rwl::ImageCacheState<librbd::ImageCtx>;
-
-#endif // CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "librbd/cache/rwl/InitRequest.h"
-#include "librbd/Utils.h"
-#include "common/dout.h"
-#include "common/errno.h"
-#include "librbd/asio/ContextWQ.h"
-
-#if defined(WITH_RBD_RWL)
-#include "librbd/cache/rwl/ImageCacheState.h"
-#include "librbd/cache/WriteLogCache.h"
-#endif // WITH_RBD_RWL
-
-#include "librbd/cache/Utils.h"
-#include "librbd/ImageCtx.h"
-
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl:InitRequest " \
- << this << " " << __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-using librbd::util::create_async_context_callback;
-using librbd::util::create_context_callback;
-
-template <typename I>
-InitRequest<I>* InitRequest<I>::create(I &image_ctx,
- Context *on_finish) {
- return new InitRequest(image_ctx, on_finish);
-}
-
-template <typename I>
-InitRequest<I>::InitRequest(I &image_ctx, Context *on_finish)
- : m_image_ctx(image_ctx),
- m_on_finish(create_async_context_callback(image_ctx, on_finish)),
- m_error_result(0) {
-}
-
-template <typename I>
-void InitRequest<I>::send() {
-#if defined(WITH_RBD_RWL)
- get_image_cache_state();
-#else
- finish();
-#endif // WITH_RBD_RWL
-}
-
-#if defined(WITH_RBD_RWL)
-template <typename I>
-void InitRequest<I>::get_image_cache_state() {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << dendl;
-
- int r;
- auto cache_state = ImageCacheState<I>::get_image_cache_state(&m_image_ctx, r);
-
- if (r < 0 || !cache_state) {
- save_result(r);
- finish();
- return;
- } else if (!cache_state->is_valid()) {
- delete cache_state;
- cache_state = nullptr;
- lderr(cct) << "failed to get image cache state: " << cpp_strerror(r)
- << dendl;
- save_result(-ENOENT);
- finish();
- return;
- }
-
- auto cache_type = cache_state->get_image_cache_type();
- switch(cache_type) {
- case cache::IMAGE_CACHE_TYPE_RWL:
- m_image_ctx.image_cache =
- new librbd::cache::WriteLogCache<I>(m_image_ctx,
- cache_state);
- break;
- default:
- delete cache_state;
- cache_state = nullptr;
- save_result(-ENOENT);
- finish();
- return;
- }
-
- init_image_cache();
-}
-
-template <typename I>
-void InitRequest<I>::init_image_cache() {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << dendl;
-
- using klass = InitRequest<I>;
- Context *ctx = create_context_callback<klass, &klass::handle_init_image_cache>(
- this);
- m_image_ctx.image_cache->init(ctx);
-}
-
-template <typename I>
-void InitRequest<I>::handle_init_image_cache(int r) {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << dendl;
-
- if (r < 0) {
- lderr(cct) << "failed to init image cache: " << cpp_strerror(r)
- << dendl;
- delete m_image_ctx.image_cache;
- m_image_ctx.image_cache = nullptr;
- save_result(r);
- finish();
- return;
- }
- set_feature_bit();
-}
-
-template <typename I>
-void InitRequest<I>::set_feature_bit() {
- CephContext *cct = m_image_ctx.cct;
-
- uint64_t new_features = m_image_ctx.features | RBD_FEATURE_DIRTY_CACHE;
- uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE;
- ldout(cct, 10) << "old_features=" << m_image_ctx.features
- << ", new_features=" << new_features
- << ", features_mask=" << features_mask
- << dendl;
-
- int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx,
- m_image_ctx.header_oid,
- new_features, features_mask);
- m_image_ctx.features |= RBD_FEATURE_DIRTY_CACHE;
- using klass = InitRequest<I>;
- Context *ctx = create_context_callback<klass, &klass::handle_set_feature_bit>(
- this);
- ctx->complete(r);
-}
-
-template <typename I>
-void InitRequest<I>::handle_set_feature_bit(int r) {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << "r=" << r << dendl;
-
- if (r < 0) {
- lderr(cct) << "failed to set feature bit: " << cpp_strerror(r)
- << dendl;
- save_result(r);
- } else if (m_image_ctx.discard_granularity_bytes) {
- ldout(cct, 1) << "RWL image cache is enabled and "
- << "set discard_granularity_bytes = 0." << dendl;
- m_image_ctx.discard_granularity_bytes = 0;
- }
- finish();
-}
-
-#endif // WITH_RBD_RWL
-
-template <typename I>
-void InitRequest<I>::finish() {
- m_on_finish->complete(m_error_result);
- delete this;
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::rwl::InitRequest<librbd::ImageCtx>;
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H
-#define CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H
-
-class Context;
-
-namespace librbd {
-
-class ImageCtx;
-
-namespace cache {
-namespace rwl {
-
-template<typename>
-class ImageCacheState;
-
-template <typename ImageCtxT = ImageCtx>
-class InitRequest {
-public:
- static InitRequest* create(ImageCtxT &image_ctx, Context *on_finish);
-
- void send();
-
-private:
-
- /**
- * @verbatim
- *
- * Init request goes through the following state machine:
- *
- * <start>
- * |
- * v
- * GET_IMAGE_CACHE_STATE
- * |
- * v
- * INIT_IMAGE_CACHE
- * |
- * v
- * SET_FEATURE_BIT
- * |
- * v
- * <finish>
- *
- * @endverbatim
- */
-
- InitRequest(ImageCtxT &image_ctx, Context *on_finish);
-
- ImageCtxT &m_image_ctx;
- Context *m_on_finish;
-
- int m_error_result;
-
- bool is_rwl_enabled();
-
- void get_image_cache_state();
-
- void init_image_cache();
- void handle_init_image_cache(int r);
-
- void set_feature_bit();
- void handle_set_feature_bit(int r);
-
- void finish();
-
- void save_result(int result) {
- if (m_error_result == 0 && result < 0) {
- m_error_result = result;
- }
- }
-};
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-extern template class librbd::cache::rwl::InitRequest<librbd::ImageCtx>;
-
-#endif // CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include <iostream>
-#include "LogEntry.h"
-#include "librbd/cache/ImageWriteback.h"
-
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl::LogEntry: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
-
-namespace cache {
-
-namespace rwl {
-
-std::ostream& GenericLogEntry::format(std::ostream &os) const {
- os << "ram_entry=[" << ram_entry << "], "
- << "pmem_entry=" << (void*)pmem_entry << ", "
- << "log_entry_index=" << log_entry_index << ", "
- << "completed=" << completed;
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const GenericLogEntry &entry) {
- return entry.format(os);
-}
-
-std::ostream& SyncPointLogEntry::format(std::ostream &os) const {
- os << "(Sync Point) ";
- GenericLogEntry::format(os);
- os << ", "
- << "writes=" << writes << ", "
- << "bytes=" << bytes << ", "
- << "writes_completed=" << writes_completed << ", "
- << "writes_flushed=" << writes_flushed << ", "
- << "prior_sync_point_flushed=" << prior_sync_point_flushed << ", "
- << "next_sync_point_entry=" << next_sync_point_entry;
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const SyncPointLogEntry &entry) {
- return entry.format(os);
-}
-
-bool GenericWriteLogEntry::can_writeback() const {
- return (this->completed &&
- (ram_entry.sequenced ||
- (sync_point_entry &&
- sync_point_entry->completed)));
-}
-
-std::ostream& GenericWriteLogEntry::format(std::ostream &os) const {
- GenericLogEntry::format(os);
- os << ", "
- << "sync_point_entry=[";
- if (sync_point_entry) {
- os << *sync_point_entry;
- } else {
- os << "nullptr";
- }
- os << "], "
- << "referring_map_entries=" << referring_map_entries;
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const GenericWriteLogEntry &entry) {
- return entry.format(os);
-}
-
-void WriteLogEntry::init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation,
- uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush) {
- ram_entry.has_data = 1;
- ram_entry.write_data = allocation->buffer_oid;
- ceph_assert(!TOID_IS_NULL(ram_entry.write_data));
- pmem_buffer = D_RW(ram_entry.write_data);
- ram_entry.sync_gen_number = current_sync_gen;
- if (persist_on_flush) {
- /* Persist on flush. Sequence #0 is never used. */
- ram_entry.write_sequence_number = 0;
- } else {
- /* Persist on write */
- ram_entry.write_sequence_number = last_op_sequence_num;
- ram_entry.sequenced = 1;
- }
- ram_entry.sync_point = 0;
- ram_entry.discard = 0;
-}
-
-void WriteLogEntry::init_pmem_bp() {
- ceph_assert(!pmem_bp.have_raw());
- pmem_bp = buffer::ptr(buffer::create_static(this->write_bytes(), (char*)pmem_buffer));
-}
-
-void WriteLogEntry::init_pmem_bl() {
- pmem_bl.clear();
- init_pmem_bp();
- ceph_assert(pmem_bp.have_raw());
- int before_bl = pmem_bp.raw_nref();
- this->init_bl(pmem_bp, pmem_bl);
- int after_bl = pmem_bp.raw_nref();
- bl_refs = after_bl - before_bl;
-}
-
-unsigned int WriteLogEntry::reader_count() const {
- if (pmem_bp.have_raw()) {
- return (pmem_bp.raw_nref() - bl_refs - 1);
- } else {
- return 0;
- }
-}
-
-/* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */
-buffer::list& WriteLogEntry::get_pmem_bl() {
- if (0 == bl_refs) {
- std::lock_guard locker(m_entry_bl_lock);
- if (0 == bl_refs) {
- init_pmem_bl();
- }
- ceph_assert(0 != bl_refs);
- }
- return pmem_bl;
-}
-
-/* Constructs a new bl containing copies of pmem_bp */
-void WriteLogEntry::copy_pmem_bl(bufferlist *out_bl) {
- this->get_pmem_bl();
- /* pmem_bp is now initialized */
- buffer::ptr cloned_bp(pmem_bp.clone());
- out_bl->clear();
- this->init_bl(cloned_bp, *out_bl);
-}
-
-void WriteLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) {
- /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the bl even after flush()). */
- bufferlist entry_bl;
- buffer::list entry_bl_copy;
- copy_pmem_bl(&entry_bl_copy);
- entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
- image_writeback.aio_write({{ram_entry.image_offset_bytes, ram_entry.write_bytes}},
- std::move(entry_bl), 0, ctx);
-}
-
-std::ostream& WriteLogEntry::format(std::ostream &os) const {
- os << "(Write) ";
- GenericWriteLogEntry::format(os);
- os << ", "
- << "pmem_buffer=" << (void*)pmem_buffer << ", ";
- os << "pmem_bp=" << pmem_bp << ", ";
- os << "pmem_bl=" << pmem_bl << ", ";
- os << "bl_refs=" << bl_refs;
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const WriteLogEntry &entry) {
- return entry.format(os);
-}
-
-void DiscardLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) {
- image_writeback.aio_discard(ram_entry.image_offset_bytes, ram_entry.write_bytes,
- m_discard_granularity_bytes, ctx);
-}
-
-void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num) {
- ram_entry.sync_gen_number = current_sync_gen;
- if (persist_on_flush) {
- /* Persist on flush. Sequence #0 is never used. */
- ram_entry.write_sequence_number = 0;
- } else {
- /* Persist on write */
- ram_entry.write_sequence_number = last_op_sequence_num;
- ram_entry.sequenced = 1;
- }
-}
-
-std::ostream &DiscardLogEntry::format(std::ostream &os) const {
- os << "(Discard) ";
- GenericWriteLogEntry::format(os);
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const DiscardLogEntry &entry) {
- return entry.format(os);
-}
-
-void WriteSameLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) {
- for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) {
- bl.append(bp);
- }
- int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen;
- if (trailing_partial) {
- bl.append(bp, 0, trailing_partial);
- }
-}
-
-void WriteSameLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) {
- bufferlist entry_bl;
- buffer::list entry_bl_copy;
- copy_pmem_bl(&entry_bl_copy);
- entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
- image_writeback.aio_writesame(ram_entry.image_offset_bytes, ram_entry.write_bytes,
- std::move(entry_bl), 0, ctx);
-}
-
-std::ostream &WriteSameLogEntry::format(std::ostream &os) const {
- os << "(WriteSame) ";
- WriteLogEntry::format(os);
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const WriteSameLogEntry &entry) {
- return entry.format(os);
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
-#define CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
-
-#include "common/ceph_mutex.h"
-#include "librbd/Utils.h"
-#include "librbd/cache/rwl/Types.h"
-#include <atomic>
-#include <memory>
-
-namespace librbd {
-namespace cache {
-class ImageWritebackInterface;
-namespace rwl {
-
-class SyncPointLogEntry;
-class GenericWriteLogEntry;
-class WriteLogEntry;
-
-typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries;
-
-class GenericLogEntry {
-public:
- WriteLogPmemEntry ram_entry;
- WriteLogPmemEntry *pmem_entry = nullptr;
- uint32_t log_entry_index = 0;
- bool completed = false;
- GenericLogEntry(const uint64_t image_offset_bytes = 0, const uint64_t write_bytes = 0)
- : ram_entry(image_offset_bytes, write_bytes) {
- };
- virtual ~GenericLogEntry() { };
- GenericLogEntry(const GenericLogEntry&) = delete;
- GenericLogEntry &operator=(const GenericLogEntry&) = delete;
- virtual bool can_writeback() const {
- return false;
- }
- virtual bool can_retire() const {
- return false;
- }
- virtual void set_flushed(bool flushed) {
- ceph_assert(false);
- }
- virtual unsigned int write_bytes() const {
- return 0;
- };
- virtual unsigned int bytes_dirty() const {
- return 0;
- };
- virtual std::shared_ptr<SyncPointLogEntry> get_sync_point_entry() {
- return nullptr;
- }
- virtual void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) {
- ceph_assert(false);
- };
- virtual std::ostream& format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const GenericLogEntry &entry);
-};
-
-class SyncPointLogEntry : public GenericLogEntry {
-public:
- /* Writing entries using this sync gen number */
- std::atomic<unsigned int> writes = {0};
- /* Total bytes for all writing entries using this sync gen number */
- std::atomic<uint64_t> bytes = {0};
- /* Writing entries using this sync gen number that have completed */
- std::atomic<unsigned int> writes_completed = {0};
- /* Writing entries using this sync gen number that have completed flushing to the writeback interface */
- std::atomic<unsigned int> writes_flushed = {0};
- /* All writing entries using all prior sync gen numbers have been flushed */
- std::atomic<bool> prior_sync_point_flushed = {true};
- std::shared_ptr<SyncPointLogEntry> next_sync_point_entry = nullptr;
- SyncPointLogEntry(const uint64_t sync_gen_number) {
- ram_entry.sync_gen_number = sync_gen_number;
- ram_entry.sync_point = 1;
- };
- ~SyncPointLogEntry() override {};
- SyncPointLogEntry(const SyncPointLogEntry&) = delete;
- SyncPointLogEntry &operator=(const SyncPointLogEntry&) = delete;
- bool can_retire() const override {
- return this->completed;
- }
- std::ostream& format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const SyncPointLogEntry &entry);
-};
-
-class GenericWriteLogEntry : public GenericLogEntry {
-public:
- uint32_t referring_map_entries = 0;
- std::shared_ptr<SyncPointLogEntry> sync_point_entry;
- GenericWriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
- const uint64_t image_offset_bytes, const uint64_t write_bytes)
- : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(sync_point_entry) { }
- GenericWriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
- : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(nullptr) { }
- ~GenericWriteLogEntry() override {};
- GenericWriteLogEntry(const GenericWriteLogEntry&) = delete;
- GenericWriteLogEntry &operator=(const GenericWriteLogEntry&) = delete;
- unsigned int write_bytes() const override {
- /* The valid bytes in this ops data buffer. Discard and WS override. */
- return ram_entry.write_bytes;
- };
- unsigned int bytes_dirty() const override {
- /* The bytes in the image this op makes dirty. Discard and WS override. */
- return write_bytes();
- };
- BlockExtent block_extent() {
- return ram_entry.block_extent();
- }
- uint32_t get_map_ref() {
- return(referring_map_entries);
- }
- void inc_map_ref() { referring_map_entries++; }
- void dec_map_ref() { referring_map_entries--; }
- bool can_writeback() const override;
- std::shared_ptr<SyncPointLogEntry> get_sync_point_entry() override {
- return sync_point_entry;
- }
- virtual void copy_pmem_bl(bufferlist *out_bl) = 0;
- void set_flushed(bool flushed) override {
- m_flushed = flushed;
- }
- bool get_flushed() const {
- return m_flushed;
- }
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const GenericWriteLogEntry &entry);
-
-private:
- bool m_flushed = false; /* or invalidated */
-};
-
-class WriteLogEntry : public GenericWriteLogEntry {
-protected:
- buffer::ptr pmem_bp;
- buffer::list pmem_bl;
- std::atomic<int> bl_refs = {0}; /* The refs held on pmem_bp by pmem_bl */
- /* Used in WriteLogEntry::get_pmem_bl() to syncronize between threads making entries readable */
- mutable ceph::mutex m_entry_bl_lock;
-
- void init_pmem_bp();
-
- /* Write same will override */
- virtual void init_bl(buffer::ptr &bp, buffer::list &bl) {
- bl.append(bp);
- }
-
- void init_pmem_bl();
-
-public:
- uint8_t *pmem_buffer = nullptr;
- WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
- const uint64_t image_offset_bytes, const uint64_t write_bytes)
- : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
- m_entry_bl_lock(ceph::make_mutex(util::unique_lock_name(
- "librbd::cache::rwl::WriteLogEntry::m_entry_bl_lock", this)))
- { }
- WriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
- : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes),
- m_entry_bl_lock(ceph::make_mutex(util::unique_lock_name(
- "librbd::cache::rwl::WriteLogEntry::m_entry_bl_lock", this)))
- { }
- ~WriteLogEntry() override {};
- WriteLogEntry(const WriteLogEntry&) = delete;
- WriteLogEntry &operator=(const WriteLogEntry&) = delete;
- void init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation,
- uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush);
- BlockExtent block_extent();
- unsigned int reader_count() const;
- /* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */
- buffer::list &get_pmem_bl();
- /* Constructs a new bl containing copies of pmem_bp */
- void copy_pmem_bl(bufferlist *out_bl) override;
- void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) override;
- bool can_retire() const override {
- return (this->completed && this->get_flushed() && (0 == reader_count()));
- }
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const WriteLogEntry &entry);
-};
-
-class DiscardLogEntry : public GenericWriteLogEntry {
-public:
- DiscardLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
- const uint64_t image_offset_bytes, const uint64_t write_bytes,
- uint32_t discard_granularity_bytes)
- : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
- m_discard_granularity_bytes(discard_granularity_bytes) {
- ram_entry.discard = 1;
- };
- DiscardLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
- : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
- ram_entry.discard = 1;
- };
- DiscardLogEntry(const DiscardLogEntry&) = delete;
- DiscardLogEntry &operator=(const DiscardLogEntry&) = delete;
- unsigned int write_bytes() const override {
- /* The valid bytes in this ops data buffer. */
- return 0;
- };
- unsigned int bytes_dirty() const override {
- /* The bytes in the image this op makes dirty. */
- return ram_entry.write_bytes;
- };
- bool can_retire() const override {
- return this->completed;
- }
- void copy_pmem_bl(bufferlist *out_bl) override {
- ceph_assert(false);
- }
- void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) override;
- void init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num);
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const DiscardLogEntry &entry);
-private:
- uint32_t m_discard_granularity_bytes;
-};
-
-class WriteSameLogEntry : public WriteLogEntry {
-protected:
- void init_bl(buffer::ptr &bp, buffer::list &bl) override;
-
-public:
- WriteSameLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
- const uint64_t image_offset_bytes, const uint64_t write_bytes,
- const uint32_t data_length)
- : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {
- ram_entry.writesame = 1;
- ram_entry.ws_datalen = data_length;
- };
- WriteSameLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes,
- const uint32_t data_length)
- : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
- ram_entry.writesame = 1;
- ram_entry.ws_datalen = data_length;
- };
- WriteSameLogEntry(const WriteSameLogEntry&) = delete;
- WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
- unsigned int write_bytes() const override {
- /* The valid bytes in this ops data buffer. */
- return ram_entry.ws_datalen;
- };
- unsigned int bytes_dirty() const override {
- /* The bytes in the image this op makes dirty. */
- return ram_entry.write_bytes;
- };
- void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) override;
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const WriteSameLogEntry &entry);
-};
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-#endif // CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "LogMap.h"
-#include "include/ceph_assert.h"
-#include "librbd/Utils.h"
-#include "librbd/cache/rwl/LogEntry.h"
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl::LogMap: " << this << " " \
- << __func__ << ": "
-template <typename T>
-std::ostream &operator<<(std::ostream &os,
- LogMapEntry<T> &e) {
- os << "block_extent=" << e.block_extent << ", "
- << "log_entry=[" << e.log_entry << "]";
- return os;
-}
-
-template <typename T>
-LogMapEntry<T>::LogMapEntry(const BlockExtent block_extent,
- std::shared_ptr<T> log_entry)
- : block_extent(block_extent) , log_entry(log_entry) {
-}
-
-template <typename T>
-LogMapEntry<T>::LogMapEntry(std::shared_ptr<T> log_entry)
- : block_extent(log_entry->block_extent()) , log_entry(log_entry) {
-}
-
-template <typename T>
-LogMap<T>::LogMap(CephContext *cct)
- : m_cct(cct),
- m_lock(ceph::make_mutex(util::unique_lock_name(
- "librbd::cache::rwl::LogMap::m_lock", this))) {
-}
-
-/**
- * Add a write log entry to the map. Subsequent queries for blocks
- * within this log entry's extent will find this log entry. Portions
- * of prior write log entries overlapping with this log entry will
- * be replaced in the map by this log entry.
- *
- * The map_entries field of the log entry object will be updated to
- * contain this map entry.
- *
- * The map_entries fields of all log entries overlapping with this
- * entry will be updated to remove the regions that overlap with
- * this.
- */
-template <typename T>
-void LogMap<T>::add_log_entry(std::shared_ptr<T> log_entry) {
- std::lock_guard locker(m_lock);
- add_log_entry_locked(log_entry);
-}
-
-template <typename T>
-void LogMap<T>::add_log_entries(std::list<std::shared_ptr<T>> &log_entries) {
- std::lock_guard locker(m_lock);
- ldout(m_cct, 20) << dendl;
- for (auto &log_entry : log_entries) {
- add_log_entry_locked(log_entry);
- }
-}
-
-/**
- * Remove any map entries that refer to the supplied write log
- * entry.
- */
-template <typename T>
-void LogMap<T>::remove_log_entry(std::shared_ptr<T> log_entry) {
- std::lock_guard locker(m_lock);
- remove_log_entry_locked(log_entry);
-}
-
-template <typename T>
-void LogMap<T>::remove_log_entries(std::list<std::shared_ptr<T>> &log_entries) {
- std::lock_guard locker(m_lock);
- ldout(m_cct, 20) << dendl;
- for (auto &log_entry : log_entries) {
- remove_log_entry_locked(log_entry);
- }
-}
-
-/**
- * Returns the list of all write log entries that overlap the specified block
- * extent. This doesn't tell you which portions of these entries overlap the
- * extent, or each other. For that, use find_map_entries(). A log entry may
- * appear in the list more than once, if multiple map entries refer to it
- * (e.g. the middle of that write log entry has been overwritten).
- */
-template <typename T>
-std::list<std::shared_ptr<T>> LogMap<T>::find_log_entries(BlockExtent block_extent) {
- std::lock_guard locker(m_lock);
- ldout(m_cct, 20) << dendl;
- return find_log_entries_locked(block_extent);
-}
-
-/**
- * Returns the list of all write log map entries that overlap the
- * specified block extent.
- */
-template <typename T>
-LogMapEntries<T> LogMap<T>::find_map_entries(BlockExtent block_extent) {
- std::lock_guard locker(m_lock);
- ldout(m_cct, 20) << dendl;
- return find_map_entries_locked(block_extent);
-}
-
-template <typename T>
-void LogMap<T>::add_log_entry_locked(std::shared_ptr<T> log_entry) {
- LogMapEntry<T> map_entry(log_entry);
- ldout(m_cct, 20) << "block_extent=" << map_entry.block_extent
- << dendl;
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- LogMapEntries<T> overlap_entries = find_map_entries_locked(map_entry.block_extent);
- for (auto &entry : overlap_entries) {
- ldout(m_cct, 20) << entry << dendl;
- if (map_entry.block_extent.block_start <= entry.block_extent.block_start) {
- if (map_entry.block_extent.block_end >= entry.block_extent.block_end) {
- ldout(m_cct, 20) << "map entry completely occluded by new log entry" << dendl;
- remove_map_entry_locked(entry);
- } else {
- ceph_assert(map_entry.block_extent.block_end < entry.block_extent.block_end);
- /* The new entry occludes the beginning of the old entry */
- BlockExtent adjusted_extent(map_entry.block_extent.block_end,
- entry.block_extent.block_end);
- adjust_map_entry_locked(entry, adjusted_extent);
- }
- } else {
- if (map_entry.block_extent.block_end >= entry.block_extent.block_end) {
- /* The new entry occludes the end of the old entry */
- BlockExtent adjusted_extent(entry.block_extent.block_start,
- map_entry.block_extent.block_start);
- adjust_map_entry_locked(entry, adjusted_extent);
- } else {
- /* The new entry splits the old entry */
- split_map_entry_locked(entry, map_entry.block_extent);
- }
- }
- }
- add_map_entry_locked(map_entry);
-}
-
-template <typename T>
-void LogMap<T>::remove_log_entry_locked(std::shared_ptr<T> log_entry) {
- ldout(m_cct, 20) << "*log_entry=" << *log_entry << dendl;
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
-
- LogMapEntries<T> possible_hits = find_map_entries_locked(log_entry->block_extent());
- for (auto &possible_hit : possible_hits) {
- if (possible_hit.log_entry == log_entry) {
- /* This map entry refers to the specified log entry */
- remove_map_entry_locked(possible_hit);
- }
- }
-}
-
-template <typename T>
-void LogMap<T>::add_map_entry_locked(LogMapEntry<T> &map_entry) {
- ceph_assert(map_entry.log_entry);
- m_block_to_log_entry_map.insert(map_entry);
- map_entry.log_entry->inc_map_ref();
-}
-
-template <typename T>
-void LogMap<T>::remove_map_entry_locked(LogMapEntry<T> &map_entry) {
- auto it = m_block_to_log_entry_map.find(map_entry);
- ceph_assert(it != m_block_to_log_entry_map.end());
-
- LogMapEntry<T> erased = *it;
- m_block_to_log_entry_map.erase(it);
- erased.log_entry->dec_map_ref();
- if (0 == erased.log_entry->get_map_ref()) {
- ldout(m_cct, 20) << "log entry has zero map entries: " << erased.log_entry << dendl;
- }
-}
-
-template <typename T>
-void LogMap<T>::adjust_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &new_extent) {
- auto it = m_block_to_log_entry_map.find(map_entry);
- ceph_assert(it != m_block_to_log_entry_map.end());
-
- LogMapEntry<T> adjusted = *it;
- m_block_to_log_entry_map.erase(it);
-
- m_block_to_log_entry_map.insert(LogMapEntry<T>(new_extent, adjusted.log_entry));
-}
-
-template <typename T>
-void LogMap<T>::split_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &removed_extent) {
- auto it = m_block_to_log_entry_map.find(map_entry);
- ceph_assert(it != m_block_to_log_entry_map.end());
-
- LogMapEntry<T> split = *it;
- m_block_to_log_entry_map.erase(it);
-
- BlockExtent left_extent(split.block_extent.block_start,
- removed_extent.block_start);
- m_block_to_log_entry_map.insert(LogMapEntry<T>(left_extent, split.log_entry));
-
- BlockExtent right_extent(removed_extent.block_end,
- split.block_extent.block_end);
- m_block_to_log_entry_map.insert(LogMapEntry<T>(right_extent, split.log_entry));
-
- split.log_entry->inc_map_ref();
-}
-
-template <typename T>
-std::list<std::shared_ptr<T>> LogMap<T>::find_log_entries_locked(const BlockExtent &block_extent) {
- std::list<std::shared_ptr<T>> overlaps;
- ldout(m_cct, 20) << "block_extent=" << block_extent << dendl;
-
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- LogMapEntries<T> map_entries = find_map_entries_locked(block_extent);
- for (auto &map_entry : map_entries) {
- overlaps.emplace_back(map_entry.log_entry);
- }
- return overlaps;
-}
-
-/**
- * TODO: Generalize this to do some arbitrary thing to each map
- * extent, instead of returning a list.
- */
-template <typename T>
-LogMapEntries<T> LogMap<T>::find_map_entries_locked(const BlockExtent &block_extent) {
- LogMapEntries<T> overlaps;
-
- ldout(m_cct, 20) << "block_extent=" << block_extent << dendl;
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- auto p = m_block_to_log_entry_map.equal_range(LogMapEntry<T>(block_extent));
- ldout(m_cct, 20) << "count=" << std::distance(p.first, p.second) << dendl;
- for ( auto i = p.first; i != p.second; ++i ) {
- LogMapEntry<T> entry = *i;
- overlaps.emplace_back(entry);
- ldout(m_cct, 20) << entry << dendl;
- }
- return overlaps;
-}
-
-/* We map block extents to write log entries, or portions of write log
- * entries. These are both represented by a WriteLogMapEntry. When a
- * GenericWriteLogEntry is added to this map, a WriteLogMapEntry is created to
- * represent the entire block extent of the GenericWriteLogEntry, and the
- * WriteLogMapEntry is added to the set.
- *
- * The set must not contain overlapping WriteLogMapEntrys. WriteLogMapEntrys
- * in the set that overlap with one being added are adjusted (shrunk, split,
- * or removed) before the new entry is added.
- *
- * This comparison works despite the ambiguity because we ensure the set
- * contains no overlapping entries. This comparison works to find entries
- * that overlap with a given block extent because equal_range() returns the
- * first entry in which the extent doesn't end before the given extent
- * starts, and the last entry for which the extent starts before the given
- * extent ends (the first entry that the key is less than, and the last entry
- * that is less than the key).
- */
-template <typename T>
-bool LogMap<T>::LogMapEntryCompare::operator()(const LogMapEntry<T> &lhs,
- const LogMapEntry<T> &rhs) const {
- if (lhs.block_extent.block_end <= rhs.block_extent.block_start) {
- return true;
- }
- return false;
-}
-
-} //namespace rwl
-} //namespace cache
-} //namespace librbd
-
-template class librbd::cache::rwl::LogMap<librbd::cache::rwl::GenericWriteLogEntry>;
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H
-#define CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H
-
-#include "librbd/BlockGuard.h"
-#include <list>
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-/**
- * WriteLogMap: maps block extents to GenericWriteLogEntries
- *
- * A WriteLogMapEntry (based on LogMapEntry) refers to a portion of a GenericWriteLogEntry
- */
-template <typename T>
-class LogMapEntry {
-public:
- BlockExtent block_extent;
- std::shared_ptr<T> log_entry;
-
- LogMapEntry(BlockExtent block_extent,
- std::shared_ptr<T> log_entry = nullptr);
- LogMapEntry(std::shared_ptr<T> log_entry);
-
- template <typename U>
- friend std::ostream &operator<<(std::ostream &os,
- LogMapEntry<U> &e);
-};
-
-template <typename T>
-using LogMapEntries = std::list<LogMapEntry<T>>;
-
-template <typename T>
-class LogMap {
-public:
- LogMap(CephContext *cct);
- LogMap(const LogMap&) = delete;
- LogMap &operator=(const LogMap&) = delete;
-
- void add_log_entry(std::shared_ptr<T> log_entry);
- void add_log_entries(std::list<std::shared_ptr<T>> &log_entries);
- void remove_log_entry(std::shared_ptr<T> log_entry);
- void remove_log_entries(std::list<std::shared_ptr<T>> &log_entries);
- std::list<std::shared_ptr<T>> find_log_entries(BlockExtent block_extent);
- LogMapEntries<T> find_map_entries(BlockExtent block_extent);
-
-private:
- void add_log_entry_locked(std::shared_ptr<T> log_entry);
- void remove_log_entry_locked(std::shared_ptr<T> log_entry);
- void add_map_entry_locked(LogMapEntry<T> &map_entry);
- void remove_map_entry_locked(LogMapEntry<T> &map_entry);
- void adjust_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &new_extent);
- void split_map_entry_locked(LogMapEntry<T> &map_entry, BlockExtent &removed_extent);
- std::list<std::shared_ptr<T>> find_log_entries_locked(const BlockExtent &block_extent);
- LogMapEntries<T> find_map_entries_locked(const BlockExtent &block_extent);
-
- using LogMapEntryT = LogMapEntry<T>;
-
- class LogMapEntryCompare {
- public:
- bool operator()(const LogMapEntryT &lhs,
- const LogMapEntryT &rhs) const;
- };
-
- using BlockExtentToLogMapEntries = std::set<LogMapEntryT,
- LogMapEntryCompare>;
-
- CephContext *m_cct;
- ceph::mutex m_lock;
- BlockExtentToLogMapEntries m_block_to_log_entry_map;
-};
-
-} //namespace rwl
-} //namespace cache
-} //namespace librbd
-
-#endif //CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include <iostream>
-#include "LogOperation.h"
-#include "librbd/cache/rwl/Types.h"
-
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl::LogOperation: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
-
-namespace cache {
-
-namespace rwl {
-
-GenericLogOperation::GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter)
- : m_perfcounter(perfcounter), dispatch_time(dispatch_time) {
-}
-
-std::ostream& GenericLogOperation::format(std::ostream &os) const {
- os << "dispatch_time=[" << dispatch_time << "], "
- << "buf_persist_time=[" << buf_persist_time << "], "
- << "buf_persist_comp_time=[" << buf_persist_comp_time << "], "
- << "log_append_time=[" << log_append_time << "], "
- << "log_append_comp_time=[" << log_append_comp_time << "], ";
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const GenericLogOperation &op) {
- return op.format(os);
-}
-
-SyncPointLogOperation::SyncPointLogOperation(ceph::mutex &lock,
- std::shared_ptr<SyncPoint> sync_point,
- const utime_t dispatch_time,
- PerfCounters *perfcounter,
- CephContext *cct)
- : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock), sync_point(sync_point) {
-}
-
-SyncPointLogOperation::~SyncPointLogOperation() { }
-
-std::ostream &SyncPointLogOperation::format(std::ostream &os) const {
- os << "(Sync Point) ";
- GenericLogOperation::format(os);
- os << ", "
- << "sync_point=[" << *sync_point << "]";
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const SyncPointLogOperation &op) {
- return op.format(os);
-}
-
-std::vector<Context*> SyncPointLogOperation::append_sync_point() {
- std::vector<Context*> appending_contexts;
- std::lock_guard locker(m_lock);
- if (!sync_point->appending) {
- sync_point->appending = true;
- }
- appending_contexts.swap(sync_point->on_sync_point_appending);
- return appending_contexts;
-}
-
-void SyncPointLogOperation::clear_earlier_sync_point() {
- std::lock_guard locker(m_lock);
- ceph_assert(sync_point->later_sync_point);
- ceph_assert(sync_point->later_sync_point->earlier_sync_point ==
- sync_point);
- sync_point->later_sync_point->earlier_sync_point = nullptr;
-}
-
-std::vector<Context*> SyncPointLogOperation::swap_on_sync_point_persisted() {
- std::lock_guard locker(m_lock);
- std::vector<Context*> persisted_contexts;
- persisted_contexts.swap(sync_point->on_sync_point_persisted);
- return persisted_contexts;
-}
-
-void SyncPointLogOperation::appending() {
- ceph_assert(sync_point);
- ldout(m_cct, 20) << "Sync point op=[" << *this
- << "] appending" << dendl;
- auto appending_contexts = append_sync_point();
- for (auto &ctx : appending_contexts) {
- ctx->complete(0);
- }
-}
-
-void SyncPointLogOperation::complete(int result) {
- ceph_assert(sync_point);
- ldout(m_cct, 20) << "Sync point op =[" << *this
- << "] completed" << dendl;
- clear_earlier_sync_point();
-
- /* Do append now in case completion occurred before the
- * normal append callback executed, and to handle
- * on_append work that was queued after the sync point
- * entered the appending state. */
- appending();
- auto persisted_contexts = swap_on_sync_point_persisted();
- for (auto &ctx : persisted_contexts) {
- ctx->complete(result);
- }
-}
-
-GenericWriteLogOperation::GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
- const utime_t dispatch_time,
- PerfCounters *perfcounter,
- CephContext *cct)
- : GenericLogOperation(dispatch_time, perfcounter),
- m_lock(ceph::make_mutex(util::unique_lock_name(
- "librbd::cache::rwl::GenericWriteLogOperation::m_lock", this))),
- m_cct(cct),
- sync_point(sync_point) {
-}
-
-GenericWriteLogOperation::~GenericWriteLogOperation() { }
-
-std::ostream &GenericWriteLogOperation::format(std::ostream &os) const {
- GenericLogOperation::format(os);
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const GenericWriteLogOperation &op) {
- return op.format(os);
-}
-
-/* Called when the write log operation is appending and its log position is guaranteed */
-void GenericWriteLogOperation::appending() {
- Context *on_append = nullptr;
- ldout(m_cct, 20) << __func__ << " " << this << dendl;
- {
- std::lock_guard locker(m_lock);
- on_append = on_write_append;
- on_write_append = nullptr;
- }
- if (on_append) {
- ldout(m_cct, 20) << __func__ << " " << this << " on_append=" << on_append << dendl;
- on_append->complete(0);
- }
-}
-
-/* Called when the write log operation is completed in all log replicas */
-void GenericWriteLogOperation::complete(int result) {
- appending();
- Context *on_persist = nullptr;
- ldout(m_cct, 20) << __func__ << " " << this << dendl;
- {
- std::lock_guard locker(m_lock);
- on_persist = on_write_persist;
- on_write_persist = nullptr;
- }
- if (on_persist) {
- ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist << dendl;
- on_persist->complete(result);
- }
-}
-
-WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set,
- uint64_t image_offset_bytes, uint64_t write_bytes,
- CephContext *cct)
- : GenericWriteLogOperation(set.sync_point, set.dispatch_time, set.perfcounter, cct),
- log_entry(std::make_shared<WriteLogEntry>(set.sync_point->log_entry, image_offset_bytes, write_bytes)) {
- on_write_append = set.extent_ops_appending->new_sub();
- on_write_persist = set.extent_ops_persist->new_sub();
- log_entry->sync_point_entry->writes++;
- log_entry->sync_point_entry->bytes += write_bytes;
-}
-
-WriteLogOperation::~WriteLogOperation() { }
-
-void WriteLogOperation::init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation, uint64_t current_sync_gen,
- uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset,
- bool persist_on_flush) {
- log_entry->init(has_data, allocation, current_sync_gen, last_op_sequence_num, persist_on_flush);
- buffer_alloc = &(*allocation);
- bl.substr_of(write_req_bl, buffer_offset,
- log_entry->write_bytes());
-}
-
-std::ostream &WriteLogOperation::format(std::ostream &os) const {
- os << "(Write) ";
- GenericWriteLogOperation::format(os);
- os << ", ";
- if (log_entry) {
- os << "log_entry=[" << *log_entry << "], ";
- } else {
- os << "log_entry=nullptr, ";
- }
- os << "bl=[" << bl << "],"
- << "buffer_alloc=" << buffer_alloc;
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const WriteLogOperation &op) {
- return op.format(os);
-}
-
-
-void WriteLogOperation::complete(int result) {
- GenericWriteLogOperation::complete(result);
- m_perfcounter->tinc(l_librbd_rwl_log_op_dis_to_buf_t, buf_persist_time - dispatch_time);
- utime_t buf_lat = buf_persist_comp_time - buf_persist_time;
- m_perfcounter->tinc(l_librbd_rwl_log_op_buf_to_bufc_t, buf_lat);
- m_perfcounter->hinc(l_librbd_rwl_log_op_buf_to_bufc_t_hist, buf_lat.to_nsec(),
- log_entry->ram_entry.write_bytes);
- m_perfcounter->tinc(l_librbd_rwl_log_op_buf_to_app_t, log_append_time - buf_persist_time);
-}
-
-void WriteLogOperation::copy_bl_to_pmem_buffer() {
- /* operation is a shared_ptr, so write_op is only good as long as operation is in scope */
- bufferlist::iterator i(&bl);
- m_perfcounter->inc(l_librbd_rwl_log_op_bytes, log_entry->write_bytes());
- ldout(m_cct, 20) << bl << dendl;
- i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->pmem_buffer);
-}
-
-void WriteLogOperation::flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {
- buf_persist_time = ceph_clock_now();
- pmemobj_flush(log_pool, log_entry->pmem_buffer, log_entry->write_bytes());
-}
-
-WriteLogOperationSet::WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr<SyncPoint> sync_point,
- bool persist_on_flush, CephContext *cct, Context *on_finish)
- : m_cct(cct), m_on_finish(on_finish),
- persist_on_flush(persist_on_flush),
- dispatch_time(dispatched),
- perfcounter(perfcounter),
- sync_point(sync_point) {
- on_ops_appending = sync_point->prior_persisted_gather_new_sub();
- on_ops_persist = nullptr;
- extent_ops_persist =
- new C_Gather(m_cct,
- new LambdaContext( [this](int r) {
- ldout(this->m_cct,20) << __func__ << " " << this << " m_extent_ops_persist completed" << dendl;
- if (on_ops_persist) {
- on_ops_persist->complete(r);
- }
- m_on_finish->complete(r);
- }));
- auto appending_persist_sub = extent_ops_persist->new_sub();
- extent_ops_appending =
- new C_Gather(m_cct,
- new LambdaContext( [this, appending_persist_sub](int r) {
- ldout(this->m_cct, 20) << __func__ << " " << this << " m_extent_ops_appending completed" << dendl;
- on_ops_appending->complete(r);
- appending_persist_sub->complete(r);
- }));
-}
-
-WriteLogOperationSet::~WriteLogOperationSet() { }
-
-std::ostream &operator<<(std::ostream &os,
- const WriteLogOperationSet &s) {
- os << "cell=" << (void*)s.cell << ", "
- << "extent_ops_appending=[" << s.extent_ops_appending << ", "
- << "extent_ops_persist=[" << s.extent_ops_persist << "]";
- return os;
-}
-
-DiscardLogOperation::DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
- const uint64_t image_offset_bytes,
- const uint64_t write_bytes,
- uint32_t discard_granularity_bytes,
- const utime_t dispatch_time,
- PerfCounters *perfcounter,
- CephContext *cct)
- : GenericWriteLogOperation(sync_point, dispatch_time, perfcounter, cct),
- log_entry(std::make_shared<DiscardLogEntry>(sync_point->log_entry,
- image_offset_bytes,
- write_bytes,
- discard_granularity_bytes)) {
- on_write_append = sync_point->prior_persisted_gather_new_sub();
- on_write_persist = nullptr;
- log_entry->sync_point_entry->writes++;
- log_entry->sync_point_entry->bytes += write_bytes;
-}
-
-DiscardLogOperation::~DiscardLogOperation() { }
-
-void DiscardLogOperation::init(uint64_t current_sync_gen, bool persist_on_flush,
- uint64_t last_op_sequence_num, Context *write_persist) {
- log_entry->init(current_sync_gen, persist_on_flush, last_op_sequence_num);
- this->on_write_persist = write_persist;
-}
-
-std::ostream &DiscardLogOperation::format(std::ostream &os) const {
- os << "(Discard) ";
- GenericWriteLogOperation::format(os);
- os << ", ";
- if (log_entry) {
- os << "log_entry=[" << *log_entry << "], ";
- } else {
- os << "log_entry=nullptr, ";
- }
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const DiscardLogOperation &op) {
- return op.format(os);
-}
-
-WriteSameLogOperation::WriteSameLogOperation(WriteLogOperationSet &set,
- uint64_t image_offset_bytes,
- uint64_t write_bytes,
- uint32_t data_len,
- CephContext *cct)
- : WriteLogOperation(set, image_offset_bytes, write_bytes, cct) {
- log_entry =
- std::make_shared<WriteSameLogEntry>(set.sync_point->log_entry, image_offset_bytes, write_bytes, data_len);
- ldout(m_cct, 20) << __func__ << " " << this << dendl;
-}
-
-WriteSameLogOperation::~WriteSameLogOperation() { }
-
-std::ostream &WriteSameLogOperation::format(std::ostream &os) const {
- os << "(Write Same) ";
- WriteLogOperation::format(os);
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const WriteSameLogOperation &op) {
- return op.format(os);
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H
-#define CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H
-
-#include "include/utime.h"
-#include "librbd/cache/rwl/LogEntry.h"
-#include "librbd/cache/rwl/SyncPoint.h"
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-struct WriteBufferAllocation;
-
-class WriteLogOperationSet;
-
-class WriteLogOperation;
-
-class GenericWriteLogOperation;
-
-class SyncPointLogOperation;
-
-class GenericLogOperation;
-
-using GenericLogOperationSharedPtr = std::shared_ptr<GenericLogOperation>;
-
-using GenericLogOperationsVector = std::vector<GenericLogOperationSharedPtr>;
-
-class GenericLogOperation {
-protected:
- PerfCounters *m_perfcounter = nullptr;
-public:
- utime_t dispatch_time; // When op created
- utime_t buf_persist_time; // When buffer persist begins
- utime_t buf_persist_comp_time; // When buffer persist completes
- utime_t log_append_time; // When log append begins
- utime_t log_append_comp_time; // When log append completes
- GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter);
- virtual ~GenericLogOperation() { };
- GenericLogOperation(const GenericLogOperation&) = delete;
- GenericLogOperation &operator=(const GenericLogOperation&) = delete;
- virtual std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const GenericLogOperation &op);
- virtual const std::shared_ptr<GenericLogEntry> get_log_entry() = 0;
- virtual void appending() = 0;
- virtual void complete(int r) = 0;
- virtual void mark_log_entry_completed() {};
- virtual bool reserved_allocated() const {
- return false;
- }
- virtual bool is_writing_op() const {
- return false;
- }
- virtual void copy_bl_to_pmem_buffer() {};
- virtual void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {};
-};
-
-class SyncPointLogOperation : public GenericLogOperation {
-private:
- CephContext *m_cct;
- ceph::mutex &m_lock;
- std::vector<Context*> append_sync_point();
- void clear_earlier_sync_point();
- std::vector<Context*> swap_on_sync_point_persisted();
-public:
- std::shared_ptr<SyncPoint> sync_point;
- SyncPointLogOperation(ceph::mutex &lock,
- std::shared_ptr<SyncPoint> sync_point,
- const utime_t dispatch_time,
- PerfCounters *perfcounter,
- CephContext *cct);
- ~SyncPointLogOperation() override;
- SyncPointLogOperation(const SyncPointLogOperation&) = delete;
- SyncPointLogOperation &operator=(const SyncPointLogOperation&) = delete;
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const SyncPointLogOperation &op);
- const std::shared_ptr<GenericLogEntry> get_log_entry() override {
- return sync_point->log_entry;
- }
- void appending() override;
- void complete(int r) override;
-};
-
-class GenericWriteLogOperation : public GenericLogOperation {
-protected:
- ceph::mutex m_lock;
- CephContext *m_cct;
-public:
- std::shared_ptr<SyncPoint> sync_point;
- Context *on_write_append = nullptr; /* Completion for things waiting on this
- * write's position in the log to be
- * guaranteed */
- Context *on_write_persist = nullptr; /* Completion for things waiting on this
- * write to persist */
- GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
- const utime_t dispatch_time,
- PerfCounters *perfcounter,
- CephContext *cct);
- ~GenericWriteLogOperation() override;
- GenericWriteLogOperation(const GenericWriteLogOperation&) = delete;
- GenericWriteLogOperation &operator=(const GenericWriteLogOperation&) = delete;
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const GenericWriteLogOperation &op);
- void mark_log_entry_completed() override{
- sync_point->log_entry->writes_completed++;
- }
- bool reserved_allocated() const override {
- return true;
- }
- bool is_writing_op() const override {
- return true;
- }
- void appending() override;
- void complete(int r) override;
-};
-
-class WriteLogOperation : public GenericWriteLogOperation {
-public:
- using GenericWriteLogOperation::m_lock;
- using GenericWriteLogOperation::sync_point;
- using GenericWriteLogOperation::on_write_append;
- using GenericWriteLogOperation::on_write_persist;
- std::shared_ptr<WriteLogEntry> log_entry;
- bufferlist bl;
- WriteBufferAllocation *buffer_alloc = nullptr;
- WriteLogOperation(WriteLogOperationSet &set, const uint64_t image_offset_bytes,
- const uint64_t write_bytes, CephContext *cct);
- ~WriteLogOperation() override;
- WriteLogOperation(const WriteLogOperation&) = delete;
- WriteLogOperation &operator=(const WriteLogOperation&) = delete;
- void init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation, uint64_t current_sync_gen,
- uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset,
- bool persist_on_flush);
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const WriteLogOperation &op);
- const std::shared_ptr<GenericLogEntry> get_log_entry() override {
- return log_entry;
- }
-
- void complete(int r) override;
- void copy_bl_to_pmem_buffer() override;
- void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) override;
-};
-
-
-class WriteLogOperationSet {
-private:
- CephContext *m_cct;
- Context *m_on_finish;
-public:
- bool persist_on_flush;
- BlockGuardCell *cell;
- C_Gather *extent_ops_appending;
- Context *on_ops_appending;
- C_Gather *extent_ops_persist;
- Context *on_ops_persist;
- GenericLogOperationsVector operations;
- utime_t dispatch_time; /* When set created */
- PerfCounters *perfcounter = nullptr;
- std::shared_ptr<SyncPoint> sync_point;
- WriteLogOperationSet(const utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr<SyncPoint> sync_point,
- const bool persist_on_flush, CephContext *cct, Context *on_finish);
- ~WriteLogOperationSet();
- WriteLogOperationSet(const WriteLogOperationSet&) = delete;
- WriteLogOperationSet &operator=(const WriteLogOperationSet&) = delete;
- friend std::ostream &operator<<(std::ostream &os,
- const WriteLogOperationSet &s);
-};
-
-class DiscardLogOperation : public GenericWriteLogOperation {
-public:
- using GenericWriteLogOperation::m_lock;
- using GenericWriteLogOperation::sync_point;
- using GenericWriteLogOperation::on_write_append;
- using GenericWriteLogOperation::on_write_persist;
- std::shared_ptr<DiscardLogEntry> log_entry;
- DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
- const uint64_t image_offset_bytes,
- const uint64_t write_bytes,
- uint32_t discard_granularity_bytes,
- const utime_t dispatch_time,
- PerfCounters *perfcounter,
- CephContext *cct);
- ~DiscardLogOperation() override;
- DiscardLogOperation(const DiscardLogOperation&) = delete;
- DiscardLogOperation &operator=(const DiscardLogOperation&) = delete;
- const std::shared_ptr<GenericLogEntry> get_log_entry() override {
- return log_entry;
- }
- bool reserved_allocated() const override {
- return false;
- }
- void init(uint64_t current_sync_gen, bool persist_on_flush,
- uint64_t last_op_sequence_num, Context *write_persist);
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const DiscardLogOperation &op);
-};
-
-class WriteSameLogOperation : public WriteLogOperation {
-public:
- using GenericWriteLogOperation::m_lock;
- using GenericWriteLogOperation::sync_point;
- using GenericWriteLogOperation::on_write_append;
- using GenericWriteLogOperation::on_write_persist;
- using WriteLogOperation::log_entry;
- using WriteLogOperation::bl;
- using WriteLogOperation::buffer_alloc;
- WriteSameLogOperation(WriteLogOperationSet &set,
- const uint64_t image_offset_bytes,
- const uint64_t write_bytes,
- const uint32_t data_len,
- CephContext *cct);
- ~WriteSameLogOperation();
- WriteSameLogOperation(const WriteSameLogOperation&) = delete;
- WriteSameLogOperation &operator=(const WriteSameLogOperation&) = delete;
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const WriteSameLogOperation &op);
-};
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-#endif // CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "ReadRequest.h"
-
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl::ReadRequest: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-void C_ReadRequest::finish(int r) {
- ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl;
- int hits = 0;
- int misses = 0;
- int hit_bytes = 0;
- int miss_bytes = 0;
- if (r >= 0) {
- /*
- * At this point the miss read has completed. We'll iterate through
- * read_extents and produce *m_out_bl by assembling pieces of miss_bl
- * and the individual hit extent bufs in the read extents that represent
- * hits.
- */
- uint64_t miss_bl_offset = 0;
- for (auto &extent : read_extents) {
- if (extent.m_bl.length()) {
- /* This was a hit */
- ceph_assert(extent.second == extent.m_bl.length());
- ++hits;
- hit_bytes += extent.second;
- m_out_bl->claim_append(extent.m_bl);
- } else {
- /* This was a miss. */
- ++misses;
- miss_bytes += extent.second;
- bufferlist miss_extent_bl;
- miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent.second);
- /* Add this read miss bufferlist to the output bufferlist */
- m_out_bl->claim_append(miss_extent_bl);
- /* Consume these bytes in the read miss bufferlist */
- miss_bl_offset += extent.second;
- }
- }
- }
- ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl;
- utime_t now = ceph_clock_now();
- ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes);
- m_on_finish->complete(r);
- m_perfcounter->inc(l_librbd_rwl_rd_bytes, hit_bytes + miss_bytes);
- m_perfcounter->inc(l_librbd_rwl_rd_hit_bytes, hit_bytes);
- m_perfcounter->tinc(l_librbd_rwl_rd_latency, now - m_arrived_time);
- if (!misses) {
- m_perfcounter->inc(l_librbd_rwl_rd_hit_req, 1);
- m_perfcounter->tinc(l_librbd_rwl_rd_hit_latency, now - m_arrived_time);
- } else {
- if (hits) {
- m_perfcounter->inc(l_librbd_rwl_rd_part_hit_req, 1);
- }
- }
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H
-#define CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H
-
-#include "include/Context.h"
-#include "librbd/cache/rwl/Types.h"
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-typedef std::vector<rwl::ImageExtentBuf> ImageExtentBufs;
-
-class C_ReadRequest : public Context {
-public:
- io::Extents miss_extents; // move back to caller
- ImageExtentBufs read_extents;
- bufferlist miss_bl;
-
- C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish)
- : m_cct(cct), m_on_finish(on_finish), m_out_bl(out_bl),
- m_arrived_time(arrived), m_perfcounter(perfcounter) {}
- ~C_ReadRequest() {}
-
- void finish(int r) override;
-
- const char *get_name() const {
- return "C_ReadRequest";
- }
-
-private:
- CephContext *m_cct;
- Context *m_on_finish;
- bufferlist *m_out_bl;
- utime_t m_arrived_time;
- PerfCounters *m_perfcounter;
-};
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-#endif // CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "Request.h"
-#include "librbd/BlockGuard.h"
-#include "librbd/cache/rwl/LogEntry.h"
-#include "librbd/cache/ReplicatedWriteLog.h"
-
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl::Request: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-template <typename T>
-C_BlockIORequest<T>::C_BlockIORequest(T &rwl, const utime_t arrived, io::Extents &&extents,
- bufferlist&& bl, const int fadvise_flags, Context *user_req)
- : rwl(rwl), image_extents(std::move(extents)),
- bl(std::move(bl)), fadvise_flags(fadvise_flags),
- user_req(user_req), image_extents_summary(image_extents), m_arrived_time(arrived) {
- ldout(rwl.get_context(), 99) << this << dendl;
-}
-
-template <typename T>
-C_BlockIORequest<T>::~C_BlockIORequest() {
- ldout(rwl.get_context(), 99) << this << dendl;
- ceph_assert(m_cell_released || !m_cell);
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os,
- const C_BlockIORequest<T> &req) {
- os << "image_extents=[" << req.image_extents << "], "
- << "image_extents_summary=[" << req.image_extents_summary << "], "
- << "bl=" << req.bl << ", "
- << "user_req=" << req.user_req << ", "
- << "m_user_req_completed=" << req.m_user_req_completed << ", "
- << "m_deferred=" << req.m_deferred << ", "
- << "detained=" << req.detained << ", "
- << "waited_lanes=" << req.waited_lanes << ", "
- << "waited_entries=" << req.waited_entries << ", "
- << "waited_buffers=" << req.waited_buffers << "";
- return os;
-}
-
-template <typename T>
-void C_BlockIORequest<T>::set_cell(BlockGuardCell *cell) {
- ldout(rwl.get_context(), 20) << this << " cell=" << cell << dendl;
- ceph_assert(cell);
- ceph_assert(!m_cell);
- m_cell = cell;
-}
-
-template <typename T>
-BlockGuardCell *C_BlockIORequest<T>::get_cell(void) {
- ldout(rwl.get_context(), 20) << this << " cell=" << m_cell << dendl;
- return m_cell;
-}
-
-template <typename T>
-void C_BlockIORequest<T>::release_cell() {
- ldout(rwl.get_context(), 20) << this << " cell=" << m_cell << dendl;
- ceph_assert(m_cell);
- bool initial = false;
- if (m_cell_released.compare_exchange_strong(initial, true)) {
- rwl.release_guarded_request(m_cell);
- } else {
- ldout(rwl.get_context(), 5) << "cell " << m_cell << " already released for " << this << dendl;
- }
-}
-
-template <typename T>
-void C_BlockIORequest<T>::complete_user_request(int r) {
- bool initial = false;
- if (m_user_req_completed.compare_exchange_strong(initial, true)) {
- ldout(rwl.get_context(), 15) << this << " completing user req" << dendl;
- m_user_req_completed_time = ceph_clock_now();
- user_req->complete(r);
- // Set user_req as null as it is deleted
- user_req = nullptr;
- } else {
- ldout(rwl.get_context(), 20) << this << " user req already completed" << dendl;
- }
-}
-
-template <typename T>
-void C_BlockIORequest<T>::finish(int r) {
- ldout(rwl.get_context(), 20) << this << dendl;
-
- complete_user_request(r);
- bool initial = false;
- if (m_finish_called.compare_exchange_strong(initial, true)) {
- ldout(rwl.get_context(), 15) << this << " finishing" << dendl;
- finish_req(0);
- } else {
- ldout(rwl.get_context(), 20) << this << " already finished" << dendl;
- ceph_assert(0);
- }
-}
-
-template <typename T>
-void C_BlockIORequest<T>::deferred() {
- bool initial = false;
- if (m_deferred.compare_exchange_strong(initial, true)) {
- deferred_handler();
- }
-}
-
-template <typename T>
-C_WriteRequest<T>::C_WriteRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents,
- bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
- PerfCounters *perfcounter, Context *user_req)
- : C_BlockIORequest<T>(rwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req),
- m_perfcounter(perfcounter), m_lock(lock) {
- ldout(rwl.get_context(), 99) << this << dendl;
-}
-
-template <typename T>
-C_WriteRequest<T>::~C_WriteRequest() {
- ldout(rwl.get_context(), 99) << this << dendl;
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os,
- const C_WriteRequest<T> &req) {
- os << (C_BlockIORequest<T>&)req
- << " m_resources.allocated=" << req.m_resources.allocated;
- if (req.op_set) {
- os << "op_set=" << *req.op_set;
- }
- return os;
-}
-
-template <typename T>
-void C_WriteRequest<T>::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) {
- ldout(rwl.get_context(), 20) << __func__ << " write_req=" << this << " cell=" << guard_ctx.cell << dendl;
-
- ceph_assert(guard_ctx.cell);
- this->detained = guard_ctx.state.detained; /* overlapped */
- this->m_queued = guard_ctx.state.queued; /* queued behind at least one barrier */
- this->set_cell(guard_ctx.cell);
-}
-
-template <typename T>
-void C_WriteRequest<T>::finish_req(int r) {
- ldout(rwl.get_context(), 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl;
-
- /* Completed to caller by here (in finish(), which calls this) */
- utime_t now = ceph_clock_now();
- rwl.release_write_lanes(this);
- ceph_assert(m_resources.allocated);
- m_resources.allocated = false;
- this->release_cell(); /* TODO: Consider doing this in appending state */
- update_req_stats(now);
-}
-
-template <typename T>
-void C_WriteRequest<T>::setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {
-
- ceph_assert(!m_resources.allocated);
-
- auto image_extents_size = this->image_extents.size();
- m_resources.buffers.reserve(image_extents_size);
-
- bytes_cached = 0;
- bytes_allocated = 0;
- number_lanes = image_extents_size;
- number_log_entries = image_extents_size;
- number_unpublished_reserves = image_extents_size;
-
- for (auto &extent : this->image_extents) {
- m_resources.buffers.emplace_back();
- struct WriteBufferAllocation &buffer = m_resources.buffers.back();
- buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
- buffer.allocated = false;
- bytes_cached += extent.second;
- if (extent.second > buffer.allocation_size) {
- buffer.allocation_size = extent.second;
- }
- bytes_allocated += buffer.allocation_size;
- }
- bytes_dirtied = bytes_cached;
-}
-
-template <typename T>
-std::shared_ptr<WriteLogOperation> C_WriteRequest<T>::create_operation(uint64_t offset, uint64_t len) {
- return std::make_shared<WriteLogOperation>(*op_set, offset, len, rwl.get_context());
-}
-
-template <typename T>
-void C_WriteRequest<T>::setup_log_operations(DeferredContexts &on_exit) {
- GenericWriteLogEntries log_entries;
- {
- std::lock_guard locker(m_lock);
- std::shared_ptr<SyncPoint> current_sync_point = rwl.get_current_sync_point();
- if ((!rwl.get_persist_on_flush() && current_sync_point->log_entry->writes_completed) ||
- (current_sync_point->log_entry->writes > MAX_WRITES_PER_SYNC_POINT) ||
- (current_sync_point->log_entry->bytes > MAX_BYTES_PER_SYNC_POINT)) {
- /* Create new sync point and persist the previous one. This sequenced
- * write will bear a sync gen number shared with no already completed
- * writes. A group of sequenced writes may be safely flushed concurrently
- * if they all arrived before any of them completed. We'll insert one on
- * an aio_flush() from the application. Here we're inserting one to cap
- * the number of bytes and writes per sync point. When the application is
- * not issuing flushes, we insert sync points to record some observed
- * write concurrency information that enables us to safely issue >1 flush
- * write (for writes observed here to have been in flight simultaneously)
- * at a time in persist-on-write mode.
- */
- rwl.flush_new_sync_point(nullptr, on_exit);
- current_sync_point = rwl.get_current_sync_point();
- }
- uint64_t current_sync_gen = rwl.get_current_sync_gen();
- op_set =
- make_unique<WriteLogOperationSet>(this->m_dispatched_time,
- m_perfcounter,
- current_sync_point,
- rwl.get_persist_on_flush(),
- rwl.get_context(), this);
- ldout(rwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get() << dendl;
- ceph_assert(m_resources.allocated);
- /* op_set->operations initialized differently for plain write or write same */
- auto allocation = m_resources.buffers.begin();
- uint64_t buffer_offset = 0;
- for (auto &extent : this->image_extents) {
- /* operation->on_write_persist connected to m_prior_log_entries_persisted Gather */
- auto operation = this->create_operation(extent.first, extent.second);
- this->op_set->operations.emplace_back(operation);
-
- /* A WS is also a write */
- ldout(rwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get()
- << " operation=" << operation << dendl;
- log_entries.emplace_back(operation->log_entry);
- if (!op_set->persist_on_flush) {
- rwl.inc_last_op_sequence_num();
- }
- operation->init(true, allocation, current_sync_gen,
- rwl.get_last_op_sequence_num(), this->bl, buffer_offset, op_set->persist_on_flush);
- buffer_offset += operation->log_entry->write_bytes();
- ldout(rwl.get_context(), 20) << "operation=[" << *operation << "]" << dendl;
- allocation++;
- }
- }
- /* All extent ops subs created */
- op_set->extent_ops_appending->activate();
- op_set->extent_ops_persist->activate();
-
- /* Write data */
- for (auto &operation : op_set->operations) {
- operation->copy_bl_to_pmem_buffer();
- }
- rwl.add_into_log_map(log_entries);
-}
-
-template <typename T>
-bool C_WriteRequest<T>::append_write_request(std::shared_ptr<SyncPoint> sync_point) {
- std::lock_guard locker(m_lock);
- auto write_req_sp = this;
- if (sync_point->earlier_sync_point) {
- Context *schedule_append_ctx = new LambdaContext([this, write_req_sp](int r) {
- write_req_sp->schedule_append();
- });
- sync_point->earlier_sync_point->on_sync_point_appending.push_back(schedule_append_ctx);
- return true;
- }
- return false;
-}
-
-template <typename T>
-void C_WriteRequest<T>::schedule_append() {
- ceph_assert(++m_appended == 1);
- if (m_do_early_flush) {
- /* This caller is waiting for persist, so we'll use their thread to
- * expedite it */
- rwl.flush_pmem_buffer(this->op_set->operations);
- rwl.schedule_append(this->op_set->operations);
- } else {
- /* This is probably not still the caller's thread, so do the payload
- * flushing/replicating later. */
- rwl.schedule_flush_and_append(this->op_set->operations);
- }
-}
-
-/**
- * Attempts to allocate log resources for a write. Returns true if successful.
- *
- * Resources include 1 lane per extent, 1 log entry per extent, and the payload
- * data space for each extent.
- *
- * Lanes are released after the write persists via release_write_lanes()
- */
-template <typename T>
-bool C_WriteRequest<T>::alloc_resources() {
- this->allocated_time = ceph_clock_now();
- return rwl.alloc_resources(this);
-}
-
-/**
- * Takes custody of write_req. Resources must already be allocated.
- *
- * Locking:
- * Acquires lock
- */
-template <typename T>
-void C_WriteRequest<T>::dispatch()
-{
- CephContext *cct = rwl.get_context();
- DeferredContexts on_exit;
- utime_t now = ceph_clock_now();
- this->m_dispatched_time = now;
-
- ldout(cct, 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl;
- this->setup_log_operations(on_exit);
-
- bool append_deferred = false;
- if (!op_set->persist_on_flush &&
- append_write_request(op_set->sync_point)) {
- /* In persist-on-write mode, we defer the append of this write until the
- * previous sync point is appending (meaning all the writes before it are
- * persisted and that previous sync point can now appear in the
- * log). Since we insert sync points in persist-on-write mode when writes
- * have already completed to the current sync point, this limits us to
- * one inserted sync point in flight at a time, and gives the next
- * inserted sync point some time to accumulate a few writes if they
- * arrive soon. Without this we can insert an absurd number of sync
- * points, each with one or two writes. That uses a lot of log entries,
- * and limits flushing to very few writes at a time. */
- m_do_early_flush = false;
- append_deferred = true;
- } else {
- /* The prior sync point is done, so we'll schedule append here. If this is
- * persist-on-write, and probably still the caller's thread, we'll use this
- * caller's thread to perform the persist & replication of the payload
- * buffer. */
- m_do_early_flush =
- !(this->detained || this->m_queued || this->m_deferred || op_set->persist_on_flush);
- }
- if (!append_deferred) {
- this->schedule_append();
- }
-}
-
-template <typename T>
-C_FlushRequest<T>::C_FlushRequest(T &rwl, const utime_t arrived,
- io::Extents &&image_extents,
- bufferlist&& bl, const int fadvise_flags,
- ceph::mutex &lock, PerfCounters *perfcounter,
- Context *user_req)
- : C_BlockIORequest<T>(rwl, arrived, std::move(image_extents), std::move(bl),
- fadvise_flags, user_req),
- m_lock(lock), m_perfcounter(perfcounter) {
- ldout(rwl.get_context(), 20) << this << dendl;
-}
-
-template <typename T>
-void C_FlushRequest<T>::finish_req(int r) {
- ldout(rwl.get_context(), 20) << "flush_req=" << this
- << " cell=" << this->get_cell() << dendl;
- /* Block guard already released */
- ceph_assert(!this->get_cell());
-
- /* Completed to caller by here */
- utime_t now = ceph_clock_now();
- m_perfcounter->tinc(l_librbd_rwl_aio_flush_latency, now - this->m_arrived_time);
-}
-
-template <typename T>
-bool C_FlushRequest<T>::alloc_resources() {
- ldout(rwl.get_context(), 20) << "req type=" << get_name() << " "
- << "req=[" << *this << "]" << dendl;
- return rwl.alloc_resources(this);
-}
-
-template <typename T>
-void C_FlushRequest<T>::dispatch() {
- utime_t now = ceph_clock_now();
- ldout(rwl.get_context(), 20) << "req type=" << get_name() << " "
- << "req=[" << *this << "]" << dendl;
- ceph_assert(this->m_resources.allocated);
- this->m_dispatched_time = now;
-
- op = std::make_shared<SyncPointLogOperation>(m_lock,
- to_append,
- now,
- m_perfcounter,
- rwl.get_context());
-
- m_perfcounter->inc(l_librbd_rwl_log_ops, 1);
- rwl.schedule_append(op);
-}
-
-template <typename T>
-void C_FlushRequest<T>::setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {
- number_log_entries = 1;
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os,
- const C_FlushRequest<T> &req) {
- os << (C_BlockIORequest<T>&)req
- << " m_resources.allocated=" << req.m_resources.allocated;
- return os;
-}
-
-template <typename T>
-C_DiscardRequest<T>::C_DiscardRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents,
- uint32_t discard_granularity_bytes, ceph::mutex &lock,
- PerfCounters *perfcounter, Context *user_req)
- : C_BlockIORequest<T>(rwl, arrived, std::move(image_extents), bufferlist(), 0, user_req),
- m_discard_granularity_bytes(discard_granularity_bytes),
- m_lock(lock),
- m_perfcounter(perfcounter) {
- ldout(rwl.get_context(), 20) << this << dendl;
-}
-
-template <typename T>
-C_DiscardRequest<T>::~C_DiscardRequest() {
- ldout(rwl.get_context(), 20) << this << dendl;
-}
-
-template <typename T>
-bool C_DiscardRequest<T>::alloc_resources() {
- ldout(rwl.get_context(), 20) << "req type=" << get_name() << " "
- << "req=[" << *this << "]" << dendl;
- return rwl.alloc_resources(this);
-}
-
-template <typename T>
-void C_DiscardRequest<T>::setup_log_operations() {
- std::lock_guard locker(m_lock);
- GenericWriteLogEntries log_entries;
- for (auto &extent : this->image_extents) {
- op = std::make_shared<DiscardLogOperation>(rwl.get_current_sync_point(),
- extent.first,
- extent.second,
- m_discard_granularity_bytes,
- this->m_dispatched_time,
- m_perfcounter,
- rwl.get_context());
- log_entries.emplace_back(op->log_entry);
- break;
- }
- uint64_t current_sync_gen = rwl.get_current_sync_gen();
- bool persist_on_flush = rwl.get_persist_on_flush();
- if (!persist_on_flush) {
- rwl.inc_last_op_sequence_num();
- }
- auto discard_req = this;
- Context *on_write_persist = new LambdaContext(
- [this, discard_req](int r) {
- ldout(rwl.get_context(), 20) << "discard_req=" << discard_req
- << " cell=" << discard_req->get_cell() << dendl;
- ceph_assert(discard_req->get_cell());
- discard_req->complete_user_request(r);
- discard_req->release_cell();
- });
- op->init(current_sync_gen, persist_on_flush, rwl.get_last_op_sequence_num(), on_write_persist);
- rwl.add_into_log_map(log_entries);
-}
-
-template <typename T>
-void C_DiscardRequest<T>::dispatch() {
- utime_t now = ceph_clock_now();
- ldout(rwl.get_context(), 20) << "req type=" << get_name() << " "
- << "req=[" << *this << "]" << dendl;
- ceph_assert(this->m_resources.allocated);
- this->m_dispatched_time = now;
- setup_log_operations();
- m_perfcounter->inc(l_librbd_rwl_log_ops, 1);
- rwl.schedule_append(op);
-}
-
-template <typename T>
-void C_DiscardRequest<T>::setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {
- number_log_entries = 1;
- /* No bytes are allocated for a discard, but we count the discarded bytes
- * as dirty. This means it's possible to have more bytes dirty than
- * there are bytes cached or allocated. */
- for (auto &extent : this->image_extents) {
- bytes_dirtied = extent.second;
- break;
- }
-}
-
-template <typename T>
-void C_DiscardRequest<T>::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) {
- ldout(rwl.get_context(), 20) << " cell=" << guard_ctx.cell << dendl;
-
- ceph_assert(guard_ctx.cell);
- this->detained = guard_ctx.state.detained; /* overlapped */
- this->set_cell(guard_ctx.cell);
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os,
- const C_DiscardRequest<T> &req) {
- os << (C_BlockIORequest<T>&)req;
- if (req.op) {
- os << " op=[" << *req.op << "]";
- } else {
- os << " op=nullptr";
- }
- return os;
-}
-
-template <typename T>
-C_WriteSameRequest<T>::C_WriteSameRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents,
- bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
- PerfCounters *perfcounter, Context *user_req)
- : C_WriteRequest<T>(rwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req) {
- ldout(rwl.get_context(), 20) << this << dendl;
-}
-
-template <typename T>
-C_WriteSameRequest<T>::~C_WriteSameRequest() {
- ldout(rwl.get_context(), 20) << this << dendl;
-}
-
-template <typename T>
-void C_WriteSameRequest<T>::update_req_stats(utime_t &now) {
- /* Write same stats excluded from most write stats
- * because the read phase will make them look like slow writes in
- * those histograms. */
- ldout(rwl.get_context(), 20) << this << dendl;
- utime_t comp_latency = now - this->m_arrived_time;
- this->m_perfcounter->tinc(l_librbd_rwl_ws_latency, comp_latency);
-}
-
-/* Write sames will allocate one buffer, the size of the repeating pattern */
-template <typename T>
-void C_WriteSameRequest<T>::setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {
- ldout(rwl.get_context(), 20) << this << dendl;
- ceph_assert(this->image_extents.size() == 1);
- bytes_dirtied += this->image_extents[0].second;
- auto pattern_length = this->bl.length();
- this->m_resources.buffers.emplace_back();
- struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
- buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
- buffer.allocated = false;
- bytes_cached += pattern_length;
- if (pattern_length > buffer.allocation_size) {
- buffer.allocation_size = pattern_length;
- }
- bytes_allocated += buffer.allocation_size;
-}
-
-template <typename T>
-std::shared_ptr<WriteLogOperation> C_WriteSameRequest<T>::create_operation(uint64_t offset, uint64_t len) {
- ceph_assert(this->image_extents.size() == 1);
- return std::make_shared<WriteSameLogOperation>(*this->op_set.get(), offset, len,
- this->bl.length(), rwl.get_context());
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os,
- const C_WriteSameRequest<T> &req) {
- os << (C_WriteRequest<T>&)req;
- return os;
-}
-
-template <typename T>
-C_CompAndWriteRequest<T>::C_CompAndWriteRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents,
- bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
- int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
- Context *user_req)
- : C_WriteRequest<T>(rwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req),
- mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)) {
- ldout(rwl.get_context(), 20) << dendl;
-}
-
-template <typename T>
-C_CompAndWriteRequest<T>::~C_CompAndWriteRequest() {
- ldout(rwl.get_context(), 20) << dendl;
-}
-
-template <typename T>
-void C_CompAndWriteRequest<T>::finish_req(int r) {
- if (compare_succeeded) {
- C_WriteRequest<T>::finish_req(r);
- } else {
- utime_t now = ceph_clock_now();
- update_req_stats(now);
- }
-}
-
-template <typename T>
-void C_CompAndWriteRequest<T>::update_req_stats(utime_t &now) {
- /* Compare-and-write stats. Compare-and-write excluded from most write
- * stats because the read phase will make them look like slow writes in
- * those histograms. */
- if (!compare_succeeded) {
- this->m_perfcounter->inc(l_librbd_rwl_cmp_fails, 1);
- }
- utime_t comp_latency = now - this->m_arrived_time;
- this->m_perfcounter->tinc(l_librbd_rwl_cmp_latency, comp_latency);
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os,
- const C_CompAndWriteRequest<T> &req) {
- os << (C_WriteRequest<T>&)req
- << "cmp_bl=" << req.cmp_bl << ", "
- << "read_bl=" << req.read_bl << ", "
- << "compare_succeeded=" << req.compare_succeeded << ", "
- << "mismatch_offset=" << req.mismatch_offset;
- return os;
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::rwl::C_BlockIORequest<librbd::cache::AbstractWriteLog<librbd::ImageCtx> >;
-template class librbd::cache::rwl::C_WriteRequest<librbd::cache::AbstractWriteLog<librbd::ImageCtx> >;
-template class librbd::cache::rwl::C_FlushRequest<librbd::cache::AbstractWriteLog<librbd::ImageCtx> >;
-template class librbd::cache::rwl::C_DiscardRequest<librbd::cache::AbstractWriteLog<librbd::ImageCtx> >;
-template class librbd::cache::rwl::C_WriteSameRequest<librbd::cache::AbstractWriteLog<librbd::ImageCtx> >;
-template class librbd::cache::rwl::C_CompAndWriteRequest<librbd::cache::AbstractWriteLog<librbd::ImageCtx> >;
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H
-#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H
-
-#include "include/Context.h"
-#include "librbd/cache/ImageCache.h"
-#include "librbd/cache/rwl/Types.h"
-#include "librbd/cache/rwl/LogOperation.h"
-
-namespace librbd {
-class BlockGuardCell;
-
-namespace cache {
-namespace rwl {
-
-class GuardedRequestFunctionContext;
-
-struct WriteRequestResources {
- bool allocated = false;
- std::vector<WriteBufferAllocation> buffers;
-};
-
-/**
- * A request that can be deferred in a BlockGuard to sequence
- * overlapping operations.
- * This is the custodian of the BlockGuard cell for this IO, and the
- * state information about the progress of this IO. This object lives
- * until the IO is persisted in all (live) log replicas. User request
- * may be completed from here before the IO persists.
- */
-template <typename T>
-class C_BlockIORequest : public Context {
-public:
- T &rwl;
- io::Extents image_extents;
- bufferlist bl;
- int fadvise_flags;
- Context *user_req; /* User write request */
- ExtentsSummary<io::Extents> image_extents_summary;
- bool detained = false; /* Detained in blockguard (overlapped with a prior IO) */
- utime_t allocated_time; /* When allocation began */
- bool waited_lanes = false; /* This IO waited for free persist/replicate lanes */
- bool waited_entries = false; /* This IO waited for free log entries */
- bool waited_buffers = false; /* This IO waited for data buffers (pmemobj_reserve() failed) */
-
- C_BlockIORequest(T &rwl, const utime_t arrived, io::Extents &&extents,
- bufferlist&& bl, const int fadvise_flags, Context *user_req);
- ~C_BlockIORequest() override;
- C_BlockIORequest(const C_BlockIORequest&) = delete;
- C_BlockIORequest &operator=(const C_BlockIORequest&) = delete;
-
- void set_cell(BlockGuardCell *cell);
- BlockGuardCell *get_cell(void);
- void release_cell();
-
- void complete_user_request(int r);
- void finish(int r);
- virtual void finish_req(int r) = 0;
-
- virtual bool alloc_resources() = 0;
-
- void deferred();
-
- virtual void deferred_handler() = 0;
-
- virtual void dispatch() = 0;
-
- virtual const char *get_name() const {
- return "C_BlockIORequest";
- }
- uint64_t get_image_extents_size() {
- return image_extents.size();
- }
- void set_io_waited_for_lanes(bool waited) {
- waited_lanes = waited;
- }
- void set_io_waited_for_entries(bool waited) {
- waited_entries = waited;
- }
- void set_io_waited_for_buffers(bool waited) {
- waited_buffers = waited;
- }
- bool has_io_waited_for_buffers() {
- return waited_buffers;
- }
- std::vector<WriteBufferAllocation>& get_resources_buffers() {
- return m_resources.buffers;
- }
-
- void set_allocated(bool allocated) {
- if (allocated) {
- m_resources.allocated = true;
- } else {
- m_resources.buffers.clear();
- }
- }
-
- virtual void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {};
-
-protected:
- utime_t m_arrived_time;
- utime_t m_dispatched_time; /* When dispatch began */
- utime_t m_user_req_completed_time;
- std::atomic<bool> m_deferred = {false}; /* Deferred because this or a prior IO had to wait for write resources */
- WriteRequestResources m_resources;
-
-private:
- std::atomic<bool> m_user_req_completed = {false};
- std::atomic<bool> m_finish_called = {false};
- std::atomic<bool> m_cell_released = {false};
- BlockGuardCell* m_cell = nullptr;
-
- template <typename U>
- friend std::ostream &operator<<(std::ostream &os,
- const C_BlockIORequest<U> &req);
-};
-
-/**
- * This is the custodian of the BlockGuard cell for this write. Block
- * guard is not released until the write persists everywhere (this is
- * how we guarantee to each log replica that they will never see
- * overlapping writes).
- */
-template <typename T>
-class C_WriteRequest : public C_BlockIORequest<T> {
-public:
- using C_BlockIORequest<T>::rwl;
- unique_ptr<WriteLogOperationSet> op_set = nullptr;
-
- C_WriteRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents,
- bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
- PerfCounters *perfcounter, Context *user_req);
-
- ~C_WriteRequest() override;
-
- void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx);
-
- /* Common finish to plain write and compare-and-write (if it writes) */
- void finish_req(int r) override;
-
- /* Compare and write will override this */
- virtual void update_req_stats(utime_t &now) {
- // TODO: Add in later PRs
- }
- bool alloc_resources() override;
-
- void deferred_handler() override { }
-
- void dispatch() override;
-
- virtual std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset, uint64_t len);
-
- virtual void setup_log_operations(DeferredContexts &on_exit);
-
- bool append_write_request(std::shared_ptr<SyncPoint> sync_point);
-
- virtual void schedule_append();
-
- const char *get_name() const override {
- return "C_WriteRequest";
- }
-
-protected:
- using C_BlockIORequest<T>::m_resources;
- PerfCounters *m_perfcounter = nullptr;
- /* Plain writes will allocate one buffer per request extent */
- void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) override;
-
-private:
- bool m_do_early_flush = false;
- std::atomic<int> m_appended = {0};
- bool m_queued = false;
- ceph::mutex &m_lock;
- template <typename U>
- friend std::ostream &operator<<(std::ostream &os,
- const C_WriteRequest<U> &req);
-};
-
-/**
- * This is the custodian of the BlockGuard cell for this
- * aio_flush. Block guard is released as soon as the new
- * sync point (if required) is created. Subsequent IOs can
- * proceed while this flush waits for prior IOs to complete
- * and any required sync points to be persisted.
- */
-template <typename T>
-class C_FlushRequest : public C_BlockIORequest<T> {
-public:
- using C_BlockIORequest<T>::rwl;
- bool internal = false;
- std::shared_ptr<SyncPoint> to_append;
-
- C_FlushRequest(T &rwl, const utime_t arrived,
- io::Extents &&image_extents,
- bufferlist&& bl, const int fadvise_flags,
- ceph::mutex &lock, PerfCounters *perfcounter,
- Context *user_req);
-
- ~C_FlushRequest() override {}
-
- bool alloc_resources() override;
-
- void dispatch() override;
-
- const char *get_name() const override {
- return "C_FlushRequest";
- }
-
- void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) override;
-private:
- std::shared_ptr<SyncPointLogOperation> op;
- ceph::mutex &m_lock;
- PerfCounters *m_perfcounter = nullptr;
-
- void finish_req(int r) override;
- void deferred_handler() override {
- m_perfcounter->inc(l_librbd_rwl_aio_flush_def, 1);
- }
-
- template <typename U>
- friend std::ostream &operator<<(std::ostream &os,
- const C_FlushRequest<U> &req);
-};
-
-/**
- * This is the custodian of the BlockGuard cell for this discard. As in the
- * case of write, the block guard is not released until the discard persists
- * everywhere.
- */
-template <typename T>
-class C_DiscardRequest : public C_BlockIORequest<T> {
-public:
- using C_BlockIORequest<T>::rwl;
- std::shared_ptr<DiscardLogOperation> op;
-
- C_DiscardRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents,
- uint32_t discard_granularity_bytes, ceph::mutex &lock,
- PerfCounters *perfcounter, Context *user_req);
-
- ~C_DiscardRequest() override;
- void finish_req(int r) override {}
-
- bool alloc_resources() override;
-
- void deferred_handler() override { }
-
- void setup_log_operations();
-
- void dispatch() override;
-
- void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx);
-
- const char *get_name() const override {
- return "C_DiscardRequest";
- }
- void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) override;
-private:
- uint32_t m_discard_granularity_bytes;
- ceph::mutex &m_lock;
- PerfCounters *m_perfcounter = nullptr;
- template <typename U>
- friend std::ostream &operator<<(std::ostream &os,
- const C_DiscardRequest<U> &req);
-};
-
-/**
- * This is the custodian of the BlockGuard cell for this write same.
- *
- * A writesame allocates and persists a data buffer like a write, but the
- * data buffer is usually much shorter than the write same.
- */
-template <typename T>
-class C_WriteSameRequest : public C_WriteRequest<T> {
-public:
- using C_BlockIORequest<T>::rwl;
- C_WriteSameRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents,
- bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
- PerfCounters *perfcounter, Context *user_req);
-
- ~C_WriteSameRequest() override;
-
- void update_req_stats(utime_t &now) override;
-
- void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) override;
-
- std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset, uint64_t len) override;
-
- const char *get_name() const override {
- return "C_WriteSameRequest";
- }
-
- template<typename U>
- friend std::ostream &operator<<(std::ostream &os,
- const C_WriteSameRequest<U> &req);
-};
-
-/**
- * This is the custodian of the BlockGuard cell for this compare and write. The
- * block guard is acquired before the read begins to guarantee atomicity of this
- * operation. If this results in a write, the block guard will be released
- * when the write completes to all replicas.
- */
-template <typename T>
-class C_CompAndWriteRequest : public C_WriteRequest<T> {
-public:
- using C_BlockIORequest<T>::rwl;
- bool compare_succeeded = false;
- uint64_t *mismatch_offset;
- bufferlist cmp_bl;
- bufferlist read_bl;
- C_CompAndWriteRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents,
- bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
- int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
- Context *user_req);
- ~C_CompAndWriteRequest();
-
- void finish_req(int r) override;
-
- void update_req_stats(utime_t &now) override;
-
- /*
- * Compare and write doesn't implement alloc_resources(), deferred_handler(),
- * or dispatch(). We use the implementation in C_WriteRequest(), and only if the
- * compare phase succeeds and a write is actually performed.
- */
-
- const char *get_name() const override {
- return "C_CompAndWriteRequest";
- }
- template <typename U>
- friend std::ostream &operator<<(std::ostream &os,
- const C_CompAndWriteRequest<U> &req);
-};
-
-struct BlockGuardReqState {
- bool barrier = false; /* This is a barrier request */
- bool current_barrier = false; /* This is the currently active barrier */
- bool detained = false;
- bool queued = false; /* Queued for barrier */
- friend std::ostream &operator<<(std::ostream &os,
- const BlockGuardReqState &r) {
- os << "barrier=" << r.barrier << ", "
- << "current_barrier=" << r.current_barrier << ", "
- << "detained=" << r.detained << ", "
- << "queued=" << r.queued;
- return os;
- }
-};
-
-class GuardedRequestFunctionContext : public Context {
-public:
- BlockGuardCell *cell = nullptr;
- BlockGuardReqState state;
- GuardedRequestFunctionContext(boost::function<void(GuardedRequestFunctionContext&)> &&callback)
- : m_callback(std::move(callback)){ }
- ~GuardedRequestFunctionContext(void) override { };
- GuardedRequestFunctionContext(const GuardedRequestFunctionContext&) = delete;
- GuardedRequestFunctionContext &operator=(const GuardedRequestFunctionContext&) = delete;
-
-private:
- boost::function<void(GuardedRequestFunctionContext&)> m_callback;
- void finish(int r) override {
- ceph_assert(cell);
- m_callback(*this);
- }
-};
-
-class GuardedRequest {
-public:
- const BlockExtent block_extent;
- GuardedRequestFunctionContext *guard_ctx; /* Work to do when guard on range obtained */
-
- GuardedRequest(const BlockExtent block_extent,
- GuardedRequestFunctionContext *on_guard_acquire, bool barrier = false)
- : block_extent(block_extent), guard_ctx(on_guard_acquire) {
- guard_ctx->state.barrier = barrier;
- }
- friend std::ostream &operator<<(std::ostream &os,
- const GuardedRequest &r) {
- os << "guard_ctx->state=[" << r.guard_ctx->state << "], "
- << "block_extent.block_start=" << r.block_extent.block_start << ", "
- << "block_extent.block_start=" << r.block_extent.block_end;
- return os;
- }
-};
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "librbd/cache/rwl/ShutdownRequest.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/Utils.h"
-#include "common/dout.h"
-#include "common/errno.h"
-#include "librbd/Operations.h"
-#include "librbd/asio/ContextWQ.h"
-#include "librbd/cache/ImageCache.h"
-#include "librbd/cache/Types.h"
-
-
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl:ShutdownRequest: " \
- << this << " " << __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-using librbd::util::create_async_context_callback;
-using librbd::util::create_context_callback;
-
-template <typename I>
-ShutdownRequest<I>* ShutdownRequest<I>::create(I &image_ctx,
- Context *on_finish) {
- return new ShutdownRequest(image_ctx, on_finish);
-}
-
-template <typename I>
-ShutdownRequest<I>::ShutdownRequest(I &image_ctx, Context *on_finish)
- : m_image_ctx(image_ctx),
- m_on_finish(create_async_context_callback(image_ctx, on_finish)),
- m_error_result(0) {
-}
-
-template <typename I>
-void ShutdownRequest<I>::send() {
- send_shutdown_image_cache();
-}
-
-template <typename I>
-void ShutdownRequest<I>::send_shutdown_image_cache() {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << dendl;
-
- if (m_image_ctx.image_cache == nullptr) {
- finish();
- return;
- }
-
- using klass = ShutdownRequest<I>;
- Context *ctx = create_context_callback<klass, &klass::handle_shutdown_image_cache>(
- this);
-
- m_image_ctx.image_cache->shut_down(ctx);
-}
-
-template <typename I>
-void ShutdownRequest<I>::handle_shutdown_image_cache(int r) {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << dendl;
-
- if (r < 0) {
- lderr(cct) << "failed to shut down the image cache: " << cpp_strerror(r)
- << dendl;
- save_result(r);
- finish();
- return;
- } else {
- delete m_image_ctx.image_cache;
- m_image_ctx.image_cache = nullptr;
- }
- send_remove_feature_bit();
-}
-
-template <typename I>
-void ShutdownRequest<I>::send_remove_feature_bit() {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << dendl;
-
- uint64_t new_features = m_image_ctx.features & ~RBD_FEATURE_DIRTY_CACHE;
- uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE;
- ldout(cct, 10) << "old_features=" << m_image_ctx.features
- << ", new_features=" << new_features
- << ", features_mask=" << features_mask
- << dendl;
-
- int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, m_image_ctx.header_oid,
- new_features, features_mask);
- m_image_ctx.features &= ~RBD_FEATURE_DIRTY_CACHE;
- using klass = ShutdownRequest<I>;
- Context *ctx = create_context_callback<klass, &klass::handle_remove_feature_bit>(
- this);
- ctx->complete(r);
-}
-
-template <typename I>
-void ShutdownRequest<I>::handle_remove_feature_bit(int r) {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << dendl;
-
- if (r < 0) {
- lderr(cct) << "failed to remove the feature bit: " << cpp_strerror(r)
- << dendl;
- save_result(r);
- finish();
- return;
- }
- send_remove_image_cache_state();
-}
-
-template <typename I>
-void ShutdownRequest<I>::send_remove_image_cache_state() {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << dendl;
-
- using klass = ShutdownRequest<I>;
- Context *ctx = create_context_callback<klass, &klass::handle_remove_image_cache_state>(
- this);
- std::shared_lock owner_lock{m_image_ctx.owner_lock};
- m_image_ctx.operations->execute_metadata_remove(IMAGE_CACHE_STATE, ctx);
-}
-
-template <typename I>
-void ShutdownRequest<I>::handle_remove_image_cache_state(int r) {
- CephContext *cct = m_image_ctx.cct;
- ldout(cct, 10) << dendl;
-
- if (r < 0) {
- lderr(cct) << "failed to remove the image cache state: " << cpp_strerror(r)
- << dendl;
- save_result(r);
- }
- finish();
-}
-
-template <typename I>
-void ShutdownRequest<I>::finish() {
- m_on_finish->complete(m_error_result);
- delete this;
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::rwl::ShutdownRequest<librbd::ImageCtx>;
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H
-#define CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H
-
-class Context;
-
-namespace librbd {
-
-class ImageCtx;
-
-namespace cache {
-namespace rwl {
-
-template<typename>
-class ImageCacheState;
-
-template <typename ImageCtxT = ImageCtx>
-class ShutdownRequest {
-public:
- static ShutdownRequest* create(ImageCtxT &image_ctx, Context *on_finish);
-
- void send();
-
-private:
-
- /**
- * @verbatim
- *
- * Shutdown request goes through the following state machine:
- *
- * <start>
- * |
- * v
- * SHUTDOWN_IMAGE_CACHE
- * |
- * v
- * REMOVE_IMAGE_FEATURE_BIT
- * |
- * v
- * REMOVE_IMAGE_CACHE_STATE
- * |
- * v
- * <finish>
- *
- * @endverbatim
- */
-
- ShutdownRequest(ImageCtxT &image_ctx, Context *on_finish);
-
- ImageCtxT &m_image_ctx;
- Context *m_on_finish;
-
- int m_error_result;
-
- void send_shutdown_image_cache();
- void handle_shutdown_image_cache(int r);
-
- void send_remove_feature_bit();
- void handle_remove_feature_bit(int r);
-
- void send_remove_image_cache_state();
- void handle_remove_image_cache_state(int r);
-
- void finish();
-
- void save_result(int result) {
- if (m_error_result == 0 && result < 0) {
- m_error_result = result;
- }
- }
-};
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-extern template class librbd::cache::rwl::ShutdownRequest<librbd::ImageCtx>;
-
-#endif // CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "SyncPoint.h"
-
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl::SyncPoint: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-SyncPoint::SyncPoint(uint64_t sync_gen_num, CephContext *cct)
- : log_entry(std::make_shared<SyncPointLogEntry>(sync_gen_num)), m_cct(cct) {
- m_prior_log_entries_persisted = new C_Gather(cct, nullptr);
- m_sync_point_persist = new C_Gather(cct, nullptr);
- on_sync_point_appending.reserve(MAX_WRITES_PER_SYNC_POINT + 2);
- on_sync_point_persisted.reserve(MAX_WRITES_PER_SYNC_POINT + 2);
- ldout(m_cct, 20) << "sync point " << sync_gen_num << dendl;
-}
-
-SyncPoint::~SyncPoint() {
- ceph_assert(on_sync_point_appending.empty());
- ceph_assert(on_sync_point_persisted.empty());
- ceph_assert(!earlier_sync_point);
-}
-
-std::ostream &operator<<(std::ostream &os,
- const SyncPoint &p) {
- os << "log_entry=[" << *p.log_entry << "], "
- << "earlier_sync_point=" << p.earlier_sync_point << ", "
- << "later_sync_point=" << p.later_sync_point << ", "
- << "m_final_op_sequence_num=" << p.m_final_op_sequence_num << ", "
- << "m_prior_log_entries_persisted=" << p.m_prior_log_entries_persisted << ", "
- << "m_prior_log_entries_persisted_complete=" << p.m_prior_log_entries_persisted_complete << ", "
- << "m_append_scheduled=" << p.m_append_scheduled << ", "
- << "appending=" << p.appending << ", "
- << "on_sync_point_appending=" << p.on_sync_point_appending.size() << ", "
- << "on_sync_point_persisted=" << p.on_sync_point_persisted.size() << "";
- return os;
-}
-
-void SyncPoint::persist_gather_set_finisher(Context *ctx) {
- m_append_scheduled = true;
- /* All prior sync points that are still in this list must already be scheduled for append */
- std::shared_ptr<SyncPoint> previous = earlier_sync_point;
- while (previous) {
- ceph_assert(previous->m_append_scheduled);
- previous = previous->earlier_sync_point;
- }
-
- m_sync_point_persist->set_finisher(ctx);
-}
-
-void SyncPoint::persist_gather_activate() {
- m_sync_point_persist->activate();
-}
-
-Context* SyncPoint::persist_gather_new_sub() {
- return m_sync_point_persist->new_sub();
-}
-
-void SyncPoint::prior_persisted_gather_activate() {
- m_prior_log_entries_persisted->activate();
-}
-
-Context* SyncPoint::prior_persisted_gather_new_sub() {
- return m_prior_log_entries_persisted->new_sub();
-}
-
-void SyncPoint::prior_persisted_gather_set_finisher() {
- Context *sync_point_persist_ready = persist_gather_new_sub();
- std::shared_ptr<SyncPoint> sp = shared_from_this();
- m_prior_log_entries_persisted->
- set_finisher(new LambdaContext([this, sp, sync_point_persist_ready](int r) {
- ldout(m_cct, 20) << "Prior log entries persisted for sync point =["
- << sp << "]" << dendl;
- sp->m_prior_log_entries_persisted_result = r;
- sp->m_prior_log_entries_persisted_complete = true;
- sync_point_persist_ready->complete(r);
- }));
-}
-
-void SyncPoint::add_in_on_persisted_ctxs(Context* ctx) {
- on_sync_point_persisted.push_back(ctx);
-}
-
-void SyncPoint::add_in_on_appending_ctxs(Context* ctx) {
- on_sync_point_appending.push_back(ctx);
-}
-
-void SyncPoint::setup_earlier_sync_point(std::shared_ptr<SyncPoint> sync_point,
- uint64_t last_op_sequence_num) {
- earlier_sync_point = sync_point;
- log_entry->prior_sync_point_flushed = false;
- earlier_sync_point->log_entry->next_sync_point_entry = log_entry;
- earlier_sync_point->later_sync_point = shared_from_this();
- earlier_sync_point->m_final_op_sequence_num = last_op_sequence_num;
- if (!earlier_sync_point->appending) {
- /* Append of new sync point deferred until old sync point is appending */
- earlier_sync_point->add_in_on_appending_ctxs(prior_persisted_gather_new_sub());
- }
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H
-#define CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H
-
-#include "librbd/ImageCtx.h"
-#include "librbd/cache/rwl/LogEntry.h"
-#include "librbd/cache/rwl/Types.h"
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-class SyncPoint: public std::enable_shared_from_this<SyncPoint> {
-public:
- std::shared_ptr<SyncPointLogEntry> log_entry;
- /* Use lock for earlier/later links */
- std::shared_ptr<SyncPoint> earlier_sync_point; /* NULL if earlier has completed */
- std::shared_ptr<SyncPoint> later_sync_point;
- bool appending = false;
- /* Signal these when this sync point is appending to the log, and its order
- * of appearance is guaranteed. One of these is is a sub-operation of the
- * next sync point's m_prior_log_entries_persisted Gather. */
- std::vector<Context*> on_sync_point_appending;
- /* Signal these when this sync point is appended and persisted. User
- * aio_flush() calls are added to this. */
- std::vector<Context*> on_sync_point_persisted;
-
- SyncPoint(uint64_t sync_gen_num, CephContext *cct);
- ~SyncPoint();
- SyncPoint(const SyncPoint&) = delete;
- SyncPoint &operator=(const SyncPoint&) = delete;
- void persist_gather_activate();
- Context* persist_gather_new_sub();
- void persist_gather_set_finisher(Context *ctx);
- void prior_persisted_gather_activate();
- Context* prior_persisted_gather_new_sub();
- void prior_persisted_gather_set_finisher();
- void add_in_on_persisted_ctxs(Context* cxt);
- void add_in_on_appending_ctxs(Context* cxt);
- void setup_earlier_sync_point(std::shared_ptr<SyncPoint> sync_point,
- uint64_t last_op_sequence_num);
-private:
- CephContext *m_cct;
- bool m_append_scheduled = false;
- uint64_t m_final_op_sequence_num = 0;
- /* A sync point can't appear in the log until all the writes bearing
- * it and all the prior sync points have been appended and
- * persisted.
- *
- * Writes bearing this sync gen number and the prior sync point will be
- * sub-ops of this Gather. This sync point will not be appended until all
- * these complete to the point where their persist order is guaranteed. */
- C_Gather *m_prior_log_entries_persisted;
- /* The finisher for this will append the sync point to the log. The finisher
- * for m_prior_log_entries_persisted will be a sub-op of this. */
- C_Gather *m_sync_point_persist;
- int m_prior_log_entries_persisted_result = 0;
- int m_prior_log_entries_persisted_complete = false;
- friend std::ostream &operator<<(std::ostream &os,
- const SyncPoint &p);
-};
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-#endif // CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include <iostream>
-#include "Types.h"
-#include "common/ceph_context.h"
-#include "include/Context.h"
-
-#define dout_subsys ceph_subsys_rbd_rwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::rwl::Types: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
-
-namespace cache {
-
-namespace rwl {
-
-DeferredContexts::~DeferredContexts() {
- finish_contexts(nullptr, contexts, 0);
-}
-
-void DeferredContexts::add(Context* ctx) {
- contexts.push_back(ctx);
-}
-
-/*
- * A BlockExtent identifies a range by first and last.
- *
- * An Extent ("image extent") identifies a range by start and length.
- *
- * The ImageCache interface is defined in terms of image extents, and
- * requires no alignment of the beginning or end of the extent. We
- * convert between image and block extents here using a "block size"
- * of 1.
- */
-BlockExtent convert_to_block_extent(const uint64_t offset_bytes, const uint64_t length_bytes)
-{
- return BlockExtent(offset_bytes,
- offset_bytes + length_bytes);
-}
-
-BlockExtent WriteLogPmemEntry::block_extent() {
- return convert_to_block_extent(image_offset_bytes, write_bytes);
-}
-
-uint64_t WriteLogPmemEntry::get_offset_bytes() {
- return image_offset_bytes;
-}
-
-uint64_t WriteLogPmemEntry::get_write_bytes() {
- return write_bytes;
-}
-
-std::ostream& operator<<(std::ostream& os,
- const WriteLogPmemEntry &entry) {
- os << "entry_valid=" << (bool)entry.entry_valid << ", "
- << "sync_point=" << (bool)entry.sync_point << ", "
- << "sequenced=" << (bool)entry.sequenced << ", "
- << "has_data=" << (bool)entry.has_data << ", "
- << "discard=" << (bool)entry.discard << ", "
- << "writesame=" << (bool)entry.writesame << ", "
- << "sync_gen_number=" << entry.sync_gen_number << ", "
- << "write_sequence_number=" << entry.write_sequence_number << ", "
- << "image_offset_bytes=" << entry.image_offset_bytes << ", "
- << "write_bytes=" << entry.write_bytes << ", "
- << "ws_datalen=" << entry.ws_datalen << ", "
- << "entry_index=" << entry.entry_index;
- return os;
-}
-
-template <typename ExtentsType>
-ExtentsSummary<ExtentsType>::ExtentsSummary(const ExtentsType &extents)
- : total_bytes(0), first_image_byte(0), last_image_byte(0)
-{
- if (extents.empty()) return;
- /* These extents refer to image offsets between first_image_byte
- * and last_image_byte, inclusive, but we don't guarantee here
- * that they address all of those bytes. There may be gaps. */
- first_image_byte = extents.front().first;
- last_image_byte = first_image_byte + extents.front().second;
- for (auto &extent : extents) {
- /* Ignore zero length extents */
- if (extent.second) {
- total_bytes += extent.second;
- if (extent.first < first_image_byte) {
- first_image_byte = extent.first;
- }
- if ((extent.first + extent.second) > last_image_byte) {
- last_image_byte = extent.first + extent.second;
- }
- }
- }
-}
-
-io::Extent whole_volume_extent() {
- return io::Extent({0, std::numeric_limits<uint64_t>::max()});
-}
-
-BlockExtent block_extent(const io::Extent& image_extent) {
- return convert_to_block_extent(image_extent.first, image_extent.second);
-}
-
-Context * override_ctx(int r, Context *ctx) {
- if (r < 0) {
- /* Override next_ctx status with this error */
- return new LambdaContext(
- [r, ctx](int _r) {
- ctx->complete(r);
- });
- } else {
- return ctx;
- }
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::rwl::ExtentsSummary<librbd::io::Extents>;
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_RWL_TYPES_H
-#define CEPH_LIBRBD_CACHE_RWL_TYPES_H
-
-#include <vector>
-#include <libpmemobj.h>
-#include "librbd/BlockGuard.h"
-#include "librbd/io/Types.h"
-
-class Context;
-
-enum {
- l_librbd_rwl_first = 26500,
-
- // All read requests
- l_librbd_rwl_rd_req, // read requests
- l_librbd_rwl_rd_bytes, // bytes read
- l_librbd_rwl_rd_latency, // average req completion latency
-
- // Read requests completed from RWL (no misses)
- l_librbd_rwl_rd_hit_req, // read requests
- l_librbd_rwl_rd_hit_bytes, // bytes read
- l_librbd_rwl_rd_hit_latency, // average req completion latency
-
- // Reed requests with hit and miss extents
- l_librbd_rwl_rd_part_hit_req, // read ops
-
- // Per SyncPoint's LogEntry number and write bytes distribution
- l_librbd_rwl_syncpoint_hist,
-
- // All write requests
- l_librbd_rwl_wr_req, // write requests
- l_librbd_rwl_wr_req_def, // write requests deferred for resources
- l_librbd_rwl_wr_req_def_lanes, // write requests deferred for lanes
- l_librbd_rwl_wr_req_def_log, // write requests deferred for log entries
- l_librbd_rwl_wr_req_def_buf, // write requests deferred for buffer space
- l_librbd_rwl_wr_req_overlap, // write requests detained for overlap
- l_librbd_rwl_wr_req_queued, // write requests queued for prior barrier
- l_librbd_rwl_wr_bytes, // bytes written
-
- // Write log operations (1 .. n per request that appends to the log)
- l_librbd_rwl_log_ops, // log append ops
- l_librbd_rwl_log_op_bytes, // average bytes written per log op
-
- /*
-
- Req and op average latencies to the beginning of and over various phases:
-
- +------------------------------+------+-------------------------------+
- | Phase | Name | Description |
- +------------------------------+------+-------------------------------+
- | Arrive at RWL | arr |Arrives as a request |
- +------------------------------+------+-------------------------------+
- | Allocate resources | all |time spent in block guard for |
- | | |overlap sequencing occurs |
- | | |before this point |
- +------------------------------+------+-------------------------------+
- | Dispatch | dis |Op lifetime begins here. time |
- | | |spent in allocation waiting for|
- | | |resources occurs before this |
- | | |point |
- +------------------------------+------+-------------------------------+
- | Payload buffer persist and | buf |time spent queued for |
- |replicate | |replication occurs before here |
- +------------------------------+------+-------------------------------+
- | Payload buffer persist | bufc |bufc - buf is just the persist |
- |complete | |time |
- +------------------------------+------+-------------------------------+
- | Log append | app |time spent queued for append |
- | | |occurs before here |
- +------------------------------+------+-------------------------------+
- | Append complete | appc |appc - app is just the time |
- | | |spent in the append operation |
- +------------------------------+------+-------------------------------+
- | Complete | cmp |write persisted, replicated, |
- | | |and globally visible |
- +------------------------------+------+-------------------------------+
-
- */
-
- /* Request times */
- l_librbd_rwl_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard
- l_librbd_rwl_req_arr_to_dis_t, // arrival to dispatch elapsed time
- l_librbd_rwl_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources
- l_librbd_rwl_wr_latency, // average req (persist) completion latency
- l_librbd_rwl_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written
- l_librbd_rwl_wr_caller_latency, // average req completion (to caller) latency
-
- /* Request times for requests that never waited for space*/
- l_librbd_rwl_nowait_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard
- l_librbd_rwl_nowait_req_arr_to_dis_t, // arrival to dispatch elapsed time
- l_librbd_rwl_nowait_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources
- l_librbd_rwl_nowait_wr_latency, // average req (persist) completion latency
- l_librbd_rwl_nowait_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written
- l_librbd_rwl_nowait_wr_caller_latency, // average req completion (to caller) latency
-
- /* Log operation times */
- l_librbd_rwl_log_op_alloc_t, // elapsed time of pmemobj_reserve()
- l_librbd_rwl_log_op_alloc_t_hist, // Histogram of elapsed time of pmemobj_reserve()
-
- l_librbd_rwl_log_op_dis_to_buf_t, // dispatch to buffer persist elapsed time
- l_librbd_rwl_log_op_dis_to_app_t, // dispatch to log append elapsed time
- l_librbd_rwl_log_op_dis_to_cmp_t, // dispatch to persist completion elapsed time
- l_librbd_rwl_log_op_dis_to_cmp_t_hist, // Histogram of dispatch to persist completion elapsed time
-
- l_librbd_rwl_log_op_buf_to_app_t, // data buf persist + append wait time
- l_librbd_rwl_log_op_buf_to_bufc_t,// data buf persist / replicate elapsed time
- l_librbd_rwl_log_op_buf_to_bufc_t_hist,// data buf persist time vs bytes histogram
- l_librbd_rwl_log_op_app_to_cmp_t, // log entry append + completion wait time
- l_librbd_rwl_log_op_app_to_appc_t, // log entry append / replicate elapsed time
- l_librbd_rwl_log_op_app_to_appc_t_hist, // log entry append time (vs. op bytes) histogram
-
- l_librbd_rwl_discard,
- l_librbd_rwl_discard_bytes,
- l_librbd_rwl_discard_latency,
-
- l_librbd_rwl_aio_flush,
- l_librbd_rwl_aio_flush_def,
- l_librbd_rwl_aio_flush_latency,
- l_librbd_rwl_ws,
- l_librbd_rwl_ws_bytes, // Bytes modified by write same, probably much larger than WS payload bytes
- l_librbd_rwl_ws_latency,
-
- l_librbd_rwl_cmp,
- l_librbd_rwl_cmp_bytes,
- l_librbd_rwl_cmp_latency,
- l_librbd_rwl_cmp_fails,
-
- l_librbd_rwl_flush,
- l_librbd_rwl_invalidate_cache,
- l_librbd_rwl_invalidate_discard_cache,
-
- l_librbd_rwl_append_tx_t,
- l_librbd_rwl_retire_tx_t,
- l_librbd_rwl_append_tx_t_hist,
- l_librbd_rwl_retire_tx_t_hist,
-
- l_librbd_rwl_last,
-};
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-class ImageExtentBuf;
-typedef std::vector<ImageExtentBuf> ImageExtentBufs;
-
-const int IN_FLIGHT_FLUSH_WRITE_LIMIT = 64;
-const int IN_FLIGHT_FLUSH_BYTES_LIMIT = (1 * 1024 * 1024);
-
-/* Limit work between sync points */
-const uint64_t MAX_WRITES_PER_SYNC_POINT = 256;
-const uint64_t MAX_BYTES_PER_SYNC_POINT = (1024 * 1024 * 8);
-
-const uint32_t MIN_WRITE_ALLOC_SIZE = 512;
-const uint32_t LOG_STATS_INTERVAL_SECONDS = 5;
-
-/**** Write log entries ****/
-const unsigned long int MAX_ALLOC_PER_TRANSACTION = 8;
-const unsigned long int MAX_FREE_PER_TRANSACTION = 1;
-const unsigned int MAX_CONCURRENT_WRITES = 256;
-
-const uint64_t DEFAULT_POOL_SIZE = 1u<<30;
-const uint64_t MIN_POOL_SIZE = DEFAULT_POOL_SIZE;
-constexpr double USABLE_SIZE = (7.0 / 10);
-const uint64_t BLOCK_ALLOC_OVERHEAD_BYTES = 16;
-const uint8_t RWL_POOL_VERSION = 1;
-const uint64_t MAX_LOG_ENTRIES = (1024 * 1024);
-const double AGGRESSIVE_RETIRE_HIGH_WATER = 0.75;
-const double RETIRE_HIGH_WATER = 0.50;
-const double RETIRE_LOW_WATER = 0.40;
-const int RETIRE_BATCH_TIME_LIMIT_MS = 250;
-
-/* Defer a set of Contexts until destruct/exit. Used for deferring
- * work on a given thread until a required lock is dropped. */
-class DeferredContexts {
-private:
- std::vector<Context*> contexts;
-public:
- ~DeferredContexts();
- void add(Context* ctx);
-};
-
-/* Pmem structures */
-POBJ_LAYOUT_BEGIN(rbd_rwl);
-POBJ_LAYOUT_ROOT(rbd_rwl, struct WriteLogPoolRoot);
-POBJ_LAYOUT_TOID(rbd_rwl, uint8_t);
-POBJ_LAYOUT_TOID(rbd_rwl, struct WriteLogPmemEntry);
-POBJ_LAYOUT_END(rbd_rwl);
-
-struct WriteLogPmemEntry {
- uint64_t sync_gen_number = 0;
- uint64_t write_sequence_number = 0;
- uint64_t image_offset_bytes;
- uint64_t write_bytes;
- TOID(uint8_t) write_data;
- struct {
- uint8_t entry_valid :1; /* if 0, this entry is free */
- uint8_t sync_point :1; /* No data. No write sequence number. Marks sync
- point for this sync gen number */
- uint8_t sequenced :1; /* write sequence number is valid */
- uint8_t has_data :1; /* write_data field is valid (else ignore) */
- uint8_t discard :1; /* has_data will be 0 if this is a discard */
- uint8_t writesame :1; /* ws_datalen indicates length of data at write_bytes */
- };
- uint32_t ws_datalen = 0; /* Length of data buffer (writesame only) */
- uint32_t entry_index = 0; /* For debug consistency check. Can be removed if
- * we need the space */
- WriteLogPmemEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
- : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes),
- entry_valid(0), sync_point(0), sequenced(0), has_data(0), discard(0), writesame(0) {
- }
- BlockExtent block_extent();
- uint64_t get_offset_bytes();
- uint64_t get_write_bytes();
- bool is_sync_point() {
- return sync_point;
- }
- bool is_discard() {
- return discard;
- }
- bool is_writesame() {
- return writesame;
- }
- bool is_write() {
- /* Log entry is a basic write */
- return !is_sync_point() && !is_discard() && !is_writesame();
- }
- bool is_writer() {
- /* Log entry is any type that writes data */
- return is_write() || is_discard() || is_writesame();
- }
- friend std::ostream& operator<<(std::ostream& os,
- const WriteLogPmemEntry &entry);
-};
-
-static_assert(sizeof(WriteLogPmemEntry) == 64);
-
-struct WriteLogPoolRoot {
- union {
- struct {
- uint8_t layout_version; /* Version of this structure (RWL_POOL_VERSION) */
- };
- uint64_t _u64;
- } header;
- TOID(struct WriteLogPmemEntry) log_entries; /* contiguous array of log entries */
- uint64_t pool_size;
- uint64_t flushed_sync_gen; /* All writing entries with this or a lower
- * sync gen number are flushed. */
- uint32_t block_size; /* block size */
- uint32_t num_log_entries;
- uint32_t first_free_entry; /* Entry following the newest valid entry */
- uint32_t first_valid_entry; /* Index of the oldest valid entry in the log */
-};
-
-struct WriteBufferAllocation {
- unsigned int allocation_size = 0;
- pobj_action buffer_alloc_action;
- TOID(uint8_t) buffer_oid = OID_NULL;
- bool allocated = false;
- utime_t allocation_lat;
-};
-
-static inline io::Extent image_extent(const BlockExtent& block_extent) {
- return io::Extent(block_extent.block_start,
- block_extent.block_end - block_extent.block_start);
-}
-
-template <typename ExtentsType>
-class ExtentsSummary {
-public:
- uint64_t total_bytes;
- uint64_t first_image_byte;
- uint64_t last_image_byte;
- explicit ExtentsSummary(const ExtentsType &extents);
- friend std::ostream &operator<<(std::ostream &os,
- const ExtentsSummary &s) {
- os << "total_bytes=" << s.total_bytes << ", "
- << "first_image_byte=" << s.first_image_byte << ", "
- << "last_image_byte=" << s.last_image_byte << "";
- return os;
- }
- BlockExtent block_extent() {
- return BlockExtent(first_image_byte, last_image_byte);
- }
- io::Extent image_extent() {
- return librbd::cache::rwl::image_extent(block_extent());
- }
-};
-
-io::Extent whole_volume_extent();
-
-BlockExtent block_extent(const io::Extent& image_extent);
-
-Context * override_ctx(int r, Context *ctx);
-
-class ImageExtentBuf : public io::Extent {
-public:
- bufferlist m_bl;
- ImageExtentBuf(io::Extent extent)
- : io::Extent(extent) { }
- ImageExtentBuf(io::Extent extent, bufferlist bl)
- : io::Extent(extent), m_bl(bl) { }
-};
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
-
-#endif // CEPH_LIBRBD_CACHE_RWL_TYPES_H
#include "common/dout.h"
#include "common/errno.h"
#include "include/stringify.h"
-#include "librbd/cache/rwl/InitRequest.h"
-#include "librbd/cache/rwl/ShutdownRequest.h"
+#include "librbd/cache/pwl/InitRequest.h"
+#include "librbd/cache/pwl/ShutdownRequest.h"
#include "librbd/ExclusiveLock.h"
#include "librbd/ImageCtx.h"
#include "librbd/ImageState.h"
Context *ctx = create_async_context_callback(
m_image_ctx, create_context_callback<
klass, &klass::handle_open_image_cache>(this));
- cache::rwl::InitRequest<I> *req = cache::rwl::InitRequest<I>::create(
+ cache::pwl::InitRequest<I> *req = cache::pwl::InitRequest<I>::create(
m_image_ctx, ctx);
req->send();
}
using klass = PostAcquireRequest<I>;
Context *ctx = create_context_callback<klass, &klass::handle_close_image_cache>(
this);
- cache::rwl::ShutdownRequest<I> *req = cache::rwl::ShutdownRequest<I>::create(
+ cache::pwl::ShutdownRequest<I> *req = cache::pwl::ShutdownRequest<I>::create(
m_image_ctx, ctx);
req->send();
}
#include "common/AsyncOpTracker.h"
#include "common/dout.h"
#include "common/errno.h"
-#include "librbd/cache/rwl/ShutdownRequest.h"
+#include "librbd/cache/pwl/ShutdownRequest.h"
#include "librbd/ExclusiveLock.h"
#include "librbd/ImageState.h"
#include "librbd/ImageWatcher.h"
Context *ctx = create_async_context_callback(m_image_ctx, create_context_callback<
PreReleaseRequest<I>,
&PreReleaseRequest<I>::handle_shut_down_image_cache>(this));
- cache::rwl::ShutdownRequest<I> *req = cache::rwl::ShutdownRequest<I>::create(
+ cache::pwl::ShutdownRequest<I> *req = cache::pwl::ShutdownRequest<I>::create(
m_image_ctx, ctx);
req->send();
}
set(unittest_librbd_srcs
${unittest_librbd_srcs}
cache/test_mock_ReplicatedWriteLog.cc
- cache/rwl/test_WriteLogMap.cc)
+ cache/pwl/test_WriteLogMap.cc)
endif(WITH_RBD_RWL)
add_executable(unittest_librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_fixture.h"
+#include "test/librbd/test_support.h"
+
+#include "librbd/cache/pwl/LogMap.cc"
+
+void register_test_write_log_map() {
+}
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+struct TestLogEntry {
+ uint64_t image_offset_bytes;
+ uint64_t write_bytes;
+ uint32_t referring_map_entries = 0;
+ TestLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes) {
+ }
+ uint64_t get_offset_bytes() {
+ return image_offset_bytes;
+ }
+ uint64_t get_write_bytes() {
+ return write_bytes;
+ }
+ BlockExtent block_extent() {
+ return BlockExtent(image_offset_bytes, image_offset_bytes + write_bytes);
+ }
+ uint32_t get_map_ref() {
+ return referring_map_entries;
+ }
+ void inc_map_ref() {
+ referring_map_entries++;
+ }
+ void dec_map_ref() {
+ referring_map_entries--;
+ }
+ friend std::ostream &operator<<(std::ostream &os,
+ const TestLogEntry &entry) {
+ os << "referring_map_entries=" << entry.referring_map_entries << ", "
+ << "image_offset_bytes=" << entry.image_offset_bytes << ", "
+ << "write_bytes=" << entry.write_bytes;
+ return os;
+ };
+};
+
+typedef std::list<std::shared_ptr<TestLogEntry>> TestLogEntries;
+typedef LogMapEntry<TestLogEntry> TestMapEntry;
+typedef LogMapEntries<TestLogEntry> TestLogMapEntries;
+typedef LogMap<TestLogEntry> TestLogMap;
+
+class TestWriteLogMap : public TestFixture {
+public:
+ void SetUp() override {
+ TestFixture::SetUp();
+ m_cct = reinterpret_cast<CephContext*>(m_ioctx.cct());
+ }
+
+ CephContext *m_cct;
+};
+
+TEST_F(TestWriteLogMap, Simple) {
+ TestLogEntries es;
+ TestLogMapEntries lme;
+ TestLogMap map(m_cct);
+
+ /* LogEntry takes offset, length, in bytes */
+ auto e1 = make_shared<TestLogEntry>(4, 8);
+ TestLogEntry *e1_ptr = e1.get();
+ ASSERT_EQ(4, e1_ptr->get_offset_bytes());
+ ASSERT_EQ(8, e1_ptr->get_write_bytes());
+ map.add_log_entry(e1);
+
+ /* BlockExtent takes first, last, in blocks */
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100));
+ int numfound = found0.size();
+ /* Written range includes the single write above */
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e1, found0.front().log_entry);
+
+ /* Nothing before that */
+ found0 = map.find_map_entries(BlockExtent(0, 3));
+ numfound = found0.size();
+ ASSERT_EQ(0, numfound);
+
+ /* Nothing after that */
+ found0 = map.find_map_entries(BlockExtent(12, 99));
+ numfound = found0.size();
+ ASSERT_EQ(0, numfound);
+
+ /* 4-11 will be e1 */
+ for (int i=4; i<12; i++) {
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
+ int numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e1, found0.front().log_entry);
+ }
+
+ map.remove_log_entry(e1);
+ /* Nothing should be found */
+ for (int i=4; i<12; i++) {
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
+ int numfound = found0.size();
+ ASSERT_EQ(0, numfound);
+ }
+}
+
+TEST_F(TestWriteLogMap, OverlapFront) {
+ TestLogMap map(m_cct);
+
+ auto e0 = make_shared<TestLogEntry>(4, 8);
+ map.add_log_entry(e0);
+ /* replaces block 4-7 of e0 */
+ auto e1 = make_shared<TestLogEntry>(0, 8);
+ map.add_log_entry(e1);
+
+ /* Written range includes the two writes above */
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100));
+ int numfound = found0.size();
+ ASSERT_EQ(2, numfound);
+ ASSERT_EQ(e1, found0.front().log_entry);
+ ASSERT_EQ(0, found0.front().block_extent.block_start);
+ ASSERT_EQ(8, found0.front().block_extent.block_end);
+ found0.pop_front();
+ ASSERT_EQ(e0, found0.front().log_entry);
+ ASSERT_EQ(8, found0.front().block_extent.block_start);
+ ASSERT_EQ(12, found0.front().block_extent.block_end);
+
+ /* 0-7 will be e1 */
+ for (int i=0; i<8; i++) {
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
+ int numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e1, found0.front().log_entry);
+ }
+
+ /* 8-11 will be e0 */
+ for (int i=8; i<12; i++) {
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
+ int numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e0, found0.front().log_entry);
+ }
+}
+
+TEST_F(TestWriteLogMap, OverlapBack) {
+ TestLogMap map(m_cct);
+
+ auto e0 = make_shared<TestLogEntry>(0, 8);
+ map.add_log_entry(e0);
+ /* replaces block 4-7 of e0 */
+ auto e1 = make_shared<TestLogEntry>(4, 8);
+ map.add_log_entry(e1);
+
+ /* Written range includes the two writes above */
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100));
+ int numfound = found0.size();
+ ASSERT_EQ(2, numfound);
+ ASSERT_EQ(e0, found0.front().log_entry);
+ ASSERT_EQ(0, found0.front().block_extent.block_start);
+ ASSERT_EQ(4, found0.front().block_extent.block_end);
+ found0.pop_front();
+ ASSERT_EQ(e1, found0.front().log_entry);
+ ASSERT_EQ(4, found0.front().block_extent.block_start);
+ ASSERT_EQ(12, found0.front().block_extent.block_end);
+
+ /* 0-3 will be e0 */
+ for (int i=0; i<4; i++) {
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
+ int numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e0, found0.front().log_entry);
+ }
+
+ /* 4-11 will be e1 */
+ for (int i=4; i<12; i++) {
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
+ int numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e1, found0.front().log_entry);
+ }
+
+ map.remove_log_entry(e0);
+
+ /* 0-3 will find nothing */
+ for (int i=0; i<4; i++) {
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
+ int numfound = found0.size();
+ ASSERT_EQ(0, numfound);
+ }
+
+ /* 4-11 will still be e1 */
+ for (int i=4; i<12; i++) {
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
+ int numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e1, found0.front().log_entry);
+ }
+
+}
+
+TEST_F(TestWriteLogMap, OverlapMiddle) {
+ TestLogMap map(m_cct);
+
+ auto e0 = make_shared<TestLogEntry>(0, 1);
+ map.add_log_entry(e0);
+
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 1));
+ int numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e0, found0.front().log_entry);
+ TestLogEntries entries = map.find_log_entries(BlockExtent(0, 1));
+ int entriesfound = entries.size();
+ ASSERT_EQ(1, entriesfound);
+ ASSERT_EQ(e0, entries.front());
+
+ auto e1 = make_shared<TestLogEntry>(1, 1);
+ map.add_log_entry(e1);
+
+ found0 = map.find_map_entries(BlockExtent(1, 2));
+ numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e1, found0.front().log_entry);
+ entries = map.find_log_entries(BlockExtent(1, 2));
+ entriesfound = entries.size();
+ ASSERT_EQ(1, entriesfound);
+ ASSERT_EQ(e1, entries.front());
+
+ auto e2 = make_shared<TestLogEntry>(2, 1);
+ map.add_log_entry(e2);
+
+ found0 = map.find_map_entries(BlockExtent(2, 3));
+ numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e2, found0.front().log_entry);
+ entries = map.find_log_entries(BlockExtent(2, 3));
+ entriesfound = entries.size();
+ ASSERT_EQ(1, entriesfound);
+ ASSERT_EQ(e2, entries.front());
+
+ /* replaces e1 */
+ auto e3 = make_shared<TestLogEntry>(1, 1);
+ map.add_log_entry(e3);
+
+ found0 = map.find_map_entries(BlockExtent(1, 2));
+ numfound = found0.size();
+ ASSERT_EQ(1, numfound);
+ ASSERT_EQ(e3, found0.front().log_entry);
+ entries = map.find_log_entries(BlockExtent(1, 2));
+ entriesfound = entries.size();
+ ASSERT_EQ(1, entriesfound);
+ ASSERT_EQ(e3, entries.front());
+
+ found0 = map.find_map_entries(BlockExtent(0, 100));
+ numfound = found0.size();
+ ASSERT_EQ(3, numfound);
+ ASSERT_EQ(e0, found0.front().log_entry);
+ found0.pop_front();
+ ASSERT_EQ(e3, found0.front().log_entry);
+ found0.pop_front();
+ ASSERT_EQ(e2, found0.front().log_entry);
+ entries = map.find_log_entries(BlockExtent(0, 100));
+ entriesfound = entries.size();
+ ASSERT_EQ(3, entriesfound);
+ ASSERT_EQ(e0, entries.front());
+ entries.pop_front();
+ ASSERT_EQ(e3, entries.front());
+ entries.pop_front();
+ ASSERT_EQ(e2, entries.front());
+
+ entries.clear();
+ entries.emplace_back(e0);
+ entries.emplace_back(e1);
+ map.remove_log_entries(entries);
+
+ found0 = map.find_map_entries(BlockExtent(0, 100));
+ numfound = found0.size();
+ ASSERT_EQ(2, numfound);
+ ASSERT_EQ(e3, found0.front().log_entry);
+ found0.pop_front();
+ ASSERT_EQ(e2, found0.front().log_entry);
+}
+
+TEST_F(TestWriteLogMap, OverlapSplit) {
+ TestLogMap map(m_cct);
+
+ auto e0 = make_shared<TestLogEntry>(0, 8);
+ map.add_log_entry(e0);
+
+ /* Splits e0 at 1 */
+ auto e1 = make_shared<TestLogEntry>(1, 1);
+ map.add_log_entry(e1);
+
+ /* Splits e0 again at 4 */
+ auto e2 = make_shared<TestLogEntry>(4, 2);
+ map.add_log_entry(e2);
+
+ /* Replaces one block of e2, and one of e0 */
+ auto e3 = make_shared<TestLogEntry>(5, 2);
+ map.add_log_entry(e3);
+
+ /* Expecting: 0:e0, 1:e1, 2..3:e0, 4:e2, 5..6:e3, 7:e0 */
+ TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100));
+ int numfound = found0.size();
+ ASSERT_EQ(6, numfound);
+ ASSERT_EQ(e0, found0.front().log_entry);
+ ASSERT_EQ(0, found0.front().block_extent.block_start);
+ ASSERT_EQ(1, found0.front().block_extent.block_end);
+ found0.pop_front();
+ ASSERT_EQ(e1, found0.front().log_entry);
+ ASSERT_EQ(1, found0.front().block_extent.block_start);
+ ASSERT_EQ(2, found0.front().block_extent.block_end);
+ found0.pop_front();
+ ASSERT_EQ(e0, found0.front().log_entry);
+ ASSERT_EQ(2, found0.front().block_extent.block_start);
+ ASSERT_EQ(4, found0.front().block_extent.block_end);
+ found0.pop_front();
+ ASSERT_EQ(e2, found0.front().log_entry);
+ ASSERT_EQ(4, found0.front().block_extent.block_start);
+ ASSERT_EQ(5, found0.front().block_extent.block_end);
+ found0.pop_front();
+ ASSERT_EQ(e3, found0.front().log_entry);
+ ASSERT_EQ(5, found0.front().block_extent.block_start);
+ ASSERT_EQ(7, found0.front().block_extent.block_end);
+ found0.pop_front();
+ ASSERT_EQ(e0, found0.front().log_entry);
+ ASSERT_EQ(7, found0.front().block_extent.block_start);
+ ASSERT_EQ(8, found0.front().block_extent.block_end);
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "test/librbd/test_fixture.h"
-#include "test/librbd/test_support.h"
-
-#include "librbd/cache/rwl/LogMap.cc"
-
-void register_test_write_log_map() {
-}
-
-namespace librbd {
-namespace cache {
-namespace rwl {
-
-struct TestLogEntry {
- uint64_t image_offset_bytes;
- uint64_t write_bytes;
- uint32_t referring_map_entries = 0;
- TestLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
- : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes) {
- }
- uint64_t get_offset_bytes() {
- return image_offset_bytes;
- }
- uint64_t get_write_bytes() {
- return write_bytes;
- }
- BlockExtent block_extent() {
- return BlockExtent(image_offset_bytes, image_offset_bytes + write_bytes);
- }
- uint32_t get_map_ref() {
- return referring_map_entries;
- }
- void inc_map_ref() {
- referring_map_entries++;
- }
- void dec_map_ref() {
- referring_map_entries--;
- }
- friend std::ostream &operator<<(std::ostream &os,
- const TestLogEntry &entry) {
- os << "referring_map_entries=" << entry.referring_map_entries << ", "
- << "image_offset_bytes=" << entry.image_offset_bytes << ", "
- << "write_bytes=" << entry.write_bytes;
- return os;
- };
-};
-
-typedef std::list<std::shared_ptr<TestLogEntry>> TestLogEntries;
-typedef LogMapEntry<TestLogEntry> TestMapEntry;
-typedef LogMapEntries<TestLogEntry> TestLogMapEntries;
-typedef LogMap<TestLogEntry> TestLogMap;
-
-class TestWriteLogMap : public TestFixture {
-public:
- void SetUp() override {
- TestFixture::SetUp();
- m_cct = reinterpret_cast<CephContext*>(m_ioctx.cct());
- }
-
- CephContext *m_cct;
-};
-
-TEST_F(TestWriteLogMap, Simple) {
- TestLogEntries es;
- TestLogMapEntries lme;
- TestLogMap map(m_cct);
-
- /* LogEntry takes offset, length, in bytes */
- auto e1 = make_shared<TestLogEntry>(4, 8);
- TestLogEntry *e1_ptr = e1.get();
- ASSERT_EQ(4, e1_ptr->get_offset_bytes());
- ASSERT_EQ(8, e1_ptr->get_write_bytes());
- map.add_log_entry(e1);
-
- /* BlockExtent takes first, last, in blocks */
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100));
- int numfound = found0.size();
- /* Written range includes the single write above */
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e1, found0.front().log_entry);
-
- /* Nothing before that */
- found0 = map.find_map_entries(BlockExtent(0, 3));
- numfound = found0.size();
- ASSERT_EQ(0, numfound);
-
- /* Nothing after that */
- found0 = map.find_map_entries(BlockExtent(12, 99));
- numfound = found0.size();
- ASSERT_EQ(0, numfound);
-
- /* 4-11 will be e1 */
- for (int i=4; i<12; i++) {
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
- int numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e1, found0.front().log_entry);
- }
-
- map.remove_log_entry(e1);
- /* Nothing should be found */
- for (int i=4; i<12; i++) {
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
- int numfound = found0.size();
- ASSERT_EQ(0, numfound);
- }
-}
-
-TEST_F(TestWriteLogMap, OverlapFront) {
- TestLogMap map(m_cct);
-
- auto e0 = make_shared<TestLogEntry>(4, 8);
- map.add_log_entry(e0);
- /* replaces block 4-7 of e0 */
- auto e1 = make_shared<TestLogEntry>(0, 8);
- map.add_log_entry(e1);
-
- /* Written range includes the two writes above */
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100));
- int numfound = found0.size();
- ASSERT_EQ(2, numfound);
- ASSERT_EQ(e1, found0.front().log_entry);
- ASSERT_EQ(0, found0.front().block_extent.block_start);
- ASSERT_EQ(8, found0.front().block_extent.block_end);
- found0.pop_front();
- ASSERT_EQ(e0, found0.front().log_entry);
- ASSERT_EQ(8, found0.front().block_extent.block_start);
- ASSERT_EQ(12, found0.front().block_extent.block_end);
-
- /* 0-7 will be e1 */
- for (int i=0; i<8; i++) {
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
- int numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e1, found0.front().log_entry);
- }
-
- /* 8-11 will be e0 */
- for (int i=8; i<12; i++) {
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
- int numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e0, found0.front().log_entry);
- }
-}
-
-TEST_F(TestWriteLogMap, OverlapBack) {
- TestLogMap map(m_cct);
-
- auto e0 = make_shared<TestLogEntry>(0, 8);
- map.add_log_entry(e0);
- /* replaces block 4-7 of e0 */
- auto e1 = make_shared<TestLogEntry>(4, 8);
- map.add_log_entry(e1);
-
- /* Written range includes the two writes above */
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100));
- int numfound = found0.size();
- ASSERT_EQ(2, numfound);
- ASSERT_EQ(e0, found0.front().log_entry);
- ASSERT_EQ(0, found0.front().block_extent.block_start);
- ASSERT_EQ(4, found0.front().block_extent.block_end);
- found0.pop_front();
- ASSERT_EQ(e1, found0.front().log_entry);
- ASSERT_EQ(4, found0.front().block_extent.block_start);
- ASSERT_EQ(12, found0.front().block_extent.block_end);
-
- /* 0-3 will be e0 */
- for (int i=0; i<4; i++) {
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
- int numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e0, found0.front().log_entry);
- }
-
- /* 4-11 will be e1 */
- for (int i=4; i<12; i++) {
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
- int numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e1, found0.front().log_entry);
- }
-
- map.remove_log_entry(e0);
-
- /* 0-3 will find nothing */
- for (int i=0; i<4; i++) {
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
- int numfound = found0.size();
- ASSERT_EQ(0, numfound);
- }
-
- /* 4-11 will still be e1 */
- for (int i=4; i<12; i++) {
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1));
- int numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e1, found0.front().log_entry);
- }
-
-}
-
-TEST_F(TestWriteLogMap, OverlapMiddle) {
- TestLogMap map(m_cct);
-
- auto e0 = make_shared<TestLogEntry>(0, 1);
- map.add_log_entry(e0);
-
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 1));
- int numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e0, found0.front().log_entry);
- TestLogEntries entries = map.find_log_entries(BlockExtent(0, 1));
- int entriesfound = entries.size();
- ASSERT_EQ(1, entriesfound);
- ASSERT_EQ(e0, entries.front());
-
- auto e1 = make_shared<TestLogEntry>(1, 1);
- map.add_log_entry(e1);
-
- found0 = map.find_map_entries(BlockExtent(1, 2));
- numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e1, found0.front().log_entry);
- entries = map.find_log_entries(BlockExtent(1, 2));
- entriesfound = entries.size();
- ASSERT_EQ(1, entriesfound);
- ASSERT_EQ(e1, entries.front());
-
- auto e2 = make_shared<TestLogEntry>(2, 1);
- map.add_log_entry(e2);
-
- found0 = map.find_map_entries(BlockExtent(2, 3));
- numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e2, found0.front().log_entry);
- entries = map.find_log_entries(BlockExtent(2, 3));
- entriesfound = entries.size();
- ASSERT_EQ(1, entriesfound);
- ASSERT_EQ(e2, entries.front());
-
- /* replaces e1 */
- auto e3 = make_shared<TestLogEntry>(1, 1);
- map.add_log_entry(e3);
-
- found0 = map.find_map_entries(BlockExtent(1, 2));
- numfound = found0.size();
- ASSERT_EQ(1, numfound);
- ASSERT_EQ(e3, found0.front().log_entry);
- entries = map.find_log_entries(BlockExtent(1, 2));
- entriesfound = entries.size();
- ASSERT_EQ(1, entriesfound);
- ASSERT_EQ(e3, entries.front());
-
- found0 = map.find_map_entries(BlockExtent(0, 100));
- numfound = found0.size();
- ASSERT_EQ(3, numfound);
- ASSERT_EQ(e0, found0.front().log_entry);
- found0.pop_front();
- ASSERT_EQ(e3, found0.front().log_entry);
- found0.pop_front();
- ASSERT_EQ(e2, found0.front().log_entry);
- entries = map.find_log_entries(BlockExtent(0, 100));
- entriesfound = entries.size();
- ASSERT_EQ(3, entriesfound);
- ASSERT_EQ(e0, entries.front());
- entries.pop_front();
- ASSERT_EQ(e3, entries.front());
- entries.pop_front();
- ASSERT_EQ(e2, entries.front());
-
- entries.clear();
- entries.emplace_back(e0);
- entries.emplace_back(e1);
- map.remove_log_entries(entries);
-
- found0 = map.find_map_entries(BlockExtent(0, 100));
- numfound = found0.size();
- ASSERT_EQ(2, numfound);
- ASSERT_EQ(e3, found0.front().log_entry);
- found0.pop_front();
- ASSERT_EQ(e2, found0.front().log_entry);
-}
-
-TEST_F(TestWriteLogMap, OverlapSplit) {
- TestLogMap map(m_cct);
-
- auto e0 = make_shared<TestLogEntry>(0, 8);
- map.add_log_entry(e0);
-
- /* Splits e0 at 1 */
- auto e1 = make_shared<TestLogEntry>(1, 1);
- map.add_log_entry(e1);
-
- /* Splits e0 again at 4 */
- auto e2 = make_shared<TestLogEntry>(4, 2);
- map.add_log_entry(e2);
-
- /* Replaces one block of e2, and one of e0 */
- auto e3 = make_shared<TestLogEntry>(5, 2);
- map.add_log_entry(e3);
-
- /* Expecting: 0:e0, 1:e1, 2..3:e0, 4:e2, 5..6:e3, 7:e0 */
- TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100));
- int numfound = found0.size();
- ASSERT_EQ(6, numfound);
- ASSERT_EQ(e0, found0.front().log_entry);
- ASSERT_EQ(0, found0.front().block_extent.block_start);
- ASSERT_EQ(1, found0.front().block_extent.block_end);
- found0.pop_front();
- ASSERT_EQ(e1, found0.front().log_entry);
- ASSERT_EQ(1, found0.front().block_extent.block_start);
- ASSERT_EQ(2, found0.front().block_extent.block_end);
- found0.pop_front();
- ASSERT_EQ(e0, found0.front().log_entry);
- ASSERT_EQ(2, found0.front().block_extent.block_start);
- ASSERT_EQ(4, found0.front().block_extent.block_end);
- found0.pop_front();
- ASSERT_EQ(e2, found0.front().log_entry);
- ASSERT_EQ(4, found0.front().block_extent.block_start);
- ASSERT_EQ(5, found0.front().block_extent.block_end);
- found0.pop_front();
- ASSERT_EQ(e3, found0.front().log_entry);
- ASSERT_EQ(5, found0.front().block_extent.block_start);
- ASSERT_EQ(7, found0.front().block_extent.block_end);
- found0.pop_front();
- ASSERT_EQ(e0, found0.front().log_entry);
- ASSERT_EQ(7, found0.front().block_extent.block_start);
- ASSERT_EQ(8, found0.front().block_extent.block_end);
-}
-
-} // namespace rwl
-} // namespace cache
-} // namespace librbd
#include "test/librbd/test_support.h"
#include "test/librbd/mock/MockImageCtx.h"
#include "include/rbd/librbd.hpp"
-#include "librbd/cache/rwl/ImageCacheState.h"
-#include "librbd/cache/rwl/Types.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/Types.h"
#include "librbd/cache/ImageWriteback.h"
#include "librbd/cache/WriteLogCache.h"
} // namespace librbd
#include "librbd/cache/WriteLogCache.cc"
-#include "librbd/cache/AbstractWriteLog.cc"
-#include "librbd/cache/ReplicatedWriteLog.cc"
+#include "librbd/cache/pwl/AbstractWriteLog.cc"
+#include "librbd/cache/pwl/ReplicatedWriteLog.cc"
// template definitions
#include "librbd/cache/ImageWriteback.cc"
-#include "librbd/cache/rwl/ImageCacheState.cc"
-#include "librbd/cache/rwl/Request.cc"
+#include "librbd/cache/pwl/ImageCacheState.cc"
+#include "librbd/cache/pwl/Request.cc"
namespace librbd {
namespace cache {
struct TestMockCacheReplicatedWriteLog : public TestMockFixture {
typedef WriteLogCache<librbd::MockImageCtx> MockReplicatedWriteLog;
- typedef librbd::cache::rwl::ImageCacheState<librbd::MockImageCtx> MockImageCacheStateRWL;
+ typedef librbd::cache::pwl::ImageCacheState<librbd::MockImageCtx> MockImageCacheStateRWL;
MockImageCacheStateRWL *get_cache_state(MockImageCtx& mock_image_ctx) {
MockImageCacheStateRWL *rwl_state = new MockImageCacheStateRWL(&mock_image_ctx);
#include "test/librbd/mock/MockObjectMap.h"
#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
#include "test/librados_test_stub/MockTestMemRadosClient.h"
-#include "librbd/cache/rwl/InitRequest.h"
-#include "librbd/cache/rwl/ShutdownRequest.h"
+#include "librbd/cache/pwl/InitRequest.h"
+#include "librbd/cache/pwl/ShutdownRequest.h"
#include "librbd/exclusive_lock/PostAcquireRequest.h"
#include "librbd/image/RefreshRequest.h"
} // namespace image
namespace cache {
-namespace rwl {
+namespace pwl {
template<>
struct InitRequest<librbd::MockTestImageCtx> {
ShutdownRequest<librbd::MockTestImageCtx> *ShutdownRequest<librbd::MockTestImageCtx>::s_instance = nullptr;
-} // namespace rwl
+} // namespace pwl
} // namespace cache
} // namespace librbd
public:
typedef PostAcquireRequest<MockTestImageCtx> MockPostAcquireRequest;
typedef librbd::image::RefreshRequest<MockTestImageCtx> MockRefreshRequest;
- typedef librbd::cache::rwl::InitRequest<MockTestImageCtx> MockInitRequest;
- typedef librbd::cache::rwl::ShutdownRequest<MockTestImageCtx> MockShutdownRequest;
+ typedef librbd::cache::pwl::InitRequest<MockTestImageCtx> MockInitRequest;
+ typedef librbd::cache::pwl::ShutdownRequest<MockTestImageCtx> MockShutdownRequest;
void expect_test_features(MockTestImageCtx &mock_image_ctx, uint64_t features,
bool enabled) {
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#include "librbd/cache/rwl/ShutdownRequest.h"
+#include "librbd/cache/pwl/ShutdownRequest.h"
#include "test/librbd/test_mock_fixture.h"
#include "test/librbd/test_support.h"
#include "test/librbd/mock/cache/MockImageCache.h"
} // namespace exclusive_lock
namespace cache {
-namespace rwl {
+namespace pwl {
template<>
struct ShutdownRequest<librbd::MockTestImageCtx> {
static ShutdownRequest *s_instance;
ShutdownRequest<librbd::MockTestImageCtx> *ShutdownRequest<librbd::MockTestImageCtx>::s_instance = nullptr;
-} // namespace rwl
+} // namespace pwl
} // namespace cache
} // namespace librbd
public:
typedef ImageDispatch<MockTestImageCtx> MockImageDispatch;
typedef PreReleaseRequest<MockTestImageCtx> MockPreReleaseRequest;
- typedef librbd::cache::rwl::ShutdownRequest<MockTestImageCtx> MockShutdownRequest;
+ typedef librbd::cache::pwl::ShutdownRequest<MockTestImageCtx> MockShutdownRequest;
void expect_complete_context(MockContext &mock_context, int r) {
EXPECT_CALL(mock_context, complete(r));