From: Mahati Chamarthy Date: Wed, 2 Sep 2020 09:53:06 +0000 (+0530) Subject: librbd/cache: Rename namespaces and move files X-Git-Tag: v16.1.0~1069^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F36586%2Fhead;p=ceph.git librbd/cache: Rename namespaces and move files Signed-off-by: Lisa Li Signed-off-by: Mahati Chamarthy Signed-off-by: Changcheng Liu --- diff --git a/src/common/subsys.h b/src/common/subsys.h index e61b478bfae95..d01abd0f38b85 100644 --- a/src/common/subsys.h +++ b/src/common/subsys.h @@ -37,7 +37,7 @@ SUBSYS(rados, 0, 5) SUBSYS(rbd, 0, 5) SUBSYS(rbd_mirror, 0, 5) SUBSYS(rbd_replay, 0, 5) -SUBSYS(rbd_rwl, 0, 5) +SUBSYS(rbd_pwl, 0, 5) SUBSYS(journaler, 0, 5) SUBSYS(objectcacher, 0, 5) SUBSYS(immutable_obj_cache, 0, 5) diff --git a/src/librbd/CMakeLists.txt b/src/librbd/CMakeLists.txt index 7540032522339..06e37bd39590a 100644 --- a/src/librbd/CMakeLists.txt +++ b/src/librbd/CMakeLists.txt @@ -43,8 +43,8 @@ set(librbd_internal_srcs cache/ObjectCacherObjectDispatch.cc cache/ObjectCacherWriteback.cc cache/PassthroughImageCache.cc - cache/rwl/InitRequest.cc - cache/rwl/ShutdownRequest.cc + cache/pwl/InitRequest.cc + cache/pwl/ShutdownRequest.cc cache/WriteAroundObjectDispatch.cc crypto/CryptoObjectDispatch.cc deep_copy/ImageCopyRequest.cc @@ -196,16 +196,16 @@ endif() if(WITH_RBD_RWL) set(librbd_internal_srcs ${librbd_internal_srcs} - cache/rwl/ImageCacheState.cc - cache/rwl/LogEntry.cc - cache/rwl/LogMap.cc - cache/rwl/LogOperation.cc - cache/rwl/ReadRequest.cc - cache/rwl/Request.cc - cache/rwl/SyncPoint.cc - cache/rwl/Types.cc - cache/ReplicatedWriteLog.cc - cache/AbstractWriteLog.cc + cache/pwl/ImageCacheState.cc + cache/pwl/LogEntry.cc + cache/pwl/LogMap.cc + cache/pwl/LogOperation.cc + cache/pwl/ReadRequest.cc + cache/pwl/Request.cc + cache/pwl/SyncPoint.cc + cache/pwl/Types.cc + cache/pwl/ReplicatedWriteLog.cc + cache/pwl/AbstractWriteLog.cc cache/WriteLogCache.cc) endif() diff --git a/src/librbd/ExclusiveLock.cc b/src/librbd/ExclusiveLock.cc index 00496c765b5f5..c5963b07c036f 100644 --- a/src/librbd/ExclusiveLock.cc +++ b/src/librbd/ExclusiveLock.cc @@ -205,7 +205,7 @@ void ExclusiveLock::handle_init_complete(int r, uint64_t features, on_finish->complete(r); }); - bool rwl_enabled = cache::util::is_rwl_enabled(m_image_ctx); + bool rwl_enabled = cache::util::is_pwl_enabled(m_image_ctx); if (m_image_ctx.clone_copy_on_read || (features & RBD_FEATURE_JOURNALING) != 0 || rwl_enabled) { diff --git a/src/librbd/cache/AbstractWriteLog.cc b/src/librbd/cache/AbstractWriteLog.cc deleted file mode 100644 index 7b4ffed2e1aa4..0000000000000 --- a/src/librbd/cache/AbstractWriteLog.cc +++ /dev/null @@ -1,2769 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include "AbstractWriteLog.h" -#include "include/buffer.h" -#include "include/Context.h" -#include "include/ceph_assert.h" -#include "common/deleter.h" -#include "common/dout.h" -#include "common/environment.h" -#include "common/errno.h" -#include "common/WorkQueue.h" -#include "common/Timer.h" -#include "common/perf_counters.h" -#include "librbd/ImageCtx.h" -#include "librbd/asio/ContextWQ.h" -#include "librbd/cache/rwl/ImageCacheState.h" -#include "librbd/cache/rwl/LogEntry.h" -#include "librbd/cache/rwl/ReadRequest.h" -#include "librbd/cache/rwl/Types.h" -#include -#include - -#undef dout_subsys -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::AbstractWriteLog: " << this << " " \ - << __func__ << ": " - -namespace librbd { -namespace cache { - -using namespace librbd::cache::rwl; - -typedef AbstractWriteLog::Extent Extent; -typedef AbstractWriteLog::Extents Extents; - -const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION; - -template -AbstractWriteLog::AbstractWriteLog(I &image_ctx, librbd::cache::rwl::ImageCacheState* cache_state) - : m_cache_state(cache_state), - m_rwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_rwl)), - m_image_ctx(image_ctx), - m_log_pool_config_size(DEFAULT_POOL_SIZE), - m_image_writeback(image_ctx), m_write_log_guard(image_ctx.cct), - m_log_retire_lock(ceph::make_mutex(util::unique_lock_name( - "librbd::cache::AbstractWriteLog::m_log_retire_lock", this))), - m_entry_reader_lock("librbd::cache::AbstractWriteLog::m_entry_reader_lock"), - m_deferred_dispatch_lock(ceph::make_mutex(util::unique_lock_name( - "librbd::cache::AbstractWriteLog::m_deferred_dispatch_lock", this))), - m_log_append_lock(ceph::make_mutex(util::unique_lock_name( - "librbd::cache::AbstractWriteLog::m_log_append_lock", this))), - m_lock(ceph::make_mutex(util::unique_lock_name( - "librbd::cache::AbstractWriteLog::m_lock", this))), - m_blockguard_lock(ceph::make_mutex(util::unique_lock_name( - "librbd::cache::AbstractWriteLog::m_blockguard_lock", this))), - m_blocks_to_log_entries(image_ctx.cct), - m_thread_pool(image_ctx.cct, "librbd::cache::AbstractWriteLog::thread_pool", "tp_rwl", - 4, - ""), - m_work_queue("librbd::cache::ReplicatedWriteLog::work_queue", - ceph::make_timespan( - image_ctx.config.template get_val( - "rbd_op_thread_timeout")), - &m_thread_pool) -{ - CephContext *cct = m_image_ctx.cct; - ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock); -} - -template -AbstractWriteLog::~AbstractWriteLog() { - ldout(m_image_ctx.cct, 15) << "enter" << dendl; - { - std::lock_guard timer_locker(*m_timer_lock); - std::lock_guard locker(m_lock); - m_timer->cancel_event(m_timer_ctx); - m_thread_pool.stop(); - ceph_assert(m_deferred_ios.size() == 0); - ceph_assert(m_ops_to_flush.size() == 0); - ceph_assert(m_ops_to_append.size() == 0); - ceph_assert(m_flush_ops_in_flight == 0); - - m_log_pool = nullptr; - delete m_cache_state; - m_cache_state = nullptr; - } - ldout(m_image_ctx.cct, 15) << "exit" << dendl; -} - -template -void AbstractWriteLog::perf_start(std::string name) { - PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_rwl_first, l_librbd_rwl_last); - - // Latency axis configuration for op histograms, values are in nanoseconds - PerfHistogramCommon::axis_config_d op_hist_x_axis_config{ - "Latency (nsec)", - PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale - 0, ///< Start at 0 - 5000, ///< Quantization unit is 5usec - 16, ///< Ranges into the mS - }; - - // Syncpoint logentry number x-axis configuration for op histograms - PerfHistogramCommon::axis_config_d sp_logentry_number_config{ - "logentry number", - PerfHistogramCommon::SCALE_LINEAR, // log entry number in linear scale - 0, // Start at 0 - 1, // Quantization unit is 1 - 260, // Up to 260 > (MAX_WRITES_PER_SYNC_POINT) - }; - - // Syncpoint bytes number y-axis configuration for op histogram - PerfHistogramCommon::axis_config_d sp_bytes_number_config{ - "Number of SyncPoint", - PerfHistogramCommon::SCALE_LOG2, // Request size in logarithmic scale - 0, // Start at 0 - 512, // Quantization unit is 512 - 17, // Writes up to 8M >= MAX_BYTES_PER_SYNC_POINT - }; - - // Op size axis configuration for op histogram y axis, values are in bytes - PerfHistogramCommon::axis_config_d op_hist_y_axis_config{ - "Request size (bytes)", - PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale - 0, ///< Start at 0 - 512, ///< Quantization unit is 512 bytes - 16, ///< Writes up to >32k - }; - - // Num items configuration for op histogram y axis, values are in items - PerfHistogramCommon::axis_config_d op_hist_y_axis_count_config{ - "Number of items", - PerfHistogramCommon::SCALE_LINEAR, ///< Request size in linear scale - 0, ///< Start at 0 - 1, ///< Quantization unit is 1 - 32, ///< Writes up to >32k - }; - - plb.add_u64_counter(l_librbd_rwl_rd_req, "rd", "Reads"); - plb.add_u64_counter(l_librbd_rwl_rd_bytes, "rd_bytes", "Data size in reads"); - plb.add_time_avg(l_librbd_rwl_rd_latency, "rd_latency", "Latency of reads"); - - plb.add_u64_counter(l_librbd_rwl_rd_hit_req, "hit_rd", "Reads completely hitting RWL"); - plb.add_u64_counter(l_librbd_rwl_rd_hit_bytes, "rd_hit_bytes", "Bytes read from RWL"); - plb.add_time_avg(l_librbd_rwl_rd_hit_latency, "hit_rd_latency", "Latency of read hits"); - - plb.add_u64_counter(l_librbd_rwl_rd_part_hit_req, "part_hit_rd", "reads partially hitting RWL"); - - plb.add_u64_counter_histogram( - l_librbd_rwl_syncpoint_hist, "syncpoint_logentry_bytes_histogram", - sp_logentry_number_config, sp_bytes_number_config, - "Histogram of syncpoint's logentry numbers vs bytes number"); - - plb.add_u64_counter(l_librbd_rwl_wr_req, "wr", "Writes"); - plb.add_u64_counter(l_librbd_rwl_wr_req_def, "wr_def", "Writes deferred for resources"); - plb.add_u64_counter(l_librbd_rwl_wr_req_def_lanes, "wr_def_lanes", "Writes deferred for lanes"); - plb.add_u64_counter(l_librbd_rwl_wr_req_def_log, "wr_def_log", "Writes deferred for log entries"); - plb.add_u64_counter(l_librbd_rwl_wr_req_def_buf, "wr_def_buf", "Writes deferred for buffers"); - plb.add_u64_counter(l_librbd_rwl_wr_req_overlap, "wr_overlap", "Writes overlapping with prior in-progress writes"); - plb.add_u64_counter(l_librbd_rwl_wr_req_queued, "wr_q_barrier", "Writes queued for prior barriers (aio_flush)"); - plb.add_u64_counter(l_librbd_rwl_wr_bytes, "wr_bytes", "Data size in writes"); - - plb.add_u64_counter(l_librbd_rwl_log_ops, "log_ops", "Log appends"); - plb.add_u64_avg(l_librbd_rwl_log_op_bytes, "log_op_bytes", "Average log append bytes"); - - plb.add_time_avg( - l_librbd_rwl_req_arr_to_all_t, "req_arr_to_all_t", - "Average arrival to allocation time (time deferred for overlap)"); - plb.add_time_avg( - l_librbd_rwl_req_arr_to_dis_t, "req_arr_to_dis_t", - "Average arrival to dispatch time (includes time deferred for overlaps and allocation)"); - plb.add_time_avg( - l_librbd_rwl_req_all_to_dis_t, "req_all_to_dis_t", - "Average allocation to dispatch time (time deferred for log resources)"); - plb.add_time_avg( - l_librbd_rwl_wr_latency, "wr_latency", - "Latency of writes (persistent completion)"); - plb.add_u64_counter_histogram( - l_librbd_rwl_wr_latency_hist, "wr_latency_bytes_histogram", - op_hist_x_axis_config, op_hist_y_axis_config, - "Histogram of write request latency (nanoseconds) vs. bytes written"); - plb.add_time_avg( - l_librbd_rwl_wr_caller_latency, "caller_wr_latency", - "Latency of write completion to caller"); - plb.add_time_avg( - l_librbd_rwl_nowait_req_arr_to_all_t, "req_arr_to_all_nw_t", - "Average arrival to allocation time (time deferred for overlap)"); - plb.add_time_avg( - l_librbd_rwl_nowait_req_arr_to_dis_t, "req_arr_to_dis_nw_t", - "Average arrival to dispatch time (includes time deferred for overlaps and allocation)"); - plb.add_time_avg( - l_librbd_rwl_nowait_req_all_to_dis_t, "req_all_to_dis_nw_t", - "Average allocation to dispatch time (time deferred for log resources)"); - plb.add_time_avg( - l_librbd_rwl_nowait_wr_latency, "wr_latency_nw", - "Latency of writes (persistent completion) not deferred for free space"); - plb.add_u64_counter_histogram( - l_librbd_rwl_nowait_wr_latency_hist, "wr_latency_nw_bytes_histogram", - op_hist_x_axis_config, op_hist_y_axis_config, - "Histogram of write request latency (nanoseconds) vs. bytes written for writes not deferred for free space"); - plb.add_time_avg( - l_librbd_rwl_nowait_wr_caller_latency, "caller_wr_latency_nw", - "Latency of write completion to callerfor writes not deferred for free space"); - plb.add_time_avg(l_librbd_rwl_log_op_alloc_t, "op_alloc_t", "Average buffer pmemobj_reserve() time"); - plb.add_u64_counter_histogram( - l_librbd_rwl_log_op_alloc_t_hist, "op_alloc_t_bytes_histogram", - op_hist_x_axis_config, op_hist_y_axis_config, - "Histogram of buffer pmemobj_reserve() time (nanoseconds) vs. bytes written"); - plb.add_time_avg(l_librbd_rwl_log_op_dis_to_buf_t, "op_dis_to_buf_t", "Average dispatch to buffer persist time"); - plb.add_time_avg(l_librbd_rwl_log_op_dis_to_app_t, "op_dis_to_app_t", "Average dispatch to log append time"); - plb.add_time_avg(l_librbd_rwl_log_op_dis_to_cmp_t, "op_dis_to_cmp_t", "Average dispatch to persist completion time"); - plb.add_u64_counter_histogram( - l_librbd_rwl_log_op_dis_to_cmp_t_hist, "op_dis_to_cmp_t_bytes_histogram", - op_hist_x_axis_config, op_hist_y_axis_config, - "Histogram of op dispatch to persist complete time (nanoseconds) vs. bytes written"); - - plb.add_time_avg( - l_librbd_rwl_log_op_buf_to_app_t, "op_buf_to_app_t", - "Average buffer persist to log append time (write data persist/replicate + wait for append time)"); - plb.add_time_avg( - l_librbd_rwl_log_op_buf_to_bufc_t, "op_buf_to_bufc_t", - "Average buffer persist time (write data persist/replicate time)"); - plb.add_u64_counter_histogram( - l_librbd_rwl_log_op_buf_to_bufc_t_hist, "op_buf_to_bufc_t_bytes_histogram", - op_hist_x_axis_config, op_hist_y_axis_config, - "Histogram of write buffer persist time (nanoseconds) vs. bytes written"); - plb.add_time_avg( - l_librbd_rwl_log_op_app_to_cmp_t, "op_app_to_cmp_t", - "Average log append to persist complete time (log entry append/replicate + wait for complete time)"); - plb.add_time_avg( - l_librbd_rwl_log_op_app_to_appc_t, "op_app_to_appc_t", - "Average log append to persist complete time (log entry append/replicate time)"); - plb.add_u64_counter_histogram( - l_librbd_rwl_log_op_app_to_appc_t_hist, "op_app_to_appc_t_bytes_histogram", - op_hist_x_axis_config, op_hist_y_axis_config, - "Histogram of log append persist time (nanoseconds) (vs. op bytes)"); - - plb.add_u64_counter(l_librbd_rwl_discard, "discard", "Discards"); - plb.add_u64_counter(l_librbd_rwl_discard_bytes, "discard_bytes", "Bytes discarded"); - plb.add_time_avg(l_librbd_rwl_discard_latency, "discard_lat", "Discard latency"); - - plb.add_u64_counter(l_librbd_rwl_aio_flush, "aio_flush", "AIO flush (flush to RWL)"); - plb.add_u64_counter(l_librbd_rwl_aio_flush_def, "aio_flush_def", "AIO flushes deferred for resources"); - plb.add_time_avg(l_librbd_rwl_aio_flush_latency, "aio_flush_lat", "AIO flush latency"); - - plb.add_u64_counter(l_librbd_rwl_ws,"ws", "Write Sames"); - plb.add_u64_counter(l_librbd_rwl_ws_bytes, "ws_bytes", "Write Same bytes to image"); - plb.add_time_avg(l_librbd_rwl_ws_latency, "ws_lat", "Write Same latency"); - - plb.add_u64_counter(l_librbd_rwl_cmp, "cmp", "Compare and Write requests"); - plb.add_u64_counter(l_librbd_rwl_cmp_bytes, "cmp_bytes", "Compare and Write bytes compared/written"); - plb.add_time_avg(l_librbd_rwl_cmp_latency, "cmp_lat", "Compare and Write latecy"); - plb.add_u64_counter(l_librbd_rwl_cmp_fails, "cmp_fails", "Compare and Write compare fails"); - - plb.add_u64_counter(l_librbd_rwl_flush, "flush", "Flush (flush RWL)"); - plb.add_u64_counter(l_librbd_rwl_invalidate_cache, "invalidate", "Invalidate RWL"); - plb.add_u64_counter(l_librbd_rwl_invalidate_discard_cache, "discard", "Discard and invalidate RWL"); - - plb.add_time_avg(l_librbd_rwl_append_tx_t, "append_tx_lat", "Log append transaction latency"); - plb.add_u64_counter_histogram( - l_librbd_rwl_append_tx_t_hist, "append_tx_lat_histogram", - op_hist_x_axis_config, op_hist_y_axis_count_config, - "Histogram of log append transaction time (nanoseconds) vs. entries appended"); - plb.add_time_avg(l_librbd_rwl_retire_tx_t, "retire_tx_lat", "Log retire transaction latency"); - plb.add_u64_counter_histogram( - l_librbd_rwl_retire_tx_t_hist, "retire_tx_lat_histogram", - op_hist_x_axis_config, op_hist_y_axis_count_config, - "Histogram of log retire transaction time (nanoseconds) vs. entries retired"); - - m_perfcounter = plb.create_perf_counters(); - m_image_ctx.cct->get_perfcounters_collection()->add(m_perfcounter); -} - -template -void AbstractWriteLog::perf_stop() { - ceph_assert(m_perfcounter); - m_image_ctx.cct->get_perfcounters_collection()->remove(m_perfcounter); - delete m_perfcounter; -} - -template -void AbstractWriteLog::log_perf() { - bufferlist bl; - Formatter *f = Formatter::create("json-pretty"); - bl.append("Perf dump follows\n--- Begin perf dump ---\n"); - bl.append("{\n"); - stringstream ss; - utime_t now = ceph_clock_now(); - ss << "\"test_time\": \"" << now << "\","; - ss << "\"image\": \"" << m_image_ctx.name << "\","; - bl.append(ss); - bl.append("\"stats\": "); - m_image_ctx.cct->get_perfcounters_collection()->dump_formatted(f, 0); - f->flush(bl); - bl.append(",\n\"histograms\": "); - m_image_ctx.cct->get_perfcounters_collection()->dump_formatted_histograms(f, 0); - f->flush(bl); - delete f; - bl.append("}\n--- End perf dump ---\n"); - bl.append('\0'); - ldout(m_image_ctx.cct, 1) << bl.c_str() << dendl; -} - -template -void AbstractWriteLog::periodic_stats() { - std::lock_guard locker(m_lock); - ldout(m_image_ctx.cct, 1) << "STATS: " - << "m_free_log_entries=" << m_free_log_entries << ", " - << "m_log_entries=" << m_log_entries.size() << ", " - << "m_dirty_log_entries=" << m_dirty_log_entries.size() << ", " - << "m_bytes_allocated=" << m_bytes_allocated << ", " - << "m_bytes_cached=" << m_bytes_cached << ", " - << "m_bytes_dirty=" << m_bytes_dirty << ", " - << "bytes available=" << m_bytes_allocated_cap - m_bytes_allocated << ", " - << "m_current_sync_gen=" << m_current_sync_gen << ", " - << "m_flushed_sync_gen=" << m_flushed_sync_gen << ", " - << dendl; -} - -template -void AbstractWriteLog::arm_periodic_stats() { - ceph_assert(ceph_mutex_is_locked(*m_timer_lock)); - if (m_periodic_stats_enabled) { - m_timer_ctx = new LambdaContext( - [this](int r) { - /* m_timer_lock is held */ - periodic_stats(); - arm_periodic_stats(); - }); - m_timer->add_event_after(LOG_STATS_INTERVAL_SECONDS, m_timer_ctx); - } -} - -/* - * Loads the log entries from an existing log. - * - * Creates the in-memory structures to represent the state of the - * re-opened log. - * - * Finds the last appended sync point, and any sync points referred to - * in log entries, but missing from the log. These missing sync points - * are created and scheduled for append. Some rudimentary consistency - * checking is done. - * - * Rebuilds the m_blocks_to_log_entries map, to make log entries - * readable. - * - * Places all writes on the dirty entries list, which causes them all - * to be flushed. - * - */ -template -void AbstractWriteLog::load_existing_entries(DeferredContexts &later) { - TOID(struct WriteLogPoolRoot) pool_root; - pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); - struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries); - uint64_t entry_index = m_first_valid_entry; - /* The map below allows us to find sync point log entries by sync - * gen number, which is necessary so write entries can be linked to - * their sync points. */ - std::map> sync_point_entries; - /* The map below tracks sync points referred to in writes but not - * appearing in the sync_point_entries map. We'll use this to - * determine which sync points are missing and need to be - * created. */ - std::map missing_sync_points; - - /* - * Read the existing log entries. Construct an in-memory log entry - * object of the appropriate type for each. Add these to the global - * log entries list. - * - * Write entries will not link to their sync points yet. We'll do - * that in the next pass. Here we'll accumulate a map of sync point - * gen numbers that are referred to in writes but do not appearing in - * the log. - */ - while (entry_index != m_first_free_entry) { - WriteLogPmemEntry *pmem_entry = &pmem_log_entries[entry_index]; - std::shared_ptr log_entry = nullptr; - bool writer = pmem_entry->is_writer(); - - ceph_assert(pmem_entry->entry_index == entry_index); - if (pmem_entry->is_sync_point()) { - ldout(m_image_ctx.cct, 20) << "Entry " << entry_index - << " is a sync point. pmem_entry=[" << *pmem_entry << "]" << dendl; - auto sync_point_entry = std::make_shared(pmem_entry->sync_gen_number); - log_entry = sync_point_entry; - sync_point_entries[pmem_entry->sync_gen_number] = sync_point_entry; - missing_sync_points.erase(pmem_entry->sync_gen_number); - m_current_sync_gen = pmem_entry->sync_gen_number; - } else if (pmem_entry->is_write()) { - ldout(m_image_ctx.cct, 20) << "Entry " << entry_index - << " is a write. pmem_entry=[" << *pmem_entry << "]" << dendl; - auto write_entry = - std::make_shared(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes); - write_entry->pmem_buffer = D_RW(pmem_entry->write_data); - log_entry = write_entry; - } else if (pmem_entry->is_writesame()) { - ldout(m_image_ctx.cct, 20) << "Entry " << entry_index - << " is a write same. pmem_entry=[" << *pmem_entry << "]" << dendl; - auto ws_entry = - std::make_shared(nullptr, pmem_entry->image_offset_bytes, - pmem_entry->write_bytes, pmem_entry->ws_datalen); - ws_entry->pmem_buffer = D_RW(pmem_entry->write_data); - log_entry = ws_entry; - } else if (pmem_entry->is_discard()) { - ldout(m_image_ctx.cct, 20) << "Entry " << entry_index - << " is a discard. pmem_entry=[" << *pmem_entry << "]" << dendl; - auto discard_entry = - std::make_shared(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes, - m_discard_granularity_bytes); - log_entry = discard_entry; - } else { - lderr(m_image_ctx.cct) << "Unexpected entry type in entry " << entry_index - << ", pmem_entry=[" << *pmem_entry << "]" << dendl; - } - - if (writer) { - ldout(m_image_ctx.cct, 20) << "Entry " << entry_index - << " writes. pmem_entry=[" << *pmem_entry << "]" << dendl; - if (!sync_point_entries[pmem_entry->sync_gen_number]) { - missing_sync_points[pmem_entry->sync_gen_number] = true; - } - } - - log_entry->ram_entry = *pmem_entry; - log_entry->pmem_entry = pmem_entry; - log_entry->log_entry_index = entry_index; - log_entry->completed = true; - - m_log_entries.push_back(log_entry); - - entry_index = (entry_index + 1) % m_total_log_entries; - } - - /* Create missing sync points. These must not be appended until the - * entry reload is complete and the write map is up to - * date. Currently this is handled by the deferred contexts object - * passed to new_sync_point(). These contexts won't be completed - * until this function returns. */ - for (auto &kv : missing_sync_points) { - ldout(m_image_ctx.cct, 5) << "Adding sync point " << kv.first << dendl; - if (0 == m_current_sync_gen) { - /* The unlikely case where the log contains writing entries, but no sync - * points (e.g. because they were all retired) */ - m_current_sync_gen = kv.first-1; - } - ceph_assert(kv.first == m_current_sync_gen+1); - init_flush_new_sync_point(later); - ceph_assert(kv.first == m_current_sync_gen); - sync_point_entries[kv.first] = m_current_sync_point->log_entry;; - } - - /* - * Iterate over the log entries again (this time via the global - * entries list), connecting write entries to their sync points and - * updating the sync point stats. - * - * Add writes to the write log map. - */ - std::shared_ptr previous_sync_point_entry = nullptr; - for (auto &log_entry : m_log_entries) { - if ((log_entry->write_bytes() > 0) || (log_entry->bytes_dirty() > 0)) { - /* This entry is one of the types that write */ - auto gen_write_entry = static_pointer_cast(log_entry); - if (gen_write_entry) { - auto sync_point_entry = sync_point_entries[gen_write_entry->ram_entry.sync_gen_number]; - if (!sync_point_entry) { - lderr(m_image_ctx.cct) << "Sync point missing for entry=[" << *gen_write_entry << "]" << dendl; - ceph_assert(false); - } else { - gen_write_entry->sync_point_entry = sync_point_entry; - sync_point_entry->writes++; - sync_point_entry->bytes += gen_write_entry->ram_entry.write_bytes; - sync_point_entry->writes_completed++; - m_blocks_to_log_entries.add_log_entry(gen_write_entry); - /* This entry is only dirty if its sync gen number is > the flushed - * sync gen number from the root object. */ - if (gen_write_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) { - m_dirty_log_entries.push_back(log_entry); - m_bytes_dirty += gen_write_entry->bytes_dirty(); - } else { - gen_write_entry->set_flushed(true); - sync_point_entry->writes_flushed++; - } - if (log_entry->write_bytes() == log_entry->bytes_dirty()) { - /* This entry is a basic write */ - uint64_t bytes_allocated = MIN_WRITE_ALLOC_SIZE; - if (gen_write_entry->ram_entry.write_bytes > bytes_allocated) { - bytes_allocated = gen_write_entry->ram_entry.write_bytes; - } - m_bytes_allocated += bytes_allocated; - m_bytes_cached += gen_write_entry->ram_entry.write_bytes; - } - } - } - } else { - /* This entry is sync point entry */ - auto sync_point_entry = static_pointer_cast(log_entry); - if (sync_point_entry) { - if (previous_sync_point_entry) { - previous_sync_point_entry->next_sync_point_entry = sync_point_entry; - if (previous_sync_point_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) { - sync_point_entry->prior_sync_point_flushed = false; - ceph_assert(!previous_sync_point_entry->prior_sync_point_flushed || - (0 == previous_sync_point_entry->writes) || - (previous_sync_point_entry->writes >= previous_sync_point_entry->writes_flushed)); - } else { - sync_point_entry->prior_sync_point_flushed = true; - ceph_assert(previous_sync_point_entry->prior_sync_point_flushed); - ceph_assert(previous_sync_point_entry->writes == previous_sync_point_entry->writes_flushed); - } - previous_sync_point_entry = sync_point_entry; - } else { - /* There are no previous sync points, so we'll consider them flushed */ - sync_point_entry->prior_sync_point_flushed = true; - } - ldout(m_image_ctx.cct, 10) << "Loaded to sync point=[" << *sync_point_entry << dendl; - } - } - } - if (0 == m_current_sync_gen) { - /* If a re-opened log was completely flushed, we'll have found no sync point entries here, - * and not advanced m_current_sync_gen. Here we ensure it starts past the last flushed sync - * point recorded in the log. */ - m_current_sync_gen = m_flushed_sync_gen; - } -} - -template -void AbstractWriteLog::rwl_init(Context *on_finish, DeferredContexts &later) { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 20) << dendl; - TOID(struct WriteLogPoolRoot) pool_root; - ceph_assert(m_cache_state); - std::lock_guard locker(m_lock); - ceph_assert(!m_initialized); - ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl; - ldout(cct,5) << "rwl_size: " << m_cache_state->size << dendl; - std::string rwl_path = m_cache_state->path; - ldout(cct,5) << "rwl_path: " << rwl_path << dendl; - - std::string pool_name = m_image_ctx.md_ctx.get_pool_name(); - std::string log_pool_name = rwl_path + "/rbd-rwl." + pool_name + "." + m_image_ctx.id + ".pool"; - std::string log_poolset_name = rwl_path + "/rbd-rwl." + pool_name + "." + m_image_ctx.id + ".poolset"; - m_log_pool_config_size = max(m_cache_state->size, MIN_POOL_SIZE); - - if (access(log_poolset_name.c_str(), F_OK) == 0) { - m_log_pool_name = log_poolset_name; - m_log_is_poolset = true; - } else { - m_log_pool_name = log_pool_name; - ldout(cct, 5) << "Poolset file " << log_poolset_name - << " not present (or can't open). Using unreplicated pool" << dendl; - } - - if ((!m_cache_state->present) && - (access(m_log_pool_name.c_str(), F_OK) == 0)) { - ldout(cct, 5) << "There's an existing pool/poolset file " << m_log_pool_name - << ", While there's no cache in the image metatata." << dendl; - if (remove(m_log_pool_name.c_str()) != 0) { - lderr(cct) << "Failed to remove the pool/poolset file " << m_log_pool_name - << dendl; - on_finish->complete(-errno); - return; - } else { - ldout(cct, 5) << "Removed the existing pool/poolset file." << dendl; - } - } - - if (access(m_log_pool_name.c_str(), F_OK) != 0) { - if ((m_log_pool = - pmemobj_create(m_log_pool_name.c_str(), - m_rwl_pool_layout_name, - m_log_pool_config_size, - (S_IWUSR | S_IRUSR))) == NULL) { - lderr(cct) << "failed to create pool (" << m_log_pool_name << ")" - << pmemobj_errormsg() << dendl; - m_cache_state->present = false; - m_cache_state->clean = true; - m_cache_state->empty = true; - /* TODO: filter/replace errnos that are meaningless to the caller */ - on_finish->complete(-errno); - return; - } - m_cache_state->present = true; - m_cache_state->clean = true; - m_cache_state->empty = true; - pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); - - /* new pool, calculate and store metadata */ - size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE); - size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogPmemEntry); - uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size); - if (num_small_writes > MAX_LOG_ENTRIES) { - num_small_writes = MAX_LOG_ENTRIES; - } - if (num_small_writes <= 2) { - lderr(cct) << "num_small_writes needs to > 2" << dendl; - on_finish->complete(-EINVAL); - return; - } - m_log_pool_actual_size = m_log_pool_config_size; - m_bytes_allocated_cap = effective_pool_size; - /* Log ring empty */ - m_first_free_entry = 0; - m_first_valid_entry = 0; - TX_BEGIN(m_log_pool) { - TX_ADD(pool_root); - D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION; - D_RW(pool_root)->log_entries = - TX_ZALLOC(struct WriteLogPmemEntry, - sizeof(struct WriteLogPmemEntry) * num_small_writes); - D_RW(pool_root)->pool_size = m_log_pool_actual_size; - D_RW(pool_root)->flushed_sync_gen = m_flushed_sync_gen; - D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE; - D_RW(pool_root)->num_log_entries = num_small_writes; - D_RW(pool_root)->first_free_entry = m_first_free_entry; - D_RW(pool_root)->first_valid_entry = m_first_valid_entry; - } TX_ONCOMMIT { - m_total_log_entries = D_RO(pool_root)->num_log_entries; - m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free - } TX_ONABORT { - m_total_log_entries = 0; - m_free_log_entries = 0; - lderr(cct) << "failed to initialize pool (" << m_log_pool_name << ")" << dendl; - on_finish->complete(-pmemobj_tx_errno()); - return; - } TX_FINALLY { - } TX_END; - } else { - m_cache_state->present = true; - /* Open existing pool */ - if ((m_log_pool = - pmemobj_open(m_log_pool_name.c_str(), - m_rwl_pool_layout_name)) == NULL) { - lderr(cct) << "failed to open pool (" << m_log_pool_name << "): " - << pmemobj_errormsg() << dendl; - on_finish->complete(-errno); - return; - } - pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); - if (D_RO(pool_root)->header.layout_version != RWL_POOL_VERSION) { - // TODO: will handle upgrading version in the future - lderr(cct) << "Pool layout version is " << D_RO(pool_root)->header.layout_version - << " expected " << RWL_POOL_VERSION << dendl; - on_finish->complete(-EINVAL); - return; - } - if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) { - lderr(cct) << "Pool block size is " << D_RO(pool_root)->block_size - << " expected " << MIN_WRITE_ALLOC_SIZE << dendl; - on_finish->complete(-EINVAL); - return; - } - m_log_pool_actual_size = D_RO(pool_root)->pool_size; - m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen; - m_total_log_entries = D_RO(pool_root)->num_log_entries; - m_first_free_entry = D_RO(pool_root)->first_free_entry; - m_first_valid_entry = D_RO(pool_root)->first_valid_entry; - if (m_first_free_entry < m_first_valid_entry) { - /* Valid entries wrap around the end of the ring, so first_free is lower - * than first_valid. If first_valid was == first_free+1, the entry at - * first_free would be empty. The last entry is never used, so in - * that case there would be zero free log entries. */ - m_free_log_entries = m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1; - } else { - /* first_valid is <= first_free. If they are == we have zero valid log - * entries, and n-1 free log entries */ - m_free_log_entries = m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1; - } - size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE); - m_bytes_allocated_cap = effective_pool_size; - load_existing_entries(later); - m_cache_state->clean = m_dirty_log_entries.empty(); - m_cache_state->empty = m_log_entries.empty(); - } - - ldout(cct,1) << "pool " << m_log_pool_name << " has " << m_total_log_entries - << " log entries, " << m_free_log_entries << " of which are free." - << " first_valid=" << m_first_valid_entry - << ", first_free=" << m_first_free_entry - << ", flushed_sync_gen=" << m_flushed_sync_gen - << ", m_current_sync_gen=" << m_current_sync_gen << dendl; - if (m_first_free_entry == m_first_valid_entry) { - ldout(cct,1) << "write log is empty" << dendl; - m_cache_state->empty = true; - } - - /* Start the sync point following the last one seen in the - * log. Flush the last sync point created during the loading of the - * existing log entries. */ - init_flush_new_sync_point(later); - ldout(cct,20) << "new sync point = [" << m_current_sync_point << "]" << dendl; - - m_initialized = true; - // Start the thread - m_thread_pool.start(); - - m_periodic_stats_enabled = m_cache_state->log_periodic_stats; - /* Do these after we drop lock */ - later.add(new LambdaContext([this](int r) { - if (m_periodic_stats_enabled) { - /* Log stats for the first time */ - periodic_stats(); - /* Arm periodic stats logging for the first time */ - std::lock_guard timer_locker(*m_timer_lock); - arm_periodic_stats(); - } - })); - m_image_ctx.op_work_queue->queue(on_finish, 0); -} - -template -void AbstractWriteLog::update_image_cache_state(Context *on_finish) { - m_cache_state->write_image_cache_state(on_finish); -} - -template -void AbstractWriteLog::init(Context *on_finish) { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 20) << dendl; - perf_start(m_image_ctx.id); - - ceph_assert(!m_initialized); - - Context *ctx = new LambdaContext( - [this, on_finish](int r) { - if (r >= 0) { - update_image_cache_state(on_finish); - } else { - on_finish->complete(r); - } - }); - - DeferredContexts later; - rwl_init(ctx, later); -} - -template -void AbstractWriteLog::shut_down(Context *on_finish) { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 20) << dendl; - - ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl; - - Context *ctx = new LambdaContext( - [this, on_finish](int r) { - ldout(m_image_ctx.cct, 6) << "shutdown complete" << dendl; - m_image_ctx.op_work_queue->queue(on_finish, r); - }); - ctx = new LambdaContext( - [this, ctx](int r) { - Context *next_ctx = override_ctx(r, ctx); - bool periodic_stats_enabled = m_periodic_stats_enabled; - m_periodic_stats_enabled = false; - - if (periodic_stats_enabled) { - /* Log stats one last time if they were enabled */ - periodic_stats(); - } - { - std::lock_guard locker(m_lock); - ceph_assert(m_dirty_log_entries.size() == 0); - m_wake_up_enabled = false; - m_cache_state->clean = true; - m_log_entries.clear(); - if (m_log_pool) { - ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl; - pmemobj_close(m_log_pool); - } - if (m_cache_state->clean) { - if (m_log_is_poolset) { - ldout(m_image_ctx.cct, 5) << "Not removing poolset " << m_log_pool_name << dendl; - } else { - ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << m_log_pool_name << dendl; - if (remove(m_log_pool_name.c_str()) != 0) { - lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << m_log_pool_name << "\": " - << pmemobj_errormsg() << dendl; - } else { - m_cache_state->clean = true; - m_cache_state->empty = true; - m_cache_state->present = false; - } - } - } else { - if (m_log_is_poolset) { - ldout(m_image_ctx.cct, 5) << "Not removing poolset " << m_log_pool_name << dendl; - } else { - ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << m_log_pool_name << dendl; - } - } - if (m_perfcounter) { - perf_stop(); - } - } - update_image_cache_state(next_ctx); - }); - ctx = new LambdaContext( - [this, ctx](int r) { - Context *next_ctx = override_ctx(r, ctx); - { - /* Sync with process_writeback_dirty_entries() */ - RWLock::WLocker entry_reader_wlocker(m_entry_reader_lock); - m_shutting_down = true; - /* Flush all writes to OSDs (unless disabled) and wait for all - in-progress flush writes to complete */ - ldout(m_image_ctx.cct, 6) << "flushing" << dendl; - if (m_periodic_stats_enabled) { - periodic_stats(); - } - } - flush_dirty_entries(next_ctx); - }); - ctx = new LambdaContext( - [this, ctx](int r) { - Context *next_ctx = override_ctx(r, ctx); - ldout(m_image_ctx.cct, 6) << "waiting for in flight operations" << dendl; - // Wait for in progress IOs to complete - next_ctx = util::create_async_context_callback(m_image_ctx, next_ctx); - m_async_op_tracker.wait_for_ops(next_ctx); - }); - ctx = new LambdaContext( - [this, ctx](int r) { - ldout(m_image_ctx.cct, 6) << "Done internal_flush in shutdown" << dendl; - m_work_queue.queue(ctx, r); - }); - /* Complete all in-flight writes before shutting down */ - ldout(m_image_ctx.cct, 6) << "internal_flush in shutdown" << dendl; - internal_flush(false, ctx); -} - -template -void AbstractWriteLog::read(Extents&& image_extents, - ceph::bufferlist* bl, - int fadvise_flags, Context *on_finish) { - // TODO: handle writesame and discard case in later PRs - CephContext *cct = m_image_ctx.cct; - utime_t now = ceph_clock_now(); - C_ReadRequest *read_ctx = new C_ReadRequest(cct, now, m_perfcounter, bl, on_finish); - ldout(cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id - << "image_extents=" << image_extents << ", " - << "bl=" << bl << ", " - << "on_finish=" << on_finish << dendl; - - ceph_assert(m_initialized); - bl->clear(); - m_perfcounter->inc(l_librbd_rwl_rd_req, 1); - - /* - * The strategy here is to look up all the WriteLogMapEntries that overlap - * this read, and iterate through those to separate this read into hits and - * misses. A new Extents object is produced here with Extents for each miss - * region. The miss Extents is then passed on to the read cache below RWL. We - * also produce an ImageExtentBufs for all the extents (hit or miss) in this - * read. When the read from the lower cache layer completes, we iterate - * through the ImageExtentBufs and insert buffers for each cache hit at the - * appropriate spot in the bufferlist returned from below for the miss - * read. The buffers we insert here refer directly to regions of various - * write log entry data buffers. - * - * Locking: These buffer objects hold a reference on the write log entries - * they refer to. Log entries can't be retired until there are no references. - * The GenericWriteLogEntry references are released by the buffer destructor. - */ - for (auto &extent : image_extents) { - uint64_t extent_offset = 0; - RWLock::RLocker entry_reader_locker(m_entry_reader_lock); - WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries(block_extent(extent)); - for (auto &map_entry : map_entries) { - Extent entry_image_extent(rwl::image_extent(map_entry.block_extent)); - /* If this map entry starts after the current image extent offset ... */ - if (entry_image_extent.first > extent.first + extent_offset) { - /* ... add range before map_entry to miss extents */ - uint64_t miss_extent_start = extent.first + extent_offset; - uint64_t miss_extent_length = entry_image_extent.first - miss_extent_start; - Extent miss_extent(miss_extent_start, miss_extent_length); - read_ctx->miss_extents.push_back(miss_extent); - /* Add miss range to read extents */ - ImageExtentBuf miss_extent_buf(miss_extent); - read_ctx->read_extents.push_back(miss_extent_buf); - extent_offset += miss_extent_length; - } - ceph_assert(entry_image_extent.first <= extent.first + extent_offset); - uint64_t entry_offset = 0; - /* If this map entry starts before the current image extent offset ... */ - if (entry_image_extent.first < extent.first + extent_offset) { - /* ... compute offset into log entry for this read extent */ - entry_offset = (extent.first + extent_offset) - entry_image_extent.first; - } - /* This read hit ends at the end of the extent or the end of the log - entry, whichever is less. */ - uint64_t entry_hit_length = min(entry_image_extent.second - entry_offset, - extent.second - extent_offset); - Extent hit_extent(entry_image_extent.first, entry_hit_length); - if (0 == map_entry.log_entry->write_bytes() && 0 < map_entry.log_entry->bytes_dirty()) { - /* discard log entry */ - auto discard_entry = map_entry.log_entry; - ldout(cct, 20) << "read hit on discard entry: log_entry=" << *discard_entry << dendl; - /* Discards read as zero, so we'll construct a bufferlist of zeros */ - bufferlist zero_bl; - zero_bl.append_zero(entry_hit_length); - /* Add hit extent to read extents */ - ImageExtentBuf hit_extent_buf(hit_extent, zero_bl); - read_ctx->read_extents.push_back(hit_extent_buf); - } else { - /* write and writesame log entry */ - /* Offset of the map entry into the log entry's buffer */ - uint64_t map_entry_buffer_offset = entry_image_extent.first - map_entry.log_entry->ram_entry.image_offset_bytes; - /* Offset into the log entry buffer of this read hit */ - uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset; - /* Create buffer object referring to pmem pool for this read hit */ - auto write_entry = map_entry.log_entry; - - /* Make a bl for this hit extent. This will add references to the write_entry->pmem_bp */ - buffer::list hit_bl; - - buffer::list entry_bl_copy; - write_entry->copy_pmem_bl(&entry_bl_copy); - entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl); - - ceph_assert(hit_bl.length() == entry_hit_length); - - /* Add hit extent to read extents */ - ImageExtentBuf hit_extent_buf(hit_extent, hit_bl); - read_ctx->read_extents.push_back(hit_extent_buf); - } - /* Exclude RWL hit range from buffer and extent */ - extent_offset += entry_hit_length; - ldout(cct, 20) << map_entry << dendl; - } - /* If the last map entry didn't consume the entire image extent ... */ - if (extent.second > extent_offset) { - /* ... add the rest of this extent to miss extents */ - uint64_t miss_extent_start = extent.first + extent_offset; - uint64_t miss_extent_length = extent.second - extent_offset; - Extent miss_extent(miss_extent_start, miss_extent_length); - read_ctx->miss_extents.push_back(miss_extent); - /* Add miss range to read extents */ - ImageExtentBuf miss_extent_buf(miss_extent); - read_ctx->read_extents.push_back(miss_extent_buf); - extent_offset += miss_extent_length; - } - } - - ldout(cct, 20) << "miss_extents=" << read_ctx->miss_extents << ", " - << "miss_bl=" << read_ctx->miss_bl << dendl; - - if (read_ctx->miss_extents.empty()) { - /* All of this read comes from RWL */ - read_ctx->complete(0); - } else { - /* Pass the read misses on to the layer below RWL */ - m_image_writeback.aio_read(std::move(read_ctx->miss_extents), &read_ctx->miss_bl, fadvise_flags, read_ctx); - } -} - -template -void AbstractWriteLog::write(Extents &&image_extents, - bufferlist&& bl, - int fadvise_flags, - Context *on_finish) { - CephContext *cct = m_image_ctx.cct; - - ldout(cct, 20) << "aio_write" << dendl; - - utime_t now = ceph_clock_now(); - m_perfcounter->inc(l_librbd_rwl_wr_req, 1); - - ceph_assert(m_initialized); - - auto *write_req = - new C_WriteRequestT(*this, now, std::move(image_extents), std::move(bl), fadvise_flags, - m_lock, m_perfcounter, on_finish); - m_perfcounter->inc(l_librbd_rwl_wr_bytes, write_req->image_extents_summary.total_bytes); - - /* The lambda below will be called when the block guard for all - * blocks affected by this write is obtained */ - GuardedRequestFunctionContext *guarded_ctx = - new GuardedRequestFunctionContext([this, write_req](GuardedRequestFunctionContext &guard_ctx) { - write_req->blockguard_acquired(guard_ctx); - alloc_and_dispatch_io_req(write_req); - }); - - detain_guarded_request(write_req, guarded_ctx, false); -} - -template -void AbstractWriteLog::discard(uint64_t offset, uint64_t length, - uint32_t discard_granularity_bytes, - Context *on_finish) { - CephContext *cct = m_image_ctx.cct; - - ldout(cct, 20) << dendl; - - utime_t now = ceph_clock_now(); - m_perfcounter->inc(l_librbd_rwl_discard, 1); - Extents discard_extents = {{offset, length}}; - m_discard_granularity_bytes = discard_granularity_bytes; - - ceph_assert(m_initialized); - - auto *discard_req = - new C_DiscardRequestT(*this, now, std::move(discard_extents), discard_granularity_bytes, - m_lock, m_perfcounter, on_finish); - - /* The lambda below will be called when the block guard for all - * blocks affected by this write is obtained */ - GuardedRequestFunctionContext *guarded_ctx = - new GuardedRequestFunctionContext([this, discard_req](GuardedRequestFunctionContext &guard_ctx) { - discard_req->blockguard_acquired(guard_ctx); - alloc_and_dispatch_io_req(discard_req); - }); - - detain_guarded_request(discard_req, guarded_ctx, false); -} - -/** - * Aio_flush completes when all previously completed writes are - * flushed to persistent cache. We make a best-effort attempt to also - * defer until all in-progress writes complete, but we may not know - * about all of the writes the application considers in-progress yet, - * due to uncertainty in the IO submission workq (multiple WQ threads - * may allow out-of-order submission). - * - * This flush operation will not wait for writes deferred for overlap - * in the block guard. - */ -template -void AbstractWriteLog::flush(io::FlushSource flush_source, Context *on_finish) { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 20) << "on_finish=" << on_finish << " flush_source=" << flush_source << dendl; - - if (io::FLUSH_SOURCE_SHUTDOWN == flush_source || io::FLUSH_SOURCE_INTERNAL == flush_source) { - internal_flush(false, on_finish); - return; - } - m_perfcounter->inc(l_librbd_rwl_aio_flush, 1); - - /* May be called even if initialization fails */ - if (!m_initialized) { - ldout(cct, 05) << "never initialized" << dendl; - /* Deadlock if completed here */ - m_image_ctx.op_work_queue->queue(on_finish, 0); - return; - } - - { - std::shared_lock image_locker(m_image_ctx.image_lock); - if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) { - on_finish->complete(-EROFS); - return; - } - } - - auto flush_req = make_flush_req(on_finish); - - GuardedRequestFunctionContext *guarded_ctx = - new GuardedRequestFunctionContext([this, flush_req](GuardedRequestFunctionContext &guard_ctx) { - ldout(m_image_ctx.cct, 20) << "flush_req=" << flush_req << " cell=" << guard_ctx.cell << dendl; - ceph_assert(guard_ctx.cell); - flush_req->detained = guard_ctx.state.detained; - /* We don't call flush_req->set_cell(), because the block guard will be released here */ - { - DeferredContexts post_unlock; /* Do these when the lock below is released */ - std::lock_guard locker(m_lock); - - if (!m_persist_on_flush && m_persist_on_write_until_flush) { - m_persist_on_flush = true; - ldout(m_image_ctx.cct, 5) << "now persisting on flush" << dendl; - } - - /* - * Create a new sync point if there have been writes since the last - * one. - * - * We do not flush the caches below the RWL here. - */ - flush_new_sync_point_if_needed(flush_req, post_unlock); - } - - release_guarded_request(guard_ctx.cell); - }); - - detain_guarded_request(flush_req, guarded_ctx, true); -} - -template -void AbstractWriteLog::writesame(uint64_t offset, uint64_t length, - bufferlist&& bl, int fadvise_flags, - Context *on_finish) { - CephContext *cct = m_image_ctx.cct; - - ldout(cct, 20) << "aio_writesame" << dendl; - - utime_t now = ceph_clock_now(); - Extents ws_extents = {{offset, length}}; - m_perfcounter->inc(l_librbd_rwl_ws, 1); - ceph_assert(m_initialized); - - /* A write same request is also a write request. The key difference is the - * write same data buffer is shorter than the extent of the request. The full - * extent will be used in the block guard, and appear in - * m_blocks_to_log_entries_map. The data buffer allocated for the WS is only - * as long as the length of the bl here, which is the pattern that's repeated - * in the image for the entire length of this WS. Read hits and flushing of - * write sames are different than normal writes. */ - auto *ws_req = - new C_WriteSameRequestT(*this, now, std::move(ws_extents), std::move(bl), - fadvise_flags, m_lock, m_perfcounter, on_finish); - m_perfcounter->inc(l_librbd_rwl_ws_bytes, ws_req->image_extents_summary.total_bytes); - - /* The lambda below will be called when the block guard for all - * blocks affected by this write is obtained */ - GuardedRequestFunctionContext *guarded_ctx = - new GuardedRequestFunctionContext([this, ws_req](GuardedRequestFunctionContext &guard_ctx) { - ws_req->blockguard_acquired(guard_ctx); - alloc_and_dispatch_io_req(ws_req); - }); - - detain_guarded_request(ws_req, guarded_ctx, false); -} - -template -void AbstractWriteLog::compare_and_write(Extents &&image_extents, - bufferlist&& cmp_bl, - bufferlist&& bl, - uint64_t *mismatch_offset, - int fadvise_flags, - Context *on_finish) { - ldout(m_image_ctx.cct, 20) << dendl; - - utime_t now = ceph_clock_now(); - m_perfcounter->inc(l_librbd_rwl_cmp, 1); - ceph_assert(m_initialized); - - /* A compare and write request is also a write request. We only allocate - * resources and dispatch this write request if the compare phase - * succeeds. */ - auto *cw_req = - new C_CompAndWriteRequestT(*this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl), - mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish); - m_perfcounter->inc(l_librbd_rwl_cmp_bytes, cw_req->image_extents_summary.total_bytes); - - /* The lambda below will be called when the block guard for all - * blocks affected by this write is obtained */ - GuardedRequestFunctionContext *guarded_ctx = - new GuardedRequestFunctionContext([this, cw_req](GuardedRequestFunctionContext &guard_ctx) { - cw_req->blockguard_acquired(guard_ctx); - - auto read_complete_ctx = new LambdaContext( - [this, cw_req](int r) { - ldout(m_image_ctx.cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id - << "cw_req=" << cw_req << dendl; - - /* Compare read_bl to cmp_bl to determine if this will produce a write */ - buffer::list aligned_read_bl; - if (cw_req->cmp_bl.length() < cw_req->read_bl.length()) { - aligned_read_bl.substr_of(cw_req->read_bl, 0, cw_req->cmp_bl.length()); - } - if (cw_req->cmp_bl.contents_equal(cw_req->read_bl) || - cw_req->cmp_bl.contents_equal(aligned_read_bl)) { - /* Compare phase succeeds. Begin write */ - ldout(m_image_ctx.cct, 5) << " cw_req=" << cw_req << " compare matched" << dendl; - cw_req->compare_succeeded = true; - *cw_req->mismatch_offset = 0; - /* Continue with this request as a write. Blockguard release and - * user request completion handled as if this were a plain - * write. */ - alloc_and_dispatch_io_req(cw_req); - } else { - /* Compare phase fails. Comp-and write ends now. */ - ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " compare failed" << dendl; - /* Bufferlist doesn't tell us where they differed, so we'll have to determine that here */ - uint64_t bl_index = 0; - for (bl_index = 0; bl_index < cw_req->cmp_bl.length(); bl_index++) { - if (cw_req->cmp_bl[bl_index] != cw_req->read_bl[bl_index]) { - ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " mismatch at " << bl_index << dendl; - break; - } - } - cw_req->compare_succeeded = false; - *cw_req->mismatch_offset = bl_index; - cw_req->complete_user_request(-EILSEQ); - cw_req->release_cell(); - cw_req->complete(0); - } - }); - - /* Read phase of comp-and-write must read through RWL */ - Extents image_extents_copy = cw_req->image_extents; - read(std::move(image_extents_copy), &cw_req->read_bl, cw_req->fadvise_flags, read_complete_ctx); - }); - - detain_guarded_request(cw_req, guarded_ctx, false); -} - -template -void AbstractWriteLog::flush(Context *on_finish) { - internal_flush(false, on_finish); -} - -template -void AbstractWriteLog::invalidate(Context *on_finish) { - internal_flush(true, on_finish); -} - -template -CephContext *AbstractWriteLog::get_context() { - return m_image_ctx.cct; -} - -template -BlockGuardCell* AbstractWriteLog::detain_guarded_request_helper(GuardedRequest &req) -{ - CephContext *cct = m_image_ctx.cct; - BlockGuardCell *cell; - - ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock)); - ldout(cct, 20) << dendl; - - int r = m_write_log_guard.detain(req.block_extent, &req, &cell); - ceph_assert(r>=0); - if (r > 0) { - ldout(cct, 20) << "detaining guarded request due to in-flight requests: " - << "req=" << req << dendl; - return nullptr; - } - - ldout(cct, 20) << "in-flight request cell: " << cell << dendl; - return cell; -} - -template -BlockGuardCell* AbstractWriteLog::detain_guarded_request_barrier_helper( - GuardedRequest &req) -{ - BlockGuardCell *cell = nullptr; - - ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock)); - ldout(m_image_ctx.cct, 20) << dendl; - - if (m_barrier_in_progress) { - req.guard_ctx->state.queued = true; - m_awaiting_barrier.push_back(req); - } else { - bool barrier = req.guard_ctx->state.barrier; - if (barrier) { - m_barrier_in_progress = true; - req.guard_ctx->state.current_barrier = true; - } - cell = detain_guarded_request_helper(req); - if (barrier) { - /* Only non-null if the barrier acquires the guard now */ - m_barrier_cell = cell; - } - } - - return cell; -} - -template -void AbstractWriteLog::detain_guarded_request( - C_BlockIORequestT *request, - GuardedRequestFunctionContext *guarded_ctx, - bool is_barrier) -{ - BlockExtent extent; - if (request) { - extent = request->image_extents_summary.block_extent(); - } else { - extent = block_extent(whole_volume_extent()); - } - auto req = GuardedRequest(extent, guarded_ctx, is_barrier); - BlockGuardCell *cell = nullptr; - - ldout(m_image_ctx.cct, 20) << dendl; - { - std::lock_guard locker(m_blockguard_lock); - cell = detain_guarded_request_barrier_helper(req); - } - if (cell) { - req.guard_ctx->cell = cell; - req.guard_ctx->complete(0); - } -} - -template -void AbstractWriteLog::release_guarded_request(BlockGuardCell *released_cell) -{ - CephContext *cct = m_image_ctx.cct; - WriteLogGuard::BlockOperations block_reqs; - ldout(cct, 20) << "released_cell=" << released_cell << dendl; - - { - std::lock_guard locker(m_blockguard_lock); - m_write_log_guard.release(released_cell, &block_reqs); - - for (auto &req : block_reqs) { - req.guard_ctx->state.detained = true; - BlockGuardCell *detained_cell = detain_guarded_request_helper(req); - if (detained_cell) { - if (req.guard_ctx->state.current_barrier) { - /* The current barrier is acquiring the block guard, so now we know its cell */ - m_barrier_cell = detained_cell; - /* detained_cell could be == released_cell here */ - ldout(cct, 20) << "current barrier cell=" << detained_cell << " req=" << req << dendl; - } - req.guard_ctx->cell = detained_cell; - m_work_queue.queue(req.guard_ctx); - } - } - - if (m_barrier_in_progress && (released_cell == m_barrier_cell)) { - ldout(cct, 20) << "current barrier released cell=" << released_cell << dendl; - /* The released cell is the current barrier request */ - m_barrier_in_progress = false; - m_barrier_cell = nullptr; - /* Move waiting requests into the blockguard. Stop if there's another barrier */ - while (!m_barrier_in_progress && !m_awaiting_barrier.empty()) { - auto &req = m_awaiting_barrier.front(); - ldout(cct, 20) << "submitting queued request to blockguard: " << req << dendl; - BlockGuardCell *detained_cell = detain_guarded_request_barrier_helper(req); - if (detained_cell) { - req.guard_ctx->cell = detained_cell; - m_work_queue.queue(req.guard_ctx); - } - m_awaiting_barrier.pop_front(); - } - } - } - - ldout(cct, 20) << "exit" << dendl; -} - -/* - * Performs the log event append operation for all of the scheduled - * events. - */ -template -void AbstractWriteLog::append_scheduled_ops(void) -{ - GenericLogOperations ops; - int append_result = 0; - bool ops_remain = false; - bool appending = false; /* true if we set m_appending */ - ldout(m_image_ctx.cct, 20) << dendl; - do { - ops.clear(); - - { - std::lock_guard locker(m_lock); - if (!appending && m_appending) { - /* Another thread is appending */ - ldout(m_image_ctx.cct, 15) << "Another thread is appending" << dendl; - return; - } - if (m_ops_to_append.size()) { - appending = true; - m_appending = true; - auto last_in_batch = m_ops_to_append.begin(); - unsigned int ops_to_append = m_ops_to_append.size(); - if (ops_to_append > OPS_APPENDED_TOGETHER) { - ops_to_append = OPS_APPENDED_TOGETHER; - } - std::advance(last_in_batch, ops_to_append); - ops.splice(ops.end(), m_ops_to_append, m_ops_to_append.begin(), last_in_batch); - ops_remain = true; /* Always check again before leaving */ - ldout(m_image_ctx.cct, 20) << "appending " << ops.size() << ", " - << m_ops_to_append.size() << " remain" << dendl; - } else { - ops_remain = false; - if (appending) { - appending = false; - m_appending = false; - } - } - } - - if (ops.size()) { - std::lock_guard locker(m_log_append_lock); - alloc_op_log_entries(ops); - append_result = append_op_log_entries(ops); - } - - int num_ops = ops.size(); - if (num_ops) { - /* New entries may be flushable. Completion will wake up flusher. */ - complete_op_log_entries(std::move(ops), append_result); - } - } while (ops_remain); -} - -template -void AbstractWriteLog::enlist_op_appender() -{ - m_async_append_ops++; - m_async_op_tracker.start_op(); - Context *append_ctx = new LambdaContext([this](int r) { - append_scheduled_ops(); - m_async_append_ops--; - m_async_op_tracker.finish_op(); - }); - m_work_queue.queue(append_ctx); -} - -/* - * Takes custody of ops. They'll all get their log entries appended, - * and have their on_write_persist contexts completed once they and - * all prior log entries are persisted everywhere. - */ -template -void AbstractWriteLog::schedule_append(GenericLogOperations &ops) -{ - bool need_finisher; - GenericLogOperationsVector appending; - - std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending)); - { - std::lock_guard locker(m_lock); - - need_finisher = m_ops_to_append.empty() && !m_appending; - m_ops_to_append.splice(m_ops_to_append.end(), ops); - } - - if (need_finisher) { - enlist_op_appender(); - } - - for (auto &op : appending) { - op->appending(); - } -} - -template -void AbstractWriteLog::schedule_append(GenericLogOperationsVector &ops) -{ - GenericLogOperations to_append(ops.begin(), ops.end()); - - schedule_append(to_append); -} - -template -void AbstractWriteLog::schedule_append(GenericLogOperationSharedPtr op) -{ - GenericLogOperations to_append { op }; - - schedule_append(to_append); -} - -const unsigned long int ops_flushed_together = 4; -/* - * Performs the pmem buffer flush on all scheduled ops, then schedules - * the log event append operation for all of them. - */ -template -void AbstractWriteLog::flush_then_append_scheduled_ops(void) -{ - GenericLogOperations ops; - bool ops_remain = false; - ldout(m_image_ctx.cct, 20) << dendl; - do { - { - ops.clear(); - std::lock_guard locker(m_lock); - if (m_ops_to_flush.size()) { - auto last_in_batch = m_ops_to_flush.begin(); - unsigned int ops_to_flush = m_ops_to_flush.size(); - if (ops_to_flush > ops_flushed_together) { - ops_to_flush = ops_flushed_together; - } - ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl; - std::advance(last_in_batch, ops_to_flush); - ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch); - ops_remain = !m_ops_to_flush.empty(); - ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", " - << m_ops_to_flush.size() << " remain" << dendl; - } else { - ops_remain = false; - } - } - if (ops_remain) { - enlist_op_flusher(); - } - - /* Ops subsequently scheduled for flush may finish before these, - * which is fine. We're unconcerned with completion order until we - * get to the log message append step. */ - if (ops.size()) { - flush_pmem_buffer(ops); - schedule_append(ops); - } - } while (ops_remain); - append_scheduled_ops(); -} - -template -void AbstractWriteLog::enlist_op_flusher() -{ - m_async_flush_ops++; - m_async_op_tracker.start_op(); - Context *flush_ctx = new LambdaContext([this](int r) { - flush_then_append_scheduled_ops(); - m_async_flush_ops--; - m_async_op_tracker.finish_op(); - }); - m_work_queue.queue(flush_ctx); -} - -/* - * Takes custody of ops. They'll all get their pmem blocks flushed, - * then get their log entries appended. - */ -template -void AbstractWriteLog::schedule_flush_and_append(GenericLogOperationsVector &ops) -{ - GenericLogOperations to_flush(ops.begin(), ops.end()); - bool need_finisher; - ldout(m_image_ctx.cct, 20) << dendl; - { - std::lock_guard locker(m_lock); - - need_finisher = m_ops_to_flush.empty(); - m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush); - } - - if (need_finisher) { - enlist_op_flusher(); - } -} - -/* - * Flush the pmem regions for the data blocks of a set of operations - * - * V is expected to be GenericLogOperations, or GenericLogOperationsVector - */ -template -template -void AbstractWriteLog::flush_pmem_buffer(V& ops) -{ - for (auto &operation : ops) { - operation->flush_pmem_buf_to_cache(m_log_pool); - } - - /* Drain once for all */ - pmemobj_drain(m_log_pool); - - utime_t now = ceph_clock_now(); - for (auto &operation : ops) { - if (operation->reserved_allocated()) { - operation->buf_persist_comp_time = now; - } else { - ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl; - } - } -} - -/* - * Allocate the (already reserved) write log entries for a set of operations. - * - * Locking: - * Acquires lock - */ -template -void AbstractWriteLog::alloc_op_log_entries(GenericLogOperations &ops) -{ - TOID(struct WriteLogPoolRoot) pool_root; - pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); - struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries); - - ceph_assert(ceph_mutex_is_locked_by_me(m_log_append_lock)); - - /* Allocate the (already reserved) log entries */ - std::lock_guard locker(m_lock); - - for (auto &operation : ops) { - uint32_t entry_index = m_first_free_entry; - m_first_free_entry = (m_first_free_entry + 1) % m_total_log_entries; - auto &log_entry = operation->get_log_entry(); - log_entry->log_entry_index = entry_index; - log_entry->ram_entry.entry_index = entry_index; - log_entry->pmem_entry = &pmem_log_entries[entry_index]; - log_entry->ram_entry.entry_valid = 1; - m_log_entries.push_back(log_entry); - ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl; - } -} - -/* - * Flush the persistent write log entries set of ops. The entries must - * be contiguous in persistent memory. - */ -template -void AbstractWriteLog::flush_op_log_entries(GenericLogOperationsVector &ops) -{ - if (ops.empty()) { - return; - } - - if (ops.size() > 1) { - ceph_assert(ops.front()->get_log_entry()->pmem_entry < ops.back()->get_log_entry()->pmem_entry); - } - - ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() << " " - << "start address=" - << ops.front()->get_log_entry()->pmem_entry << " " - << "bytes=" - << ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry)) - << dendl; - pmemobj_flush(m_log_pool, - ops.front()->get_log_entry()->pmem_entry, - ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry))); -} - -/* - * Write and persist the (already allocated) write log entries and - * data buffer allocations for a set of ops. The data buffer for each - * of these must already have been persisted to its reserved area. - */ -template -int AbstractWriteLog::append_op_log_entries(GenericLogOperations &ops) -{ - CephContext *cct = m_image_ctx.cct; - GenericLogOperationsVector entries_to_flush; - TOID(struct WriteLogPoolRoot) pool_root; - pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); - int ret = 0; - - ceph_assert(ceph_mutex_is_locked_by_me(m_log_append_lock)); - - if (ops.empty()) { - return 0; - } - entries_to_flush.reserve(OPS_APPENDED_TOGETHER); - - /* Write log entries to ring and persist */ - utime_t now = ceph_clock_now(); - for (auto &operation : ops) { - if (!entries_to_flush.empty()) { - /* Flush these and reset the list if the current entry wraps to the - * tail of the ring */ - if (entries_to_flush.back()->get_log_entry()->log_entry_index > - operation->get_log_entry()->log_entry_index) { - ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at " - << "operation=[" << *operation << "]" << dendl; - flush_op_log_entries(entries_to_flush); - entries_to_flush.clear(); - now = ceph_clock_now(); - } - } - ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index=" - << operation->get_log_entry()->log_entry_index << " " - << "from " << &operation->get_log_entry()->ram_entry << " " - << "to " << operation->get_log_entry()->pmem_entry << " " - << "operation=[" << *operation << "]" << dendl; - ldout(m_image_ctx.cct, 05) << "APPENDING: index=" - << operation->get_log_entry()->log_entry_index << " " - << "operation=[" << *operation << "]" << dendl; - operation->log_append_time = now; - *operation->get_log_entry()->pmem_entry = operation->get_log_entry()->ram_entry; - ldout(m_image_ctx.cct, 20) << "APPENDING: index=" - << operation->get_log_entry()->log_entry_index << " " - << "pmem_entry=[" << *operation->get_log_entry()->pmem_entry - << "]" << dendl; - entries_to_flush.push_back(operation); - } - flush_op_log_entries(entries_to_flush); - - /* Drain once for all */ - pmemobj_drain(m_log_pool); - - /* - * Atomically advance the log head pointer and publish the - * allocations for all the data buffers they refer to. - */ - utime_t tx_start = ceph_clock_now(); - TX_BEGIN(m_log_pool) { - D_RW(pool_root)->first_free_entry = m_first_free_entry; - for (auto &operation : ops) { - if (operation->reserved_allocated()) { - auto write_op = (std::shared_ptr&) operation; - pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1); - } else { - ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl; - } - } - } TX_ONCOMMIT { - } TX_ONABORT { - lderr(cct) << "failed to commit " << ops.size() - << " log entries (" << m_log_pool_name << ")" << dendl; - ceph_assert(false); - ret = -EIO; - } TX_FINALLY { - } TX_END; - - utime_t tx_end = ceph_clock_now(); - m_perfcounter->tinc(l_librbd_rwl_append_tx_t, tx_end - tx_start); - m_perfcounter->hinc( - l_librbd_rwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size()); - for (auto &operation : ops) { - operation->log_append_comp_time = tx_end; - } - - return ret; -} - -/* - * Complete a set of write ops with the result of append_op_entries. - */ -template -void AbstractWriteLog::complete_op_log_entries(GenericLogOperations &&ops, - const int result) -{ - GenericLogEntries dirty_entries; - int published_reserves = 0; - ldout(m_image_ctx.cct, 20) << __func__ << ": completing" << dendl; - for (auto &op : ops) { - utime_t now = ceph_clock_now(); - auto log_entry = op->get_log_entry(); - log_entry->completed = true; - if (op->is_writing_op()) { - op->mark_log_entry_completed(); - dirty_entries.push_back(log_entry); - } - if (op->reserved_allocated()) { - published_reserves++; - } - op->complete(result); - m_perfcounter->tinc(l_librbd_rwl_log_op_dis_to_app_t, - op->log_append_time - op->dispatch_time); - m_perfcounter->tinc(l_librbd_rwl_log_op_dis_to_cmp_t, now - op->dispatch_time); - m_perfcounter->hinc(l_librbd_rwl_log_op_dis_to_cmp_t_hist, - utime_t(now - op->dispatch_time).to_nsec(), - log_entry->ram_entry.write_bytes); - utime_t app_lat = op->log_append_comp_time - op->log_append_time; - m_perfcounter->tinc(l_librbd_rwl_log_op_app_to_appc_t, app_lat); - m_perfcounter->hinc(l_librbd_rwl_log_op_app_to_appc_t_hist, app_lat.to_nsec(), - log_entry->ram_entry.write_bytes); - m_perfcounter->tinc(l_librbd_rwl_log_op_app_to_cmp_t, now - op->log_append_time); - } - - { - std::lock_guard locker(m_lock); - m_unpublished_reserves -= published_reserves; - m_dirty_log_entries.splice(m_dirty_log_entries.end(), dirty_entries); - - /* New entries may be flushable */ - wake_up(); - } -} - -/** - * Dispatch as many deferred writes as possible - */ -template -void AbstractWriteLog::dispatch_deferred_writes(void) -{ - C_BlockIORequestT *front_req = nullptr; /* req still on front of deferred list */ - C_BlockIORequestT *allocated_req = nullptr; /* req that was allocated, and is now off the list */ - bool allocated = false; /* front_req allocate succeeded */ - bool cleared_dispatching_flag = false; - - /* If we can't become the dispatcher, we'll exit */ - { - std::lock_guard locker(m_lock); - if (m_dispatching_deferred_ops || - !m_deferred_ios.size()) { - return; - } - m_dispatching_deferred_ops = true; - } - - /* There are ops to dispatch, and this should be the only thread dispatching them */ - { - std::lock_guard deferred_dispatch(m_deferred_dispatch_lock); - do { - { - std::lock_guard locker(m_lock); - ceph_assert(m_dispatching_deferred_ops); - if (allocated) { - /* On the 2..n-1 th time we get lock, front_req->alloc_resources() will - * have succeeded, and we'll need to pop it off the deferred ops list - * here. */ - ceph_assert(front_req); - ceph_assert(!allocated_req); - m_deferred_ios.pop_front(); - allocated_req = front_req; - front_req = nullptr; - allocated = false; - } - ceph_assert(!allocated); - if (!allocated && front_req) { - /* front_req->alloc_resources() failed on the last iteration. We'll stop dispatching. */ - front_req = nullptr; - ceph_assert(!cleared_dispatching_flag); - m_dispatching_deferred_ops = false; - cleared_dispatching_flag = true; - } else { - ceph_assert(!front_req); - if (m_deferred_ios.size()) { - /* New allocation candidate */ - front_req = m_deferred_ios.front(); - } else { - ceph_assert(!cleared_dispatching_flag); - m_dispatching_deferred_ops = false; - cleared_dispatching_flag = true; - } - } - } - /* Try allocating for front_req before we decide what to do with allocated_req - * (if any) */ - if (front_req) { - ceph_assert(!cleared_dispatching_flag); - allocated = front_req->alloc_resources(); - } - if (allocated_req && front_req && allocated) { - /* Push dispatch of the first allocated req to a wq */ - m_work_queue.queue(new LambdaContext( - [this, allocated_req](int r) { - allocated_req->dispatch(); - }), 0); - allocated_req = nullptr; - } - ceph_assert(!(allocated_req && front_req && allocated)); - - /* Continue while we're still considering the front of the deferred ops list */ - } while (front_req); - ceph_assert(!allocated); - } - ceph_assert(cleared_dispatching_flag); - - /* If any deferred requests were allocated, the last one will still be in allocated_req */ - if (allocated_req) { - allocated_req->dispatch(); - } -} - -/** - * Returns the lanes used by this write, and attempts to dispatch the next - * deferred write - */ -template -void AbstractWriteLog::release_write_lanes(C_BlockIORequestT *req) -{ - { - std::lock_guard locker(m_lock); - m_free_lanes += req->image_extents.size(); - } - dispatch_deferred_writes(); -} - -/** - * Attempts to allocate log resources for a write. Write is dispatched if - * resources are available, or queued if they aren't. - */ -template -void AbstractWriteLog::alloc_and_dispatch_io_req(C_BlockIORequestT *req) -{ - bool dispatch_here = false; - - { - /* If there are already deferred writes, queue behind them for resources */ - { - std::lock_guard locker(m_lock); - dispatch_here = m_deferred_ios.empty(); - } - if (dispatch_here) { - dispatch_here = req->alloc_resources(); - } - if (dispatch_here) { - ldout(m_image_ctx.cct, 20) << "dispatching" << dendl; - req->dispatch(); - } else { - req->deferred(); - { - std::lock_guard locker(m_lock); - m_deferred_ios.push_back(req); - } - ldout(m_image_ctx.cct, 20) << "deferred IOs: " << m_deferred_ios.size() << dendl; - dispatch_deferred_writes(); - } - } -} - -template -bool AbstractWriteLog::alloc_resources(C_BlockIORequestT *req) { - bool alloc_succeeds = true; - bool no_space = false; - uint64_t bytes_allocated = 0; - uint64_t bytes_cached = 0; - uint64_t bytes_dirtied = 0; - uint64_t num_lanes = 0; - uint64_t num_unpublished_reserves = 0; - uint64_t num_log_entries = 0; - - // Setup buffer, and get all the number of required resources - req->setup_buffer_resources(bytes_cached, bytes_dirtied, bytes_allocated, - num_lanes, num_log_entries, num_unpublished_reserves); - - { - std::lock_guard locker(m_lock); - if (m_free_lanes < num_lanes) { - req->set_io_waited_for_lanes(true); - ldout(m_image_ctx.cct, 20) << "not enough free lanes (need " - << num_lanes - << ", have " << m_free_lanes << ") " - << *req << dendl; - alloc_succeeds = false; - /* This isn't considered a "no space" alloc fail. Lanes are a throttling mechanism. */ - } - if (m_free_log_entries < num_log_entries) { - req->set_io_waited_for_entries(true); - ldout(m_image_ctx.cct, 20) << "not enough free entries (need " - << num_log_entries - << ", have " << m_free_log_entries << ") " - << *req << dendl; - alloc_succeeds = false; - no_space = true; /* Entries must be retired */ - } - /* Don't attempt buffer allocate if we've exceeded the "full" threshold */ - if (m_bytes_allocated + bytes_allocated > m_bytes_allocated_cap) { - if (!req->has_io_waited_for_buffers()) { - req->set_io_waited_for_entries(true); - ldout(m_image_ctx.cct, 1) << "Waiting for allocation cap (cap=" - << m_bytes_allocated_cap - << ", allocated=" << m_bytes_allocated - << ") in write [" << *req << "]" << dendl; - } - alloc_succeeds = false; - no_space = true; /* Entries must be retired */ - } - } - - std::vector& buffers = req->get_resources_buffers(); - if (alloc_succeeds) { - for (auto &buffer : buffers) { - utime_t before_reserve = ceph_clock_now(); - buffer.buffer_oid = pmemobj_reserve(m_log_pool, - &buffer.buffer_alloc_action, - buffer.allocation_size, - 0 /* Object type */); - buffer.allocation_lat = ceph_clock_now() - before_reserve; - if (TOID_IS_NULL(buffer.buffer_oid)) { - if (!req->has_io_waited_for_buffers()) { - req->set_io_waited_for_entries(true); - } - ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: " - << pmemobj_errormsg() << ". " - << *req << dendl; - alloc_succeeds = false; - no_space = true; /* Entries need to be retired */ - break; - } else { - buffer.allocated = true; - } - ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo - << "." << buffer.buffer_oid.oid.off - << ", size=" << buffer.allocation_size << dendl; - } - } - - if (alloc_succeeds) { - std::lock_guard locker(m_lock); - /* We need one free log entry per extent (each is a separate entry), and - * one free "lane" for remote replication. */ - if ((m_free_lanes >= num_lanes) && - (m_free_log_entries >= num_log_entries)) { - m_free_lanes -= num_lanes; - m_free_log_entries -= num_log_entries; - m_unpublished_reserves += num_unpublished_reserves; - m_bytes_allocated += bytes_allocated; - m_bytes_cached += bytes_cached; - m_bytes_dirty += bytes_dirtied; - } else { - alloc_succeeds = false; - } - } - - if (!alloc_succeeds) { - /* On alloc failure, free any buffers we did allocate */ - for (auto &buffer : buffers) { - if (buffer.allocated) { - pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1); - } - } - if (no_space) { - /* Expedite flushing and/or retiring */ - std::lock_guard locker(m_lock); - m_alloc_failed_since_retire = true; - m_last_alloc_fail = ceph_clock_now(); - } - } - - req->set_allocated(alloc_succeeds); - - return alloc_succeeds; -} - -template -C_FlushRequest>* AbstractWriteLog::make_flush_req(Context *on_finish) { - utime_t flush_begins = ceph_clock_now(); - bufferlist bl; - auto *flush_req = - new C_FlushRequestT(*this, flush_begins, Extents({whole_volume_extent()}), - std::move(bl), 0, m_lock, m_perfcounter, on_finish); - - return flush_req; -} - -template -void AbstractWriteLog::wake_up() { - CephContext *cct = m_image_ctx.cct; - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - - if (!m_wake_up_enabled) { - // wake_up is disabled during shutdown after flushing completes - ldout(m_image_ctx.cct, 6) << "deferred processing disabled" << dendl; - return; - } - - if (m_wake_up_requested && m_wake_up_scheduled) { - return; - } - - ldout(cct, 20) << dendl; - - /* Wake-up can be requested while it's already scheduled */ - m_wake_up_requested = true; - - /* Wake-up cannot be scheduled if it's already scheduled */ - if (m_wake_up_scheduled) { - return; - } - m_wake_up_scheduled = true; - m_async_process_work++; - m_async_op_tracker.start_op(); - m_work_queue.queue(new LambdaContext( - [this](int r) { - process_work(); - m_async_op_tracker.finish_op(); - m_async_process_work--; - }), 0); -} - -template -void AbstractWriteLog::process_work() { - CephContext *cct = m_image_ctx.cct; - int max_iterations = 4; - bool wake_up_requested = false; - uint64_t aggressive_high_water_bytes = m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER; - uint64_t high_water_bytes = m_bytes_allocated_cap * RETIRE_HIGH_WATER; - uint64_t low_water_bytes = m_bytes_allocated_cap * RETIRE_LOW_WATER; - uint64_t aggressive_high_water_entries = m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER; - uint64_t high_water_entries = m_total_log_entries * RETIRE_HIGH_WATER; - uint64_t low_water_entries = m_total_log_entries * RETIRE_LOW_WATER; - - ldout(cct, 20) << dendl; - - do { - { - std::lock_guard locker(m_lock); - m_wake_up_requested = false; - } - if (m_alloc_failed_since_retire || m_invalidating || - m_bytes_allocated > high_water_bytes || - (m_log_entries.size() > high_water_entries)) { - int retired = 0; - utime_t started = ceph_clock_now(); - ldout(m_image_ctx.cct, 10) << "alloc_fail=" << m_alloc_failed_since_retire - << ", allocated > high_water=" - << (m_bytes_allocated > high_water_bytes) - << ", allocated_entries > high_water=" - << (m_log_entries.size() > high_water_entries) - << dendl; - while (m_alloc_failed_since_retire || m_invalidating || - (m_bytes_allocated > high_water_bytes) || - (m_log_entries.size() > high_water_entries) || - (((m_bytes_allocated > low_water_bytes) || (m_log_entries.size() > low_water_entries)) && - (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) { - if (!retire_entries((m_shutting_down || m_invalidating || - (m_bytes_allocated > aggressive_high_water_bytes) || - (m_log_entries.size() > aggressive_high_water_entries)) - ? MAX_ALLOC_PER_TRANSACTION - : MAX_FREE_PER_TRANSACTION)) { - break; - } - retired++; - dispatch_deferred_writes(); - process_writeback_dirty_entries(); - } - ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl; - } - dispatch_deferred_writes(); - process_writeback_dirty_entries(); - - { - std::lock_guard locker(m_lock); - wake_up_requested = m_wake_up_requested; - } - } while (wake_up_requested && --max_iterations > 0); - - { - std::lock_guard locker(m_lock); - m_wake_up_scheduled = false; - /* Reschedule if it's still requested */ - if (m_wake_up_requested) { - wake_up(); - } - } -} - -template -bool AbstractWriteLog::can_flush_entry(std::shared_ptr log_entry) { - CephContext *cct = m_image_ctx.cct; - - ldout(cct, 20) << "" << dendl; - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - - if (m_invalidating) { - return true; - } - - /* For OWB we can flush entries with the same sync gen number (write between - * aio_flush() calls) concurrently. Here we'll consider an entry flushable if - * its sync gen number is <= the lowest sync gen number carried by all the - * entries currently flushing. - * - * If the entry considered here bears a sync gen number lower than a - * previously flushed entry, the application had to have submitted the write - * bearing the higher gen number before the write with the lower gen number - * completed. So, flushing these concurrently is OK. - * - * If the entry considered here bears a sync gen number higher than a - * currently flushing entry, the write with the lower gen number may have - * completed to the application before the write with the higher sync gen - * number was submitted, and the application may rely on that completion - * order for volume consistency. In this case the entry will not be - * considered flushable until all the entries bearing lower sync gen numbers - * finish flushing. - */ - - if (m_flush_ops_in_flight && - (log_entry->ram_entry.sync_gen_number > m_lowest_flushing_sync_gen)) { - return false; - } - - return (log_entry->can_writeback() && - (m_flush_ops_in_flight <= IN_FLIGHT_FLUSH_WRITE_LIMIT) && - (m_flush_bytes_in_flight <= IN_FLIGHT_FLUSH_BYTES_LIMIT)); -} - -template -Context* AbstractWriteLog::construct_flush_entry_ctx(std::shared_ptr log_entry) { - CephContext *cct = m_image_ctx.cct; - bool invalidating = m_invalidating; // snapshot so we behave consistently - - ldout(cct, 20) << "" << dendl; - ceph_assert(m_entry_reader_lock.is_locked()); - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - if (!m_flush_ops_in_flight || - (log_entry->ram_entry.sync_gen_number < m_lowest_flushing_sync_gen)) { - m_lowest_flushing_sync_gen = log_entry->ram_entry.sync_gen_number; - } - m_flush_ops_in_flight += 1; - /* For write same this is the bytes affected bt the flush op, not the bytes transferred */ - m_flush_bytes_in_flight += log_entry->ram_entry.write_bytes; - - /* Flush write completion action */ - Context *ctx = new LambdaContext( - [this, log_entry, invalidating](int r) { - { - std::lock_guard locker(m_lock); - if (r < 0) { - lderr(m_image_ctx.cct) << "failed to flush log entry" - << cpp_strerror(r) << dendl; - m_dirty_log_entries.push_front(log_entry); - } else { - ceph_assert(m_bytes_dirty >= log_entry->bytes_dirty()); - log_entry->set_flushed(true); - m_bytes_dirty -= log_entry->bytes_dirty(); - sync_point_writer_flushed(log_entry->get_sync_point_entry()); - ldout(m_image_ctx.cct, 20) << "flushed: " << log_entry - << " invalidating=" << invalidating - << dendl; - } - m_flush_ops_in_flight -= 1; - m_flush_bytes_in_flight -= log_entry->ram_entry.write_bytes; - wake_up(); - } - }); - /* Flush through lower cache before completing */ - ctx = new LambdaContext( - [this, ctx](int r) { - if (r < 0) { - lderr(m_image_ctx.cct) << "failed to flush log entry" - << cpp_strerror(r) << dendl; - ctx->complete(r); - } else { - m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, ctx); - } - }); - - if (invalidating) { - return ctx; - } - return new LambdaContext( - [this, log_entry, ctx](int r) { - m_image_ctx.op_work_queue->queue(new LambdaContext( - [this, log_entry, ctx](int r) { - ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry - << " " << *log_entry << dendl; - log_entry->writeback(m_image_writeback, ctx); - }), 0); - }); -} - -template -void AbstractWriteLog::process_writeback_dirty_entries() { - CephContext *cct = m_image_ctx.cct; - bool all_clean = false; - int flushed = 0; - - ldout(cct, 20) << "Look for dirty entries" << dendl; - { - DeferredContexts post_unlock; - std::shared_lock entry_reader_locker(m_entry_reader_lock); - while (flushed < IN_FLIGHT_FLUSH_WRITE_LIMIT) { - std::lock_guard locker(m_lock); - if (m_shutting_down) { - ldout(cct, 5) << "Flush during shutdown supressed" << dendl; - /* Do flush complete only when all flush ops are finished */ - all_clean = !m_flush_ops_in_flight; - break; - } - if (m_dirty_log_entries.empty()) { - ldout(cct, 20) << "Nothing new to flush" << dendl; - /* Do flush complete only when all flush ops are finished */ - all_clean = !m_flush_ops_in_flight; - break; - } - auto candidate = m_dirty_log_entries.front(); - bool flushable = can_flush_entry(candidate); - if (flushable) { - post_unlock.add(construct_flush_entry_ctx(candidate)); - flushed++; - m_dirty_log_entries.pop_front(); - } else { - ldout(cct, 20) << "Next dirty entry isn't flushable yet" << dendl; - break; - } - } - } - - if (all_clean) { - /* All flushing complete, drain outside lock */ - Contexts flush_contexts; - { - std::lock_guard locker(m_lock); - flush_contexts.swap(m_flush_complete_contexts); - } - finish_contexts(m_image_ctx.cct, flush_contexts, 0); - } -} - -/** - * Update/persist the last flushed sync point in the log - */ -template -void AbstractWriteLog::persist_last_flushed_sync_gen() -{ - TOID(struct WriteLogPoolRoot) pool_root; - pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); - uint64_t flushed_sync_gen; - - std::lock_guard append_locker(m_log_append_lock); - { - std::lock_guard locker(m_lock); - flushed_sync_gen = m_flushed_sync_gen; - } - - if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) { - ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from " - << D_RO(pool_root)->flushed_sync_gen << " to " - << flushed_sync_gen << dendl; - TX_BEGIN(m_log_pool) { - D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen; - } TX_ONCOMMIT { - } TX_ONABORT { - lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl; - ceph_assert(false); - } TX_FINALLY { - } TX_END; - } -} - -/* Returns true if the specified SyncPointLogEntry is considered flushed, and - * the log will be updated to reflect this. */ -template -bool AbstractWriteLog::handle_flushed_sync_point(std::shared_ptr log_entry) -{ - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - ceph_assert(log_entry); - - if ((log_entry->writes_flushed == log_entry->writes) && - log_entry->completed && log_entry->prior_sync_point_flushed && - log_entry->next_sync_point_entry) { - ldout(m_image_ctx.cct, 20) << "All writes flushed up to sync point=" - << *log_entry << dendl; - log_entry->next_sync_point_entry->prior_sync_point_flushed = true; - /* Don't move the flushed sync gen num backwards. */ - if (m_flushed_sync_gen < log_entry->ram_entry.sync_gen_number) { - m_flushed_sync_gen = log_entry->ram_entry.sync_gen_number; - } - m_async_op_tracker.start_op(); - m_work_queue.queue(new LambdaContext( - [this, log_entry](int r) { - bool handled_by_next; - { - std::lock_guard locker(m_lock); - handled_by_next = handle_flushed_sync_point(log_entry->next_sync_point_entry); - } - if (!handled_by_next) { - persist_last_flushed_sync_gen(); - } - m_async_op_tracker.finish_op(); - })); - return true; - } - return false; -} - -template -void AbstractWriteLog::sync_point_writer_flushed(std::shared_ptr log_entry) -{ - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - ceph_assert(log_entry); - log_entry->writes_flushed++; - - /* If this entry might be completely flushed, look closer */ - if ((log_entry->writes_flushed == log_entry->writes) && log_entry->completed) { - ldout(m_image_ctx.cct, 15) << "All writes flushed for sync point=" - << *log_entry << dendl; - handle_flushed_sync_point(log_entry); - } -} - -/* Make a new sync point and flush the previous during initialization, when there may or may - * not be a previous sync point */ -template -void AbstractWriteLog::init_flush_new_sync_point(DeferredContexts &later) { - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - ceph_assert(!m_initialized); /* Don't use this after init */ - - if (!m_current_sync_point) { - /* First sync point since start */ - new_sync_point(later); - } else { - flush_new_sync_point(nullptr, later); - } -} - -/** - * Begin a new sync point - */ -template -void AbstractWriteLog::new_sync_point(DeferredContexts &later) { - CephContext *cct = m_image_ctx.cct; - std::shared_ptr old_sync_point = m_current_sync_point; - std::shared_ptr new_sync_point; - ldout(cct, 20) << dendl; - - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - - /* The first time this is called, if this is a newly created log, - * this makes the first sync gen number we'll use 1. On the first - * call for a re-opened log m_current_sync_gen will be the highest - * gen number from all the sync point entries found in the re-opened - * log, and this advances to the next sync gen number. */ - ++m_current_sync_gen; - - new_sync_point = std::make_shared(m_current_sync_gen, cct); - m_current_sync_point = new_sync_point; - - /* If this log has been re-opened, old_sync_point will initially be - * nullptr, but m_current_sync_gen may not be zero. */ - if (old_sync_point) { - new_sync_point->setup_earlier_sync_point(old_sync_point, m_last_op_sequence_num); - m_perfcounter->hinc(l_librbd_rwl_syncpoint_hist, - old_sync_point->log_entry->writes, - old_sync_point->log_entry->bytes); - /* This sync point will acquire no more sub-ops. Activation needs - * to acquire m_lock, so defer to later*/ - later.add(new LambdaContext( - [this, old_sync_point](int r) { - old_sync_point->prior_persisted_gather_activate(); - })); - } - - new_sync_point->prior_persisted_gather_set_finisher(); - - if (old_sync_point) { - ldout(cct,6) << "new sync point = [" << *m_current_sync_point - << "], prior = [" << *old_sync_point << "]" << dendl; - } else { - ldout(cct,6) << "first sync point = [" << *m_current_sync_point - << "]" << dendl; - } -} - -template -void AbstractWriteLog::flush_new_sync_point(C_FlushRequestT *flush_req, - DeferredContexts &later) { - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - - if (!flush_req) { - m_async_null_flush_finish++; - m_async_op_tracker.start_op(); - Context *flush_ctx = new LambdaContext([this](int r) { - m_async_null_flush_finish--; - m_async_op_tracker.finish_op(); - }); - flush_req = make_flush_req(flush_ctx); - flush_req->internal = true; - } - - /* Add a new sync point. */ - new_sync_point(later); - std::shared_ptr to_append = m_current_sync_point->earlier_sync_point; - ceph_assert(to_append); - - /* This flush request will append/persist the (now) previous sync point */ - flush_req->to_append = to_append; - - /* When the m_sync_point_persist Gather completes this sync point can be - * appended. The only sub for this Gather is the finisher Context for - * m_prior_log_entries_persisted, which records the result of the Gather in - * the sync point, and completes. TODO: Do we still need both of these - * Gathers?*/ - Context * ctx = new LambdaContext([this, flush_req](int r) { - ldout(m_image_ctx.cct, 20) << "Flush req=" << flush_req - << " sync point =" << flush_req->to_append - << ". Ready to persist." << dendl; - alloc_and_dispatch_io_req(flush_req); - }); - to_append->persist_gather_set_finisher(ctx); - - /* The m_sync_point_persist Gather has all the subs it will ever have, and - * now has its finisher. If the sub is already complete, activation will - * complete the Gather. The finisher will acquire m_lock, so we'll activate - * this when we release m_lock.*/ - later.add(new LambdaContext([this, to_append](int r) { - to_append->persist_gather_activate(); - })); - - /* The flush request completes when the sync point persists */ - to_append->add_in_on_persisted_ctxs(flush_req); -} - -template -void AbstractWriteLog::flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, - DeferredContexts &later) { - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - - /* If there have been writes since the last sync point ... */ - if (m_current_sync_point->log_entry->writes) { - flush_new_sync_point(flush_req, later); - } else { - /* There have been no writes to the current sync point. */ - if (m_current_sync_point->earlier_sync_point) { - /* If previous sync point hasn't completed, complete this flush - * with the earlier sync point. No alloc or dispatch needed. */ - m_current_sync_point->earlier_sync_point->on_sync_point_persisted.push_back(flush_req); - } else { - /* The previous sync point has already completed and been - * appended. The current sync point has no writes, so this flush - * has nothing to wait for. This flush completes now. */ - later.add(flush_req); - } - } -} - -/* - * RWL internal flush - will actually flush the RWL. - * - * User flushes should arrive at aio_flush(), and only flush prior - * writes to all log replicas. - * - * Librbd internal flushes will arrive at flush(invalidate=false, - * discard=false), and traverse the block guard to ensure in-flight writes are - * flushed. - */ -template -void AbstractWriteLog::flush_dirty_entries(Context *on_finish) { - CephContext *cct = m_image_ctx.cct; - bool all_clean; - bool flushing; - bool stop_flushing; - - { - std::lock_guard locker(m_lock); - flushing = (0 != m_flush_ops_in_flight); - all_clean = m_dirty_log_entries.empty(); - stop_flushing = (m_shutting_down); - } - - if (!flushing && (all_clean || stop_flushing)) { - /* Complete without holding m_lock */ - if (all_clean) { - ldout(cct, 20) << "no dirty entries" << dendl; - } else { - ldout(cct, 5) << "flush during shutdown suppressed" << dendl; - } - on_finish->complete(0); - } else { - if (all_clean) { - ldout(cct, 5) << "flush ops still in progress" << dendl; - } else { - ldout(cct, 20) << "dirty entries remain" << dendl; - } - std::lock_guard locker(m_lock); - /* on_finish can't be completed yet */ - m_flush_complete_contexts.push_back(new LambdaContext( - [this, on_finish](int r) { - flush_dirty_entries(on_finish); - })); - wake_up(); - } -} - -template -void AbstractWriteLog::internal_flush(bool invalidate, Context *on_finish) { - ldout(m_image_ctx.cct, 20) << "invalidate=" << invalidate << dendl; - - if (m_perfcounter) { - if (invalidate) { - m_perfcounter->inc(l_librbd_rwl_invalidate_cache, 1); - } else { - m_perfcounter->inc(l_librbd_rwl_flush, 1); - } - } - - /* May be called even if initialization fails */ - if (!m_initialized) { - ldout(m_image_ctx.cct, 05) << "never initialized" << dendl; - /* Deadlock if completed here */ - m_image_ctx.op_work_queue->queue(on_finish, 0); - return; - } - - /* Flush/invalidate must pass through block guard to ensure all layers of - * cache are consistently flush/invalidated. This ensures no in-flight write leaves - * some layers with valid regions, which may later produce inconsistent read - * results. */ - GuardedRequestFunctionContext *guarded_ctx = - new GuardedRequestFunctionContext( - [this, on_finish, invalidate](GuardedRequestFunctionContext &guard_ctx) { - DeferredContexts on_exit; - ldout(m_image_ctx.cct, 20) << "cell=" << guard_ctx.cell << dendl; - ceph_assert(guard_ctx.cell); - - Context *ctx = new LambdaContext( - [this, cell=guard_ctx.cell, invalidate, on_finish](int r) { - std::lock_guard locker(m_lock); - m_invalidating = false; - ldout(m_image_ctx.cct, 6) << "Done flush/invalidating (invalidate=" - << invalidate << ")" << dendl; - if (m_log_entries.size()) { - ldout(m_image_ctx.cct, 1) << "m_log_entries.size()=" - << m_log_entries.size() << ", " - << "front()=" << *m_log_entries.front() - << dendl; - } - if (invalidate) { - ceph_assert(m_log_entries.size() == 0); - } - ceph_assert(m_dirty_log_entries.size() == 0); - m_image_ctx.op_work_queue->queue(on_finish, r); - release_guarded_request(cell); - }); - ctx = new LambdaContext( - [this, ctx, invalidate](int r) { - Context *next_ctx = ctx; - if (r < 0) { - /* Override on_finish status with this error */ - next_ctx = new LambdaContext([r, ctx](int _r) { - ctx->complete(r); - }); - } - if (invalidate) { - { - std::lock_guard locker(m_lock); - ceph_assert(m_dirty_log_entries.size() == 0); - ceph_assert(!m_invalidating); - ldout(m_image_ctx.cct, 6) << "Invalidating" << dendl; - m_invalidating = true; - } - /* Discards all RWL entries */ - while (retire_entries(MAX_ALLOC_PER_TRANSACTION)) { } - next_ctx->complete(0); - } else { - { - std::lock_guard locker(m_lock); - ceph_assert(m_dirty_log_entries.size() == 0); - ceph_assert(!m_invalidating); - } - m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, next_ctx); - } - }); - ctx = new LambdaContext( - [this, ctx](int r) { - flush_dirty_entries(ctx); - }); - std::lock_guard locker(m_lock); - /* Even if we're throwing everything away, but we want the last entry to - * be a sync point so we can cleanly resume. - * - * Also, the blockguard only guarantees the replication of this op - * can't overlap with prior ops. It doesn't guarantee those are all - * completed and eligible for flush & retire, which we require here. - */ - auto flush_req = make_flush_req(ctx); - flush_new_sync_point_if_needed(flush_req, on_exit); - }); - detain_guarded_request(nullptr, guarded_ctx, true); -} - -template -void AbstractWriteLog::add_into_log_map(GenericWriteLogEntries &log_entries) { - m_blocks_to_log_entries.add_log_entries(log_entries); -} - -template -bool AbstractWriteLog::can_retire_entry(std::shared_ptr log_entry) { - CephContext *cct = m_image_ctx.cct; - - ldout(cct, 20) << dendl; - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - return log_entry->can_retire(); -} - -/** - * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries - * that are eligible to be retired. Returns true if anything was - * retired. - */ -template -bool AbstractWriteLog::retire_entries(const unsigned long int frees_per_tx) { - CephContext *cct = m_image_ctx.cct; - GenericLogEntriesVector retiring_entries; - uint32_t initial_first_valid_entry; - uint32_t first_valid_entry; - - std::lock_guard retire_locker(m_log_retire_lock); - ldout(cct, 20) << "Look for entries to retire" << dendl; - { - /* Entry readers can't be added while we hold m_entry_reader_lock */ - RWLock::WLocker entry_reader_locker(m_entry_reader_lock); - std::lock_guard locker(m_lock); - initial_first_valid_entry = m_first_valid_entry; - first_valid_entry = m_first_valid_entry; - auto entry = m_log_entries.front(); - while (!m_log_entries.empty() && - retiring_entries.size() < frees_per_tx && - can_retire_entry(entry)) { - if (entry->log_entry_index != first_valid_entry) { - lderr(cct) << "Retiring entry index (" << entry->log_entry_index - << ") and first valid log entry index (" << first_valid_entry - << ") must be ==." << dendl; - } - ceph_assert(entry->log_entry_index == first_valid_entry); - first_valid_entry = (first_valid_entry + 1) % m_total_log_entries; - m_log_entries.pop_front(); - retiring_entries.push_back(entry); - /* Remove entry from map so there will be no more readers */ - if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) { - auto gen_write_entry = static_pointer_cast(entry); - if (gen_write_entry) { - m_blocks_to_log_entries.remove_log_entry(gen_write_entry); - } - } - entry = m_log_entries.front(); - } - } - - if (retiring_entries.size()) { - ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl; - TOID(struct WriteLogPoolRoot) pool_root; - pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); - - utime_t tx_start; - utime_t tx_end; - /* Advance first valid entry and release buffers */ - { - uint64_t flushed_sync_gen; - std::lock_guard append_locker(m_log_append_lock); - { - std::lock_guard locker(m_lock); - flushed_sync_gen = m_flushed_sync_gen; - } - - tx_start = ceph_clock_now(); - TX_BEGIN(m_log_pool) { - if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) { - ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from " - << D_RO(pool_root)->flushed_sync_gen << " to " - << flushed_sync_gen << dendl; - D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen; - } - D_RW(pool_root)->first_valid_entry = first_valid_entry; - for (auto &entry: retiring_entries) { - if (entry->write_bytes()) { - ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo - << "." << entry->ram_entry.write_data.oid.off << dendl; - TX_FREE(entry->ram_entry.write_data); - } else { - ldout(cct, 20) << "Retiring non-write: " << *entry << dendl; - } - } - } TX_ONCOMMIT { - } TX_ONABORT { - lderr(cct) << "failed to commit free of" << retiring_entries.size() << " log entries (" << m_log_pool_name << ")" << dendl; - ceph_assert(false); - } TX_FINALLY { - } TX_END; - tx_end = ceph_clock_now(); - } - m_perfcounter->tinc(l_librbd_rwl_retire_tx_t, tx_end - tx_start); - m_perfcounter->hinc(l_librbd_rwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), retiring_entries.size()); - - /* Update runtime copy of first_valid, and free entries counts */ - { - std::lock_guard locker(m_lock); - - ceph_assert(m_first_valid_entry == initial_first_valid_entry); - m_first_valid_entry = first_valid_entry; - m_free_log_entries += retiring_entries.size(); - for (auto &entry: retiring_entries) { - if (entry->write_bytes()) { - ceph_assert(m_bytes_cached >= entry->write_bytes()); - m_bytes_cached -= entry->write_bytes(); - uint64_t entry_allocation_size = entry->write_bytes(); - if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) { - entry_allocation_size = MIN_WRITE_ALLOC_SIZE; - } - ceph_assert(m_bytes_allocated >= entry_allocation_size); - m_bytes_allocated -= entry_allocation_size; - } - } - m_alloc_failed_since_retire = false; - wake_up(); - } - } else { - ldout(cct, 20) << "Nothing to retire" << dendl; - return false; - } - return true; -} - -} // namespace cache -} // namespace librbd - -template class librbd::cache::AbstractWriteLog; -template void librbd::cache::AbstractWriteLog:: \ - flush_pmem_buffer(std::vector>&); diff --git a/src/librbd/cache/AbstractWriteLog.h b/src/librbd/cache/AbstractWriteLog.h deleted file mode 100644 index 4951d0f9dd6ae..0000000000000 --- a/src/librbd/cache/AbstractWriteLog.h +++ /dev/null @@ -1,315 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG -#define CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG - -#include "common/RWLock.h" -#include "common/WorkQueue.h" -#include "common/AsyncOpTracker.h" -#include "librbd/cache/ImageCache.h" -#include "librbd/cache/ImageWriteback.h" -#include "librbd/Utils.h" -#include "librbd/BlockGuard.h" -#include "librbd/cache/Types.h" -#include "librbd/cache/rwl/LogOperation.h" -#include "librbd/cache/rwl/Request.h" -#include "librbd/cache/rwl/LogMap.h" -#include -#include - -class Context; -class SafeTimer; - -namespace librbd { - -struct ImageCtx; - -namespace cache { - -namespace rwl { - -class SyncPointLogEntry; -class GenericWriteLogEntry; -class WriteLogEntry; -class GenericLogEntry; - -typedef std::list> WriteLogEntries; -typedef std::list> GenericLogEntries; -typedef std::list> GenericWriteLogEntries; -typedef std::vector> GenericLogEntriesVector; - -typedef LogMapEntries WriteLogMapEntries; -typedef LogMap WriteLogMap; - -/**** Write log entries end ****/ - -typedef librbd::BlockGuard WriteLogGuard; - -class DeferredContexts; -template class ImageCacheState; - -template -struct C_BlockIORequest; - -template -struct C_WriteRequest; - -using GenericLogOperations = std::list; - -} // namespace rwl - - -template -class AbstractWriteLog { -public: - typedef io::Extent Extent; - typedef io::Extents Extents; - - AbstractWriteLog(ImageCtxT &image_ctx, librbd::cache::rwl::ImageCacheState* cache_state); - ~AbstractWriteLog(); - AbstractWriteLog(const AbstractWriteLog&) = delete; - AbstractWriteLog &operator=(const AbstractWriteLog&) = delete; - - /// IO methods - void read(Extents&& image_extents, ceph::bufferlist *bl, - int fadvise_flags, Context *on_finish); - void write(Extents&& image_extents, ceph::bufferlist&& bl, - int fadvise_flags, - Context *on_finish); - void discard(uint64_t offset, uint64_t length, - uint32_t discard_granularity_bytes, - Context *on_finish); - void flush(io::FlushSource flush_source, Context *on_finish); - void writesame(uint64_t offset, uint64_t length, - ceph::bufferlist&& bl, - int fadvise_flags, Context *on_finish); - void compare_and_write(Extents&& image_extents, - ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl, - uint64_t *mismatch_offset,int fadvise_flags, - Context *on_finish); - - /// internal state methods - void init(Context *on_finish); - void shut_down(Context *on_finish); - void invalidate(Context *on_finish); - void flush(Context *on_finish); - - using This = AbstractWriteLog; - using C_WriteRequestT = rwl::C_WriteRequest; - using C_BlockIORequestT = rwl::C_BlockIORequest; - using C_FlushRequestT = rwl::C_FlushRequest; - using C_DiscardRequestT = rwl::C_DiscardRequest; - using C_WriteSameRequestT = rwl::C_WriteSameRequest; - using C_CompAndWriteRequestT = rwl::C_CompAndWriteRequest; - - CephContext * get_context(); - void release_guarded_request(BlockGuardCell *cell); - void release_write_lanes(C_BlockIORequestT *req); - bool alloc_resources(C_BlockIORequestT *req); - template - void flush_pmem_buffer(V& ops); - void schedule_append(rwl::GenericLogOperationsVector &ops); - void schedule_append(rwl::GenericLogOperationSharedPtr op); - void schedule_flush_and_append(rwl::GenericLogOperationsVector &ops); - void flush_new_sync_point(C_FlushRequestT *flush_req, rwl::DeferredContexts &later); - std::shared_ptr get_current_sync_point() { - return m_current_sync_point; - } - bool get_persist_on_flush() { - return m_persist_on_flush; - } - void inc_last_op_sequence_num() { - m_perfcounter->inc(l_librbd_rwl_log_ops, 1); - ++m_last_op_sequence_num; - } - uint64_t get_last_op_sequence_num() { - return m_last_op_sequence_num; - } - uint64_t get_current_sync_gen() { - return m_current_sync_gen; - } - unsigned int get_free_lanes() { - return m_free_lanes; - } - uint32_t get_free_log_entries() { - return m_free_log_entries; - } - void add_into_log_map(rwl::GenericWriteLogEntries &log_entries); -protected: - typedef std::list *> C_WriteRequests; - typedef std::list *> C_BlockIORequests; - - BlockGuardCell* detain_guarded_request_helper(rwl::GuardedRequest &req); - BlockGuardCell* detain_guarded_request_barrier_helper(rwl::GuardedRequest &req); - void detain_guarded_request(C_BlockIORequestT *request, - rwl::GuardedRequestFunctionContext *guarded_ctx, - bool is_barrier); - - librbd::cache::rwl::ImageCacheState* m_cache_state = nullptr; - - std::atomic m_initialized = {false}; - std::atomic m_shutting_down = {false}; - std::atomic m_invalidating = {false}; - PMEMobjpool *m_log_pool = nullptr; - const char* m_rwl_pool_layout_name; - - ImageCtxT &m_image_ctx; - - std::string m_log_pool_name; - bool m_log_is_poolset = false; - uint64_t m_log_pool_config_size; /* Configured size of RWL */ - uint64_t m_log_pool_actual_size = 0; /* Actual size of RWL pool */ - - uint32_t m_total_log_entries = 0; - uint32_t m_free_log_entries = 0; - - std::atomic m_bytes_allocated = {0}; /* Total bytes allocated in write buffers */ - uint64_t m_bytes_cached = 0; /* Total bytes used in write buffers */ - uint64_t m_bytes_dirty = 0; /* Total bytes yet to flush to RBD */ - uint64_t m_bytes_allocated_cap = 0; - - utime_t m_last_alloc_fail; /* Entry or buffer allocation fail seen */ - std::atomic m_alloc_failed_since_retire = {false}; - - ImageWriteback m_image_writeback; - rwl::WriteLogGuard m_write_log_guard; - /* - * When m_first_free_entry == m_first_valid_entry, the log is - * empty. There is always at least one free entry, which can't be - * used. - */ - uint64_t m_first_free_entry = 0; /* Entries from here to m_first_valid_entry-1 are free */ - uint64_t m_first_valid_entry = 0; /* Entries from here to m_first_free_entry-1 are valid */ - - /* Starts at 0 for a new write log. Incremented on every flush. */ - uint64_t m_current_sync_gen = 0; - /* Starts at 0 on each sync gen increase. Incremented before applied - to an operation */ - uint64_t m_last_op_sequence_num = 0; - /* All writes bearing this and all prior sync gen numbers are flushed */ - uint64_t m_flushed_sync_gen = 0; - - bool m_persist_on_write_until_flush = true; - - AsyncOpTracker m_async_op_tracker; - /* Debug counters for the places m_async_op_tracker is used */ - std::atomic m_async_flush_ops = {0}; - std::atomic m_async_append_ops = {0}; - std::atomic m_async_complete_ops = {0}; - std::atomic m_async_null_flush_finish = {0}; - std::atomic m_async_process_work = {0}; - - /* Acquire locks in order declared here */ - - mutable ceph::mutex m_log_retire_lock; - /* Hold a read lock on m_entry_reader_lock to add readers to log entry - * bufs. Hold a write lock to prevent readers from being added (e.g. when - * removing log entrys from the map). No lock required to remove readers. */ - mutable RWLock m_entry_reader_lock; - /* Hold m_deferred_dispatch_lock while consuming from m_deferred_ios. */ - mutable ceph::mutex m_deferred_dispatch_lock; - /* Hold m_log_append_lock while appending or retiring log entries. */ - mutable ceph::mutex m_log_append_lock; - /* Used for most synchronization */ - mutable ceph::mutex m_lock; - - /* Used in release/detain to make BlockGuard preserve submission order */ - mutable ceph::mutex m_blockguard_lock; - - /* Use m_blockguard_lock for the following 3 things */ - rwl::WriteLogGuard::BlockOperations m_awaiting_barrier; - bool m_barrier_in_progress = false; - BlockGuardCell *m_barrier_cell = nullptr; - - bool m_wake_up_requested = false; - bool m_wake_up_scheduled = false; - bool m_wake_up_enabled = true; - bool m_appending = false; - bool m_dispatching_deferred_ops = false; - - Contexts m_flush_complete_contexts; - - rwl::GenericLogOperations m_ops_to_flush; /* Write ops needing flush in local log */ - rwl::GenericLogOperations m_ops_to_append; /* Write ops needing event append in local log */ - - rwl::WriteLogMap m_blocks_to_log_entries; - - /* New entries are at the back. Oldest at the front */ - rwl::GenericLogEntries m_log_entries; - rwl::GenericLogEntries m_dirty_log_entries; - - PerfCounters *m_perfcounter = nullptr; - - std::shared_ptr m_current_sync_point = nullptr; - bool m_persist_on_flush = false; /* If false, persist each write before completion */ - - int m_flush_ops_in_flight = 0; - int m_flush_bytes_in_flight = 0; - uint64_t m_lowest_flushing_sync_gen = 0; - - /* Writes that have left the block guard, but are waiting for resources */ - C_BlockIORequests m_deferred_ios; - /* Throttle writes concurrently allocating & replicating */ - unsigned int m_free_lanes = rwl::MAX_CONCURRENT_WRITES; - unsigned int m_unpublished_reserves = 0; - - /* Initialized from config, then set false during shutdown */ - std::atomic m_periodic_stats_enabled = {false}; - SafeTimer *m_timer = nullptr; /* Used with m_timer_lock */ - mutable ceph::mutex *m_timer_lock = nullptr; /* Used with and by m_timer */ - Context *m_timer_ctx = nullptr; - - ThreadPool m_thread_pool; - ContextWQ m_work_queue; - - uint32_t m_discard_granularity_bytes; - - void perf_start(const std::string name); - void perf_stop(); - void log_perf(); - void periodic_stats(); - void arm_periodic_stats(); - - void rwl_init(Context *on_finish, rwl::DeferredContexts &later); - void update_image_cache_state(Context *on_finish); - void load_existing_entries(rwl::DeferredContexts &later); - void wake_up(); - void process_work(); - - void flush_dirty_entries(Context *on_finish); - bool can_flush_entry(const std::shared_ptr log_entry); - Context *construct_flush_entry_ctx(const std::shared_ptr log_entry); - void persist_last_flushed_sync_gen(); - bool handle_flushed_sync_point(std::shared_ptr log_entry); - void sync_point_writer_flushed(std::shared_ptr log_entry); - void process_writeback_dirty_entries(); - bool can_retire_entry(const std::shared_ptr log_entry); - bool retire_entries(const unsigned long int frees_per_tx); - - void init_flush_new_sync_point(rwl::DeferredContexts &later); - void new_sync_point(rwl::DeferredContexts &later); - rwl::C_FlushRequest>* make_flush_req(Context *on_finish); - void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, rwl::DeferredContexts &later); - - void dispatch_deferred_writes(void); - void alloc_and_dispatch_io_req(C_BlockIORequestT *write_req); - void append_scheduled_ops(void); - void enlist_op_appender(); - void schedule_append(rwl::GenericLogOperations &ops); - void flush_then_append_scheduled_ops(void); - void enlist_op_flusher(); - void alloc_op_log_entries(rwl::GenericLogOperations &ops); - void flush_op_log_entries(rwl::GenericLogOperationsVector &ops); - int append_op_log_entries(rwl::GenericLogOperations &ops); - void complete_op_log_entries(rwl::GenericLogOperations &&ops, const int r); - void schedule_complete_op_log_entries(rwl::GenericLogOperations &&ops, const int r); - void internal_flush(bool invalidate, Context *on_finish); -}; - -} // namespace cache -} // namespace librbd - -extern template class librbd::cache::AbstractWriteLog; - -#endif // CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG diff --git a/src/librbd/cache/ReplicatedWriteLog.cc b/src/librbd/cache/ReplicatedWriteLog.cc deleted file mode 100644 index bb67d96f4ba40..0000000000000 --- a/src/librbd/cache/ReplicatedWriteLog.cc +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// // vim: ts=8 sw=2 smarttab - -#include "ReplicatedWriteLog.h" -#include "include/buffer.h" -#include "include/Context.h" -#include "include/ceph_assert.h" -#include "common/deleter.h" -#include "common/dout.h" -#include "common/environment.h" -#include "common/errno.h" -#include "common/WorkQueue.h" -#include "common/Timer.h" -#include "common/perf_counters.h" -#include "librbd/ImageCtx.h" -#include "librbd/cache/rwl/ImageCacheState.h" -#include "librbd/cache/rwl/LogEntry.h" -#include -#include - -#undef dout_subsys -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::ReplicatedWriteLog: " << this << " " \ - << __func__ << ": " - -namespace librbd { - namespace cache { - - using namespace librbd::cache::rwl; - - template - ReplicatedWriteLog::ReplicatedWriteLog(I &image_ctx, librbd::cache::rwl::ImageCacheState* cache_state) - : AbstractWriteLog(image_ctx, cache_state) - { - } - - - } // namespace cache -} // namespace librbd - -template class librbd::cache::ReplicatedWriteLog; diff --git a/src/librbd/cache/ReplicatedWriteLog.h b/src/librbd/cache/ReplicatedWriteLog.h deleted file mode 100644 index cd1963fe0acb6..0000000000000 --- a/src/librbd/cache/ReplicatedWriteLog.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG -#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG - -#include "common/RWLock.h" -#include "common/WorkQueue.h" -#include "common/AsyncOpTracker.h" -#include "librbd/cache/ImageCache.h" -#include "librbd/cache/ImageWriteback.h" -#include "librbd/Utils.h" -#include "librbd/BlockGuard.h" -#include "librbd/cache/Types.h" -#include "librbd/cache/rwl/LogOperation.h" -#include "librbd/cache/rwl/Request.h" -#include "librbd/cache/rwl/LogMap.h" -#include "AbstractWriteLog.h" -#include -#include - -class Context; -class SafeTimer; - -namespace librbd { - -struct ImageCtx; - -namespace cache { - -template -class ReplicatedWriteLog : public AbstractWriteLog { -public: - typedef io::Extent Extent; - typedef io::Extents Extents; - - ReplicatedWriteLog(ImageCtxT &image_ctx, librbd::cache::rwl::ImageCacheState* cache_state); - ~ReplicatedWriteLog(); - ReplicatedWriteLog(const ReplicatedWriteLog&) = delete; - ReplicatedWriteLog &operator=(const ReplicatedWriteLog&) = delete; - -private: - using This = AbstractWriteLog; - using C_WriteRequestT = rwl::C_WriteRequest; - using C_BlockIORequestT = rwl::C_BlockIORequest; - using C_FlushRequestT = rwl::C_FlushRequest; - using C_DiscardRequestT = rwl::C_DiscardRequest; - using C_WriteSameRequestT = rwl::C_WriteSameRequest; - using C_CompAndWriteRequestT = rwl::C_CompAndWriteRequest; - -}; - -} // namespace cache -} // namespace librbd - -extern template class librbd::cache::ReplicatedWriteLog; - -#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG diff --git a/src/librbd/cache/Utils.h b/src/librbd/cache/Utils.h index e69b09b3aaefe..827dcc5daa6a5 100644 --- a/src/librbd/cache/Utils.h +++ b/src/librbd/cache/Utils.h @@ -12,7 +12,7 @@ namespace cache { namespace util { template -bool is_rwl_enabled(T& image_ctx) { +bool is_pwl_enabled(T& image_ctx) { #if defined(WITH_RBD_RWL) return image_ctx.config.template get_val("rbd_rwl_enabled"); #else diff --git a/src/librbd/cache/WriteLogCache.cc b/src/librbd/cache/WriteLogCache.cc index 310bb5fb1503f..97b4125041d8e 100644 --- a/src/librbd/cache/WriteLogCache.cc +++ b/src/librbd/cache/WriteLogCache.cc @@ -2,11 +2,11 @@ // vim: ts=8 sw=2 smarttab #include "WriteLogCache.h" -#include "ReplicatedWriteLog.h" -#include "librbd/cache/rwl/ImageCacheState.h" +#include "librbd/cache/pwl/ReplicatedWriteLog.h" +#include "librbd/cache/pwl/ImageCacheState.h" #undef dout_subsys -#define dout_subsys ceph_subsys_rbd_rwl +#define dout_subsys ceph_subsys_rbd_pwl #undef dout_prefix #define dout_prefix *_dout << "librbd::cache::WriteLogCache: " << this << " " \ << __func__ << ": " @@ -14,14 +14,14 @@ namespace librbd { namespace cache { -using namespace librbd::cache::rwl; +using namespace librbd::cache::pwl; typedef WriteLogCache::Extent Extent; typedef WriteLogCache::Extents Extents; template -WriteLogCache::WriteLogCache(I &image_ctx, librbd::cache::rwl::ImageCacheState* cache_state) { - m_write_log = new ReplicatedWriteLog(image_ctx, cache_state); +WriteLogCache::WriteLogCache(I &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state) { + m_write_log = new librbd::cache::pwl::ReplicatedWriteLog(image_ctx, cache_state); } template diff --git a/src/librbd/cache/WriteLogCache.h b/src/librbd/cache/WriteLogCache.h index 0f41955a4940e..5fe77a396be73 100644 --- a/src/librbd/cache/WriteLogCache.h +++ b/src/librbd/cache/WriteLogCache.h @@ -6,21 +6,14 @@ #include "librbd/cache/ImageCache.h" -class Context; -class SafeTimer; - -class Context; -class SafeTimer; - namespace librbd { struct ImageCtx; namespace cache { +namespace pwl { template class AbstractWriteLog; - -namespace rwl { template class ImageCacheState; } @@ -30,7 +23,7 @@ public: using typename ImageCache::Extent; using typename ImageCache::Extents; - WriteLogCache(ImageCtxT &image_ctx, librbd::cache::rwl::ImageCacheState* cache_state); + WriteLogCache(ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state); ~WriteLogCache(); WriteLogCache(const WriteLogCache&) = delete; WriteLogCache &operator=(const WriteLogCache&) = delete; @@ -58,7 +51,7 @@ public: void invalidate(Context *on_finish) override; void flush(Context *on_finish) override; - AbstractWriteLog *m_write_log; + librbd::cache::pwl::AbstractWriteLog *m_write_log; }; } // namespace cache diff --git a/src/librbd/cache/pwl/AbstractWriteLog.cc b/src/librbd/cache/pwl/AbstractWriteLog.cc new file mode 100644 index 0000000000000..8bb111ed5c5fb --- /dev/null +++ b/src/librbd/cache/pwl/AbstractWriteLog.cc @@ -0,0 +1,2771 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "AbstractWriteLog.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "include/ceph_assert.h" +#include "common/deleter.h" +#include "common/dout.h" +#include "common/environment.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "common/Timer.h" +#include "common/perf_counters.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/cache/pwl/LogEntry.h" +#include "librbd/cache/pwl/ReadRequest.h" +#include "librbd/cache/pwl/Types.h" +#include +#include + +#undef dout_subsys +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::AbstractWriteLog: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +using namespace librbd::cache::pwl; + +typedef AbstractWriteLog::Extent Extent; +typedef AbstractWriteLog::Extents Extents; + +const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION; + +template +AbstractWriteLog::AbstractWriteLog(I &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state) + : m_cache_state(cache_state), + m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl)), + m_image_ctx(image_ctx), + m_log_pool_config_size(DEFAULT_POOL_SIZE), + m_image_writeback(image_ctx), m_write_log_guard(image_ctx.cct), + m_log_retire_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_log_retire_lock", this))), + m_entry_reader_lock("librbd::cache::pwl::AbstractWriteLog::m_entry_reader_lock"), + m_deferred_dispatch_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_deferred_dispatch_lock", this))), + m_log_append_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_log_append_lock", this))), + m_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_lock", this))), + m_blockguard_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_blockguard_lock", this))), + m_blocks_to_log_entries(image_ctx.cct), + m_thread_pool(image_ctx.cct, "librbd::cache::pwl::AbstractWriteLog::thread_pool", "tp_pwl", + 4, + ""), + m_work_queue("librbd::cache::pwl::ReplicatedWriteLog::work_queue", + ceph::make_timespan( + image_ctx.config.template get_val( + "rbd_op_thread_timeout")), + &m_thread_pool) +{ + CephContext *cct = m_image_ctx.cct; + ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock); +} + +template +AbstractWriteLog::~AbstractWriteLog() { + ldout(m_image_ctx.cct, 15) << "enter" << dendl; + { + std::lock_guard timer_locker(*m_timer_lock); + std::lock_guard locker(m_lock); + m_timer->cancel_event(m_timer_ctx); + m_thread_pool.stop(); + ceph_assert(m_deferred_ios.size() == 0); + ceph_assert(m_ops_to_flush.size() == 0); + ceph_assert(m_ops_to_append.size() == 0); + ceph_assert(m_flush_ops_in_flight == 0); + + m_log_pool = nullptr; + delete m_cache_state; + m_cache_state = nullptr; + } + ldout(m_image_ctx.cct, 15) << "exit" << dendl; +} + +template +void AbstractWriteLog::perf_start(std::string name) { + PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_pwl_first, l_librbd_pwl_last); + + // Latency axis configuration for op histograms, values are in nanoseconds + PerfHistogramCommon::axis_config_d op_hist_x_axis_config{ + "Latency (nsec)", + PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale + 0, ///< Start at 0 + 5000, ///< Quantization unit is 5usec + 16, ///< Ranges into the mS + }; + + // Syncpoint logentry number x-axis configuration for op histograms + PerfHistogramCommon::axis_config_d sp_logentry_number_config{ + "logentry number", + PerfHistogramCommon::SCALE_LINEAR, // log entry number in linear scale + 0, // Start at 0 + 1, // Quantization unit is 1 + 260, // Up to 260 > (MAX_WRITES_PER_SYNC_POINT) + }; + + // Syncpoint bytes number y-axis configuration for op histogram + PerfHistogramCommon::axis_config_d sp_bytes_number_config{ + "Number of SyncPoint", + PerfHistogramCommon::SCALE_LOG2, // Request size in logarithmic scale + 0, // Start at 0 + 512, // Quantization unit is 512 + 17, // Writes up to 8M >= MAX_BYTES_PER_SYNC_POINT + }; + + // Op size axis configuration for op histogram y axis, values are in bytes + PerfHistogramCommon::axis_config_d op_hist_y_axis_config{ + "Request size (bytes)", + PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale + 0, ///< Start at 0 + 512, ///< Quantization unit is 512 bytes + 16, ///< Writes up to >32k + }; + + // Num items configuration for op histogram y axis, values are in items + PerfHistogramCommon::axis_config_d op_hist_y_axis_count_config{ + "Number of items", + PerfHistogramCommon::SCALE_LINEAR, ///< Request size in linear scale + 0, ///< Start at 0 + 1, ///< Quantization unit is 1 + 32, ///< Writes up to >32k + }; + + plb.add_u64_counter(l_librbd_pwl_rd_req, "rd", "Reads"); + plb.add_u64_counter(l_librbd_pwl_rd_bytes, "rd_bytes", "Data size in reads"); + plb.add_time_avg(l_librbd_pwl_rd_latency, "rd_latency", "Latency of reads"); + + plb.add_u64_counter(l_librbd_pwl_rd_hit_req, "hit_rd", "Reads completely hitting RWL"); + plb.add_u64_counter(l_librbd_pwl_rd_hit_bytes, "rd_hit_bytes", "Bytes read from RWL"); + plb.add_time_avg(l_librbd_pwl_rd_hit_latency, "hit_rd_latency", "Latency of read hits"); + + plb.add_u64_counter(l_librbd_pwl_rd_part_hit_req, "part_hit_rd", "reads partially hitting RWL"); + + plb.add_u64_counter_histogram( + l_librbd_pwl_syncpoint_hist, "syncpoint_logentry_bytes_histogram", + sp_logentry_number_config, sp_bytes_number_config, + "Histogram of syncpoint's logentry numbers vs bytes number"); + + plb.add_u64_counter(l_librbd_pwl_wr_req, "wr", "Writes"); + plb.add_u64_counter(l_librbd_pwl_wr_req_def, "wr_def", "Writes deferred for resources"); + plb.add_u64_counter(l_librbd_pwl_wr_req_def_lanes, "wr_def_lanes", "Writes deferred for lanes"); + plb.add_u64_counter(l_librbd_pwl_wr_req_def_log, "wr_def_log", "Writes deferred for log entries"); + plb.add_u64_counter(l_librbd_pwl_wr_req_def_buf, "wr_def_buf", "Writes deferred for buffers"); + plb.add_u64_counter(l_librbd_pwl_wr_req_overlap, "wr_overlap", "Writes overlapping with prior in-progress writes"); + plb.add_u64_counter(l_librbd_pwl_wr_req_queued, "wr_q_barrier", "Writes queued for prior barriers (aio_flush)"); + plb.add_u64_counter(l_librbd_pwl_wr_bytes, "wr_bytes", "Data size in writes"); + + plb.add_u64_counter(l_librbd_pwl_log_ops, "log_ops", "Log appends"); + plb.add_u64_avg(l_librbd_pwl_log_op_bytes, "log_op_bytes", "Average log append bytes"); + + plb.add_time_avg( + l_librbd_pwl_req_arr_to_all_t, "req_arr_to_all_t", + "Average arrival to allocation time (time deferred for overlap)"); + plb.add_time_avg( + l_librbd_pwl_req_arr_to_dis_t, "req_arr_to_dis_t", + "Average arrival to dispatch time (includes time deferred for overlaps and allocation)"); + plb.add_time_avg( + l_librbd_pwl_req_all_to_dis_t, "req_all_to_dis_t", + "Average allocation to dispatch time (time deferred for log resources)"); + plb.add_time_avg( + l_librbd_pwl_wr_latency, "wr_latency", + "Latency of writes (persistent completion)"); + plb.add_u64_counter_histogram( + l_librbd_pwl_wr_latency_hist, "wr_latency_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of write request latency (nanoseconds) vs. bytes written"); + plb.add_time_avg( + l_librbd_pwl_wr_caller_latency, "caller_wr_latency", + "Latency of write completion to caller"); + plb.add_time_avg( + l_librbd_pwl_nowait_req_arr_to_all_t, "req_arr_to_all_nw_t", + "Average arrival to allocation time (time deferred for overlap)"); + plb.add_time_avg( + l_librbd_pwl_nowait_req_arr_to_dis_t, "req_arr_to_dis_nw_t", + "Average arrival to dispatch time (includes time deferred for overlaps and allocation)"); + plb.add_time_avg( + l_librbd_pwl_nowait_req_all_to_dis_t, "req_all_to_dis_nw_t", + "Average allocation to dispatch time (time deferred for log resources)"); + plb.add_time_avg( + l_librbd_pwl_nowait_wr_latency, "wr_latency_nw", + "Latency of writes (persistent completion) not deferred for free space"); + plb.add_u64_counter_histogram( + l_librbd_pwl_nowait_wr_latency_hist, "wr_latency_nw_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of write request latency (nanoseconds) vs. bytes written for writes not deferred for free space"); + plb.add_time_avg( + l_librbd_pwl_nowait_wr_caller_latency, "caller_wr_latency_nw", + "Latency of write completion to callerfor writes not deferred for free space"); + plb.add_time_avg(l_librbd_pwl_log_op_alloc_t, "op_alloc_t", "Average buffer pmemobj_reserve() time"); + plb.add_u64_counter_histogram( + l_librbd_pwl_log_op_alloc_t_hist, "op_alloc_t_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of buffer pmemobj_reserve() time (nanoseconds) vs. bytes written"); + plb.add_time_avg(l_librbd_pwl_log_op_dis_to_buf_t, "op_dis_to_buf_t", "Average dispatch to buffer persist time"); + plb.add_time_avg(l_librbd_pwl_log_op_dis_to_app_t, "op_dis_to_app_t", "Average dispatch to log append time"); + plb.add_time_avg(l_librbd_pwl_log_op_dis_to_cmp_t, "op_dis_to_cmp_t", "Average dispatch to persist completion time"); + plb.add_u64_counter_histogram( + l_librbd_pwl_log_op_dis_to_cmp_t_hist, "op_dis_to_cmp_t_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of op dispatch to persist complete time (nanoseconds) vs. bytes written"); + + plb.add_time_avg( + l_librbd_pwl_log_op_buf_to_app_t, "op_buf_to_app_t", + "Average buffer persist to log append time (write data persist/replicate + wait for append time)"); + plb.add_time_avg( + l_librbd_pwl_log_op_buf_to_bufc_t, "op_buf_to_bufc_t", + "Average buffer persist time (write data persist/replicate time)"); + plb.add_u64_counter_histogram( + l_librbd_pwl_log_op_buf_to_bufc_t_hist, "op_buf_to_bufc_t_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of write buffer persist time (nanoseconds) vs. bytes written"); + plb.add_time_avg( + l_librbd_pwl_log_op_app_to_cmp_t, "op_app_to_cmp_t", + "Average log append to persist complete time (log entry append/replicate + wait for complete time)"); + plb.add_time_avg( + l_librbd_pwl_log_op_app_to_appc_t, "op_app_to_appc_t", + "Average log append to persist complete time (log entry append/replicate time)"); + plb.add_u64_counter_histogram( + l_librbd_pwl_log_op_app_to_appc_t_hist, "op_app_to_appc_t_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of log append persist time (nanoseconds) (vs. op bytes)"); + + plb.add_u64_counter(l_librbd_pwl_discard, "discard", "Discards"); + plb.add_u64_counter(l_librbd_pwl_discard_bytes, "discard_bytes", "Bytes discarded"); + plb.add_time_avg(l_librbd_pwl_discard_latency, "discard_lat", "Discard latency"); + + plb.add_u64_counter(l_librbd_pwl_aio_flush, "aio_flush", "AIO flush (flush to RWL)"); + plb.add_u64_counter(l_librbd_pwl_aio_flush_def, "aio_flush_def", "AIO flushes deferred for resources"); + plb.add_time_avg(l_librbd_pwl_aio_flush_latency, "aio_flush_lat", "AIO flush latency"); + + plb.add_u64_counter(l_librbd_pwl_ws,"ws", "Write Sames"); + plb.add_u64_counter(l_librbd_pwl_ws_bytes, "ws_bytes", "Write Same bytes to image"); + plb.add_time_avg(l_librbd_pwl_ws_latency, "ws_lat", "Write Same latency"); + + plb.add_u64_counter(l_librbd_pwl_cmp, "cmp", "Compare and Write requests"); + plb.add_u64_counter(l_librbd_pwl_cmp_bytes, "cmp_bytes", "Compare and Write bytes compared/written"); + plb.add_time_avg(l_librbd_pwl_cmp_latency, "cmp_lat", "Compare and Write latecy"); + plb.add_u64_counter(l_librbd_pwl_cmp_fails, "cmp_fails", "Compare and Write compare fails"); + + plb.add_u64_counter(l_librbd_pwl_flush, "flush", "Flush (flush RWL)"); + plb.add_u64_counter(l_librbd_pwl_invalidate_cache, "invalidate", "Invalidate RWL"); + plb.add_u64_counter(l_librbd_pwl_invalidate_discard_cache, "discard", "Discard and invalidate RWL"); + + plb.add_time_avg(l_librbd_pwl_append_tx_t, "append_tx_lat", "Log append transaction latency"); + plb.add_u64_counter_histogram( + l_librbd_pwl_append_tx_t_hist, "append_tx_lat_histogram", + op_hist_x_axis_config, op_hist_y_axis_count_config, + "Histogram of log append transaction time (nanoseconds) vs. entries appended"); + plb.add_time_avg(l_librbd_pwl_retire_tx_t, "retire_tx_lat", "Log retire transaction latency"); + plb.add_u64_counter_histogram( + l_librbd_pwl_retire_tx_t_hist, "retire_tx_lat_histogram", + op_hist_x_axis_config, op_hist_y_axis_count_config, + "Histogram of log retire transaction time (nanoseconds) vs. entries retired"); + + m_perfcounter = plb.create_perf_counters(); + m_image_ctx.cct->get_perfcounters_collection()->add(m_perfcounter); +} + +template +void AbstractWriteLog::perf_stop() { + ceph_assert(m_perfcounter); + m_image_ctx.cct->get_perfcounters_collection()->remove(m_perfcounter); + delete m_perfcounter; +} + +template +void AbstractWriteLog::log_perf() { + bufferlist bl; + Formatter *f = Formatter::create("json-pretty"); + bl.append("Perf dump follows\n--- Begin perf dump ---\n"); + bl.append("{\n"); + stringstream ss; + utime_t now = ceph_clock_now(); + ss << "\"test_time\": \"" << now << "\","; + ss << "\"image\": \"" << m_image_ctx.name << "\","; + bl.append(ss); + bl.append("\"stats\": "); + m_image_ctx.cct->get_perfcounters_collection()->dump_formatted(f, 0); + f->flush(bl); + bl.append(",\n\"histograms\": "); + m_image_ctx.cct->get_perfcounters_collection()->dump_formatted_histograms(f, 0); + f->flush(bl); + delete f; + bl.append("}\n--- End perf dump ---\n"); + bl.append('\0'); + ldout(m_image_ctx.cct, 1) << bl.c_str() << dendl; +} + +template +void AbstractWriteLog::periodic_stats() { + std::lock_guard locker(m_lock); + ldout(m_image_ctx.cct, 1) << "STATS: " + << "m_free_log_entries=" << m_free_log_entries << ", " + << "m_log_entries=" << m_log_entries.size() << ", " + << "m_dirty_log_entries=" << m_dirty_log_entries.size() << ", " + << "m_bytes_allocated=" << m_bytes_allocated << ", " + << "m_bytes_cached=" << m_bytes_cached << ", " + << "m_bytes_dirty=" << m_bytes_dirty << ", " + << "bytes available=" << m_bytes_allocated_cap - m_bytes_allocated << ", " + << "m_current_sync_gen=" << m_current_sync_gen << ", " + << "m_flushed_sync_gen=" << m_flushed_sync_gen << ", " + << dendl; +} + +template +void AbstractWriteLog::arm_periodic_stats() { + ceph_assert(ceph_mutex_is_locked(*m_timer_lock)); + if (m_periodic_stats_enabled) { + m_timer_ctx = new LambdaContext( + [this](int r) { + /* m_timer_lock is held */ + periodic_stats(); + arm_periodic_stats(); + }); + m_timer->add_event_after(LOG_STATS_INTERVAL_SECONDS, m_timer_ctx); + } +} + +/* + * Loads the log entries from an existing log. + * + * Creates the in-memory structures to represent the state of the + * re-opened log. + * + * Finds the last appended sync point, and any sync points referred to + * in log entries, but missing from the log. These missing sync points + * are created and scheduled for append. Some rudimentary consistency + * checking is done. + * + * Rebuilds the m_blocks_to_log_entries map, to make log entries + * readable. + * + * Places all writes on the dirty entries list, which causes them all + * to be flushed. + * + */ +template +void AbstractWriteLog::load_existing_entries(DeferredContexts &later) { + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries); + uint64_t entry_index = m_first_valid_entry; + /* The map below allows us to find sync point log entries by sync + * gen number, which is necessary so write entries can be linked to + * their sync points. */ + std::map> sync_point_entries; + /* The map below tracks sync points referred to in writes but not + * appearing in the sync_point_entries map. We'll use this to + * determine which sync points are missing and need to be + * created. */ + std::map missing_sync_points; + + /* + * Read the existing log entries. Construct an in-memory log entry + * object of the appropriate type for each. Add these to the global + * log entries list. + * + * Write entries will not link to their sync points yet. We'll do + * that in the next pass. Here we'll accumulate a map of sync point + * gen numbers that are referred to in writes but do not appearing in + * the log. + */ + while (entry_index != m_first_free_entry) { + WriteLogPmemEntry *pmem_entry = &pmem_log_entries[entry_index]; + std::shared_ptr log_entry = nullptr; + bool writer = pmem_entry->is_writer(); + + ceph_assert(pmem_entry->entry_index == entry_index); + if (pmem_entry->is_sync_point()) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " is a sync point. pmem_entry=[" << *pmem_entry << "]" << dendl; + auto sync_point_entry = std::make_shared(pmem_entry->sync_gen_number); + log_entry = sync_point_entry; + sync_point_entries[pmem_entry->sync_gen_number] = sync_point_entry; + missing_sync_points.erase(pmem_entry->sync_gen_number); + m_current_sync_gen = pmem_entry->sync_gen_number; + } else if (pmem_entry->is_write()) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " is a write. pmem_entry=[" << *pmem_entry << "]" << dendl; + auto write_entry = + std::make_shared(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes); + write_entry->pmem_buffer = D_RW(pmem_entry->write_data); + log_entry = write_entry; + } else if (pmem_entry->is_writesame()) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " is a write same. pmem_entry=[" << *pmem_entry << "]" << dendl; + auto ws_entry = + std::make_shared(nullptr, pmem_entry->image_offset_bytes, + pmem_entry->write_bytes, pmem_entry->ws_datalen); + ws_entry->pmem_buffer = D_RW(pmem_entry->write_data); + log_entry = ws_entry; + } else if (pmem_entry->is_discard()) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " is a discard. pmem_entry=[" << *pmem_entry << "]" << dendl; + auto discard_entry = + std::make_shared(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes, + m_discard_granularity_bytes); + log_entry = discard_entry; + } else { + lderr(m_image_ctx.cct) << "Unexpected entry type in entry " << entry_index + << ", pmem_entry=[" << *pmem_entry << "]" << dendl; + } + + if (writer) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " writes. pmem_entry=[" << *pmem_entry << "]" << dendl; + if (!sync_point_entries[pmem_entry->sync_gen_number]) { + missing_sync_points[pmem_entry->sync_gen_number] = true; + } + } + + log_entry->ram_entry = *pmem_entry; + log_entry->pmem_entry = pmem_entry; + log_entry->log_entry_index = entry_index; + log_entry->completed = true; + + m_log_entries.push_back(log_entry); + + entry_index = (entry_index + 1) % m_total_log_entries; + } + + /* Create missing sync points. These must not be appended until the + * entry reload is complete and the write map is up to + * date. Currently this is handled by the deferred contexts object + * passed to new_sync_point(). These contexts won't be completed + * until this function returns. */ + for (auto &kv : missing_sync_points) { + ldout(m_image_ctx.cct, 5) << "Adding sync point " << kv.first << dendl; + if (0 == m_current_sync_gen) { + /* The unlikely case where the log contains writing entries, but no sync + * points (e.g. because they were all retired) */ + m_current_sync_gen = kv.first-1; + } + ceph_assert(kv.first == m_current_sync_gen+1); + init_flush_new_sync_point(later); + ceph_assert(kv.first == m_current_sync_gen); + sync_point_entries[kv.first] = m_current_sync_point->log_entry;; + } + + /* + * Iterate over the log entries again (this time via the global + * entries list), connecting write entries to their sync points and + * updating the sync point stats. + * + * Add writes to the write log map. + */ + std::shared_ptr previous_sync_point_entry = nullptr; + for (auto &log_entry : m_log_entries) { + if ((log_entry->write_bytes() > 0) || (log_entry->bytes_dirty() > 0)) { + /* This entry is one of the types that write */ + auto gen_write_entry = static_pointer_cast(log_entry); + if (gen_write_entry) { + auto sync_point_entry = sync_point_entries[gen_write_entry->ram_entry.sync_gen_number]; + if (!sync_point_entry) { + lderr(m_image_ctx.cct) << "Sync point missing for entry=[" << *gen_write_entry << "]" << dendl; + ceph_assert(false); + } else { + gen_write_entry->sync_point_entry = sync_point_entry; + sync_point_entry->writes++; + sync_point_entry->bytes += gen_write_entry->ram_entry.write_bytes; + sync_point_entry->writes_completed++; + m_blocks_to_log_entries.add_log_entry(gen_write_entry); + /* This entry is only dirty if its sync gen number is > the flushed + * sync gen number from the root object. */ + if (gen_write_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) { + m_dirty_log_entries.push_back(log_entry); + m_bytes_dirty += gen_write_entry->bytes_dirty(); + } else { + gen_write_entry->set_flushed(true); + sync_point_entry->writes_flushed++; + } + if (log_entry->write_bytes() == log_entry->bytes_dirty()) { + /* This entry is a basic write */ + uint64_t bytes_allocated = MIN_WRITE_ALLOC_SIZE; + if (gen_write_entry->ram_entry.write_bytes > bytes_allocated) { + bytes_allocated = gen_write_entry->ram_entry.write_bytes; + } + m_bytes_allocated += bytes_allocated; + m_bytes_cached += gen_write_entry->ram_entry.write_bytes; + } + } + } + } else { + /* This entry is sync point entry */ + auto sync_point_entry = static_pointer_cast(log_entry); + if (sync_point_entry) { + if (previous_sync_point_entry) { + previous_sync_point_entry->next_sync_point_entry = sync_point_entry; + if (previous_sync_point_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) { + sync_point_entry->prior_sync_point_flushed = false; + ceph_assert(!previous_sync_point_entry->prior_sync_point_flushed || + (0 == previous_sync_point_entry->writes) || + (previous_sync_point_entry->writes >= previous_sync_point_entry->writes_flushed)); + } else { + sync_point_entry->prior_sync_point_flushed = true; + ceph_assert(previous_sync_point_entry->prior_sync_point_flushed); + ceph_assert(previous_sync_point_entry->writes == previous_sync_point_entry->writes_flushed); + } + previous_sync_point_entry = sync_point_entry; + } else { + /* There are no previous sync points, so we'll consider them flushed */ + sync_point_entry->prior_sync_point_flushed = true; + } + ldout(m_image_ctx.cct, 10) << "Loaded to sync point=[" << *sync_point_entry << dendl; + } + } + } + if (0 == m_current_sync_gen) { + /* If a re-opened log was completely flushed, we'll have found no sync point entries here, + * and not advanced m_current_sync_gen. Here we ensure it starts past the last flushed sync + * point recorded in the log. */ + m_current_sync_gen = m_flushed_sync_gen; + } +} + +template +void AbstractWriteLog::pwl_init(Context *on_finish, DeferredContexts &later) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + TOID(struct WriteLogPoolRoot) pool_root; + ceph_assert(m_cache_state); + std::lock_guard locker(m_lock); + ceph_assert(!m_initialized); + ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl; + ldout(cct,5) << "pwl_size: " << m_cache_state->size << dendl; + std::string pwl_path = m_cache_state->path; + ldout(cct,5) << "pwl_path: " << pwl_path << dendl; + + std::string pool_name = m_image_ctx.md_ctx.get_pool_name(); + std::string log_pool_name = pwl_path + "/rbd-pwl." + pool_name + "." + m_image_ctx.id + ".pool"; + std::string log_poolset_name = pwl_path + "/rbd-pwl." + pool_name + "." + m_image_ctx.id + ".poolset"; + m_log_pool_config_size = max(m_cache_state->size, MIN_POOL_SIZE); + + if (access(log_poolset_name.c_str(), F_OK) == 0) { + m_log_pool_name = log_poolset_name; + m_log_is_poolset = true; + } else { + m_log_pool_name = log_pool_name; + ldout(cct, 5) << "Poolset file " << log_poolset_name + << " not present (or can't open). Using unreplicated pool" << dendl; + } + + if ((!m_cache_state->present) && + (access(m_log_pool_name.c_str(), F_OK) == 0)) { + ldout(cct, 5) << "There's an existing pool/poolset file " << m_log_pool_name + << ", While there's no cache in the image metatata." << dendl; + if (remove(m_log_pool_name.c_str()) != 0) { + lderr(cct) << "Failed to remove the pool/poolset file " << m_log_pool_name + << dendl; + on_finish->complete(-errno); + return; + } else { + ldout(cct, 5) << "Removed the existing pool/poolset file." << dendl; + } + } + + if (access(m_log_pool_name.c_str(), F_OK) != 0) { + if ((m_log_pool = + pmemobj_create(m_log_pool_name.c_str(), + m_pwl_pool_layout_name, + m_log_pool_config_size, + (S_IWUSR | S_IRUSR))) == NULL) { + lderr(cct) << "failed to create pool (" << m_log_pool_name << ")" + << pmemobj_errormsg() << dendl; + m_cache_state->present = false; + m_cache_state->clean = true; + m_cache_state->empty = true; + /* TODO: filter/replace errnos that are meaningless to the caller */ + on_finish->complete(-errno); + return; + } + m_cache_state->present = true; + m_cache_state->clean = true; + m_cache_state->empty = true; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + + /* new pool, calculate and store metadata */ + size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE); + size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogPmemEntry); + uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size); + if (num_small_writes > MAX_LOG_ENTRIES) { + num_small_writes = MAX_LOG_ENTRIES; + } + if (num_small_writes <= 2) { + lderr(cct) << "num_small_writes needs to > 2" << dendl; + on_finish->complete(-EINVAL); + return; + } + m_log_pool_actual_size = m_log_pool_config_size; + m_bytes_allocated_cap = effective_pool_size; + /* Log ring empty */ + m_first_free_entry = 0; + m_first_valid_entry = 0; + TX_BEGIN(m_log_pool) { + TX_ADD(pool_root); + D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION; + D_RW(pool_root)->log_entries = + TX_ZALLOC(struct WriteLogPmemEntry, + sizeof(struct WriteLogPmemEntry) * num_small_writes); + D_RW(pool_root)->pool_size = m_log_pool_actual_size; + D_RW(pool_root)->flushed_sync_gen = m_flushed_sync_gen; + D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE; + D_RW(pool_root)->num_log_entries = num_small_writes; + D_RW(pool_root)->first_free_entry = m_first_free_entry; + D_RW(pool_root)->first_valid_entry = m_first_valid_entry; + } TX_ONCOMMIT { + m_total_log_entries = D_RO(pool_root)->num_log_entries; + m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free + } TX_ONABORT { + m_total_log_entries = 0; + m_free_log_entries = 0; + lderr(cct) << "failed to initialize pool (" << m_log_pool_name << ")" << dendl; + on_finish->complete(-pmemobj_tx_errno()); + return; + } TX_FINALLY { + } TX_END; + } else { + m_cache_state->present = true; + /* Open existing pool */ + if ((m_log_pool = + pmemobj_open(m_log_pool_name.c_str(), + m_pwl_pool_layout_name)) == NULL) { + lderr(cct) << "failed to open pool (" << m_log_pool_name << "): " + << pmemobj_errormsg() << dendl; + on_finish->complete(-errno); + return; + } + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + if (D_RO(pool_root)->header.layout_version != RWL_POOL_VERSION) { + // TODO: will handle upgrading version in the future + lderr(cct) << "Pool layout version is " << D_RO(pool_root)->header.layout_version + << " expected " << RWL_POOL_VERSION << dendl; + on_finish->complete(-EINVAL); + return; + } + if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) { + lderr(cct) << "Pool block size is " << D_RO(pool_root)->block_size + << " expected " << MIN_WRITE_ALLOC_SIZE << dendl; + on_finish->complete(-EINVAL); + return; + } + m_log_pool_actual_size = D_RO(pool_root)->pool_size; + m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen; + m_total_log_entries = D_RO(pool_root)->num_log_entries; + m_first_free_entry = D_RO(pool_root)->first_free_entry; + m_first_valid_entry = D_RO(pool_root)->first_valid_entry; + if (m_first_free_entry < m_first_valid_entry) { + /* Valid entries wrap around the end of the ring, so first_free is lower + * than first_valid. If first_valid was == first_free+1, the entry at + * first_free would be empty. The last entry is never used, so in + * that case there would be zero free log entries. */ + m_free_log_entries = m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1; + } else { + /* first_valid is <= first_free. If they are == we have zero valid log + * entries, and n-1 free log entries */ + m_free_log_entries = m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1; + } + size_t effective_pool_size = (size_t)(m_log_pool_config_size * USABLE_SIZE); + m_bytes_allocated_cap = effective_pool_size; + load_existing_entries(later); + m_cache_state->clean = m_dirty_log_entries.empty(); + m_cache_state->empty = m_log_entries.empty(); + } + + ldout(cct,1) << "pool " << m_log_pool_name << " has " << m_total_log_entries + << " log entries, " << m_free_log_entries << " of which are free." + << " first_valid=" << m_first_valid_entry + << ", first_free=" << m_first_free_entry + << ", flushed_sync_gen=" << m_flushed_sync_gen + << ", m_current_sync_gen=" << m_current_sync_gen << dendl; + if (m_first_free_entry == m_first_valid_entry) { + ldout(cct,1) << "write log is empty" << dendl; + m_cache_state->empty = true; + } + + /* Start the sync point following the last one seen in the + * log. Flush the last sync point created during the loading of the + * existing log entries. */ + init_flush_new_sync_point(later); + ldout(cct,20) << "new sync point = [" << m_current_sync_point << "]" << dendl; + + m_initialized = true; + // Start the thread + m_thread_pool.start(); + + m_periodic_stats_enabled = m_cache_state->log_periodic_stats; + /* Do these after we drop lock */ + later.add(new LambdaContext([this](int r) { + if (m_periodic_stats_enabled) { + /* Log stats for the first time */ + periodic_stats(); + /* Arm periodic stats logging for the first time */ + std::lock_guard timer_locker(*m_timer_lock); + arm_periodic_stats(); + } + })); + m_image_ctx.op_work_queue->queue(on_finish, 0); +} + +template +void AbstractWriteLog::update_image_cache_state(Context *on_finish) { + m_cache_state->write_image_cache_state(on_finish); +} + +template +void AbstractWriteLog::init(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + perf_start(m_image_ctx.id); + + ceph_assert(!m_initialized); + + Context *ctx = new LambdaContext( + [this, on_finish](int r) { + if (r >= 0) { + update_image_cache_state(on_finish); + } else { + on_finish->complete(r); + } + }); + + DeferredContexts later; + pwl_init(ctx, later); +} + +template +void AbstractWriteLog::shut_down(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl; + + Context *ctx = new LambdaContext( + [this, on_finish](int r) { + ldout(m_image_ctx.cct, 6) << "shutdown complete" << dendl; + m_image_ctx.op_work_queue->queue(on_finish, r); + }); + ctx = new LambdaContext( + [this, ctx](int r) { + Context *next_ctx = override_ctx(r, ctx); + bool periodic_stats_enabled = m_periodic_stats_enabled; + m_periodic_stats_enabled = false; + + if (periodic_stats_enabled) { + /* Log stats one last time if they were enabled */ + periodic_stats(); + } + { + std::lock_guard locker(m_lock); + ceph_assert(m_dirty_log_entries.size() == 0); + m_wake_up_enabled = false; + m_cache_state->clean = true; + m_log_entries.clear(); + if (m_log_pool) { + ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl; + pmemobj_close(m_log_pool); + } + if (m_cache_state->clean) { + if (m_log_is_poolset) { + ldout(m_image_ctx.cct, 5) << "Not removing poolset " << m_log_pool_name << dendl; + } else { + ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << m_log_pool_name << dendl; + if (remove(m_log_pool_name.c_str()) != 0) { + lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << m_log_pool_name << "\": " + << pmemobj_errormsg() << dendl; + } else { + m_cache_state->clean = true; + m_cache_state->empty = true; + m_cache_state->present = false; + } + } + } else { + if (m_log_is_poolset) { + ldout(m_image_ctx.cct, 5) << "Not removing poolset " << m_log_pool_name << dendl; + } else { + ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << m_log_pool_name << dendl; + } + } + if (m_perfcounter) { + perf_stop(); + } + } + update_image_cache_state(next_ctx); + }); + ctx = new LambdaContext( + [this, ctx](int r) { + Context *next_ctx = override_ctx(r, ctx); + { + /* Sync with process_writeback_dirty_entries() */ + RWLock::WLocker entry_reader_wlocker(m_entry_reader_lock); + m_shutting_down = true; + /* Flush all writes to OSDs (unless disabled) and wait for all + in-progress flush writes to complete */ + ldout(m_image_ctx.cct, 6) << "flushing" << dendl; + if (m_periodic_stats_enabled) { + periodic_stats(); + } + } + flush_dirty_entries(next_ctx); + }); + ctx = new LambdaContext( + [this, ctx](int r) { + Context *next_ctx = override_ctx(r, ctx); + ldout(m_image_ctx.cct, 6) << "waiting for in flight operations" << dendl; + // Wait for in progress IOs to complete + next_ctx = util::create_async_context_callback(m_image_ctx, next_ctx); + m_async_op_tracker.wait_for_ops(next_ctx); + }); + ctx = new LambdaContext( + [this, ctx](int r) { + ldout(m_image_ctx.cct, 6) << "Done internal_flush in shutdown" << dendl; + m_work_queue.queue(ctx, r); + }); + /* Complete all in-flight writes before shutting down */ + ldout(m_image_ctx.cct, 6) << "internal_flush in shutdown" << dendl; + internal_flush(false, ctx); +} + +template +void AbstractWriteLog::read(Extents&& image_extents, + ceph::bufferlist* bl, + int fadvise_flags, Context *on_finish) { + // TODO: handle writesame and discard case in later PRs + CephContext *cct = m_image_ctx.cct; + utime_t now = ceph_clock_now(); + C_ReadRequest *read_ctx = new C_ReadRequest(cct, now, m_perfcounter, bl, on_finish); + ldout(cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id + << "image_extents=" << image_extents << ", " + << "bl=" << bl << ", " + << "on_finish=" << on_finish << dendl; + + ceph_assert(m_initialized); + bl->clear(); + m_perfcounter->inc(l_librbd_pwl_rd_req, 1); + + /* + * The strategy here is to look up all the WriteLogMapEntries that overlap + * this read, and iterate through those to separate this read into hits and + * misses. A new Extents object is produced here with Extents for each miss + * region. The miss Extents is then passed on to the read cache below RWL. We + * also produce an ImageExtentBufs for all the extents (hit or miss) in this + * read. When the read from the lower cache layer completes, we iterate + * through the ImageExtentBufs and insert buffers for each cache hit at the + * appropriate spot in the bufferlist returned from below for the miss + * read. The buffers we insert here refer directly to regions of various + * write log entry data buffers. + * + * Locking: These buffer objects hold a reference on the write log entries + * they refer to. Log entries can't be retired until there are no references. + * The GenericWriteLogEntry references are released by the buffer destructor. + */ + for (auto &extent : image_extents) { + uint64_t extent_offset = 0; + RWLock::RLocker entry_reader_locker(m_entry_reader_lock); + WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries(block_extent(extent)); + for (auto &map_entry : map_entries) { + Extent entry_image_extent(pwl::image_extent(map_entry.block_extent)); + /* If this map entry starts after the current image extent offset ... */ + if (entry_image_extent.first > extent.first + extent_offset) { + /* ... add range before map_entry to miss extents */ + uint64_t miss_extent_start = extent.first + extent_offset; + uint64_t miss_extent_length = entry_image_extent.first - miss_extent_start; + Extent miss_extent(miss_extent_start, miss_extent_length); + read_ctx->miss_extents.push_back(miss_extent); + /* Add miss range to read extents */ + ImageExtentBuf miss_extent_buf(miss_extent); + read_ctx->read_extents.push_back(miss_extent_buf); + extent_offset += miss_extent_length; + } + ceph_assert(entry_image_extent.first <= extent.first + extent_offset); + uint64_t entry_offset = 0; + /* If this map entry starts before the current image extent offset ... */ + if (entry_image_extent.first < extent.first + extent_offset) { + /* ... compute offset into log entry for this read extent */ + entry_offset = (extent.first + extent_offset) - entry_image_extent.first; + } + /* This read hit ends at the end of the extent or the end of the log + entry, whichever is less. */ + uint64_t entry_hit_length = min(entry_image_extent.second - entry_offset, + extent.second - extent_offset); + Extent hit_extent(entry_image_extent.first, entry_hit_length); + if (0 == map_entry.log_entry->write_bytes() && 0 < map_entry.log_entry->bytes_dirty()) { + /* discard log entry */ + auto discard_entry = map_entry.log_entry; + ldout(cct, 20) << "read hit on discard entry: log_entry=" << *discard_entry << dendl; + /* Discards read as zero, so we'll construct a bufferlist of zeros */ + bufferlist zero_bl; + zero_bl.append_zero(entry_hit_length); + /* Add hit extent to read extents */ + ImageExtentBuf hit_extent_buf(hit_extent, zero_bl); + read_ctx->read_extents.push_back(hit_extent_buf); + } else { + /* write and writesame log entry */ + /* Offset of the map entry into the log entry's buffer */ + uint64_t map_entry_buffer_offset = entry_image_extent.first - map_entry.log_entry->ram_entry.image_offset_bytes; + /* Offset into the log entry buffer of this read hit */ + uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset; + /* Create buffer object referring to pmem pool for this read hit */ + auto write_entry = map_entry.log_entry; + + /* Make a bl for this hit extent. This will add references to the write_entry->pmem_bp */ + buffer::list hit_bl; + + buffer::list entry_bl_copy; + write_entry->copy_pmem_bl(&entry_bl_copy); + entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl); + + ceph_assert(hit_bl.length() == entry_hit_length); + + /* Add hit extent to read extents */ + ImageExtentBuf hit_extent_buf(hit_extent, hit_bl); + read_ctx->read_extents.push_back(hit_extent_buf); + } + /* Exclude RWL hit range from buffer and extent */ + extent_offset += entry_hit_length; + ldout(cct, 20) << map_entry << dendl; + } + /* If the last map entry didn't consume the entire image extent ... */ + if (extent.second > extent_offset) { + /* ... add the rest of this extent to miss extents */ + uint64_t miss_extent_start = extent.first + extent_offset; + uint64_t miss_extent_length = extent.second - extent_offset; + Extent miss_extent(miss_extent_start, miss_extent_length); + read_ctx->miss_extents.push_back(miss_extent); + /* Add miss range to read extents */ + ImageExtentBuf miss_extent_buf(miss_extent); + read_ctx->read_extents.push_back(miss_extent_buf); + extent_offset += miss_extent_length; + } + } + + ldout(cct, 20) << "miss_extents=" << read_ctx->miss_extents << ", " + << "miss_bl=" << read_ctx->miss_bl << dendl; + + if (read_ctx->miss_extents.empty()) { + /* All of this read comes from RWL */ + read_ctx->complete(0); + } else { + /* Pass the read misses on to the layer below RWL */ + m_image_writeback.aio_read(std::move(read_ctx->miss_extents), &read_ctx->miss_bl, fadvise_flags, read_ctx); + } +} + +template +void AbstractWriteLog::write(Extents &&image_extents, + bufferlist&& bl, + int fadvise_flags, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << "aio_write" << dendl; + + utime_t now = ceph_clock_now(); + m_perfcounter->inc(l_librbd_pwl_wr_req, 1); + + ceph_assert(m_initialized); + + auto *write_req = + new C_WriteRequestT(*this, now, std::move(image_extents), std::move(bl), fadvise_flags, + m_lock, m_perfcounter, on_finish); + m_perfcounter->inc(l_librbd_pwl_wr_bytes, write_req->image_extents_summary.total_bytes); + + /* The lambda below will be called when the block guard for all + * blocks affected by this write is obtained */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, write_req](GuardedRequestFunctionContext &guard_ctx) { + write_req->blockguard_acquired(guard_ctx); + alloc_and_dispatch_io_req(write_req); + }); + + detain_guarded_request(write_req, guarded_ctx, false); +} + +template +void AbstractWriteLog::discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << dendl; + + utime_t now = ceph_clock_now(); + m_perfcounter->inc(l_librbd_pwl_discard, 1); + Extents discard_extents = {{offset, length}}; + m_discard_granularity_bytes = discard_granularity_bytes; + + ceph_assert(m_initialized); + + auto *discard_req = + new C_DiscardRequestT(*this, now, std::move(discard_extents), discard_granularity_bytes, + m_lock, m_perfcounter, on_finish); + + /* The lambda below will be called when the block guard for all + * blocks affected by this write is obtained */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, discard_req](GuardedRequestFunctionContext &guard_ctx) { + discard_req->blockguard_acquired(guard_ctx); + alloc_and_dispatch_io_req(discard_req); + }); + + detain_guarded_request(discard_req, guarded_ctx, false); +} + +/** + * Aio_flush completes when all previously completed writes are + * flushed to persistent cache. We make a best-effort attempt to also + * defer until all in-progress writes complete, but we may not know + * about all of the writes the application considers in-progress yet, + * due to uncertainty in the IO submission workq (multiple WQ threads + * may allow out-of-order submission). + * + * This flush operation will not wait for writes deferred for overlap + * in the block guard. + */ +template +void AbstractWriteLog::flush(io::FlushSource flush_source, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "on_finish=" << on_finish << " flush_source=" << flush_source << dendl; + + if (io::FLUSH_SOURCE_SHUTDOWN == flush_source || io::FLUSH_SOURCE_INTERNAL == flush_source) { + internal_flush(false, on_finish); + return; + } + m_perfcounter->inc(l_librbd_pwl_aio_flush, 1); + + /* May be called even if initialization fails */ + if (!m_initialized) { + ldout(cct, 05) << "never initialized" << dendl; + /* Deadlock if completed here */ + m_image_ctx.op_work_queue->queue(on_finish, 0); + return; + } + + { + std::shared_lock image_locker(m_image_ctx.image_lock); + if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) { + on_finish->complete(-EROFS); + return; + } + } + + auto flush_req = make_flush_req(on_finish); + + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, flush_req](GuardedRequestFunctionContext &guard_ctx) { + ldout(m_image_ctx.cct, 20) << "flush_req=" << flush_req << " cell=" << guard_ctx.cell << dendl; + ceph_assert(guard_ctx.cell); + flush_req->detained = guard_ctx.state.detained; + /* We don't call flush_req->set_cell(), because the block guard will be released here */ + { + DeferredContexts post_unlock; /* Do these when the lock below is released */ + std::lock_guard locker(m_lock); + + if (!m_persist_on_flush && m_persist_on_write_until_flush) { + m_persist_on_flush = true; + ldout(m_image_ctx.cct, 5) << "now persisting on flush" << dendl; + } + + /* + * Create a new sync point if there have been writes since the last + * one. + * + * We do not flush the caches below the RWL here. + */ + flush_new_sync_point_if_needed(flush_req, post_unlock); + } + + release_guarded_request(guard_ctx.cell); + }); + + detain_guarded_request(flush_req, guarded_ctx, true); +} + +template +void AbstractWriteLog::writesame(uint64_t offset, uint64_t length, + bufferlist&& bl, int fadvise_flags, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << "aio_writesame" << dendl; + + utime_t now = ceph_clock_now(); + Extents ws_extents = {{offset, length}}; + m_perfcounter->inc(l_librbd_pwl_ws, 1); + ceph_assert(m_initialized); + + /* A write same request is also a write request. The key difference is the + * write same data buffer is shorter than the extent of the request. The full + * extent will be used in the block guard, and appear in + * m_blocks_to_log_entries_map. The data buffer allocated for the WS is only + * as long as the length of the bl here, which is the pattern that's repeated + * in the image for the entire length of this WS. Read hits and flushing of + * write sames are different than normal writes. */ + auto *ws_req = + new C_WriteSameRequestT(*this, now, std::move(ws_extents), std::move(bl), + fadvise_flags, m_lock, m_perfcounter, on_finish); + m_perfcounter->inc(l_librbd_pwl_ws_bytes, ws_req->image_extents_summary.total_bytes); + + /* The lambda below will be called when the block guard for all + * blocks affected by this write is obtained */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, ws_req](GuardedRequestFunctionContext &guard_ctx) { + ws_req->blockguard_acquired(guard_ctx); + alloc_and_dispatch_io_req(ws_req); + }); + + detain_guarded_request(ws_req, guarded_ctx, false); +} + +template +void AbstractWriteLog::compare_and_write(Extents &&image_extents, + bufferlist&& cmp_bl, + bufferlist&& bl, + uint64_t *mismatch_offset, + int fadvise_flags, + Context *on_finish) { + ldout(m_image_ctx.cct, 20) << dendl; + + utime_t now = ceph_clock_now(); + m_perfcounter->inc(l_librbd_pwl_cmp, 1); + ceph_assert(m_initialized); + + /* A compare and write request is also a write request. We only allocate + * resources and dispatch this write request if the compare phase + * succeeds. */ + auto *cw_req = + new C_CompAndWriteRequestT(*this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl), + mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish); + m_perfcounter->inc(l_librbd_pwl_cmp_bytes, cw_req->image_extents_summary.total_bytes); + + /* The lambda below will be called when the block guard for all + * blocks affected by this write is obtained */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, cw_req](GuardedRequestFunctionContext &guard_ctx) { + cw_req->blockguard_acquired(guard_ctx); + + auto read_complete_ctx = new LambdaContext( + [this, cw_req](int r) { + ldout(m_image_ctx.cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id + << "cw_req=" << cw_req << dendl; + + /* Compare read_bl to cmp_bl to determine if this will produce a write */ + buffer::list aligned_read_bl; + if (cw_req->cmp_bl.length() < cw_req->read_bl.length()) { + aligned_read_bl.substr_of(cw_req->read_bl, 0, cw_req->cmp_bl.length()); + } + if (cw_req->cmp_bl.contents_equal(cw_req->read_bl) || + cw_req->cmp_bl.contents_equal(aligned_read_bl)) { + /* Compare phase succeeds. Begin write */ + ldout(m_image_ctx.cct, 5) << " cw_req=" << cw_req << " compare matched" << dendl; + cw_req->compare_succeeded = true; + *cw_req->mismatch_offset = 0; + /* Continue with this request as a write. Blockguard release and + * user request completion handled as if this were a plain + * write. */ + alloc_and_dispatch_io_req(cw_req); + } else { + /* Compare phase fails. Comp-and write ends now. */ + ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " compare failed" << dendl; + /* Bufferlist doesn't tell us where they differed, so we'll have to determine that here */ + uint64_t bl_index = 0; + for (bl_index = 0; bl_index < cw_req->cmp_bl.length(); bl_index++) { + if (cw_req->cmp_bl[bl_index] != cw_req->read_bl[bl_index]) { + ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " mismatch at " << bl_index << dendl; + break; + } + } + cw_req->compare_succeeded = false; + *cw_req->mismatch_offset = bl_index; + cw_req->complete_user_request(-EILSEQ); + cw_req->release_cell(); + cw_req->complete(0); + } + }); + + /* Read phase of comp-and-write must read through RWL */ + Extents image_extents_copy = cw_req->image_extents; + read(std::move(image_extents_copy), &cw_req->read_bl, cw_req->fadvise_flags, read_complete_ctx); + }); + + detain_guarded_request(cw_req, guarded_ctx, false); +} + +template +void AbstractWriteLog::flush(Context *on_finish) { + internal_flush(false, on_finish); +} + +template +void AbstractWriteLog::invalidate(Context *on_finish) { + internal_flush(true, on_finish); +} + +template +CephContext *AbstractWriteLog::get_context() { + return m_image_ctx.cct; +} + +template +BlockGuardCell* AbstractWriteLog::detain_guarded_request_helper(GuardedRequest &req) +{ + CephContext *cct = m_image_ctx.cct; + BlockGuardCell *cell; + + ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock)); + ldout(cct, 20) << dendl; + + int r = m_write_log_guard.detain(req.block_extent, &req, &cell); + ceph_assert(r>=0); + if (r > 0) { + ldout(cct, 20) << "detaining guarded request due to in-flight requests: " + << "req=" << req << dendl; + return nullptr; + } + + ldout(cct, 20) << "in-flight request cell: " << cell << dendl; + return cell; +} + +template +BlockGuardCell* AbstractWriteLog::detain_guarded_request_barrier_helper( + GuardedRequest &req) +{ + BlockGuardCell *cell = nullptr; + + ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock)); + ldout(m_image_ctx.cct, 20) << dendl; + + if (m_barrier_in_progress) { + req.guard_ctx->state.queued = true; + m_awaiting_barrier.push_back(req); + } else { + bool barrier = req.guard_ctx->state.barrier; + if (barrier) { + m_barrier_in_progress = true; + req.guard_ctx->state.current_barrier = true; + } + cell = detain_guarded_request_helper(req); + if (barrier) { + /* Only non-null if the barrier acquires the guard now */ + m_barrier_cell = cell; + } + } + + return cell; +} + +template +void AbstractWriteLog::detain_guarded_request( + C_BlockIORequestT *request, + GuardedRequestFunctionContext *guarded_ctx, + bool is_barrier) +{ + BlockExtent extent; + if (request) { + extent = request->image_extents_summary.block_extent(); + } else { + extent = block_extent(whole_volume_extent()); + } + auto req = GuardedRequest(extent, guarded_ctx, is_barrier); + BlockGuardCell *cell = nullptr; + + ldout(m_image_ctx.cct, 20) << dendl; + { + std::lock_guard locker(m_blockguard_lock); + cell = detain_guarded_request_barrier_helper(req); + } + if (cell) { + req.guard_ctx->cell = cell; + req.guard_ctx->complete(0); + } +} + +template +void AbstractWriteLog::release_guarded_request(BlockGuardCell *released_cell) +{ + CephContext *cct = m_image_ctx.cct; + WriteLogGuard::BlockOperations block_reqs; + ldout(cct, 20) << "released_cell=" << released_cell << dendl; + + { + std::lock_guard locker(m_blockguard_lock); + m_write_log_guard.release(released_cell, &block_reqs); + + for (auto &req : block_reqs) { + req.guard_ctx->state.detained = true; + BlockGuardCell *detained_cell = detain_guarded_request_helper(req); + if (detained_cell) { + if (req.guard_ctx->state.current_barrier) { + /* The current barrier is acquiring the block guard, so now we know its cell */ + m_barrier_cell = detained_cell; + /* detained_cell could be == released_cell here */ + ldout(cct, 20) << "current barrier cell=" << detained_cell << " req=" << req << dendl; + } + req.guard_ctx->cell = detained_cell; + m_work_queue.queue(req.guard_ctx); + } + } + + if (m_barrier_in_progress && (released_cell == m_barrier_cell)) { + ldout(cct, 20) << "current barrier released cell=" << released_cell << dendl; + /* The released cell is the current barrier request */ + m_barrier_in_progress = false; + m_barrier_cell = nullptr; + /* Move waiting requests into the blockguard. Stop if there's another barrier */ + while (!m_barrier_in_progress && !m_awaiting_barrier.empty()) { + auto &req = m_awaiting_barrier.front(); + ldout(cct, 20) << "submitting queued request to blockguard: " << req << dendl; + BlockGuardCell *detained_cell = detain_guarded_request_barrier_helper(req); + if (detained_cell) { + req.guard_ctx->cell = detained_cell; + m_work_queue.queue(req.guard_ctx); + } + m_awaiting_barrier.pop_front(); + } + } + } + + ldout(cct, 20) << "exit" << dendl; +} + +/* + * Performs the log event append operation for all of the scheduled + * events. + */ +template +void AbstractWriteLog::append_scheduled_ops(void) +{ + GenericLogOperations ops; + int append_result = 0; + bool ops_remain = false; + bool appending = false; /* true if we set m_appending */ + ldout(m_image_ctx.cct, 20) << dendl; + do { + ops.clear(); + + { + std::lock_guard locker(m_lock); + if (!appending && m_appending) { + /* Another thread is appending */ + ldout(m_image_ctx.cct, 15) << "Another thread is appending" << dendl; + return; + } + if (m_ops_to_append.size()) { + appending = true; + m_appending = true; + auto last_in_batch = m_ops_to_append.begin(); + unsigned int ops_to_append = m_ops_to_append.size(); + if (ops_to_append > OPS_APPENDED_TOGETHER) { + ops_to_append = OPS_APPENDED_TOGETHER; + } + std::advance(last_in_batch, ops_to_append); + ops.splice(ops.end(), m_ops_to_append, m_ops_to_append.begin(), last_in_batch); + ops_remain = true; /* Always check again before leaving */ + ldout(m_image_ctx.cct, 20) << "appending " << ops.size() << ", " + << m_ops_to_append.size() << " remain" << dendl; + } else { + ops_remain = false; + if (appending) { + appending = false; + m_appending = false; + } + } + } + + if (ops.size()) { + std::lock_guard locker(m_log_append_lock); + alloc_op_log_entries(ops); + append_result = append_op_log_entries(ops); + } + + int num_ops = ops.size(); + if (num_ops) { + /* New entries may be flushable. Completion will wake up flusher. */ + complete_op_log_entries(std::move(ops), append_result); + } + } while (ops_remain); +} + +template +void AbstractWriteLog::enlist_op_appender() +{ + m_async_append_ops++; + m_async_op_tracker.start_op(); + Context *append_ctx = new LambdaContext([this](int r) { + append_scheduled_ops(); + m_async_append_ops--; + m_async_op_tracker.finish_op(); + }); + m_work_queue.queue(append_ctx); +} + +/* + * Takes custody of ops. They'll all get their log entries appended, + * and have their on_write_persist contexts completed once they and + * all prior log entries are persisted everywhere. + */ +template +void AbstractWriteLog::schedule_append(GenericLogOperations &ops) +{ + bool need_finisher; + GenericLogOperationsVector appending; + + std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending)); + { + std::lock_guard locker(m_lock); + + need_finisher = m_ops_to_append.empty() && !m_appending; + m_ops_to_append.splice(m_ops_to_append.end(), ops); + } + + if (need_finisher) { + enlist_op_appender(); + } + + for (auto &op : appending) { + op->appending(); + } +} + +template +void AbstractWriteLog::schedule_append(GenericLogOperationsVector &ops) +{ + GenericLogOperations to_append(ops.begin(), ops.end()); + + schedule_append(to_append); +} + +template +void AbstractWriteLog::schedule_append(GenericLogOperationSharedPtr op) +{ + GenericLogOperations to_append { op }; + + schedule_append(to_append); +} + +const unsigned long int ops_flushed_together = 4; +/* + * Performs the pmem buffer flush on all scheduled ops, then schedules + * the log event append operation for all of them. + */ +template +void AbstractWriteLog::flush_then_append_scheduled_ops(void) +{ + GenericLogOperations ops; + bool ops_remain = false; + ldout(m_image_ctx.cct, 20) << dendl; + do { + { + ops.clear(); + std::lock_guard locker(m_lock); + if (m_ops_to_flush.size()) { + auto last_in_batch = m_ops_to_flush.begin(); + unsigned int ops_to_flush = m_ops_to_flush.size(); + if (ops_to_flush > ops_flushed_together) { + ops_to_flush = ops_flushed_together; + } + ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl; + std::advance(last_in_batch, ops_to_flush); + ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch); + ops_remain = !m_ops_to_flush.empty(); + ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", " + << m_ops_to_flush.size() << " remain" << dendl; + } else { + ops_remain = false; + } + } + if (ops_remain) { + enlist_op_flusher(); + } + + /* Ops subsequently scheduled for flush may finish before these, + * which is fine. We're unconcerned with completion order until we + * get to the log message append step. */ + if (ops.size()) { + flush_pmem_buffer(ops); + schedule_append(ops); + } + } while (ops_remain); + append_scheduled_ops(); +} + +template +void AbstractWriteLog::enlist_op_flusher() +{ + m_async_flush_ops++; + m_async_op_tracker.start_op(); + Context *flush_ctx = new LambdaContext([this](int r) { + flush_then_append_scheduled_ops(); + m_async_flush_ops--; + m_async_op_tracker.finish_op(); + }); + m_work_queue.queue(flush_ctx); +} + +/* + * Takes custody of ops. They'll all get their pmem blocks flushed, + * then get their log entries appended. + */ +template +void AbstractWriteLog::schedule_flush_and_append(GenericLogOperationsVector &ops) +{ + GenericLogOperations to_flush(ops.begin(), ops.end()); + bool need_finisher; + ldout(m_image_ctx.cct, 20) << dendl; + { + std::lock_guard locker(m_lock); + + need_finisher = m_ops_to_flush.empty(); + m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush); + } + + if (need_finisher) { + enlist_op_flusher(); + } +} + +/* + * Flush the pmem regions for the data blocks of a set of operations + * + * V is expected to be GenericLogOperations, or GenericLogOperationsVector + */ +template +template +void AbstractWriteLog::flush_pmem_buffer(V& ops) +{ + for (auto &operation : ops) { + operation->flush_pmem_buf_to_cache(m_log_pool); + } + + /* Drain once for all */ + pmemobj_drain(m_log_pool); + + utime_t now = ceph_clock_now(); + for (auto &operation : ops) { + if (operation->reserved_allocated()) { + operation->buf_persist_comp_time = now; + } else { + ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl; + } + } +} + +/* + * Allocate the (already reserved) write log entries for a set of operations. + * + * Locking: + * Acquires lock + */ +template +void AbstractWriteLog::alloc_op_log_entries(GenericLogOperations &ops) +{ + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries); + + ceph_assert(ceph_mutex_is_locked_by_me(m_log_append_lock)); + + /* Allocate the (already reserved) log entries */ + std::lock_guard locker(m_lock); + + for (auto &operation : ops) { + uint32_t entry_index = m_first_free_entry; + m_first_free_entry = (m_first_free_entry + 1) % m_total_log_entries; + auto &log_entry = operation->get_log_entry(); + log_entry->log_entry_index = entry_index; + log_entry->ram_entry.entry_index = entry_index; + log_entry->pmem_entry = &pmem_log_entries[entry_index]; + log_entry->ram_entry.entry_valid = 1; + m_log_entries.push_back(log_entry); + ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl; + } +} + +/* + * Flush the persistent write log entries set of ops. The entries must + * be contiguous in persistent memory. + */ +template +void AbstractWriteLog::flush_op_log_entries(GenericLogOperationsVector &ops) +{ + if (ops.empty()) { + return; + } + + if (ops.size() > 1) { + ceph_assert(ops.front()->get_log_entry()->pmem_entry < ops.back()->get_log_entry()->pmem_entry); + } + + ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() << " " + << "start address=" + << ops.front()->get_log_entry()->pmem_entry << " " + << "bytes=" + << ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry)) + << dendl; + pmemobj_flush(m_log_pool, + ops.front()->get_log_entry()->pmem_entry, + ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry))); +} + +/* + * Write and persist the (already allocated) write log entries and + * data buffer allocations for a set of ops. The data buffer for each + * of these must already have been persisted to its reserved area. + */ +template +int AbstractWriteLog::append_op_log_entries(GenericLogOperations &ops) +{ + CephContext *cct = m_image_ctx.cct; + GenericLogOperationsVector entries_to_flush; + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + int ret = 0; + + ceph_assert(ceph_mutex_is_locked_by_me(m_log_append_lock)); + + if (ops.empty()) { + return 0; + } + entries_to_flush.reserve(OPS_APPENDED_TOGETHER); + + /* Write log entries to ring and persist */ + utime_t now = ceph_clock_now(); + for (auto &operation : ops) { + if (!entries_to_flush.empty()) { + /* Flush these and reset the list if the current entry wraps to the + * tail of the ring */ + if (entries_to_flush.back()->get_log_entry()->log_entry_index > + operation->get_log_entry()->log_entry_index) { + ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at " + << "operation=[" << *operation << "]" << dendl; + flush_op_log_entries(entries_to_flush); + entries_to_flush.clear(); + now = ceph_clock_now(); + } + } + ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index=" + << operation->get_log_entry()->log_entry_index << " " + << "from " << &operation->get_log_entry()->ram_entry << " " + << "to " << operation->get_log_entry()->pmem_entry << " " + << "operation=[" << *operation << "]" << dendl; + ldout(m_image_ctx.cct, 05) << "APPENDING: index=" + << operation->get_log_entry()->log_entry_index << " " + << "operation=[" << *operation << "]" << dendl; + operation->log_append_time = now; + *operation->get_log_entry()->pmem_entry = operation->get_log_entry()->ram_entry; + ldout(m_image_ctx.cct, 20) << "APPENDING: index=" + << operation->get_log_entry()->log_entry_index << " " + << "pmem_entry=[" << *operation->get_log_entry()->pmem_entry + << "]" << dendl; + entries_to_flush.push_back(operation); + } + flush_op_log_entries(entries_to_flush); + + /* Drain once for all */ + pmemobj_drain(m_log_pool); + + /* + * Atomically advance the log head pointer and publish the + * allocations for all the data buffers they refer to. + */ + utime_t tx_start = ceph_clock_now(); + TX_BEGIN(m_log_pool) { + D_RW(pool_root)->first_free_entry = m_first_free_entry; + for (auto &operation : ops) { + if (operation->reserved_allocated()) { + auto write_op = (std::shared_ptr&) operation; + pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1); + } else { + ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl; + } + } + } TX_ONCOMMIT { + } TX_ONABORT { + lderr(cct) << "failed to commit " << ops.size() + << " log entries (" << m_log_pool_name << ")" << dendl; + ceph_assert(false); + ret = -EIO; + } TX_FINALLY { + } TX_END; + + utime_t tx_end = ceph_clock_now(); + m_perfcounter->tinc(l_librbd_pwl_append_tx_t, tx_end - tx_start); + m_perfcounter->hinc( + l_librbd_pwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size()); + for (auto &operation : ops) { + operation->log_append_comp_time = tx_end; + } + + return ret; +} + +/* + * Complete a set of write ops with the result of append_op_entries. + */ +template +void AbstractWriteLog::complete_op_log_entries(GenericLogOperations &&ops, + const int result) +{ + GenericLogEntries dirty_entries; + int published_reserves = 0; + ldout(m_image_ctx.cct, 20) << __func__ << ": completing" << dendl; + for (auto &op : ops) { + utime_t now = ceph_clock_now(); + auto log_entry = op->get_log_entry(); + log_entry->completed = true; + if (op->is_writing_op()) { + op->mark_log_entry_completed(); + dirty_entries.push_back(log_entry); + } + if (op->reserved_allocated()) { + published_reserves++; + } + op->complete(result); + m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_app_t, + op->log_append_time - op->dispatch_time); + m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_cmp_t, now - op->dispatch_time); + m_perfcounter->hinc(l_librbd_pwl_log_op_dis_to_cmp_t_hist, + utime_t(now - op->dispatch_time).to_nsec(), + log_entry->ram_entry.write_bytes); + utime_t app_lat = op->log_append_comp_time - op->log_append_time; + m_perfcounter->tinc(l_librbd_pwl_log_op_app_to_appc_t, app_lat); + m_perfcounter->hinc(l_librbd_pwl_log_op_app_to_appc_t_hist, app_lat.to_nsec(), + log_entry->ram_entry.write_bytes); + m_perfcounter->tinc(l_librbd_pwl_log_op_app_to_cmp_t, now - op->log_append_time); + } + + { + std::lock_guard locker(m_lock); + m_unpublished_reserves -= published_reserves; + m_dirty_log_entries.splice(m_dirty_log_entries.end(), dirty_entries); + + /* New entries may be flushable */ + wake_up(); + } +} + +/** + * Dispatch as many deferred writes as possible + */ +template +void AbstractWriteLog::dispatch_deferred_writes(void) +{ + C_BlockIORequestT *front_req = nullptr; /* req still on front of deferred list */ + C_BlockIORequestT *allocated_req = nullptr; /* req that was allocated, and is now off the list */ + bool allocated = false; /* front_req allocate succeeded */ + bool cleared_dispatching_flag = false; + + /* If we can't become the dispatcher, we'll exit */ + { + std::lock_guard locker(m_lock); + if (m_dispatching_deferred_ops || + !m_deferred_ios.size()) { + return; + } + m_dispatching_deferred_ops = true; + } + + /* There are ops to dispatch, and this should be the only thread dispatching them */ + { + std::lock_guard deferred_dispatch(m_deferred_dispatch_lock); + do { + { + std::lock_guard locker(m_lock); + ceph_assert(m_dispatching_deferred_ops); + if (allocated) { + /* On the 2..n-1 th time we get lock, front_req->alloc_resources() will + * have succeeded, and we'll need to pop it off the deferred ops list + * here. */ + ceph_assert(front_req); + ceph_assert(!allocated_req); + m_deferred_ios.pop_front(); + allocated_req = front_req; + front_req = nullptr; + allocated = false; + } + ceph_assert(!allocated); + if (!allocated && front_req) { + /* front_req->alloc_resources() failed on the last iteration. We'll stop dispatching. */ + front_req = nullptr; + ceph_assert(!cleared_dispatching_flag); + m_dispatching_deferred_ops = false; + cleared_dispatching_flag = true; + } else { + ceph_assert(!front_req); + if (m_deferred_ios.size()) { + /* New allocation candidate */ + front_req = m_deferred_ios.front(); + } else { + ceph_assert(!cleared_dispatching_flag); + m_dispatching_deferred_ops = false; + cleared_dispatching_flag = true; + } + } + } + /* Try allocating for front_req before we decide what to do with allocated_req + * (if any) */ + if (front_req) { + ceph_assert(!cleared_dispatching_flag); + allocated = front_req->alloc_resources(); + } + if (allocated_req && front_req && allocated) { + /* Push dispatch of the first allocated req to a wq */ + m_work_queue.queue(new LambdaContext( + [this, allocated_req](int r) { + allocated_req->dispatch(); + }), 0); + allocated_req = nullptr; + } + ceph_assert(!(allocated_req && front_req && allocated)); + + /* Continue while we're still considering the front of the deferred ops list */ + } while (front_req); + ceph_assert(!allocated); + } + ceph_assert(cleared_dispatching_flag); + + /* If any deferred requests were allocated, the last one will still be in allocated_req */ + if (allocated_req) { + allocated_req->dispatch(); + } +} + +/** + * Returns the lanes used by this write, and attempts to dispatch the next + * deferred write + */ +template +void AbstractWriteLog::release_write_lanes(C_BlockIORequestT *req) +{ + { + std::lock_guard locker(m_lock); + m_free_lanes += req->image_extents.size(); + } + dispatch_deferred_writes(); +} + +/** + * Attempts to allocate log resources for a write. Write is dispatched if + * resources are available, or queued if they aren't. + */ +template +void AbstractWriteLog::alloc_and_dispatch_io_req(C_BlockIORequestT *req) +{ + bool dispatch_here = false; + + { + /* If there are already deferred writes, queue behind them for resources */ + { + std::lock_guard locker(m_lock); + dispatch_here = m_deferred_ios.empty(); + } + if (dispatch_here) { + dispatch_here = req->alloc_resources(); + } + if (dispatch_here) { + ldout(m_image_ctx.cct, 20) << "dispatching" << dendl; + req->dispatch(); + } else { + req->deferred(); + { + std::lock_guard locker(m_lock); + m_deferred_ios.push_back(req); + } + ldout(m_image_ctx.cct, 20) << "deferred IOs: " << m_deferred_ios.size() << dendl; + dispatch_deferred_writes(); + } + } +} + +template +bool AbstractWriteLog::alloc_resources(C_BlockIORequestT *req) { + bool alloc_succeeds = true; + bool no_space = false; + uint64_t bytes_allocated = 0; + uint64_t bytes_cached = 0; + uint64_t bytes_dirtied = 0; + uint64_t num_lanes = 0; + uint64_t num_unpublished_reserves = 0; + uint64_t num_log_entries = 0; + + // Setup buffer, and get all the number of required resources + req->setup_buffer_resources(bytes_cached, bytes_dirtied, bytes_allocated, + num_lanes, num_log_entries, num_unpublished_reserves); + + { + std::lock_guard locker(m_lock); + if (m_free_lanes < num_lanes) { + req->set_io_waited_for_lanes(true); + ldout(m_image_ctx.cct, 20) << "not enough free lanes (need " + << num_lanes + << ", have " << m_free_lanes << ") " + << *req << dendl; + alloc_succeeds = false; + /* This isn't considered a "no space" alloc fail. Lanes are a throttling mechanism. */ + } + if (m_free_log_entries < num_log_entries) { + req->set_io_waited_for_entries(true); + ldout(m_image_ctx.cct, 20) << "not enough free entries (need " + << num_log_entries + << ", have " << m_free_log_entries << ") " + << *req << dendl; + alloc_succeeds = false; + no_space = true; /* Entries must be retired */ + } + /* Don't attempt buffer allocate if we've exceeded the "full" threshold */ + if (m_bytes_allocated + bytes_allocated > m_bytes_allocated_cap) { + if (!req->has_io_waited_for_buffers()) { + req->set_io_waited_for_entries(true); + ldout(m_image_ctx.cct, 1) << "Waiting for allocation cap (cap=" + << m_bytes_allocated_cap + << ", allocated=" << m_bytes_allocated + << ") in write [" << *req << "]" << dendl; + } + alloc_succeeds = false; + no_space = true; /* Entries must be retired */ + } + } + + std::vector& buffers = req->get_resources_buffers(); + if (alloc_succeeds) { + for (auto &buffer : buffers) { + utime_t before_reserve = ceph_clock_now(); + buffer.buffer_oid = pmemobj_reserve(m_log_pool, + &buffer.buffer_alloc_action, + buffer.allocation_size, + 0 /* Object type */); + buffer.allocation_lat = ceph_clock_now() - before_reserve; + if (TOID_IS_NULL(buffer.buffer_oid)) { + if (!req->has_io_waited_for_buffers()) { + req->set_io_waited_for_entries(true); + } + ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: " + << pmemobj_errormsg() << ". " + << *req << dendl; + alloc_succeeds = false; + no_space = true; /* Entries need to be retired */ + break; + } else { + buffer.allocated = true; + } + ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo + << "." << buffer.buffer_oid.oid.off + << ", size=" << buffer.allocation_size << dendl; + } + } + + if (alloc_succeeds) { + std::lock_guard locker(m_lock); + /* We need one free log entry per extent (each is a separate entry), and + * one free "lane" for remote replication. */ + if ((m_free_lanes >= num_lanes) && + (m_free_log_entries >= num_log_entries)) { + m_free_lanes -= num_lanes; + m_free_log_entries -= num_log_entries; + m_unpublished_reserves += num_unpublished_reserves; + m_bytes_allocated += bytes_allocated; + m_bytes_cached += bytes_cached; + m_bytes_dirty += bytes_dirtied; + } else { + alloc_succeeds = false; + } + } + + if (!alloc_succeeds) { + /* On alloc failure, free any buffers we did allocate */ + for (auto &buffer : buffers) { + if (buffer.allocated) { + pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1); + } + } + if (no_space) { + /* Expedite flushing and/or retiring */ + std::lock_guard locker(m_lock); + m_alloc_failed_since_retire = true; + m_last_alloc_fail = ceph_clock_now(); + } + } + + req->set_allocated(alloc_succeeds); + + return alloc_succeeds; +} + +template +C_FlushRequest>* AbstractWriteLog::make_flush_req(Context *on_finish) { + utime_t flush_begins = ceph_clock_now(); + bufferlist bl; + auto *flush_req = + new C_FlushRequestT(*this, flush_begins, Extents({whole_volume_extent()}), + std::move(bl), 0, m_lock, m_perfcounter, on_finish); + + return flush_req; +} + +template +void AbstractWriteLog::wake_up() { + CephContext *cct = m_image_ctx.cct; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + if (!m_wake_up_enabled) { + // wake_up is disabled during shutdown after flushing completes + ldout(m_image_ctx.cct, 6) << "deferred processing disabled" << dendl; + return; + } + + if (m_wake_up_requested && m_wake_up_scheduled) { + return; + } + + ldout(cct, 20) << dendl; + + /* Wake-up can be requested while it's already scheduled */ + m_wake_up_requested = true; + + /* Wake-up cannot be scheduled if it's already scheduled */ + if (m_wake_up_scheduled) { + return; + } + m_wake_up_scheduled = true; + m_async_process_work++; + m_async_op_tracker.start_op(); + m_work_queue.queue(new LambdaContext( + [this](int r) { + process_work(); + m_async_op_tracker.finish_op(); + m_async_process_work--; + }), 0); +} + +template +void AbstractWriteLog::process_work() { + CephContext *cct = m_image_ctx.cct; + int max_iterations = 4; + bool wake_up_requested = false; + uint64_t aggressive_high_water_bytes = m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER; + uint64_t high_water_bytes = m_bytes_allocated_cap * RETIRE_HIGH_WATER; + uint64_t low_water_bytes = m_bytes_allocated_cap * RETIRE_LOW_WATER; + uint64_t aggressive_high_water_entries = m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER; + uint64_t high_water_entries = m_total_log_entries * RETIRE_HIGH_WATER; + uint64_t low_water_entries = m_total_log_entries * RETIRE_LOW_WATER; + + ldout(cct, 20) << dendl; + + do { + { + std::lock_guard locker(m_lock); + m_wake_up_requested = false; + } + if (m_alloc_failed_since_retire || m_invalidating || + m_bytes_allocated > high_water_bytes || + (m_log_entries.size() > high_water_entries)) { + int retired = 0; + utime_t started = ceph_clock_now(); + ldout(m_image_ctx.cct, 10) << "alloc_fail=" << m_alloc_failed_since_retire + << ", allocated > high_water=" + << (m_bytes_allocated > high_water_bytes) + << ", allocated_entries > high_water=" + << (m_log_entries.size() > high_water_entries) + << dendl; + while (m_alloc_failed_since_retire || m_invalidating || + (m_bytes_allocated > high_water_bytes) || + (m_log_entries.size() > high_water_entries) || + (((m_bytes_allocated > low_water_bytes) || (m_log_entries.size() > low_water_entries)) && + (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) { + if (!retire_entries((m_shutting_down || m_invalidating || + (m_bytes_allocated > aggressive_high_water_bytes) || + (m_log_entries.size() > aggressive_high_water_entries)) + ? MAX_ALLOC_PER_TRANSACTION + : MAX_FREE_PER_TRANSACTION)) { + break; + } + retired++; + dispatch_deferred_writes(); + process_writeback_dirty_entries(); + } + ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl; + } + dispatch_deferred_writes(); + process_writeback_dirty_entries(); + + { + std::lock_guard locker(m_lock); + wake_up_requested = m_wake_up_requested; + } + } while (wake_up_requested && --max_iterations > 0); + + { + std::lock_guard locker(m_lock); + m_wake_up_scheduled = false; + /* Reschedule if it's still requested */ + if (m_wake_up_requested) { + wake_up(); + } + } +} + +template +bool AbstractWriteLog::can_flush_entry(std::shared_ptr log_entry) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << "" << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + if (m_invalidating) { + return true; + } + + /* For OWB we can flush entries with the same sync gen number (write between + * aio_flush() calls) concurrently. Here we'll consider an entry flushable if + * its sync gen number is <= the lowest sync gen number carried by all the + * entries currently flushing. + * + * If the entry considered here bears a sync gen number lower than a + * previously flushed entry, the application had to have submitted the write + * bearing the higher gen number before the write with the lower gen number + * completed. So, flushing these concurrently is OK. + * + * If the entry considered here bears a sync gen number higher than a + * currently flushing entry, the write with the lower gen number may have + * completed to the application before the write with the higher sync gen + * number was submitted, and the application may rely on that completion + * order for volume consistency. In this case the entry will not be + * considered flushable until all the entries bearing lower sync gen numbers + * finish flushing. + */ + + if (m_flush_ops_in_flight && + (log_entry->ram_entry.sync_gen_number > m_lowest_flushing_sync_gen)) { + return false; + } + + return (log_entry->can_writeback() && + (m_flush_ops_in_flight <= IN_FLIGHT_FLUSH_WRITE_LIMIT) && + (m_flush_bytes_in_flight <= IN_FLIGHT_FLUSH_BYTES_LIMIT)); +} + +template +Context* AbstractWriteLog::construct_flush_entry_ctx(std::shared_ptr log_entry) { + CephContext *cct = m_image_ctx.cct; + bool invalidating = m_invalidating; // snapshot so we behave consistently + + ldout(cct, 20) << "" << dendl; + ceph_assert(m_entry_reader_lock.is_locked()); + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + if (!m_flush_ops_in_flight || + (log_entry->ram_entry.sync_gen_number < m_lowest_flushing_sync_gen)) { + m_lowest_flushing_sync_gen = log_entry->ram_entry.sync_gen_number; + } + m_flush_ops_in_flight += 1; + /* For write same this is the bytes affected bt the flush op, not the bytes transferred */ + m_flush_bytes_in_flight += log_entry->ram_entry.write_bytes; + + /* Flush write completion action */ + Context *ctx = new LambdaContext( + [this, log_entry, invalidating](int r) { + { + std::lock_guard locker(m_lock); + if (r < 0) { + lderr(m_image_ctx.cct) << "failed to flush log entry" + << cpp_strerror(r) << dendl; + m_dirty_log_entries.push_front(log_entry); + } else { + ceph_assert(m_bytes_dirty >= log_entry->bytes_dirty()); + log_entry->set_flushed(true); + m_bytes_dirty -= log_entry->bytes_dirty(); + sync_point_writer_flushed(log_entry->get_sync_point_entry()); + ldout(m_image_ctx.cct, 20) << "flushed: " << log_entry + << " invalidating=" << invalidating + << dendl; + } + m_flush_ops_in_flight -= 1; + m_flush_bytes_in_flight -= log_entry->ram_entry.write_bytes; + wake_up(); + } + }); + /* Flush through lower cache before completing */ + ctx = new LambdaContext( + [this, ctx](int r) { + if (r < 0) { + lderr(m_image_ctx.cct) << "failed to flush log entry" + << cpp_strerror(r) << dendl; + ctx->complete(r); + } else { + m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, ctx); + } + }); + + if (invalidating) { + return ctx; + } + return new LambdaContext( + [this, log_entry, ctx](int r) { + m_image_ctx.op_work_queue->queue(new LambdaContext( + [this, log_entry, ctx](int r) { + ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry + << " " << *log_entry << dendl; + log_entry->writeback(m_image_writeback, ctx); + }), 0); + }); +} + +template +void AbstractWriteLog::process_writeback_dirty_entries() { + CephContext *cct = m_image_ctx.cct; + bool all_clean = false; + int flushed = 0; + + ldout(cct, 20) << "Look for dirty entries" << dendl; + { + DeferredContexts post_unlock; + std::shared_lock entry_reader_locker(m_entry_reader_lock); + while (flushed < IN_FLIGHT_FLUSH_WRITE_LIMIT) { + std::lock_guard locker(m_lock); + if (m_shutting_down) { + ldout(cct, 5) << "Flush during shutdown supressed" << dendl; + /* Do flush complete only when all flush ops are finished */ + all_clean = !m_flush_ops_in_flight; + break; + } + if (m_dirty_log_entries.empty()) { + ldout(cct, 20) << "Nothing new to flush" << dendl; + /* Do flush complete only when all flush ops are finished */ + all_clean = !m_flush_ops_in_flight; + break; + } + auto candidate = m_dirty_log_entries.front(); + bool flushable = can_flush_entry(candidate); + if (flushable) { + post_unlock.add(construct_flush_entry_ctx(candidate)); + flushed++; + m_dirty_log_entries.pop_front(); + } else { + ldout(cct, 20) << "Next dirty entry isn't flushable yet" << dendl; + break; + } + } + } + + if (all_clean) { + /* All flushing complete, drain outside lock */ + Contexts flush_contexts; + { + std::lock_guard locker(m_lock); + flush_contexts.swap(m_flush_complete_contexts); + } + finish_contexts(m_image_ctx.cct, flush_contexts, 0); + } +} + +/** + * Update/persist the last flushed sync point in the log + */ +template +void AbstractWriteLog::persist_last_flushed_sync_gen() +{ + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + uint64_t flushed_sync_gen; + + std::lock_guard append_locker(m_log_append_lock); + { + std::lock_guard locker(m_lock); + flushed_sync_gen = m_flushed_sync_gen; + } + + if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) { + ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from " + << D_RO(pool_root)->flushed_sync_gen << " to " + << flushed_sync_gen << dendl; + TX_BEGIN(m_log_pool) { + D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen; + } TX_ONCOMMIT { + } TX_ONABORT { + lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl; + ceph_assert(false); + } TX_FINALLY { + } TX_END; + } +} + +/* Returns true if the specified SyncPointLogEntry is considered flushed, and + * the log will be updated to reflect this. */ +template +bool AbstractWriteLog::handle_flushed_sync_point(std::shared_ptr log_entry) +{ + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + ceph_assert(log_entry); + + if ((log_entry->writes_flushed == log_entry->writes) && + log_entry->completed && log_entry->prior_sync_point_flushed && + log_entry->next_sync_point_entry) { + ldout(m_image_ctx.cct, 20) << "All writes flushed up to sync point=" + << *log_entry << dendl; + log_entry->next_sync_point_entry->prior_sync_point_flushed = true; + /* Don't move the flushed sync gen num backwards. */ + if (m_flushed_sync_gen < log_entry->ram_entry.sync_gen_number) { + m_flushed_sync_gen = log_entry->ram_entry.sync_gen_number; + } + m_async_op_tracker.start_op(); + m_work_queue.queue(new LambdaContext( + [this, log_entry](int r) { + bool handled_by_next; + { + std::lock_guard locker(m_lock); + handled_by_next = handle_flushed_sync_point(log_entry->next_sync_point_entry); + } + if (!handled_by_next) { + persist_last_flushed_sync_gen(); + } + m_async_op_tracker.finish_op(); + })); + return true; + } + return false; +} + +template +void AbstractWriteLog::sync_point_writer_flushed(std::shared_ptr log_entry) +{ + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + ceph_assert(log_entry); + log_entry->writes_flushed++; + + /* If this entry might be completely flushed, look closer */ + if ((log_entry->writes_flushed == log_entry->writes) && log_entry->completed) { + ldout(m_image_ctx.cct, 15) << "All writes flushed for sync point=" + << *log_entry << dendl; + handle_flushed_sync_point(log_entry); + } +} + +/* Make a new sync point and flush the previous during initialization, when there may or may + * not be a previous sync point */ +template +void AbstractWriteLog::init_flush_new_sync_point(DeferredContexts &later) { + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + ceph_assert(!m_initialized); /* Don't use this after init */ + + if (!m_current_sync_point) { + /* First sync point since start */ + new_sync_point(later); + } else { + flush_new_sync_point(nullptr, later); + } +} + +/** + * Begin a new sync point + */ +template +void AbstractWriteLog::new_sync_point(DeferredContexts &later) { + CephContext *cct = m_image_ctx.cct; + std::shared_ptr old_sync_point = m_current_sync_point; + std::shared_ptr new_sync_point; + ldout(cct, 20) << dendl; + + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + /* The first time this is called, if this is a newly created log, + * this makes the first sync gen number we'll use 1. On the first + * call for a re-opened log m_current_sync_gen will be the highest + * gen number from all the sync point entries found in the re-opened + * log, and this advances to the next sync gen number. */ + ++m_current_sync_gen; + + new_sync_point = std::make_shared(m_current_sync_gen, cct); + m_current_sync_point = new_sync_point; + + /* If this log has been re-opened, old_sync_point will initially be + * nullptr, but m_current_sync_gen may not be zero. */ + if (old_sync_point) { + new_sync_point->setup_earlier_sync_point(old_sync_point, m_last_op_sequence_num); + m_perfcounter->hinc(l_librbd_pwl_syncpoint_hist, + old_sync_point->log_entry->writes, + old_sync_point->log_entry->bytes); + /* This sync point will acquire no more sub-ops. Activation needs + * to acquire m_lock, so defer to later*/ + later.add(new LambdaContext( + [this, old_sync_point](int r) { + old_sync_point->prior_persisted_gather_activate(); + })); + } + + new_sync_point->prior_persisted_gather_set_finisher(); + + if (old_sync_point) { + ldout(cct,6) << "new sync point = [" << *m_current_sync_point + << "], prior = [" << *old_sync_point << "]" << dendl; + } else { + ldout(cct,6) << "first sync point = [" << *m_current_sync_point + << "]" << dendl; + } +} + +template +void AbstractWriteLog::flush_new_sync_point(C_FlushRequestT *flush_req, + DeferredContexts &later) { + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + if (!flush_req) { + m_async_null_flush_finish++; + m_async_op_tracker.start_op(); + Context *flush_ctx = new LambdaContext([this](int r) { + m_async_null_flush_finish--; + m_async_op_tracker.finish_op(); + }); + flush_req = make_flush_req(flush_ctx); + flush_req->internal = true; + } + + /* Add a new sync point. */ + new_sync_point(later); + std::shared_ptr to_append = m_current_sync_point->earlier_sync_point; + ceph_assert(to_append); + + /* This flush request will append/persist the (now) previous sync point */ + flush_req->to_append = to_append; + + /* When the m_sync_point_persist Gather completes this sync point can be + * appended. The only sub for this Gather is the finisher Context for + * m_prior_log_entries_persisted, which records the result of the Gather in + * the sync point, and completes. TODO: Do we still need both of these + * Gathers?*/ + Context * ctx = new LambdaContext([this, flush_req](int r) { + ldout(m_image_ctx.cct, 20) << "Flush req=" << flush_req + << " sync point =" << flush_req->to_append + << ". Ready to persist." << dendl; + alloc_and_dispatch_io_req(flush_req); + }); + to_append->persist_gather_set_finisher(ctx); + + /* The m_sync_point_persist Gather has all the subs it will ever have, and + * now has its finisher. If the sub is already complete, activation will + * complete the Gather. The finisher will acquire m_lock, so we'll activate + * this when we release m_lock.*/ + later.add(new LambdaContext([this, to_append](int r) { + to_append->persist_gather_activate(); + })); + + /* The flush request completes when the sync point persists */ + to_append->add_in_on_persisted_ctxs(flush_req); +} + +template +void AbstractWriteLog::flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, + DeferredContexts &later) { + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + /* If there have been writes since the last sync point ... */ + if (m_current_sync_point->log_entry->writes) { + flush_new_sync_point(flush_req, later); + } else { + /* There have been no writes to the current sync point. */ + if (m_current_sync_point->earlier_sync_point) { + /* If previous sync point hasn't completed, complete this flush + * with the earlier sync point. No alloc or dispatch needed. */ + m_current_sync_point->earlier_sync_point->on_sync_point_persisted.push_back(flush_req); + } else { + /* The previous sync point has already completed and been + * appended. The current sync point has no writes, so this flush + * has nothing to wait for. This flush completes now. */ + later.add(flush_req); + } + } +} + +/* + * RWL internal flush - will actually flush the RWL. + * + * User flushes should arrive at aio_flush(), and only flush prior + * writes to all log replicas. + * + * Librbd internal flushes will arrive at flush(invalidate=false, + * discard=false), and traverse the block guard to ensure in-flight writes are + * flushed. + */ +template +void AbstractWriteLog::flush_dirty_entries(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + bool all_clean; + bool flushing; + bool stop_flushing; + + { + std::lock_guard locker(m_lock); + flushing = (0 != m_flush_ops_in_flight); + all_clean = m_dirty_log_entries.empty(); + stop_flushing = (m_shutting_down); + } + + if (!flushing && (all_clean || stop_flushing)) { + /* Complete without holding m_lock */ + if (all_clean) { + ldout(cct, 20) << "no dirty entries" << dendl; + } else { + ldout(cct, 5) << "flush during shutdown suppressed" << dendl; + } + on_finish->complete(0); + } else { + if (all_clean) { + ldout(cct, 5) << "flush ops still in progress" << dendl; + } else { + ldout(cct, 20) << "dirty entries remain" << dendl; + } + std::lock_guard locker(m_lock); + /* on_finish can't be completed yet */ + m_flush_complete_contexts.push_back(new LambdaContext( + [this, on_finish](int r) { + flush_dirty_entries(on_finish); + })); + wake_up(); + } +} + +template +void AbstractWriteLog::internal_flush(bool invalidate, Context *on_finish) { + ldout(m_image_ctx.cct, 20) << "invalidate=" << invalidate << dendl; + + if (m_perfcounter) { + if (invalidate) { + m_perfcounter->inc(l_librbd_pwl_invalidate_cache, 1); + } else { + m_perfcounter->inc(l_librbd_pwl_flush, 1); + } + } + + /* May be called even if initialization fails */ + if (!m_initialized) { + ldout(m_image_ctx.cct, 05) << "never initialized" << dendl; + /* Deadlock if completed here */ + m_image_ctx.op_work_queue->queue(on_finish, 0); + return; + } + + /* Flush/invalidate must pass through block guard to ensure all layers of + * cache are consistently flush/invalidated. This ensures no in-flight write leaves + * some layers with valid regions, which may later produce inconsistent read + * results. */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext( + [this, on_finish, invalidate](GuardedRequestFunctionContext &guard_ctx) { + DeferredContexts on_exit; + ldout(m_image_ctx.cct, 20) << "cell=" << guard_ctx.cell << dendl; + ceph_assert(guard_ctx.cell); + + Context *ctx = new LambdaContext( + [this, cell=guard_ctx.cell, invalidate, on_finish](int r) { + std::lock_guard locker(m_lock); + m_invalidating = false; + ldout(m_image_ctx.cct, 6) << "Done flush/invalidating (invalidate=" + << invalidate << ")" << dendl; + if (m_log_entries.size()) { + ldout(m_image_ctx.cct, 1) << "m_log_entries.size()=" + << m_log_entries.size() << ", " + << "front()=" << *m_log_entries.front() + << dendl; + } + if (invalidate) { + ceph_assert(m_log_entries.size() == 0); + } + ceph_assert(m_dirty_log_entries.size() == 0); + m_image_ctx.op_work_queue->queue(on_finish, r); + release_guarded_request(cell); + }); + ctx = new LambdaContext( + [this, ctx, invalidate](int r) { + Context *next_ctx = ctx; + if (r < 0) { + /* Override on_finish status with this error */ + next_ctx = new LambdaContext([r, ctx](int _r) { + ctx->complete(r); + }); + } + if (invalidate) { + { + std::lock_guard locker(m_lock); + ceph_assert(m_dirty_log_entries.size() == 0); + ceph_assert(!m_invalidating); + ldout(m_image_ctx.cct, 6) << "Invalidating" << dendl; + m_invalidating = true; + } + /* Discards all RWL entries */ + while (retire_entries(MAX_ALLOC_PER_TRANSACTION)) { } + next_ctx->complete(0); + } else { + { + std::lock_guard locker(m_lock); + ceph_assert(m_dirty_log_entries.size() == 0); + ceph_assert(!m_invalidating); + } + m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, next_ctx); + } + }); + ctx = new LambdaContext( + [this, ctx](int r) { + flush_dirty_entries(ctx); + }); + std::lock_guard locker(m_lock); + /* Even if we're throwing everything away, but we want the last entry to + * be a sync point so we can cleanly resume. + * + * Also, the blockguard only guarantees the replication of this op + * can't overlap with prior ops. It doesn't guarantee those are all + * completed and eligible for flush & retire, which we require here. + */ + auto flush_req = make_flush_req(ctx); + flush_new_sync_point_if_needed(flush_req, on_exit); + }); + detain_guarded_request(nullptr, guarded_ctx, true); +} + +template +void AbstractWriteLog::add_into_log_map(GenericWriteLogEntries &log_entries) { + m_blocks_to_log_entries.add_log_entries(log_entries); +} + +template +bool AbstractWriteLog::can_retire_entry(std::shared_ptr log_entry) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + return log_entry->can_retire(); +} + +/** + * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries + * that are eligible to be retired. Returns true if anything was + * retired. + */ +template +bool AbstractWriteLog::retire_entries(const unsigned long int frees_per_tx) { + CephContext *cct = m_image_ctx.cct; + GenericLogEntriesVector retiring_entries; + uint32_t initial_first_valid_entry; + uint32_t first_valid_entry; + + std::lock_guard retire_locker(m_log_retire_lock); + ldout(cct, 20) << "Look for entries to retire" << dendl; + { + /* Entry readers can't be added while we hold m_entry_reader_lock */ + RWLock::WLocker entry_reader_locker(m_entry_reader_lock); + std::lock_guard locker(m_lock); + initial_first_valid_entry = m_first_valid_entry; + first_valid_entry = m_first_valid_entry; + auto entry = m_log_entries.front(); + while (!m_log_entries.empty() && + retiring_entries.size() < frees_per_tx && + can_retire_entry(entry)) { + if (entry->log_entry_index != first_valid_entry) { + lderr(cct) << "Retiring entry index (" << entry->log_entry_index + << ") and first valid log entry index (" << first_valid_entry + << ") must be ==." << dendl; + } + ceph_assert(entry->log_entry_index == first_valid_entry); + first_valid_entry = (first_valid_entry + 1) % m_total_log_entries; + m_log_entries.pop_front(); + retiring_entries.push_back(entry); + /* Remove entry from map so there will be no more readers */ + if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) { + auto gen_write_entry = static_pointer_cast(entry); + if (gen_write_entry) { + m_blocks_to_log_entries.remove_log_entry(gen_write_entry); + } + } + entry = m_log_entries.front(); + } + } + + if (retiring_entries.size()) { + ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl; + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + + utime_t tx_start; + utime_t tx_end; + /* Advance first valid entry and release buffers */ + { + uint64_t flushed_sync_gen; + std::lock_guard append_locker(m_log_append_lock); + { + std::lock_guard locker(m_lock); + flushed_sync_gen = m_flushed_sync_gen; + } + + tx_start = ceph_clock_now(); + TX_BEGIN(m_log_pool) { + if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) { + ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from " + << D_RO(pool_root)->flushed_sync_gen << " to " + << flushed_sync_gen << dendl; + D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen; + } + D_RW(pool_root)->first_valid_entry = first_valid_entry; + for (auto &entry: retiring_entries) { + if (entry->write_bytes()) { + ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo + << "." << entry->ram_entry.write_data.oid.off << dendl; + TX_FREE(entry->ram_entry.write_data); + } else { + ldout(cct, 20) << "Retiring non-write: " << *entry << dendl; + } + } + } TX_ONCOMMIT { + } TX_ONABORT { + lderr(cct) << "failed to commit free of" << retiring_entries.size() << " log entries (" << m_log_pool_name << ")" << dendl; + ceph_assert(false); + } TX_FINALLY { + } TX_END; + tx_end = ceph_clock_now(); + } + m_perfcounter->tinc(l_librbd_pwl_retire_tx_t, tx_end - tx_start); + m_perfcounter->hinc(l_librbd_pwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), retiring_entries.size()); + + /* Update runtime copy of first_valid, and free entries counts */ + { + std::lock_guard locker(m_lock); + + ceph_assert(m_first_valid_entry == initial_first_valid_entry); + m_first_valid_entry = first_valid_entry; + m_free_log_entries += retiring_entries.size(); + for (auto &entry: retiring_entries) { + if (entry->write_bytes()) { + ceph_assert(m_bytes_cached >= entry->write_bytes()); + m_bytes_cached -= entry->write_bytes(); + uint64_t entry_allocation_size = entry->write_bytes(); + if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) { + entry_allocation_size = MIN_WRITE_ALLOC_SIZE; + } + ceph_assert(m_bytes_allocated >= entry_allocation_size); + m_bytes_allocated -= entry_allocation_size; + } + } + m_alloc_failed_since_retire = false; + wake_up(); + } + } else { + ldout(cct, 20) << "Nothing to retire" << dendl; + return false; + } + return true; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::AbstractWriteLog; +template void librbd::cache::pwl::AbstractWriteLog:: \ + flush_pmem_buffer(std::vector>&); diff --git a/src/librbd/cache/pwl/AbstractWriteLog.h b/src/librbd/cache/pwl/AbstractWriteLog.h new file mode 100644 index 0000000000000..d47980cc2fd43 --- /dev/null +++ b/src/librbd/cache/pwl/AbstractWriteLog.h @@ -0,0 +1,314 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG +#define CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG + +#include "common/RWLock.h" +#include "common/WorkQueue.h" +#include "common/AsyncOpTracker.h" +#include "librbd/cache/ImageCache.h" +#include "librbd/cache/ImageWriteback.h" +#include "librbd/Utils.h" +#include "librbd/BlockGuard.h" +#include "librbd/cache/Types.h" +#include "librbd/cache/pwl/LogOperation.h" +#include "librbd/cache/pwl/Request.h" +#include "librbd/cache/pwl/LogMap.h" +#include +#include + +class Context; +class SafeTimer; + +namespace librbd { + +struct ImageCtx; + +namespace cache { + +namespace pwl { + +class SyncPointLogEntry; +class GenericWriteLogEntry; +class WriteLogEntry; +class GenericLogEntry; + +typedef std::list> WriteLogEntries; +typedef std::list> GenericLogEntries; +typedef std::list> GenericWriteLogEntries; +typedef std::vector> GenericLogEntriesVector; + +typedef LogMapEntries WriteLogMapEntries; +typedef LogMap WriteLogMap; + +/**** Write log entries end ****/ + +typedef librbd::BlockGuard WriteLogGuard; + +class DeferredContexts; +template class ImageCacheState; + +template +struct C_BlockIORequest; + +template +struct C_WriteRequest; + +using GenericLogOperations = std::list; + + +template +class AbstractWriteLog { +public: + typedef io::Extent Extent; + typedef io::Extents Extents; + + AbstractWriteLog(ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state); + ~AbstractWriteLog(); + AbstractWriteLog(const AbstractWriteLog&) = delete; + AbstractWriteLog &operator=(const AbstractWriteLog&) = delete; + + /// IO methods + void read(Extents&& image_extents, ceph::bufferlist *bl, + int fadvise_flags, Context *on_finish); + void write(Extents&& image_extents, ceph::bufferlist&& bl, + int fadvise_flags, + Context *on_finish); + void discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, + Context *on_finish); + void flush(io::FlushSource flush_source, Context *on_finish); + void writesame(uint64_t offset, uint64_t length, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish); + void compare_and_write(Extents&& image_extents, + ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl, + uint64_t *mismatch_offset,int fadvise_flags, + Context *on_finish); + + /// internal state methods + void init(Context *on_finish); + void shut_down(Context *on_finish); + void invalidate(Context *on_finish); + void flush(Context *on_finish); + + using This = AbstractWriteLog; + using C_WriteRequestT = pwl::C_WriteRequest; + using C_BlockIORequestT = pwl::C_BlockIORequest; + using C_FlushRequestT = pwl::C_FlushRequest; + using C_DiscardRequestT = pwl::C_DiscardRequest; + using C_WriteSameRequestT = pwl::C_WriteSameRequest; + using C_CompAndWriteRequestT = pwl::C_CompAndWriteRequest; + + CephContext * get_context(); + void release_guarded_request(BlockGuardCell *cell); + void release_write_lanes(C_BlockIORequestT *req); + bool alloc_resources(C_BlockIORequestT *req); + template + void flush_pmem_buffer(V& ops); + void schedule_append(pwl::GenericLogOperationsVector &ops); + void schedule_append(pwl::GenericLogOperationSharedPtr op); + void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops); + void flush_new_sync_point(C_FlushRequestT *flush_req, pwl::DeferredContexts &later); + std::shared_ptr get_current_sync_point() { + return m_current_sync_point; + } + bool get_persist_on_flush() { + return m_persist_on_flush; + } + void inc_last_op_sequence_num() { + m_perfcounter->inc(l_librbd_pwl_log_ops, 1); + ++m_last_op_sequence_num; + } + uint64_t get_last_op_sequence_num() { + return m_last_op_sequence_num; + } + uint64_t get_current_sync_gen() { + return m_current_sync_gen; + } + unsigned int get_free_lanes() { + return m_free_lanes; + } + uint32_t get_free_log_entries() { + return m_free_log_entries; + } + void add_into_log_map(pwl::GenericWriteLogEntries &log_entries); +protected: + typedef std::list *> C_WriteRequests; + typedef std::list *> C_BlockIORequests; + + BlockGuardCell* detain_guarded_request_helper(pwl::GuardedRequest &req); + BlockGuardCell* detain_guarded_request_barrier_helper(pwl::GuardedRequest &req); + void detain_guarded_request(C_BlockIORequestT *request, + pwl::GuardedRequestFunctionContext *guarded_ctx, + bool is_barrier); + + librbd::cache::pwl::ImageCacheState* m_cache_state = nullptr; + + std::atomic m_initialized = {false}; + std::atomic m_shutting_down = {false}; + std::atomic m_invalidating = {false}; + PMEMobjpool *m_log_pool = nullptr; + const char* m_pwl_pool_layout_name; + + ImageCtxT &m_image_ctx; + + std::string m_log_pool_name; + bool m_log_is_poolset = false; + uint64_t m_log_pool_config_size; /* Configured size of RWL */ + uint64_t m_log_pool_actual_size = 0; /* Actual size of RWL pool */ + + uint32_t m_total_log_entries = 0; + uint32_t m_free_log_entries = 0; + + std::atomic m_bytes_allocated = {0}; /* Total bytes allocated in write buffers */ + uint64_t m_bytes_cached = 0; /* Total bytes used in write buffers */ + uint64_t m_bytes_dirty = 0; /* Total bytes yet to flush to RBD */ + uint64_t m_bytes_allocated_cap = 0; + + utime_t m_last_alloc_fail; /* Entry or buffer allocation fail seen */ + std::atomic m_alloc_failed_since_retire = {false}; + + ImageWriteback m_image_writeback; + pwl::WriteLogGuard m_write_log_guard; + /* + * When m_first_free_entry == m_first_valid_entry, the log is + * empty. There is always at least one free entry, which can't be + * used. + */ + uint64_t m_first_free_entry = 0; /* Entries from here to m_first_valid_entry-1 are free */ + uint64_t m_first_valid_entry = 0; /* Entries from here to m_first_free_entry-1 are valid */ + + /* Starts at 0 for a new write log. Incremented on every flush. */ + uint64_t m_current_sync_gen = 0; + /* Starts at 0 on each sync gen increase. Incremented before applied + to an operation */ + uint64_t m_last_op_sequence_num = 0; + /* All writes bearing this and all prior sync gen numbers are flushed */ + uint64_t m_flushed_sync_gen = 0; + + bool m_persist_on_write_until_flush = true; + + AsyncOpTracker m_async_op_tracker; + /* Debug counters for the places m_async_op_tracker is used */ + std::atomic m_async_flush_ops = {0}; + std::atomic m_async_append_ops = {0}; + std::atomic m_async_complete_ops = {0}; + std::atomic m_async_null_flush_finish = {0}; + std::atomic m_async_process_work = {0}; + + /* Acquire locks in order declared here */ + + mutable ceph::mutex m_log_retire_lock; + /* Hold a read lock on m_entry_reader_lock to add readers to log entry + * bufs. Hold a write lock to prevent readers from being added (e.g. when + * removing log entrys from the map). No lock required to remove readers. */ + mutable RWLock m_entry_reader_lock; + /* Hold m_deferred_dispatch_lock while consuming from m_deferred_ios. */ + mutable ceph::mutex m_deferred_dispatch_lock; + /* Hold m_log_append_lock while appending or retiring log entries. */ + mutable ceph::mutex m_log_append_lock; + /* Used for most synchronization */ + mutable ceph::mutex m_lock; + + /* Used in release/detain to make BlockGuard preserve submission order */ + mutable ceph::mutex m_blockguard_lock; + + /* Use m_blockguard_lock for the following 3 things */ + pwl::WriteLogGuard::BlockOperations m_awaiting_barrier; + bool m_barrier_in_progress = false; + BlockGuardCell *m_barrier_cell = nullptr; + + bool m_wake_up_requested = false; + bool m_wake_up_scheduled = false; + bool m_wake_up_enabled = true; + bool m_appending = false; + bool m_dispatching_deferred_ops = false; + + Contexts m_flush_complete_contexts; + + pwl::GenericLogOperations m_ops_to_flush; /* Write ops needing flush in local log */ + pwl::GenericLogOperations m_ops_to_append; /* Write ops needing event append in local log */ + + pwl::WriteLogMap m_blocks_to_log_entries; + + /* New entries are at the back. Oldest at the front */ + pwl::GenericLogEntries m_log_entries; + pwl::GenericLogEntries m_dirty_log_entries; + + PerfCounters *m_perfcounter = nullptr; + + std::shared_ptr m_current_sync_point = nullptr; + bool m_persist_on_flush = false; /* If false, persist each write before completion */ + + int m_flush_ops_in_flight = 0; + int m_flush_bytes_in_flight = 0; + uint64_t m_lowest_flushing_sync_gen = 0; + + /* Writes that have left the block guard, but are waiting for resources */ + C_BlockIORequests m_deferred_ios; + /* Throttle writes concurrently allocating & replicating */ + unsigned int m_free_lanes = pwl::MAX_CONCURRENT_WRITES; + unsigned int m_unpublished_reserves = 0; + + /* Initialized from config, then set false during shutdown */ + std::atomic m_periodic_stats_enabled = {false}; + SafeTimer *m_timer = nullptr; /* Used with m_timer_lock */ + mutable ceph::mutex *m_timer_lock = nullptr; /* Used with and by m_timer */ + Context *m_timer_ctx = nullptr; + + ThreadPool m_thread_pool; + ContextWQ m_work_queue; + + uint32_t m_discard_granularity_bytes; + + void perf_start(const std::string name); + void perf_stop(); + void log_perf(); + void periodic_stats(); + void arm_periodic_stats(); + + void pwl_init(Context *on_finish, pwl::DeferredContexts &later); + void update_image_cache_state(Context *on_finish); + void load_existing_entries(pwl::DeferredContexts &later); + void wake_up(); + void process_work(); + + void flush_dirty_entries(Context *on_finish); + bool can_flush_entry(const std::shared_ptr log_entry); + Context *construct_flush_entry_ctx(const std::shared_ptr log_entry); + void persist_last_flushed_sync_gen(); + bool handle_flushed_sync_point(std::shared_ptr log_entry); + void sync_point_writer_flushed(std::shared_ptr log_entry); + void process_writeback_dirty_entries(); + bool can_retire_entry(const std::shared_ptr log_entry); + bool retire_entries(const unsigned long int frees_per_tx); + + void init_flush_new_sync_point(pwl::DeferredContexts &later); + void new_sync_point(pwl::DeferredContexts &later); + pwl::C_FlushRequest>* make_flush_req(Context *on_finish); + void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, pwl::DeferredContexts &later); + + void dispatch_deferred_writes(void); + void alloc_and_dispatch_io_req(C_BlockIORequestT *write_req); + void append_scheduled_ops(void); + void enlist_op_appender(); + void schedule_append(pwl::GenericLogOperations &ops); + void flush_then_append_scheduled_ops(void); + void enlist_op_flusher(); + void alloc_op_log_entries(pwl::GenericLogOperations &ops); + void flush_op_log_entries(pwl::GenericLogOperationsVector &ops); + int append_op_log_entries(pwl::GenericLogOperations &ops); + void complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r); + void schedule_complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r); + void internal_flush(bool invalidate, Context *on_finish); +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::AbstractWriteLog; + +#endif // CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG diff --git a/src/librbd/cache/pwl/ImageCacheState.cc b/src/librbd/cache/pwl/ImageCacheState.cc new file mode 100644 index 0000000000000..1aa9c73a7ec6b --- /dev/null +++ b/src/librbd/cache/pwl/ImageCacheState.cc @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/Types.h" +#include "librbd/cache/Utils.h" +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/ImageCtx.h" +#include "librbd/Operations.h" +#include "common/environment.h" +#include "common/hostname.h" +#include "common/config_proxy.h" +#include "common/ceph_json.h" + +#undef dout_subsys +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::ImageCacheState: " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +namespace { +bool get_json_format(const std::string& s, JSONFormattable *f) { + JSONParser p; + bool success = p.parse(s.c_str(), s.size()); + if (success) { + decode_json_obj(*f, &p); + } + return success; +} +} // namespace + +template +ImageCacheState::ImageCacheState(I *image_ctx) : m_image_ctx(image_ctx) { + ldout(image_ctx->cct, 20) << "Initialize RWL cache state with config data. " + << dendl; + + ConfigProxy &config = image_ctx->config; + host = ceph_get_short_hostname(); + path = config.get_val("rbd_rwl_path"); + size = config.get_val("rbd_rwl_size"); + log_periodic_stats = config.get_val("rbd_rwl_log_periodic_stats"); +} + +template +ImageCacheState::ImageCacheState( + I *image_ctx, JSONFormattable &f) : m_image_ctx(image_ctx) { + ldout(image_ctx->cct, 20) << "Initialize RWL cache state with data from " + << "server side"<< dendl; + + present = (bool)f["present"]; + empty = (bool)f["empty"]; + clean = (bool)f["clean"]; + host = (string)f["rwl_host"]; + path = (string)f["rwl_path"]; + uint64_t pwl_size; + std::istringstream iss(f["rwl_size"]); + iss >> pwl_size; + size = pwl_size; + + // Others from config + ConfigProxy &config = image_ctx->config; + log_periodic_stats = config.get_val("rbd_rwl_log_periodic_stats"); +} + +template +void ImageCacheState::write_image_cache_state(Context *on_finish) { + std::shared_lock owner_lock{m_image_ctx->owner_lock}; + JSONFormattable f; + ::encode_json(IMAGE_CACHE_STATE.c_str(), *this, &f); + std::ostringstream oss; + f.flush(oss); + std::string image_state_json = oss.str(); + + ldout(m_image_ctx->cct, 20) << __func__ << " Store state: " + << image_state_json << dendl; + m_image_ctx->operations->execute_metadata_set(IMAGE_CACHE_STATE, + image_state_json, on_finish); +} + +template +void ImageCacheState::clear_image_cache_state(Context *on_finish) { + std::shared_lock owner_lock{m_image_ctx->owner_lock}; + ldout(m_image_ctx->cct, 20) << __func__ << " Remove state: " << dendl; + m_image_ctx->operations->execute_metadata_remove(IMAGE_CACHE_STATE, on_finish); +} + +template +void ImageCacheState::dump(ceph::Formatter *f) const { + ::encode_json("present", present, f); + ::encode_json("empty", empty, f); + ::encode_json("clean", clean, f); + ::encode_json("cache_type", (int)get_image_cache_type(), f); + ::encode_json("pwl_host", host, f); + ::encode_json("pwl_path", path, f); + ::encode_json("pwl_size", size, f); +} + +template +ImageCacheState* ImageCacheState::get_image_cache_state( + I* image_ctx, int &r) { + std::string cache_state_str; + ImageCacheState* cache_state = nullptr; + ldout(image_ctx->cct, 20) << "image_cache_state:" << cache_state_str << dendl; + + r = 0; + bool dirty_cache = image_ctx->test_features(RBD_FEATURE_DIRTY_CACHE); + if (dirty_cache) { + cls_client::metadata_get(&image_ctx->md_ctx, image_ctx->header_oid, + IMAGE_CACHE_STATE, &cache_state_str); + } + + bool pwl_enabled = cache::util::is_pwl_enabled(*image_ctx); + bool cache_desired = pwl_enabled; + cache_desired &= !image_ctx->read_only; + cache_desired &= !image_ctx->test_features(RBD_FEATURE_MIGRATING); + cache_desired &= !image_ctx->test_features(RBD_FEATURE_JOURNALING); + cache_desired &= !image_ctx->old_format; + + if (!dirty_cache && !cache_desired) { + ldout(image_ctx->cct, 20) << "Do not desire to use image cache." << dendl; + } else if (dirty_cache && !cache_desired) { + lderr(image_ctx->cct) << "There's a dirty cache, but RWL cache is disabled." + << dendl; + r = -EINVAL; + }else if ((!dirty_cache || cache_state_str.empty()) && cache_desired) { + cache_state = new ImageCacheState(image_ctx); + } else { + ceph_assert(!cache_state_str.empty()); + JSONFormattable f; + bool success = get_json_format(cache_state_str, &f); + if (!success) { + lderr(image_ctx->cct) << "Failed to parse cache state: " + << cache_state_str << dendl; + r = -EINVAL; + return nullptr; + } + + bool cache_exists = (bool)f["present"]; + int cache_type = (int)f["cache_type"]; + + switch (cache_type) { + case IMAGE_CACHE_TYPE_RWL: + if (!cache_exists) { + cache_state = new ImageCacheState(image_ctx); + } else { + cache_state = new ImageCacheState(image_ctx, f); + } + break; + default: + r = -EINVAL; + } + } + return cache_state; +} + +template +bool ImageCacheState::is_valid() { + if (this->present && + (host.compare(ceph_get_short_hostname()) != 0)) { + auto cleanstring = "dirty"; + if (this->clean) { + cleanstring = "clean"; + } + lderr(m_image_ctx->cct) << "An image cache (RWL) remains on another host " + << host << " which is " << cleanstring + << ". Flush/close the image there to remove the " + << "image cache" << dendl; + return false; + } + return true; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::ImageCacheState; diff --git a/src/librbd/cache/pwl/ImageCacheState.h b/src/librbd/cache/pwl/ImageCacheState.h new file mode 100644 index 0000000000000..2bdf31ffeed71 --- /dev/null +++ b/src/librbd/cache/pwl/ImageCacheState.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H +#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H + +#include "librbd/ImageCtx.h" +#include "librbd/cache/Types.h" +#include + +class JSONFormattable; +namespace ceph { + class Formatter; +} + +namespace librbd { +namespace cache { +namespace pwl { + +template +class ImageCacheState { +private: + ImageCtxT* m_image_ctx; +public: + bool present = true; + bool empty = true; + bool clean = true; + std::string host; + std::string path; + uint64_t size; + bool log_periodic_stats; + + ImageCacheState(ImageCtxT* image_ctx); + + ImageCacheState(ImageCtxT* image_ctx, JSONFormattable& f); + + ~ImageCacheState() {} + + ImageCacheType get_image_cache_type() const { + return IMAGE_CACHE_TYPE_RWL; + } + + + void write_image_cache_state(Context *on_finish); + + void clear_image_cache_state(Context *on_finish); + + void dump(ceph::Formatter *f) const; + + static ImageCacheState* get_image_cache_state( + ImageCtxT* image_ctx, int &r); + + bool is_valid(); +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::ImageCacheState; + +#endif // CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H diff --git a/src/librbd/cache/pwl/InitRequest.cc b/src/librbd/cache/pwl/InitRequest.cc new file mode 100644 index 0000000000000..1519272463222 --- /dev/null +++ b/src/librbd/cache/pwl/InitRequest.cc @@ -0,0 +1,171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/pwl/InitRequest.h" +#include "librbd/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/asio/ContextWQ.h" + +#if defined(WITH_RBD_RWL) +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/cache/WriteLogCache.h" +#endif // WITH_RBD_RWL + +#include "librbd/cache/Utils.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl:InitRequest " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template +InitRequest* InitRequest::create(I &image_ctx, + Context *on_finish) { + return new InitRequest(image_ctx, on_finish); +} + +template +InitRequest::InitRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), + m_on_finish(create_async_context_callback(image_ctx, on_finish)), + m_error_result(0) { +} + +template +void InitRequest::send() { +#if defined(WITH_RBD_RWL) + get_image_cache_state(); +#else + finish(); +#endif // WITH_RBD_RWL +} + +#if defined(WITH_RBD_RWL) +template +void InitRequest::get_image_cache_state() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + int r; + auto cache_state = ImageCacheState::get_image_cache_state(&m_image_ctx, r); + + if (r < 0 || !cache_state) { + save_result(r); + finish(); + return; + } else if (!cache_state->is_valid()) { + delete cache_state; + cache_state = nullptr; + lderr(cct) << "failed to get image cache state: " << cpp_strerror(r) + << dendl; + save_result(-ENOENT); + finish(); + return; + } + + auto cache_type = cache_state->get_image_cache_type(); + switch(cache_type) { + case cache::IMAGE_CACHE_TYPE_RWL: + m_image_ctx.image_cache = + new librbd::cache::WriteLogCache(m_image_ctx, + cache_state); + break; + default: + delete cache_state; + cache_state = nullptr; + save_result(-ENOENT); + finish(); + return; + } + + init_image_cache(); +} + +template +void InitRequest::init_image_cache() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = InitRequest; + Context *ctx = create_context_callback( + this); + m_image_ctx.image_cache->init(ctx); +} + +template +void InitRequest::handle_init_image_cache(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to init image cache: " << cpp_strerror(r) + << dendl; + delete m_image_ctx.image_cache; + m_image_ctx.image_cache = nullptr; + save_result(r); + finish(); + return; + } + set_feature_bit(); +} + +template +void InitRequest::set_feature_bit() { + CephContext *cct = m_image_ctx.cct; + + uint64_t new_features = m_image_ctx.features | RBD_FEATURE_DIRTY_CACHE; + uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE; + ldout(cct, 10) << "old_features=" << m_image_ctx.features + << ", new_features=" << new_features + << ", features_mask=" << features_mask + << dendl; + + int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, + m_image_ctx.header_oid, + new_features, features_mask); + m_image_ctx.features |= RBD_FEATURE_DIRTY_CACHE; + using klass = InitRequest; + Context *ctx = create_context_callback( + this); + ctx->complete(r); +} + +template +void InitRequest::handle_set_feature_bit(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to set feature bit: " << cpp_strerror(r) + << dendl; + save_result(r); + } else if (m_image_ctx.discard_granularity_bytes) { + ldout(cct, 1) << "RWL image cache is enabled and " + << "set discard_granularity_bytes = 0." << dendl; + m_image_ctx.discard_granularity_bytes = 0; + } + finish(); +} + +#endif // WITH_RBD_RWL + +template +void InitRequest::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::InitRequest; diff --git a/src/librbd/cache/pwl/InitRequest.h b/src/librbd/cache/pwl/InitRequest.h new file mode 100644 index 0000000000000..a18bfa3bb6daa --- /dev/null +++ b/src/librbd/cache/pwl/InitRequest.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H +#define CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace cache { +namespace pwl { + +template +class ImageCacheState; + +template +class InitRequest { +public: + static InitRequest* create(ImageCtxT &image_ctx, Context *on_finish); + + void send(); + +private: + + /** + * @verbatim + * + * Init request goes through the following state machine: + * + * + * | + * v + * GET_IMAGE_CACHE_STATE + * | + * v + * INIT_IMAGE_CACHE + * | + * v + * SET_FEATURE_BIT + * | + * v + * + * + * @endverbatim + */ + + InitRequest(ImageCtxT &image_ctx, Context *on_finish); + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + int m_error_result; + + bool is_pwl_enabled(); + + void get_image_cache_state(); + + void init_image_cache(); + void handle_init_image_cache(int r); + + void set_feature_bit(); + void handle_set_feature_bit(int r); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::InitRequest; + +#endif // CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H diff --git a/src/librbd/cache/pwl/LogEntry.cc b/src/librbd/cache/pwl/LogEntry.cc new file mode 100644 index 0000000000000..4e7612c94f043 --- /dev/null +++ b/src/librbd/cache/pwl/LogEntry.cc @@ -0,0 +1,228 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "LogEntry.h" +#include "librbd/cache/ImageWriteback.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::LogEntry: " << this << " " \ + << __func__ << ": " + +namespace librbd { + +namespace cache { + +namespace pwl { + +std::ostream& GenericLogEntry::format(std::ostream &os) const { + os << "ram_entry=[" << ram_entry << "], " + << "pmem_entry=" << (void*)pmem_entry << ", " + << "log_entry_index=" << log_entry_index << ", " + << "completed=" << completed; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const GenericLogEntry &entry) { + return entry.format(os); +} + +std::ostream& SyncPointLogEntry::format(std::ostream &os) const { + os << "(Sync Point) "; + GenericLogEntry::format(os); + os << ", " + << "writes=" << writes << ", " + << "bytes=" << bytes << ", " + << "writes_completed=" << writes_completed << ", " + << "writes_flushed=" << writes_flushed << ", " + << "prior_sync_point_flushed=" << prior_sync_point_flushed << ", " + << "next_sync_point_entry=" << next_sync_point_entry; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const SyncPointLogEntry &entry) { + return entry.format(os); +} + +bool GenericWriteLogEntry::can_writeback() const { + return (this->completed && + (ram_entry.sequenced || + (sync_point_entry && + sync_point_entry->completed))); +} + +std::ostream& GenericWriteLogEntry::format(std::ostream &os) const { + GenericLogEntry::format(os); + os << ", " + << "sync_point_entry=["; + if (sync_point_entry) { + os << *sync_point_entry; + } else { + os << "nullptr"; + } + os << "], " + << "referring_map_entries=" << referring_map_entries; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const GenericWriteLogEntry &entry) { + return entry.format(os); +} + +void WriteLogEntry::init(bool has_data, std::vector::iterator allocation, + uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush) { + ram_entry.has_data = 1; + ram_entry.write_data = allocation->buffer_oid; + ceph_assert(!TOID_IS_NULL(ram_entry.write_data)); + pmem_buffer = D_RW(ram_entry.write_data); + ram_entry.sync_gen_number = current_sync_gen; + if (persist_on_flush) { + /* Persist on flush. Sequence #0 is never used. */ + ram_entry.write_sequence_number = 0; + } else { + /* Persist on write */ + ram_entry.write_sequence_number = last_op_sequence_num; + ram_entry.sequenced = 1; + } + ram_entry.sync_point = 0; + ram_entry.discard = 0; +} + +void WriteLogEntry::init_pmem_bp() { + ceph_assert(!pmem_bp.have_raw()); + pmem_bp = buffer::ptr(buffer::create_static(this->write_bytes(), (char*)pmem_buffer)); +} + +void WriteLogEntry::init_pmem_bl() { + pmem_bl.clear(); + init_pmem_bp(); + ceph_assert(pmem_bp.have_raw()); + int before_bl = pmem_bp.raw_nref(); + this->init_bl(pmem_bp, pmem_bl); + int after_bl = pmem_bp.raw_nref(); + bl_refs = after_bl - before_bl; +} + +unsigned int WriteLogEntry::reader_count() const { + if (pmem_bp.have_raw()) { + return (pmem_bp.raw_nref() - bl_refs - 1); + } else { + return 0; + } +} + +/* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */ +buffer::list& WriteLogEntry::get_pmem_bl() { + if (0 == bl_refs) { + std::lock_guard locker(m_entry_bl_lock); + if (0 == bl_refs) { + init_pmem_bl(); + } + ceph_assert(0 != bl_refs); + } + return pmem_bl; +} + +/* Constructs a new bl containing copies of pmem_bp */ +void WriteLogEntry::copy_pmem_bl(bufferlist *out_bl) { + this->get_pmem_bl(); + /* pmem_bp is now initialized */ + buffer::ptr cloned_bp(pmem_bp.clone()); + out_bl->clear(); + this->init_bl(cloned_bp, *out_bl); +} + +void WriteLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) { + /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the bl even after flush()). */ + bufferlist entry_bl; + buffer::list entry_bl_copy; + copy_pmem_bl(&entry_bl_copy); + entry_bl_copy.begin(0).copy(write_bytes(), entry_bl); + image_writeback.aio_write({{ram_entry.image_offset_bytes, ram_entry.write_bytes}}, + std::move(entry_bl), 0, ctx); +} + +std::ostream& WriteLogEntry::format(std::ostream &os) const { + os << "(Write) "; + GenericWriteLogEntry::format(os); + os << ", " + << "pmem_buffer=" << (void*)pmem_buffer << ", "; + os << "pmem_bp=" << pmem_bp << ", "; + os << "pmem_bl=" << pmem_bl << ", "; + os << "bl_refs=" << bl_refs; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const WriteLogEntry &entry) { + return entry.format(os); +} + +void DiscardLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) { + image_writeback.aio_discard(ram_entry.image_offset_bytes, ram_entry.write_bytes, + m_discard_granularity_bytes, ctx); +} + +void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num) { + ram_entry.sync_gen_number = current_sync_gen; + if (persist_on_flush) { + /* Persist on flush. Sequence #0 is never used. */ + ram_entry.write_sequence_number = 0; + } else { + /* Persist on write */ + ram_entry.write_sequence_number = last_op_sequence_num; + ram_entry.sequenced = 1; + } +} + +std::ostream &DiscardLogEntry::format(std::ostream &os) const { + os << "(Discard) "; + GenericWriteLogEntry::format(os); + return os; +} + +std::ostream &operator<<(std::ostream &os, + const DiscardLogEntry &entry) { + return entry.format(os); +} + +void WriteSameLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) { + for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) { + bl.append(bp); + } + int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen; + if (trailing_partial) { + bl.append(bp, 0, trailing_partial); + } +} + +void WriteSameLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) { + bufferlist entry_bl; + buffer::list entry_bl_copy; + copy_pmem_bl(&entry_bl_copy); + entry_bl_copy.begin(0).copy(write_bytes(), entry_bl); + image_writeback.aio_writesame(ram_entry.image_offset_bytes, ram_entry.write_bytes, + std::move(entry_bl), 0, ctx); +} + +std::ostream &WriteSameLogEntry::format(std::ostream &os) const { + os << "(WriteSame) "; + WriteLogEntry::format(os); + return os; +} + +std::ostream &operator<<(std::ostream &os, + const WriteSameLogEntry &entry) { + return entry.format(os); +} + +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/LogEntry.h b/src/librbd/cache/pwl/LogEntry.h new file mode 100644 index 0000000000000..fb0f7d3fd0b42 --- /dev/null +++ b/src/librbd/cache/pwl/LogEntry.h @@ -0,0 +1,267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H +#define CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H + +#include "common/ceph_mutex.h" +#include "librbd/Utils.h" +#include "librbd/cache/pwl/Types.h" +#include +#include + +namespace librbd { +namespace cache { +class ImageWritebackInterface; +namespace pwl { + +class SyncPointLogEntry; +class GenericWriteLogEntry; +class WriteLogEntry; + +typedef std::list> GenericWriteLogEntries; + +class GenericLogEntry { +public: + WriteLogPmemEntry ram_entry; + WriteLogPmemEntry *pmem_entry = nullptr; + uint32_t log_entry_index = 0; + bool completed = false; + GenericLogEntry(const uint64_t image_offset_bytes = 0, const uint64_t write_bytes = 0) + : ram_entry(image_offset_bytes, write_bytes) { + }; + virtual ~GenericLogEntry() { }; + GenericLogEntry(const GenericLogEntry&) = delete; + GenericLogEntry &operator=(const GenericLogEntry&) = delete; + virtual bool can_writeback() const { + return false; + } + virtual bool can_retire() const { + return false; + } + virtual void set_flushed(bool flushed) { + ceph_assert(false); + } + virtual unsigned int write_bytes() const { + return 0; + }; + virtual unsigned int bytes_dirty() const { + return 0; + }; + virtual std::shared_ptr get_sync_point_entry() { + return nullptr; + } + virtual void writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) { + ceph_assert(false); + }; + virtual std::ostream& format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const GenericLogEntry &entry); +}; + +class SyncPointLogEntry : public GenericLogEntry { +public: + /* Writing entries using this sync gen number */ + std::atomic writes = {0}; + /* Total bytes for all writing entries using this sync gen number */ + std::atomic bytes = {0}; + /* Writing entries using this sync gen number that have completed */ + std::atomic writes_completed = {0}; + /* Writing entries using this sync gen number that have completed flushing to the writeback interface */ + std::atomic writes_flushed = {0}; + /* All writing entries using all prior sync gen numbers have been flushed */ + std::atomic prior_sync_point_flushed = {true}; + std::shared_ptr next_sync_point_entry = nullptr; + SyncPointLogEntry(const uint64_t sync_gen_number) { + ram_entry.sync_gen_number = sync_gen_number; + ram_entry.sync_point = 1; + }; + ~SyncPointLogEntry() override {}; + SyncPointLogEntry(const SyncPointLogEntry&) = delete; + SyncPointLogEntry &operator=(const SyncPointLogEntry&) = delete; + bool can_retire() const override { + return this->completed; + } + std::ostream& format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const SyncPointLogEntry &entry); +}; + +class GenericWriteLogEntry : public GenericLogEntry { +public: + uint32_t referring_map_entries = 0; + std::shared_ptr sync_point_entry; + GenericWriteLogEntry(std::shared_ptr sync_point_entry, + const uint64_t image_offset_bytes, const uint64_t write_bytes) + : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(sync_point_entry) { } + GenericWriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) + : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(nullptr) { } + ~GenericWriteLogEntry() override {}; + GenericWriteLogEntry(const GenericWriteLogEntry&) = delete; + GenericWriteLogEntry &operator=(const GenericWriteLogEntry&) = delete; + unsigned int write_bytes() const override { + /* The valid bytes in this ops data buffer. Discard and WS override. */ + return ram_entry.write_bytes; + }; + unsigned int bytes_dirty() const override { + /* The bytes in the image this op makes dirty. Discard and WS override. */ + return write_bytes(); + }; + BlockExtent block_extent() { + return ram_entry.block_extent(); + } + uint32_t get_map_ref() { + return(referring_map_entries); + } + void inc_map_ref() { referring_map_entries++; } + void dec_map_ref() { referring_map_entries--; } + bool can_writeback() const override; + std::shared_ptr get_sync_point_entry() override { + return sync_point_entry; + } + virtual void copy_pmem_bl(bufferlist *out_bl) = 0; + void set_flushed(bool flushed) override { + m_flushed = flushed; + } + bool get_flushed() const { + return m_flushed; + } + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const GenericWriteLogEntry &entry); + +private: + bool m_flushed = false; /* or invalidated */ +}; + +class WriteLogEntry : public GenericWriteLogEntry { +protected: + buffer::ptr pmem_bp; + buffer::list pmem_bl; + std::atomic bl_refs = {0}; /* The refs held on pmem_bp by pmem_bl */ + /* Used in WriteLogEntry::get_pmem_bl() to syncronize between threads making entries readable */ + mutable ceph::mutex m_entry_bl_lock; + + void init_pmem_bp(); + + /* Write same will override */ + virtual void init_bl(buffer::ptr &bp, buffer::list &bl) { + bl.append(bp); + } + + void init_pmem_bl(); + +public: + uint8_t *pmem_buffer = nullptr; + WriteLogEntry(std::shared_ptr sync_point_entry, + const uint64_t image_offset_bytes, const uint64_t write_bytes) + : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes), + m_entry_bl_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this))) + { } + WriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) + : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes), + m_entry_bl_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this))) + { } + ~WriteLogEntry() override {}; + WriteLogEntry(const WriteLogEntry&) = delete; + WriteLogEntry &operator=(const WriteLogEntry&) = delete; + void init(bool has_data, std::vector::iterator allocation, + uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush); + BlockExtent block_extent(); + unsigned int reader_count() const; + /* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */ + buffer::list &get_pmem_bl(); + /* Constructs a new bl containing copies of pmem_bp */ + void copy_pmem_bl(bufferlist *out_bl) override; + void writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) override; + bool can_retire() const override { + return (this->completed && this->get_flushed() && (0 == reader_count())); + } + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const WriteLogEntry &entry); +}; + +class DiscardLogEntry : public GenericWriteLogEntry { +public: + DiscardLogEntry(std::shared_ptr sync_point_entry, + const uint64_t image_offset_bytes, const uint64_t write_bytes, + uint32_t discard_granularity_bytes) + : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes), + m_discard_granularity_bytes(discard_granularity_bytes) { + ram_entry.discard = 1; + }; + DiscardLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) + : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes) { + ram_entry.discard = 1; + }; + DiscardLogEntry(const DiscardLogEntry&) = delete; + DiscardLogEntry &operator=(const DiscardLogEntry&) = delete; + unsigned int write_bytes() const override { + /* The valid bytes in this ops data buffer. */ + return 0; + }; + unsigned int bytes_dirty() const override { + /* The bytes in the image this op makes dirty. */ + return ram_entry.write_bytes; + }; + bool can_retire() const override { + return this->completed; + } + void copy_pmem_bl(bufferlist *out_bl) override { + ceph_assert(false); + } + void writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) override; + void init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num); + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const DiscardLogEntry &entry); +private: + uint32_t m_discard_granularity_bytes; +}; + +class WriteSameLogEntry : public WriteLogEntry { +protected: + void init_bl(buffer::ptr &bp, buffer::list &bl) override; + +public: + WriteSameLogEntry(std::shared_ptr sync_point_entry, + const uint64_t image_offset_bytes, const uint64_t write_bytes, + const uint32_t data_length) + : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) { + ram_entry.writesame = 1; + ram_entry.ws_datalen = data_length; + }; + WriteSameLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes, + const uint32_t data_length) + : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) { + ram_entry.writesame = 1; + ram_entry.ws_datalen = data_length; + }; + WriteSameLogEntry(const WriteSameLogEntry&) = delete; + WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete; + unsigned int write_bytes() const override { + /* The valid bytes in this ops data buffer. */ + return ram_entry.ws_datalen; + }; + unsigned int bytes_dirty() const override { + /* The bytes in the image this op makes dirty. */ + return ram_entry.write_bytes; + }; + void writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) override; + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const WriteSameLogEntry &entry); +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H diff --git a/src/librbd/cache/pwl/LogMap.cc b/src/librbd/cache/pwl/LogMap.cc new file mode 100644 index 0000000000000..d05612ac4d31e --- /dev/null +++ b/src/librbd/cache/pwl/LogMap.cc @@ -0,0 +1,278 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "LogMap.h" +#include "include/ceph_assert.h" +#include "librbd/Utils.h" +#include "librbd/cache/pwl/LogEntry.h" + +namespace librbd { +namespace cache { +namespace pwl { + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::LogMap: " << this << " " \ + << __func__ << ": " +template +std::ostream &operator<<(std::ostream &os, + LogMapEntry &e) { + os << "block_extent=" << e.block_extent << ", " + << "log_entry=[" << e.log_entry << "]"; + return os; +} + +template +LogMapEntry::LogMapEntry(const BlockExtent block_extent, + std::shared_ptr log_entry) + : block_extent(block_extent) , log_entry(log_entry) { +} + +template +LogMapEntry::LogMapEntry(std::shared_ptr log_entry) + : block_extent(log_entry->block_extent()) , log_entry(log_entry) { +} + +template +LogMap::LogMap(CephContext *cct) + : m_cct(cct), + m_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::pwl::LogMap::m_lock", this))) { +} + +/** + * Add a write log entry to the map. Subsequent queries for blocks + * within this log entry's extent will find this log entry. Portions + * of prior write log entries overlapping with this log entry will + * be replaced in the map by this log entry. + * + * The map_entries field of the log entry object will be updated to + * contain this map entry. + * + * The map_entries fields of all log entries overlapping with this + * entry will be updated to remove the regions that overlap with + * this. + */ +template +void LogMap::add_log_entry(std::shared_ptr log_entry) { + std::lock_guard locker(m_lock); + add_log_entry_locked(log_entry); +} + +template +void LogMap::add_log_entries(std::list> &log_entries) { + std::lock_guard locker(m_lock); + ldout(m_cct, 20) << dendl; + for (auto &log_entry : log_entries) { + add_log_entry_locked(log_entry); + } +} + +/** + * Remove any map entries that refer to the supplied write log + * entry. + */ +template +void LogMap::remove_log_entry(std::shared_ptr log_entry) { + std::lock_guard locker(m_lock); + remove_log_entry_locked(log_entry); +} + +template +void LogMap::remove_log_entries(std::list> &log_entries) { + std::lock_guard locker(m_lock); + ldout(m_cct, 20) << dendl; + for (auto &log_entry : log_entries) { + remove_log_entry_locked(log_entry); + } +} + +/** + * Returns the list of all write log entries that overlap the specified block + * extent. This doesn't tell you which portions of these entries overlap the + * extent, or each other. For that, use find_map_entries(). A log entry may + * appear in the list more than once, if multiple map entries refer to it + * (e.g. the middle of that write log entry has been overwritten). + */ +template +std::list> LogMap::find_log_entries(BlockExtent block_extent) { + std::lock_guard locker(m_lock); + ldout(m_cct, 20) << dendl; + return find_log_entries_locked(block_extent); +} + +/** + * Returns the list of all write log map entries that overlap the + * specified block extent. + */ +template +LogMapEntries LogMap::find_map_entries(BlockExtent block_extent) { + std::lock_guard locker(m_lock); + ldout(m_cct, 20) << dendl; + return find_map_entries_locked(block_extent); +} + +template +void LogMap::add_log_entry_locked(std::shared_ptr log_entry) { + LogMapEntry map_entry(log_entry); + ldout(m_cct, 20) << "block_extent=" << map_entry.block_extent + << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + LogMapEntries overlap_entries = find_map_entries_locked(map_entry.block_extent); + for (auto &entry : overlap_entries) { + ldout(m_cct, 20) << entry << dendl; + if (map_entry.block_extent.block_start <= entry.block_extent.block_start) { + if (map_entry.block_extent.block_end >= entry.block_extent.block_end) { + ldout(m_cct, 20) << "map entry completely occluded by new log entry" << dendl; + remove_map_entry_locked(entry); + } else { + ceph_assert(map_entry.block_extent.block_end < entry.block_extent.block_end); + /* The new entry occludes the beginning of the old entry */ + BlockExtent adjusted_extent(map_entry.block_extent.block_end, + entry.block_extent.block_end); + adjust_map_entry_locked(entry, adjusted_extent); + } + } else { + if (map_entry.block_extent.block_end >= entry.block_extent.block_end) { + /* The new entry occludes the end of the old entry */ + BlockExtent adjusted_extent(entry.block_extent.block_start, + map_entry.block_extent.block_start); + adjust_map_entry_locked(entry, adjusted_extent); + } else { + /* The new entry splits the old entry */ + split_map_entry_locked(entry, map_entry.block_extent); + } + } + } + add_map_entry_locked(map_entry); +} + +template +void LogMap::remove_log_entry_locked(std::shared_ptr log_entry) { + ldout(m_cct, 20) << "*log_entry=" << *log_entry << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + LogMapEntries possible_hits = find_map_entries_locked(log_entry->block_extent()); + for (auto &possible_hit : possible_hits) { + if (possible_hit.log_entry == log_entry) { + /* This map entry refers to the specified log entry */ + remove_map_entry_locked(possible_hit); + } + } +} + +template +void LogMap::add_map_entry_locked(LogMapEntry &map_entry) { + ceph_assert(map_entry.log_entry); + m_block_to_log_entry_map.insert(map_entry); + map_entry.log_entry->inc_map_ref(); +} + +template +void LogMap::remove_map_entry_locked(LogMapEntry &map_entry) { + auto it = m_block_to_log_entry_map.find(map_entry); + ceph_assert(it != m_block_to_log_entry_map.end()); + + LogMapEntry erased = *it; + m_block_to_log_entry_map.erase(it); + erased.log_entry->dec_map_ref(); + if (0 == erased.log_entry->get_map_ref()) { + ldout(m_cct, 20) << "log entry has zero map entries: " << erased.log_entry << dendl; + } +} + +template +void LogMap::adjust_map_entry_locked(LogMapEntry &map_entry, BlockExtent &new_extent) { + auto it = m_block_to_log_entry_map.find(map_entry); + ceph_assert(it != m_block_to_log_entry_map.end()); + + LogMapEntry adjusted = *it; + m_block_to_log_entry_map.erase(it); + + m_block_to_log_entry_map.insert(LogMapEntry(new_extent, adjusted.log_entry)); +} + +template +void LogMap::split_map_entry_locked(LogMapEntry &map_entry, BlockExtent &removed_extent) { + auto it = m_block_to_log_entry_map.find(map_entry); + ceph_assert(it != m_block_to_log_entry_map.end()); + + LogMapEntry split = *it; + m_block_to_log_entry_map.erase(it); + + BlockExtent left_extent(split.block_extent.block_start, + removed_extent.block_start); + m_block_to_log_entry_map.insert(LogMapEntry(left_extent, split.log_entry)); + + BlockExtent right_extent(removed_extent.block_end, + split.block_extent.block_end); + m_block_to_log_entry_map.insert(LogMapEntry(right_extent, split.log_entry)); + + split.log_entry->inc_map_ref(); +} + +template +std::list> LogMap::find_log_entries_locked(const BlockExtent &block_extent) { + std::list> overlaps; + ldout(m_cct, 20) << "block_extent=" << block_extent << dendl; + + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + LogMapEntries map_entries = find_map_entries_locked(block_extent); + for (auto &map_entry : map_entries) { + overlaps.emplace_back(map_entry.log_entry); + } + return overlaps; +} + +/** + * TODO: Generalize this to do some arbitrary thing to each map + * extent, instead of returning a list. + */ +template +LogMapEntries LogMap::find_map_entries_locked(const BlockExtent &block_extent) { + LogMapEntries overlaps; + + ldout(m_cct, 20) << "block_extent=" << block_extent << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + auto p = m_block_to_log_entry_map.equal_range(LogMapEntry(block_extent)); + ldout(m_cct, 20) << "count=" << std::distance(p.first, p.second) << dendl; + for ( auto i = p.first; i != p.second; ++i ) { + LogMapEntry entry = *i; + overlaps.emplace_back(entry); + ldout(m_cct, 20) << entry << dendl; + } + return overlaps; +} + +/* We map block extents to write log entries, or portions of write log + * entries. These are both represented by a WriteLogMapEntry. When a + * GenericWriteLogEntry is added to this map, a WriteLogMapEntry is created to + * represent the entire block extent of the GenericWriteLogEntry, and the + * WriteLogMapEntry is added to the set. + * + * The set must not contain overlapping WriteLogMapEntrys. WriteLogMapEntrys + * in the set that overlap with one being added are adjusted (shrunk, split, + * or removed) before the new entry is added. + * + * This comparison works despite the ambiguity because we ensure the set + * contains no overlapping entries. This comparison works to find entries + * that overlap with a given block extent because equal_range() returns the + * first entry in which the extent doesn't end before the given extent + * starts, and the last entry for which the extent starts before the given + * extent ends (the first entry that the key is less than, and the last entry + * that is less than the key). + */ +template +bool LogMap::LogMapEntryCompare::operator()(const LogMapEntry &lhs, + const LogMapEntry &rhs) const { + if (lhs.block_extent.block_end <= rhs.block_extent.block_start) { + return true; + } + return false; +} + +} //namespace pwl +} //namespace cache +} //namespace librbd + +template class librbd::cache::pwl::LogMap; diff --git a/src/librbd/cache/pwl/LogMap.h b/src/librbd/cache/pwl/LogMap.h new file mode 100644 index 0000000000000..a053078961755 --- /dev/null +++ b/src/librbd/cache/pwl/LogMap.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H +#define CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H + +#include "librbd/BlockGuard.h" +#include + +namespace librbd { +namespace cache { +namespace pwl { + +/** + * WriteLogMap: maps block extents to GenericWriteLogEntries + * + * A WriteLogMapEntry (based on LogMapEntry) refers to a portion of a GenericWriteLogEntry + */ +template +class LogMapEntry { +public: + BlockExtent block_extent; + std::shared_ptr log_entry; + + LogMapEntry(BlockExtent block_extent, + std::shared_ptr log_entry = nullptr); + LogMapEntry(std::shared_ptr log_entry); + + template + friend std::ostream &operator<<(std::ostream &os, + LogMapEntry &e); +}; + +template +using LogMapEntries = std::list>; + +template +class LogMap { +public: + LogMap(CephContext *cct); + LogMap(const LogMap&) = delete; + LogMap &operator=(const LogMap&) = delete; + + void add_log_entry(std::shared_ptr log_entry); + void add_log_entries(std::list> &log_entries); + void remove_log_entry(std::shared_ptr log_entry); + void remove_log_entries(std::list> &log_entries); + std::list> find_log_entries(BlockExtent block_extent); + LogMapEntries find_map_entries(BlockExtent block_extent); + +private: + void add_log_entry_locked(std::shared_ptr log_entry); + void remove_log_entry_locked(std::shared_ptr log_entry); + void add_map_entry_locked(LogMapEntry &map_entry); + void remove_map_entry_locked(LogMapEntry &map_entry); + void adjust_map_entry_locked(LogMapEntry &map_entry, BlockExtent &new_extent); + void split_map_entry_locked(LogMapEntry &map_entry, BlockExtent &removed_extent); + std::list> find_log_entries_locked(const BlockExtent &block_extent); + LogMapEntries find_map_entries_locked(const BlockExtent &block_extent); + + using LogMapEntryT = LogMapEntry; + + class LogMapEntryCompare { + public: + bool operator()(const LogMapEntryT &lhs, + const LogMapEntryT &rhs) const; + }; + + using BlockExtentToLogMapEntries = std::set; + + CephContext *m_cct; + ceph::mutex m_lock; + BlockExtentToLogMapEntries m_block_to_log_entry_map; +}; + +} //namespace pwl +} //namespace cache +} //namespace librbd + +#endif //CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H diff --git a/src/librbd/cache/pwl/LogOperation.cc b/src/librbd/cache/pwl/LogOperation.cc new file mode 100644 index 0000000000000..d47eb3a066e1f --- /dev/null +++ b/src/librbd/cache/pwl/LogOperation.cc @@ -0,0 +1,338 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "LogOperation.h" +#include "librbd/cache/pwl/Types.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::LogOperation: " << this << " " \ + << __func__ << ": " + +namespace librbd { + +namespace cache { + +namespace pwl { + +GenericLogOperation::GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter) + : m_perfcounter(perfcounter), dispatch_time(dispatch_time) { +} + +std::ostream& GenericLogOperation::format(std::ostream &os) const { + os << "dispatch_time=[" << dispatch_time << "], " + << "buf_persist_time=[" << buf_persist_time << "], " + << "buf_persist_comp_time=[" << buf_persist_comp_time << "], " + << "log_append_time=[" << log_append_time << "], " + << "log_append_comp_time=[" << log_append_comp_time << "], "; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const GenericLogOperation &op) { + return op.format(os); +} + +SyncPointLogOperation::SyncPointLogOperation(ceph::mutex &lock, + std::shared_ptr sync_point, + const utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct) + : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock), sync_point(sync_point) { +} + +SyncPointLogOperation::~SyncPointLogOperation() { } + +std::ostream &SyncPointLogOperation::format(std::ostream &os) const { + os << "(Sync Point) "; + GenericLogOperation::format(os); + os << ", " + << "sync_point=[" << *sync_point << "]"; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const SyncPointLogOperation &op) { + return op.format(os); +} + +std::vector SyncPointLogOperation::append_sync_point() { + std::vector appending_contexts; + std::lock_guard locker(m_lock); + if (!sync_point->appending) { + sync_point->appending = true; + } + appending_contexts.swap(sync_point->on_sync_point_appending); + return appending_contexts; +} + +void SyncPointLogOperation::clear_earlier_sync_point() { + std::lock_guard locker(m_lock); + ceph_assert(sync_point->later_sync_point); + ceph_assert(sync_point->later_sync_point->earlier_sync_point == + sync_point); + sync_point->later_sync_point->earlier_sync_point = nullptr; +} + +std::vector SyncPointLogOperation::swap_on_sync_point_persisted() { + std::lock_guard locker(m_lock); + std::vector persisted_contexts; + persisted_contexts.swap(sync_point->on_sync_point_persisted); + return persisted_contexts; +} + +void SyncPointLogOperation::appending() { + ceph_assert(sync_point); + ldout(m_cct, 20) << "Sync point op=[" << *this + << "] appending" << dendl; + auto appending_contexts = append_sync_point(); + for (auto &ctx : appending_contexts) { + ctx->complete(0); + } +} + +void SyncPointLogOperation::complete(int result) { + ceph_assert(sync_point); + ldout(m_cct, 20) << "Sync point op =[" << *this + << "] completed" << dendl; + clear_earlier_sync_point(); + + /* Do append now in case completion occurred before the + * normal append callback executed, and to handle + * on_append work that was queued after the sync point + * entered the appending state. */ + appending(); + auto persisted_contexts = swap_on_sync_point_persisted(); + for (auto &ctx : persisted_contexts) { + ctx->complete(result); + } +} + +GenericWriteLogOperation::GenericWriteLogOperation(std::shared_ptr sync_point, + const utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct) + : GenericLogOperation(dispatch_time, perfcounter), + m_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::pwl::GenericWriteLogOperation::m_lock", this))), + m_cct(cct), + sync_point(sync_point) { +} + +GenericWriteLogOperation::~GenericWriteLogOperation() { } + +std::ostream &GenericWriteLogOperation::format(std::ostream &os) const { + GenericLogOperation::format(os); + return os; +} + +std::ostream &operator<<(std::ostream &os, + const GenericWriteLogOperation &op) { + return op.format(os); +} + +/* Called when the write log operation is appending and its log position is guaranteed */ +void GenericWriteLogOperation::appending() { + Context *on_append = nullptr; + ldout(m_cct, 20) << __func__ << " " << this << dendl; + { + std::lock_guard locker(m_lock); + on_append = on_write_append; + on_write_append = nullptr; + } + if (on_append) { + ldout(m_cct, 20) << __func__ << " " << this << " on_append=" << on_append << dendl; + on_append->complete(0); + } +} + +/* Called when the write log operation is completed in all log replicas */ +void GenericWriteLogOperation::complete(int result) { + appending(); + Context *on_persist = nullptr; + ldout(m_cct, 20) << __func__ << " " << this << dendl; + { + std::lock_guard locker(m_lock); + on_persist = on_write_persist; + on_write_persist = nullptr; + } + if (on_persist) { + ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist << dendl; + on_persist->complete(result); + } +} + +WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set, + uint64_t image_offset_bytes, uint64_t write_bytes, + CephContext *cct) + : GenericWriteLogOperation(set.sync_point, set.dispatch_time, set.perfcounter, cct), + log_entry(std::make_shared(set.sync_point->log_entry, image_offset_bytes, write_bytes)) { + on_write_append = set.extent_ops_appending->new_sub(); + on_write_persist = set.extent_ops_persist->new_sub(); + log_entry->sync_point_entry->writes++; + log_entry->sync_point_entry->bytes += write_bytes; +} + +WriteLogOperation::~WriteLogOperation() { } + +void WriteLogOperation::init(bool has_data, std::vector::iterator allocation, uint64_t current_sync_gen, + uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset, + bool persist_on_flush) { + log_entry->init(has_data, allocation, current_sync_gen, last_op_sequence_num, persist_on_flush); + buffer_alloc = &(*allocation); + bl.substr_of(write_req_bl, buffer_offset, + log_entry->write_bytes()); +} + +std::ostream &WriteLogOperation::format(std::ostream &os) const { + os << "(Write) "; + GenericWriteLogOperation::format(os); + os << ", "; + if (log_entry) { + os << "log_entry=[" << *log_entry << "], "; + } else { + os << "log_entry=nullptr, "; + } + os << "bl=[" << bl << "]," + << "buffer_alloc=" << buffer_alloc; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const WriteLogOperation &op) { + return op.format(os); +} + + +void WriteLogOperation::complete(int result) { + GenericWriteLogOperation::complete(result); + m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_buf_t, buf_persist_time - dispatch_time); + utime_t buf_lat = buf_persist_comp_time - buf_persist_time; + m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_bufc_t, buf_lat); + m_perfcounter->hinc(l_librbd_pwl_log_op_buf_to_bufc_t_hist, buf_lat.to_nsec(), + log_entry->ram_entry.write_bytes); + m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_app_t, log_append_time - buf_persist_time); +} + +void WriteLogOperation::copy_bl_to_pmem_buffer() { + /* operation is a shared_ptr, so write_op is only good as long as operation is in scope */ + bufferlist::iterator i(&bl); + m_perfcounter->inc(l_librbd_pwl_log_op_bytes, log_entry->write_bytes()); + ldout(m_cct, 20) << bl << dendl; + i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->pmem_buffer); +} + +void WriteLogOperation::flush_pmem_buf_to_cache(PMEMobjpool *log_pool) { + buf_persist_time = ceph_clock_now(); + pmemobj_flush(log_pool, log_entry->pmem_buffer, log_entry->write_bytes()); +} + +WriteLogOperationSet::WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr sync_point, + bool persist_on_flush, CephContext *cct, Context *on_finish) + : m_cct(cct), m_on_finish(on_finish), + persist_on_flush(persist_on_flush), + dispatch_time(dispatched), + perfcounter(perfcounter), + sync_point(sync_point) { + on_ops_appending = sync_point->prior_persisted_gather_new_sub(); + on_ops_persist = nullptr; + extent_ops_persist = + new C_Gather(m_cct, + new LambdaContext( [this](int r) { + ldout(this->m_cct,20) << __func__ << " " << this << " m_extent_ops_persist completed" << dendl; + if (on_ops_persist) { + on_ops_persist->complete(r); + } + m_on_finish->complete(r); + })); + auto appending_persist_sub = extent_ops_persist->new_sub(); + extent_ops_appending = + new C_Gather(m_cct, + new LambdaContext( [this, appending_persist_sub](int r) { + ldout(this->m_cct, 20) << __func__ << " " << this << " m_extent_ops_appending completed" << dendl; + on_ops_appending->complete(r); + appending_persist_sub->complete(r); + })); +} + +WriteLogOperationSet::~WriteLogOperationSet() { } + +std::ostream &operator<<(std::ostream &os, + const WriteLogOperationSet &s) { + os << "cell=" << (void*)s.cell << ", " + << "extent_ops_appending=[" << s.extent_ops_appending << ", " + << "extent_ops_persist=[" << s.extent_ops_persist << "]"; + return os; +} + +DiscardLogOperation::DiscardLogOperation(std::shared_ptr sync_point, + const uint64_t image_offset_bytes, + const uint64_t write_bytes, + uint32_t discard_granularity_bytes, + const utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct) + : GenericWriteLogOperation(sync_point, dispatch_time, perfcounter, cct), + log_entry(std::make_shared(sync_point->log_entry, + image_offset_bytes, + write_bytes, + discard_granularity_bytes)) { + on_write_append = sync_point->prior_persisted_gather_new_sub(); + on_write_persist = nullptr; + log_entry->sync_point_entry->writes++; + log_entry->sync_point_entry->bytes += write_bytes; +} + +DiscardLogOperation::~DiscardLogOperation() { } + +void DiscardLogOperation::init(uint64_t current_sync_gen, bool persist_on_flush, + uint64_t last_op_sequence_num, Context *write_persist) { + log_entry->init(current_sync_gen, persist_on_flush, last_op_sequence_num); + this->on_write_persist = write_persist; +} + +std::ostream &DiscardLogOperation::format(std::ostream &os) const { + os << "(Discard) "; + GenericWriteLogOperation::format(os); + os << ", "; + if (log_entry) { + os << "log_entry=[" << *log_entry << "], "; + } else { + os << "log_entry=nullptr, "; + } + return os; +} + +std::ostream &operator<<(std::ostream &os, + const DiscardLogOperation &op) { + return op.format(os); +} + +WriteSameLogOperation::WriteSameLogOperation(WriteLogOperationSet &set, + uint64_t image_offset_bytes, + uint64_t write_bytes, + uint32_t data_len, + CephContext *cct) + : WriteLogOperation(set, image_offset_bytes, write_bytes, cct) { + log_entry = + std::make_shared(set.sync_point->log_entry, image_offset_bytes, write_bytes, data_len); + ldout(m_cct, 20) << __func__ << " " << this << dendl; +} + +WriteSameLogOperation::~WriteSameLogOperation() { } + +std::ostream &WriteSameLogOperation::format(std::ostream &os) const { + os << "(Write Same) "; + WriteLogOperation::format(os); + return os; +} + +std::ostream &operator<<(std::ostream &os, + const WriteSameLogOperation &op) { + return op.format(os); +} + +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/LogOperation.h b/src/librbd/cache/pwl/LogOperation.h new file mode 100644 index 0000000000000..8ae6351cc1b9d --- /dev/null +++ b/src/librbd/cache/pwl/LogOperation.h @@ -0,0 +1,231 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H +#define CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H + +#include "include/utime.h" +#include "librbd/cache/pwl/LogEntry.h" +#include "librbd/cache/pwl/SyncPoint.h" + +namespace librbd { +namespace cache { +namespace pwl { +struct WriteBufferAllocation; + +class WriteLogOperationSet; + +class WriteLogOperation; + +class GenericWriteLogOperation; + +class SyncPointLogOperation; + +class GenericLogOperation; + +using GenericLogOperationSharedPtr = std::shared_ptr; + +using GenericLogOperationsVector = std::vector; + +class GenericLogOperation { +protected: + PerfCounters *m_perfcounter = nullptr; +public: + utime_t dispatch_time; // When op created + utime_t buf_persist_time; // When buffer persist begins + utime_t buf_persist_comp_time; // When buffer persist completes + utime_t log_append_time; // When log append begins + utime_t log_append_comp_time; // When log append completes + GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter); + virtual ~GenericLogOperation() { }; + GenericLogOperation(const GenericLogOperation&) = delete; + GenericLogOperation &operator=(const GenericLogOperation&) = delete; + virtual std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const GenericLogOperation &op); + virtual const std::shared_ptr get_log_entry() = 0; + virtual void appending() = 0; + virtual void complete(int r) = 0; + virtual void mark_log_entry_completed() {}; + virtual bool reserved_allocated() const { + return false; + } + virtual bool is_writing_op() const { + return false; + } + virtual void copy_bl_to_pmem_buffer() {}; + virtual void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {}; +}; + +class SyncPointLogOperation : public GenericLogOperation { +private: + CephContext *m_cct; + ceph::mutex &m_lock; + std::vector append_sync_point(); + void clear_earlier_sync_point(); + std::vector swap_on_sync_point_persisted(); +public: + std::shared_ptr sync_point; + SyncPointLogOperation(ceph::mutex &lock, + std::shared_ptr sync_point, + const utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct); + ~SyncPointLogOperation() override; + SyncPointLogOperation(const SyncPointLogOperation&) = delete; + SyncPointLogOperation &operator=(const SyncPointLogOperation&) = delete; + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const SyncPointLogOperation &op); + const std::shared_ptr get_log_entry() override { + return sync_point->log_entry; + } + void appending() override; + void complete(int r) override; +}; + +class GenericWriteLogOperation : public GenericLogOperation { +protected: + ceph::mutex m_lock; + CephContext *m_cct; +public: + std::shared_ptr sync_point; + Context *on_write_append = nullptr; /* Completion for things waiting on this + * write's position in the log to be + * guaranteed */ + Context *on_write_persist = nullptr; /* Completion for things waiting on this + * write to persist */ + GenericWriteLogOperation(std::shared_ptr sync_point, + const utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct); + ~GenericWriteLogOperation() override; + GenericWriteLogOperation(const GenericWriteLogOperation&) = delete; + GenericWriteLogOperation &operator=(const GenericWriteLogOperation&) = delete; + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const GenericWriteLogOperation &op); + void mark_log_entry_completed() override{ + sync_point->log_entry->writes_completed++; + } + bool reserved_allocated() const override { + return true; + } + bool is_writing_op() const override { + return true; + } + void appending() override; + void complete(int r) override; +}; + +class WriteLogOperation : public GenericWriteLogOperation { +public: + using GenericWriteLogOperation::m_lock; + using GenericWriteLogOperation::sync_point; + using GenericWriteLogOperation::on_write_append; + using GenericWriteLogOperation::on_write_persist; + std::shared_ptr log_entry; + bufferlist bl; + WriteBufferAllocation *buffer_alloc = nullptr; + WriteLogOperation(WriteLogOperationSet &set, const uint64_t image_offset_bytes, + const uint64_t write_bytes, CephContext *cct); + ~WriteLogOperation() override; + WriteLogOperation(const WriteLogOperation&) = delete; + WriteLogOperation &operator=(const WriteLogOperation&) = delete; + void init(bool has_data, std::vector::iterator allocation, uint64_t current_sync_gen, + uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset, + bool persist_on_flush); + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const WriteLogOperation &op); + const std::shared_ptr get_log_entry() override { + return log_entry; + } + + void complete(int r) override; + void copy_bl_to_pmem_buffer() override; + void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) override; +}; + + +class WriteLogOperationSet { +private: + CephContext *m_cct; + Context *m_on_finish; +public: + bool persist_on_flush; + BlockGuardCell *cell; + C_Gather *extent_ops_appending; + Context *on_ops_appending; + C_Gather *extent_ops_persist; + Context *on_ops_persist; + GenericLogOperationsVector operations; + utime_t dispatch_time; /* When set created */ + PerfCounters *perfcounter = nullptr; + std::shared_ptr sync_point; + WriteLogOperationSet(const utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr sync_point, + const bool persist_on_flush, CephContext *cct, Context *on_finish); + ~WriteLogOperationSet(); + WriteLogOperationSet(const WriteLogOperationSet&) = delete; + WriteLogOperationSet &operator=(const WriteLogOperationSet&) = delete; + friend std::ostream &operator<<(std::ostream &os, + const WriteLogOperationSet &s); +}; + +class DiscardLogOperation : public GenericWriteLogOperation { +public: + using GenericWriteLogOperation::m_lock; + using GenericWriteLogOperation::sync_point; + using GenericWriteLogOperation::on_write_append; + using GenericWriteLogOperation::on_write_persist; + std::shared_ptr log_entry; + DiscardLogOperation(std::shared_ptr sync_point, + const uint64_t image_offset_bytes, + const uint64_t write_bytes, + uint32_t discard_granularity_bytes, + const utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct); + ~DiscardLogOperation() override; + DiscardLogOperation(const DiscardLogOperation&) = delete; + DiscardLogOperation &operator=(const DiscardLogOperation&) = delete; + const std::shared_ptr get_log_entry() override { + return log_entry; + } + bool reserved_allocated() const override { + return false; + } + void init(uint64_t current_sync_gen, bool persist_on_flush, + uint64_t last_op_sequence_num, Context *write_persist); + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const DiscardLogOperation &op); +}; + +class WriteSameLogOperation : public WriteLogOperation { +public: + using GenericWriteLogOperation::m_lock; + using GenericWriteLogOperation::sync_point; + using GenericWriteLogOperation::on_write_append; + using GenericWriteLogOperation::on_write_persist; + using WriteLogOperation::log_entry; + using WriteLogOperation::bl; + using WriteLogOperation::buffer_alloc; + WriteSameLogOperation(WriteLogOperationSet &set, + const uint64_t image_offset_bytes, + const uint64_t write_bytes, + const uint32_t data_len, + CephContext *cct); + ~WriteSameLogOperation(); + WriteSameLogOperation(const WriteSameLogOperation&) = delete; + WriteSameLogOperation &operator=(const WriteSameLogOperation&) = delete; + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const WriteSameLogOperation &op); +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H diff --git a/src/librbd/cache/pwl/ReadRequest.cc b/src/librbd/cache/pwl/ReadRequest.cc new file mode 100644 index 0000000000000..766e33febf17a --- /dev/null +++ b/src/librbd/cache/pwl/ReadRequest.cc @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ReadRequest.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::ReadRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +void C_ReadRequest::finish(int r) { + ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl; + int hits = 0; + int misses = 0; + int hit_bytes = 0; + int miss_bytes = 0; + if (r >= 0) { + /* + * At this point the miss read has completed. We'll iterate through + * read_extents and produce *m_out_bl by assembling pieces of miss_bl + * and the individual hit extent bufs in the read extents that represent + * hits. + */ + uint64_t miss_bl_offset = 0; + for (auto &extent : read_extents) { + if (extent.m_bl.length()) { + /* This was a hit */ + ceph_assert(extent.second == extent.m_bl.length()); + ++hits; + hit_bytes += extent.second; + m_out_bl->claim_append(extent.m_bl); + } else { + /* This was a miss. */ + ++misses; + miss_bytes += extent.second; + bufferlist miss_extent_bl; + miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent.second); + /* Add this read miss bufferlist to the output bufferlist */ + m_out_bl->claim_append(miss_extent_bl); + /* Consume these bytes in the read miss bufferlist */ + miss_bl_offset += extent.second; + } + } + } + ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl; + utime_t now = ceph_clock_now(); + ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes); + m_on_finish->complete(r); + m_perfcounter->inc(l_librbd_pwl_rd_bytes, hit_bytes + miss_bytes); + m_perfcounter->inc(l_librbd_pwl_rd_hit_bytes, hit_bytes); + m_perfcounter->tinc(l_librbd_pwl_rd_latency, now - m_arrived_time); + if (!misses) { + m_perfcounter->inc(l_librbd_pwl_rd_hit_req, 1); + m_perfcounter->tinc(l_librbd_pwl_rd_hit_latency, now - m_arrived_time); + } else { + if (hits) { + m_perfcounter->inc(l_librbd_pwl_rd_part_hit_req, 1); + } + } +} + +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/ReadRequest.h b/src/librbd/cache/pwl/ReadRequest.h new file mode 100644 index 0000000000000..7c953547c875d --- /dev/null +++ b/src/librbd/cache/pwl/ReadRequest.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H +#define CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H + +#include "include/Context.h" +#include "librbd/cache/pwl/Types.h" + +namespace librbd { +namespace cache { +namespace pwl { + +typedef std::vector ImageExtentBufs; + +class C_ReadRequest : public Context { +public: + io::Extents miss_extents; // move back to caller + ImageExtentBufs read_extents; + bufferlist miss_bl; + + C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish) + : m_cct(cct), m_on_finish(on_finish), m_out_bl(out_bl), + m_arrived_time(arrived), m_perfcounter(perfcounter) {} + ~C_ReadRequest() {} + + void finish(int r) override; + + const char *get_name() const { + return "C_ReadRequest"; + } + +private: + CephContext *m_cct; + Context *m_on_finish; + bufferlist *m_out_bl; + utime_t m_arrived_time; + PerfCounters *m_perfcounter; +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H diff --git a/src/librbd/cache/pwl/ReplicatedWriteLog.cc b/src/librbd/cache/pwl/ReplicatedWriteLog.cc new file mode 100644 index 0000000000000..5db62a5fd50f1 --- /dev/null +++ b/src/librbd/cache/pwl/ReplicatedWriteLog.cc @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// // vim: ts=8 sw=2 smarttab + +#include "ReplicatedWriteLog.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "include/ceph_assert.h" +#include "common/deleter.h" +#include "common/dout.h" +#include "common/environment.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "common/Timer.h" +#include "common/perf_counters.h" +#include "librbd/ImageCtx.h" +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/cache/pwl/LogEntry.h" +#include +#include + +#undef dout_subsys +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::ReplicatedWriteLog: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +using namespace librbd::cache::pwl; + +template +ReplicatedWriteLog::ReplicatedWriteLog(I &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state) +: AbstractWriteLog(image_ctx, cache_state) +{ +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::ReplicatedWriteLog; diff --git a/src/librbd/cache/pwl/ReplicatedWriteLog.h b/src/librbd/cache/pwl/ReplicatedWriteLog.h new file mode 100644 index 0000000000000..dc1a46a4547c1 --- /dev/null +++ b/src/librbd/cache/pwl/ReplicatedWriteLog.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG +#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG + +#include "common/RWLock.h" +#include "common/WorkQueue.h" +#include "common/AsyncOpTracker.h" +#include "librbd/cache/ImageCache.h" +#include "librbd/cache/ImageWriteback.h" +#include "librbd/Utils.h" +#include "librbd/BlockGuard.h" +#include "librbd/cache/Types.h" +#include "librbd/cache/pwl/LogOperation.h" +#include "librbd/cache/pwl/Request.h" +#include "librbd/cache/pwl/LogMap.h" +#include "AbstractWriteLog.h" +#include +#include + +class Context; +class SafeTimer; + +namespace librbd { + +struct ImageCtx; + +namespace cache { + +namespace pwl { + +template +class ReplicatedWriteLog : public AbstractWriteLog { +public: + typedef io::Extent Extent; + typedef io::Extents Extents; + + ReplicatedWriteLog(ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state); + ~ReplicatedWriteLog(); + ReplicatedWriteLog(const ReplicatedWriteLog&) = delete; + ReplicatedWriteLog &operator=(const ReplicatedWriteLog&) = delete; + +private: + using This = AbstractWriteLog; + using C_WriteRequestT = pwl::C_WriteRequest; + using C_BlockIORequestT = pwl::C_BlockIORequest; + using C_FlushRequestT = pwl::C_FlushRequest; + using C_DiscardRequestT = pwl::C_DiscardRequest; + using C_WriteSameRequestT = pwl::C_WriteSameRequest; + using C_CompAndWriteRequestT = pwl::C_CompAndWriteRequest; + +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::ReplicatedWriteLog; + +#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG diff --git a/src/librbd/cache/pwl/Request.cc b/src/librbd/cache/pwl/Request.cc new file mode 100644 index 0000000000000..c30fb2203cbe9 --- /dev/null +++ b/src/librbd/cache/pwl/Request.cc @@ -0,0 +1,633 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Request.h" +#include "librbd/BlockGuard.h" +#include "librbd/cache/pwl/LogEntry.h" +#include "librbd/cache/pwl/ReplicatedWriteLog.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::Request: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +template +C_BlockIORequest::C_BlockIORequest(T &pwl, const utime_t arrived, io::Extents &&extents, + bufferlist&& bl, const int fadvise_flags, Context *user_req) + : pwl(pwl), image_extents(std::move(extents)), + bl(std::move(bl)), fadvise_flags(fadvise_flags), + user_req(user_req), image_extents_summary(image_extents), m_arrived_time(arrived) { + ldout(pwl.get_context(), 99) << this << dendl; +} + +template +C_BlockIORequest::~C_BlockIORequest() { + ldout(pwl.get_context(), 99) << this << dendl; + ceph_assert(m_cell_released || !m_cell); +} + +template +std::ostream &operator<<(std::ostream &os, + const C_BlockIORequest &req) { + os << "image_extents=[" << req.image_extents << "], " + << "image_extents_summary=[" << req.image_extents_summary << "], " + << "bl=" << req.bl << ", " + << "user_req=" << req.user_req << ", " + << "m_user_req_completed=" << req.m_user_req_completed << ", " + << "m_deferred=" << req.m_deferred << ", " + << "detained=" << req.detained << ", " + << "waited_lanes=" << req.waited_lanes << ", " + << "waited_entries=" << req.waited_entries << ", " + << "waited_buffers=" << req.waited_buffers << ""; + return os; +} + +template +void C_BlockIORequest::set_cell(BlockGuardCell *cell) { + ldout(pwl.get_context(), 20) << this << " cell=" << cell << dendl; + ceph_assert(cell); + ceph_assert(!m_cell); + m_cell = cell; +} + +template +BlockGuardCell *C_BlockIORequest::get_cell(void) { + ldout(pwl.get_context(), 20) << this << " cell=" << m_cell << dendl; + return m_cell; +} + +template +void C_BlockIORequest::release_cell() { + ldout(pwl.get_context(), 20) << this << " cell=" << m_cell << dendl; + ceph_assert(m_cell); + bool initial = false; + if (m_cell_released.compare_exchange_strong(initial, true)) { + pwl.release_guarded_request(m_cell); + } else { + ldout(pwl.get_context(), 5) << "cell " << m_cell << " already released for " << this << dendl; + } +} + +template +void C_BlockIORequest::complete_user_request(int r) { + bool initial = false; + if (m_user_req_completed.compare_exchange_strong(initial, true)) { + ldout(pwl.get_context(), 15) << this << " completing user req" << dendl; + m_user_req_completed_time = ceph_clock_now(); + user_req->complete(r); + // Set user_req as null as it is deleted + user_req = nullptr; + } else { + ldout(pwl.get_context(), 20) << this << " user req already completed" << dendl; + } +} + +template +void C_BlockIORequest::finish(int r) { + ldout(pwl.get_context(), 20) << this << dendl; + + complete_user_request(r); + bool initial = false; + if (m_finish_called.compare_exchange_strong(initial, true)) { + ldout(pwl.get_context(), 15) << this << " finishing" << dendl; + finish_req(0); + } else { + ldout(pwl.get_context(), 20) << this << " already finished" << dendl; + ceph_assert(0); + } +} + +template +void C_BlockIORequest::deferred() { + bool initial = false; + if (m_deferred.compare_exchange_strong(initial, true)) { + deferred_handler(); + } +} + +template +C_WriteRequest::C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : C_BlockIORequest(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req), + m_perfcounter(perfcounter), m_lock(lock) { + ldout(pwl.get_context(), 99) << this << dendl; +} + +template +C_WriteRequest::~C_WriteRequest() { + ldout(pwl.get_context(), 99) << this << dendl; +} + +template +std::ostream &operator<<(std::ostream &os, + const C_WriteRequest &req) { + os << (C_BlockIORequest&)req + << " m_resources.allocated=" << req.m_resources.allocated; + if (req.op_set) { + os << "op_set=" << *req.op_set; + } + return os; +} + +template +void C_WriteRequest::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) { + ldout(pwl.get_context(), 20) << __func__ << " write_req=" << this << " cell=" << guard_ctx.cell << dendl; + + ceph_assert(guard_ctx.cell); + this->detained = guard_ctx.state.detained; /* overlapped */ + this->m_queued = guard_ctx.state.queued; /* queued behind at least one barrier */ + this->set_cell(guard_ctx.cell); +} + +template +void C_WriteRequest::finish_req(int r) { + ldout(pwl.get_context(), 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl; + + /* Completed to caller by here (in finish(), which calls this) */ + utime_t now = ceph_clock_now(); + pwl.release_write_lanes(this); + ceph_assert(m_resources.allocated); + m_resources.allocated = false; + this->release_cell(); /* TODO: Consider doing this in appending state */ + update_req_stats(now); +} + +template +void C_WriteRequest::setup_buffer_resources( + uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, + uint64_t &number_lanes, uint64_t &number_log_entries, + uint64_t &number_unpublished_reserves) { + + ceph_assert(!m_resources.allocated); + + auto image_extents_size = this->image_extents.size(); + m_resources.buffers.reserve(image_extents_size); + + bytes_cached = 0; + bytes_allocated = 0; + number_lanes = image_extents_size; + number_log_entries = image_extents_size; + number_unpublished_reserves = image_extents_size; + + for (auto &extent : this->image_extents) { + m_resources.buffers.emplace_back(); + struct WriteBufferAllocation &buffer = m_resources.buffers.back(); + buffer.allocation_size = MIN_WRITE_ALLOC_SIZE; + buffer.allocated = false; + bytes_cached += extent.second; + if (extent.second > buffer.allocation_size) { + buffer.allocation_size = extent.second; + } + bytes_allocated += buffer.allocation_size; + } + bytes_dirtied = bytes_cached; +} + +template +std::shared_ptr C_WriteRequest::create_operation(uint64_t offset, uint64_t len) { + return std::make_shared(*op_set, offset, len, pwl.get_context()); +} + +template +void C_WriteRequest::setup_log_operations(DeferredContexts &on_exit) { + GenericWriteLogEntries log_entries; + { + std::lock_guard locker(m_lock); + std::shared_ptr current_sync_point = pwl.get_current_sync_point(); + if ((!pwl.get_persist_on_flush() && current_sync_point->log_entry->writes_completed) || + (current_sync_point->log_entry->writes > MAX_WRITES_PER_SYNC_POINT) || + (current_sync_point->log_entry->bytes > MAX_BYTES_PER_SYNC_POINT)) { + /* Create new sync point and persist the previous one. This sequenced + * write will bear a sync gen number shared with no already completed + * writes. A group of sequenced writes may be safely flushed concurrently + * if they all arrived before any of them completed. We'll insert one on + * an aio_flush() from the application. Here we're inserting one to cap + * the number of bytes and writes per sync point. When the application is + * not issuing flushes, we insert sync points to record some observed + * write concurrency information that enables us to safely issue >1 flush + * write (for writes observed here to have been in flight simultaneously) + * at a time in persist-on-write mode. + */ + pwl.flush_new_sync_point(nullptr, on_exit); + current_sync_point = pwl.get_current_sync_point(); + } + uint64_t current_sync_gen = pwl.get_current_sync_gen(); + op_set = + make_unique(this->m_dispatched_time, + m_perfcounter, + current_sync_point, + pwl.get_persist_on_flush(), + pwl.get_context(), this); + ldout(pwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get() << dendl; + ceph_assert(m_resources.allocated); + /* op_set->operations initialized differently for plain write or write same */ + auto allocation = m_resources.buffers.begin(); + uint64_t buffer_offset = 0; + for (auto &extent : this->image_extents) { + /* operation->on_write_persist connected to m_prior_log_entries_persisted Gather */ + auto operation = this->create_operation(extent.first, extent.second); + this->op_set->operations.emplace_back(operation); + + /* A WS is also a write */ + ldout(pwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get() + << " operation=" << operation << dendl; + log_entries.emplace_back(operation->log_entry); + if (!op_set->persist_on_flush) { + pwl.inc_last_op_sequence_num(); + } + operation->init(true, allocation, current_sync_gen, + pwl.get_last_op_sequence_num(), this->bl, buffer_offset, op_set->persist_on_flush); + buffer_offset += operation->log_entry->write_bytes(); + ldout(pwl.get_context(), 20) << "operation=[" << *operation << "]" << dendl; + allocation++; + } + } + /* All extent ops subs created */ + op_set->extent_ops_appending->activate(); + op_set->extent_ops_persist->activate(); + + /* Write data */ + for (auto &operation : op_set->operations) { + operation->copy_bl_to_pmem_buffer(); + } + pwl.add_into_log_map(log_entries); +} + +template +bool C_WriteRequest::append_write_request(std::shared_ptr sync_point) { + std::lock_guard locker(m_lock); + auto write_req_sp = this; + if (sync_point->earlier_sync_point) { + Context *schedule_append_ctx = new LambdaContext([this, write_req_sp](int r) { + write_req_sp->schedule_append(); + }); + sync_point->earlier_sync_point->on_sync_point_appending.push_back(schedule_append_ctx); + return true; + } + return false; +} + +template +void C_WriteRequest::schedule_append() { + ceph_assert(++m_appended == 1); + if (m_do_early_flush) { + /* This caller is waiting for persist, so we'll use their thread to + * expedite it */ + pwl.flush_pmem_buffer(this->op_set->operations); + pwl.schedule_append(this->op_set->operations); + } else { + /* This is probably not still the caller's thread, so do the payload + * flushing/replicating later. */ + pwl.schedule_flush_and_append(this->op_set->operations); + } +} + +/** + * Attempts to allocate log resources for a write. Returns true if successful. + * + * Resources include 1 lane per extent, 1 log entry per extent, and the payload + * data space for each extent. + * + * Lanes are released after the write persists via release_write_lanes() + */ +template +bool C_WriteRequest::alloc_resources() { + this->allocated_time = ceph_clock_now(); + return pwl.alloc_resources(this); +} + +/** + * Takes custody of write_req. Resources must already be allocated. + * + * Locking: + * Acquires lock + */ +template +void C_WriteRequest::dispatch() +{ + CephContext *cct = pwl.get_context(); + DeferredContexts on_exit; + utime_t now = ceph_clock_now(); + this->m_dispatched_time = now; + + ldout(cct, 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl; + this->setup_log_operations(on_exit); + + bool append_deferred = false; + if (!op_set->persist_on_flush && + append_write_request(op_set->sync_point)) { + /* In persist-on-write mode, we defer the append of this write until the + * previous sync point is appending (meaning all the writes before it are + * persisted and that previous sync point can now appear in the + * log). Since we insert sync points in persist-on-write mode when writes + * have already completed to the current sync point, this limits us to + * one inserted sync point in flight at a time, and gives the next + * inserted sync point some time to accumulate a few writes if they + * arrive soon. Without this we can insert an absurd number of sync + * points, each with one or two writes. That uses a lot of log entries, + * and limits flushing to very few writes at a time. */ + m_do_early_flush = false; + append_deferred = true; + } else { + /* The prior sync point is done, so we'll schedule append here. If this is + * persist-on-write, and probably still the caller's thread, we'll use this + * caller's thread to perform the persist & replication of the payload + * buffer. */ + m_do_early_flush = + !(this->detained || this->m_queued || this->m_deferred || op_set->persist_on_flush); + } + if (!append_deferred) { + this->schedule_append(); + } +} + +template +C_FlushRequest::C_FlushRequest(T &pwl, const utime_t arrived, + io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, + ceph::mutex &lock, PerfCounters *perfcounter, + Context *user_req) + : C_BlockIORequest(pwl, arrived, std::move(image_extents), std::move(bl), + fadvise_flags, user_req), + m_lock(lock), m_perfcounter(perfcounter) { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +void C_FlushRequest::finish_req(int r) { + ldout(pwl.get_context(), 20) << "flush_req=" << this + << " cell=" << this->get_cell() << dendl; + /* Block guard already released */ + ceph_assert(!this->get_cell()); + + /* Completed to caller by here */ + utime_t now = ceph_clock_now(); + m_perfcounter->tinc(l_librbd_pwl_aio_flush_latency, now - this->m_arrived_time); +} + +template +bool C_FlushRequest::alloc_resources() { + ldout(pwl.get_context(), 20) << "req type=" << get_name() << " " + << "req=[" << *this << "]" << dendl; + return pwl.alloc_resources(this); +} + +template +void C_FlushRequest::dispatch() { + utime_t now = ceph_clock_now(); + ldout(pwl.get_context(), 20) << "req type=" << get_name() << " " + << "req=[" << *this << "]" << dendl; + ceph_assert(this->m_resources.allocated); + this->m_dispatched_time = now; + + op = std::make_shared(m_lock, + to_append, + now, + m_perfcounter, + pwl.get_context()); + + m_perfcounter->inc(l_librbd_pwl_log_ops, 1); + pwl.schedule_append(op); +} + +template +void C_FlushRequest::setup_buffer_resources( + uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, + uint64_t &number_lanes, uint64_t &number_log_entries, + uint64_t &number_unpublished_reserves) { + number_log_entries = 1; +} + +template +std::ostream &operator<<(std::ostream &os, + const C_FlushRequest &req) { + os << (C_BlockIORequest&)req + << " m_resources.allocated=" << req.m_resources.allocated; + return os; +} + +template +C_DiscardRequest::C_DiscardRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : C_BlockIORequest(pwl, arrived, std::move(image_extents), bufferlist(), 0, user_req), + m_discard_granularity_bytes(discard_granularity_bytes), + m_lock(lock), + m_perfcounter(perfcounter) { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +C_DiscardRequest::~C_DiscardRequest() { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +bool C_DiscardRequest::alloc_resources() { + ldout(pwl.get_context(), 20) << "req type=" << get_name() << " " + << "req=[" << *this << "]" << dendl; + return pwl.alloc_resources(this); +} + +template +void C_DiscardRequest::setup_log_operations() { + std::lock_guard locker(m_lock); + GenericWriteLogEntries log_entries; + for (auto &extent : this->image_extents) { + op = std::make_shared(pwl.get_current_sync_point(), + extent.first, + extent.second, + m_discard_granularity_bytes, + this->m_dispatched_time, + m_perfcounter, + pwl.get_context()); + log_entries.emplace_back(op->log_entry); + break; + } + uint64_t current_sync_gen = pwl.get_current_sync_gen(); + bool persist_on_flush = pwl.get_persist_on_flush(); + if (!persist_on_flush) { + pwl.inc_last_op_sequence_num(); + } + auto discard_req = this; + Context *on_write_persist = new LambdaContext( + [this, discard_req](int r) { + ldout(pwl.get_context(), 20) << "discard_req=" << discard_req + << " cell=" << discard_req->get_cell() << dendl; + ceph_assert(discard_req->get_cell()); + discard_req->complete_user_request(r); + discard_req->release_cell(); + }); + op->init(current_sync_gen, persist_on_flush, pwl.get_last_op_sequence_num(), on_write_persist); + pwl.add_into_log_map(log_entries); +} + +template +void C_DiscardRequest::dispatch() { + utime_t now = ceph_clock_now(); + ldout(pwl.get_context(), 20) << "req type=" << get_name() << " " + << "req=[" << *this << "]" << dendl; + ceph_assert(this->m_resources.allocated); + this->m_dispatched_time = now; + setup_log_operations(); + m_perfcounter->inc(l_librbd_pwl_log_ops, 1); + pwl.schedule_append(op); +} + +template +void C_DiscardRequest::setup_buffer_resources( + uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, + uint64_t &number_lanes, uint64_t &number_log_entries, + uint64_t &number_unpublished_reserves) { + number_log_entries = 1; + /* No bytes are allocated for a discard, but we count the discarded bytes + * as dirty. This means it's possible to have more bytes dirty than + * there are bytes cached or allocated. */ + for (auto &extent : this->image_extents) { + bytes_dirtied = extent.second; + break; + } +} + +template +void C_DiscardRequest::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) { + ldout(pwl.get_context(), 20) << " cell=" << guard_ctx.cell << dendl; + + ceph_assert(guard_ctx.cell); + this->detained = guard_ctx.state.detained; /* overlapped */ + this->set_cell(guard_ctx.cell); +} + +template +std::ostream &operator<<(std::ostream &os, + const C_DiscardRequest &req) { + os << (C_BlockIORequest&)req; + if (req.op) { + os << " op=[" << *req.op << "]"; + } else { + os << " op=nullptr"; + } + return os; +} + +template +C_WriteSameRequest::C_WriteSameRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : C_WriteRequest(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req) { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +C_WriteSameRequest::~C_WriteSameRequest() { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +void C_WriteSameRequest::update_req_stats(utime_t &now) { + /* Write same stats excluded from most write stats + * because the read phase will make them look like slow writes in + * those histograms. */ + ldout(pwl.get_context(), 20) << this << dendl; + utime_t comp_latency = now - this->m_arrived_time; + this->m_perfcounter->tinc(l_librbd_pwl_ws_latency, comp_latency); +} + +/* Write sames will allocate one buffer, the size of the repeating pattern */ +template +void C_WriteSameRequest::setup_buffer_resources( + uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, + uint64_t &number_lanes, uint64_t &number_log_entries, + uint64_t &number_unpublished_reserves) { + ldout(pwl.get_context(), 20) << this << dendl; + ceph_assert(this->image_extents.size() == 1); + bytes_dirtied += this->image_extents[0].second; + auto pattern_length = this->bl.length(); + this->m_resources.buffers.emplace_back(); + struct WriteBufferAllocation &buffer = this->m_resources.buffers.back(); + buffer.allocation_size = MIN_WRITE_ALLOC_SIZE; + buffer.allocated = false; + bytes_cached += pattern_length; + if (pattern_length > buffer.allocation_size) { + buffer.allocation_size = pattern_length; + } + bytes_allocated += buffer.allocation_size; +} + +template +std::shared_ptr C_WriteSameRequest::create_operation(uint64_t offset, uint64_t len) { + ceph_assert(this->image_extents.size() == 1); + return std::make_shared(*this->op_set.get(), offset, len, + this->bl.length(), pwl.get_context()); +} + +template +std::ostream &operator<<(std::ostream &os, + const C_WriteSameRequest &req) { + os << (C_WriteRequest&)req; + return os; +} + +template +C_CompAndWriteRequest::C_CompAndWriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter, + Context *user_req) + : C_WriteRequest(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req), + mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)) { + ldout(pwl.get_context(), 20) << dendl; +} + +template +C_CompAndWriteRequest::~C_CompAndWriteRequest() { + ldout(pwl.get_context(), 20) << dendl; +} + +template +void C_CompAndWriteRequest::finish_req(int r) { + if (compare_succeeded) { + C_WriteRequest::finish_req(r); + } else { + utime_t now = ceph_clock_now(); + update_req_stats(now); + } +} + +template +void C_CompAndWriteRequest::update_req_stats(utime_t &now) { + /* Compare-and-write stats. Compare-and-write excluded from most write + * stats because the read phase will make them look like slow writes in + * those histograms. */ + if (!compare_succeeded) { + this->m_perfcounter->inc(l_librbd_pwl_cmp_fails, 1); + } + utime_t comp_latency = now - this->m_arrived_time; + this->m_perfcounter->tinc(l_librbd_pwl_cmp_latency, comp_latency); +} + +template +std::ostream &operator<<(std::ostream &os, + const C_CompAndWriteRequest &req) { + os << (C_WriteRequest&)req + << "cmp_bl=" << req.cmp_bl << ", " + << "read_bl=" << req.read_bl << ", " + << "compare_succeeded=" << req.compare_succeeded << ", " + << "mismatch_offset=" << req.mismatch_offset; + return os; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::C_BlockIORequest >; +template class librbd::cache::pwl::C_WriteRequest >; +template class librbd::cache::pwl::C_FlushRequest >; +template class librbd::cache::pwl::C_DiscardRequest >; +template class librbd::cache::pwl::C_WriteSameRequest >; +template class librbd::cache::pwl::C_CompAndWriteRequest >; diff --git a/src/librbd/cache/pwl/Request.h b/src/librbd/cache/pwl/Request.h new file mode 100644 index 0000000000000..fd6fa71b9e235 --- /dev/null +++ b/src/librbd/cache/pwl/Request.h @@ -0,0 +1,408 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H +#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H + +#include "include/Context.h" +#include "librbd/cache/ImageCache.h" +#include "librbd/cache/pwl/Types.h" +#include "librbd/cache/pwl/LogOperation.h" + +namespace librbd { +class BlockGuardCell; + +namespace cache { +namespace pwl { + +class GuardedRequestFunctionContext; + +struct WriteRequestResources { + bool allocated = false; + std::vector buffers; +}; + +/** + * A request that can be deferred in a BlockGuard to sequence + * overlapping operations. + * This is the custodian of the BlockGuard cell for this IO, and the + * state information about the progress of this IO. This object lives + * until the IO is persisted in all (live) log replicas. User request + * may be completed from here before the IO persists. + */ +template +class C_BlockIORequest : public Context { +public: + T &pwl; + io::Extents image_extents; + bufferlist bl; + int fadvise_flags; + Context *user_req; /* User write request */ + ExtentsSummary image_extents_summary; + bool detained = false; /* Detained in blockguard (overlapped with a prior IO) */ + utime_t allocated_time; /* When allocation began */ + bool waited_lanes = false; /* This IO waited for free persist/replicate lanes */ + bool waited_entries = false; /* This IO waited for free log entries */ + bool waited_buffers = false; /* This IO waited for data buffers (pmemobj_reserve() failed) */ + + C_BlockIORequest(T &pwl, const utime_t arrived, io::Extents &&extents, + bufferlist&& bl, const int fadvise_flags, Context *user_req); + ~C_BlockIORequest() override; + C_BlockIORequest(const C_BlockIORequest&) = delete; + C_BlockIORequest &operator=(const C_BlockIORequest&) = delete; + + void set_cell(BlockGuardCell *cell); + BlockGuardCell *get_cell(void); + void release_cell(); + + void complete_user_request(int r); + void finish(int r); + virtual void finish_req(int r) = 0; + + virtual bool alloc_resources() = 0; + + void deferred(); + + virtual void deferred_handler() = 0; + + virtual void dispatch() = 0; + + virtual const char *get_name() const { + return "C_BlockIORequest"; + } + uint64_t get_image_extents_size() { + return image_extents.size(); + } + void set_io_waited_for_lanes(bool waited) { + waited_lanes = waited; + } + void set_io_waited_for_entries(bool waited) { + waited_entries = waited; + } + void set_io_waited_for_buffers(bool waited) { + waited_buffers = waited; + } + bool has_io_waited_for_buffers() { + return waited_buffers; + } + std::vector& get_resources_buffers() { + return m_resources.buffers; + } + + void set_allocated(bool allocated) { + if (allocated) { + m_resources.allocated = true; + } else { + m_resources.buffers.clear(); + } + } + + virtual void setup_buffer_resources( + uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, + uint64_t &number_lanes, uint64_t &number_log_entries, + uint64_t &number_unpublished_reserves) {}; + +protected: + utime_t m_arrived_time; + utime_t m_dispatched_time; /* When dispatch began */ + utime_t m_user_req_completed_time; + std::atomic m_deferred = {false}; /* Deferred because this or a prior IO had to wait for write resources */ + WriteRequestResources m_resources; + +private: + std::atomic m_user_req_completed = {false}; + std::atomic m_finish_called = {false}; + std::atomic m_cell_released = {false}; + BlockGuardCell* m_cell = nullptr; + + template + friend std::ostream &operator<<(std::ostream &os, + const C_BlockIORequest &req); +}; + +/** + * This is the custodian of the BlockGuard cell for this write. Block + * guard is not released until the write persists everywhere (this is + * how we guarantee to each log replica that they will never see + * overlapping writes). + */ +template +class C_WriteRequest : public C_BlockIORequest { +public: + using C_BlockIORequest::pwl; + unique_ptr op_set = nullptr; + + C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req); + + ~C_WriteRequest() override; + + void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx); + + /* Common finish to plain write and compare-and-write (if it writes) */ + void finish_req(int r) override; + + /* Compare and write will override this */ + virtual void update_req_stats(utime_t &now) { + // TODO: Add in later PRs + } + bool alloc_resources() override; + + void deferred_handler() override { } + + void dispatch() override; + + virtual std::shared_ptr create_operation(uint64_t offset, uint64_t len); + + virtual void setup_log_operations(DeferredContexts &on_exit); + + bool append_write_request(std::shared_ptr sync_point); + + virtual void schedule_append(); + + const char *get_name() const override { + return "C_WriteRequest"; + } + +protected: + using C_BlockIORequest::m_resources; + PerfCounters *m_perfcounter = nullptr; + /* Plain writes will allocate one buffer per request extent */ + void setup_buffer_resources( + uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, + uint64_t &number_lanes, uint64_t &number_log_entries, + uint64_t &number_unpublished_reserves) override; + +private: + bool m_do_early_flush = false; + std::atomic m_appended = {0}; + bool m_queued = false; + ceph::mutex &m_lock; + template + friend std::ostream &operator<<(std::ostream &os, + const C_WriteRequest &req); +}; + +/** + * This is the custodian of the BlockGuard cell for this + * aio_flush. Block guard is released as soon as the new + * sync point (if required) is created. Subsequent IOs can + * proceed while this flush waits for prior IOs to complete + * and any required sync points to be persisted. + */ +template +class C_FlushRequest : public C_BlockIORequest { +public: + using C_BlockIORequest::pwl; + bool internal = false; + std::shared_ptr to_append; + + C_FlushRequest(T &pwl, const utime_t arrived, + io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, + ceph::mutex &lock, PerfCounters *perfcounter, + Context *user_req); + + ~C_FlushRequest() override {} + + bool alloc_resources() override; + + void dispatch() override; + + const char *get_name() const override { + return "C_FlushRequest"; + } + + void setup_buffer_resources( + uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, + uint64_t &number_lanes, uint64_t &number_log_entries, + uint64_t &number_unpublished_reserves) override; +private: + std::shared_ptr op; + ceph::mutex &m_lock; + PerfCounters *m_perfcounter = nullptr; + + void finish_req(int r) override; + void deferred_handler() override { + m_perfcounter->inc(l_librbd_pwl_aio_flush_def, 1); + } + + template + friend std::ostream &operator<<(std::ostream &os, + const C_FlushRequest &req); +}; + +/** + * This is the custodian of the BlockGuard cell for this discard. As in the + * case of write, the block guard is not released until the discard persists + * everywhere. + */ +template +class C_DiscardRequest : public C_BlockIORequest { +public: + using C_BlockIORequest::pwl; + std::shared_ptr op; + + C_DiscardRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req); + + ~C_DiscardRequest() override; + void finish_req(int r) override {} + + bool alloc_resources() override; + + void deferred_handler() override { } + + void setup_log_operations(); + + void dispatch() override; + + void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx); + + const char *get_name() const override { + return "C_DiscardRequest"; + } + void setup_buffer_resources( + uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, + uint64_t &number_lanes, uint64_t &number_log_entries, + uint64_t &number_unpublished_reserves) override; +private: + uint32_t m_discard_granularity_bytes; + ceph::mutex &m_lock; + PerfCounters *m_perfcounter = nullptr; + template + friend std::ostream &operator<<(std::ostream &os, + const C_DiscardRequest &req); +}; + +/** + * This is the custodian of the BlockGuard cell for this write same. + * + * A writesame allocates and persists a data buffer like a write, but the + * data buffer is usually much shorter than the write same. + */ +template +class C_WriteSameRequest : public C_WriteRequest { +public: + using C_BlockIORequest::pwl; + C_WriteSameRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req); + + ~C_WriteSameRequest() override; + + void update_req_stats(utime_t &now) override; + + void setup_buffer_resources( + uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, + uint64_t &number_lanes, uint64_t &number_log_entries, + uint64_t &number_unpublished_reserves) override; + + std::shared_ptr create_operation(uint64_t offset, uint64_t len) override; + + const char *get_name() const override { + return "C_WriteSameRequest"; + } + + template + friend std::ostream &operator<<(std::ostream &os, + const C_WriteSameRequest &req); +}; + +/** + * This is the custodian of the BlockGuard cell for this compare and write. The + * block guard is acquired before the read begins to guarantee atomicity of this + * operation. If this results in a write, the block guard will be released + * when the write completes to all replicas. + */ +template +class C_CompAndWriteRequest : public C_WriteRequest { +public: + using C_BlockIORequest::pwl; + bool compare_succeeded = false; + uint64_t *mismatch_offset; + bufferlist cmp_bl; + bufferlist read_bl; + C_CompAndWriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter, + Context *user_req); + ~C_CompAndWriteRequest(); + + void finish_req(int r) override; + + void update_req_stats(utime_t &now) override; + + /* + * Compare and write doesn't implement alloc_resources(), deferred_handler(), + * or dispatch(). We use the implementation in C_WriteRequest(), and only if the + * compare phase succeeds and a write is actually performed. + */ + + const char *get_name() const override { + return "C_CompAndWriteRequest"; + } + template + friend std::ostream &operator<<(std::ostream &os, + const C_CompAndWriteRequest &req); +}; + +struct BlockGuardReqState { + bool barrier = false; /* This is a barrier request */ + bool current_barrier = false; /* This is the currently active barrier */ + bool detained = false; + bool queued = false; /* Queued for barrier */ + friend std::ostream &operator<<(std::ostream &os, + const BlockGuardReqState &r) { + os << "barrier=" << r.barrier << ", " + << "current_barrier=" << r.current_barrier << ", " + << "detained=" << r.detained << ", " + << "queued=" << r.queued; + return os; + } +}; + +class GuardedRequestFunctionContext : public Context { +public: + BlockGuardCell *cell = nullptr; + BlockGuardReqState state; + GuardedRequestFunctionContext(boost::function &&callback) + : m_callback(std::move(callback)){ } + ~GuardedRequestFunctionContext(void) override { }; + GuardedRequestFunctionContext(const GuardedRequestFunctionContext&) = delete; + GuardedRequestFunctionContext &operator=(const GuardedRequestFunctionContext&) = delete; + +private: + boost::function m_callback; + void finish(int r) override { + ceph_assert(cell); + m_callback(*this); + } +}; + +class GuardedRequest { +public: + const BlockExtent block_extent; + GuardedRequestFunctionContext *guard_ctx; /* Work to do when guard on range obtained */ + + GuardedRequest(const BlockExtent block_extent, + GuardedRequestFunctionContext *on_guard_acquire, bool barrier = false) + : block_extent(block_extent), guard_ctx(on_guard_acquire) { + guard_ctx->state.barrier = barrier; + } + friend std::ostream &operator<<(std::ostream &os, + const GuardedRequest &r) { + os << "guard_ctx->state=[" << r.guard_ctx->state << "], " + << "block_extent.block_start=" << r.block_extent.block_start << ", " + << "block_extent.block_start=" << r.block_extent.block_end; + return os; + } +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H diff --git a/src/librbd/cache/pwl/ShutdownRequest.cc b/src/librbd/cache/pwl/ShutdownRequest.cc new file mode 100644 index 0000000000000..d3eda226be277 --- /dev/null +++ b/src/librbd/cache/pwl/ShutdownRequest.cc @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/pwl/ShutdownRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/Operations.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/ImageCache.h" +#include "librbd/cache/Types.h" + + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl:ShutdownRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template +ShutdownRequest* ShutdownRequest::create(I &image_ctx, + Context *on_finish) { + return new ShutdownRequest(image_ctx, on_finish); +} + +template +ShutdownRequest::ShutdownRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), + m_on_finish(create_async_context_callback(image_ctx, on_finish)), + m_error_result(0) { +} + +template +void ShutdownRequest::send() { + send_shutdown_image_cache(); +} + +template +void ShutdownRequest::send_shutdown_image_cache() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (m_image_ctx.image_cache == nullptr) { + finish(); + return; + } + + using klass = ShutdownRequest; + Context *ctx = create_context_callback( + this); + + m_image_ctx.image_cache->shut_down(ctx); +} + +template +void ShutdownRequest::handle_shutdown_image_cache(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to shut down the image cache: " << cpp_strerror(r) + << dendl; + save_result(r); + finish(); + return; + } else { + delete m_image_ctx.image_cache; + m_image_ctx.image_cache = nullptr; + } + send_remove_feature_bit(); +} + +template +void ShutdownRequest::send_remove_feature_bit() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + uint64_t new_features = m_image_ctx.features & ~RBD_FEATURE_DIRTY_CACHE; + uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE; + ldout(cct, 10) << "old_features=" << m_image_ctx.features + << ", new_features=" << new_features + << ", features_mask=" << features_mask + << dendl; + + int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, m_image_ctx.header_oid, + new_features, features_mask); + m_image_ctx.features &= ~RBD_FEATURE_DIRTY_CACHE; + using klass = ShutdownRequest; + Context *ctx = create_context_callback( + this); + ctx->complete(r); +} + +template +void ShutdownRequest::handle_remove_feature_bit(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove the feature bit: " << cpp_strerror(r) + << dendl; + save_result(r); + finish(); + return; + } + send_remove_image_cache_state(); +} + +template +void ShutdownRequest::send_remove_image_cache_state() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = ShutdownRequest; + Context *ctx = create_context_callback( + this); + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + m_image_ctx.operations->execute_metadata_remove(IMAGE_CACHE_STATE, ctx); +} + +template +void ShutdownRequest::handle_remove_image_cache_state(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove the image cache state: " << cpp_strerror(r) + << dendl; + save_result(r); + } + finish(); +} + +template +void ShutdownRequest::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::ShutdownRequest; diff --git a/src/librbd/cache/pwl/ShutdownRequest.h b/src/librbd/cache/pwl/ShutdownRequest.h new file mode 100644 index 0000000000000..3cacb5317bf90 --- /dev/null +++ b/src/librbd/cache/pwl/ShutdownRequest.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H +#define CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace cache { +namespace pwl { + +template +class ImageCacheState; + +template +class ShutdownRequest { +public: + static ShutdownRequest* create(ImageCtxT &image_ctx, Context *on_finish); + + void send(); + +private: + + /** + * @verbatim + * + * Shutdown request goes through the following state machine: + * + * + * | + * v + * SHUTDOWN_IMAGE_CACHE + * | + * v + * REMOVE_IMAGE_FEATURE_BIT + * | + * v + * REMOVE_IMAGE_CACHE_STATE + * | + * v + * + * + * @endverbatim + */ + + ShutdownRequest(ImageCtxT &image_ctx, Context *on_finish); + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + int m_error_result; + + void send_shutdown_image_cache(); + void handle_shutdown_image_cache(int r); + + void send_remove_feature_bit(); + void handle_remove_feature_bit(int r); + + void send_remove_image_cache_state(); + void handle_remove_image_cache_state(int r); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::ShutdownRequest; + +#endif // CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H diff --git a/src/librbd/cache/pwl/SyncPoint.cc b/src/librbd/cache/pwl/SyncPoint.cc new file mode 100644 index 0000000000000..8fb2f82052e63 --- /dev/null +++ b/src/librbd/cache/pwl/SyncPoint.cc @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SyncPoint.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::SyncPoint: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +SyncPoint::SyncPoint(uint64_t sync_gen_num, CephContext *cct) + : log_entry(std::make_shared(sync_gen_num)), m_cct(cct) { + m_prior_log_entries_persisted = new C_Gather(cct, nullptr); + m_sync_point_persist = new C_Gather(cct, nullptr); + on_sync_point_appending.reserve(MAX_WRITES_PER_SYNC_POINT + 2); + on_sync_point_persisted.reserve(MAX_WRITES_PER_SYNC_POINT + 2); + ldout(m_cct, 20) << "sync point " << sync_gen_num << dendl; +} + +SyncPoint::~SyncPoint() { + ceph_assert(on_sync_point_appending.empty()); + ceph_assert(on_sync_point_persisted.empty()); + ceph_assert(!earlier_sync_point); +} + +std::ostream &operator<<(std::ostream &os, + const SyncPoint &p) { + os << "log_entry=[" << *p.log_entry << "], " + << "earlier_sync_point=" << p.earlier_sync_point << ", " + << "later_sync_point=" << p.later_sync_point << ", " + << "m_final_op_sequence_num=" << p.m_final_op_sequence_num << ", " + << "m_prior_log_entries_persisted=" << p.m_prior_log_entries_persisted << ", " + << "m_prior_log_entries_persisted_complete=" << p.m_prior_log_entries_persisted_complete << ", " + << "m_append_scheduled=" << p.m_append_scheduled << ", " + << "appending=" << p.appending << ", " + << "on_sync_point_appending=" << p.on_sync_point_appending.size() << ", " + << "on_sync_point_persisted=" << p.on_sync_point_persisted.size() << ""; + return os; +} + +void SyncPoint::persist_gather_set_finisher(Context *ctx) { + m_append_scheduled = true; + /* All prior sync points that are still in this list must already be scheduled for append */ + std::shared_ptr previous = earlier_sync_point; + while (previous) { + ceph_assert(previous->m_append_scheduled); + previous = previous->earlier_sync_point; + } + + m_sync_point_persist->set_finisher(ctx); +} + +void SyncPoint::persist_gather_activate() { + m_sync_point_persist->activate(); +} + +Context* SyncPoint::persist_gather_new_sub() { + return m_sync_point_persist->new_sub(); +} + +void SyncPoint::prior_persisted_gather_activate() { + m_prior_log_entries_persisted->activate(); +} + +Context* SyncPoint::prior_persisted_gather_new_sub() { + return m_prior_log_entries_persisted->new_sub(); +} + +void SyncPoint::prior_persisted_gather_set_finisher() { + Context *sync_point_persist_ready = persist_gather_new_sub(); + std::shared_ptr sp = shared_from_this(); + m_prior_log_entries_persisted-> + set_finisher(new LambdaContext([this, sp, sync_point_persist_ready](int r) { + ldout(m_cct, 20) << "Prior log entries persisted for sync point =[" + << sp << "]" << dendl; + sp->m_prior_log_entries_persisted_result = r; + sp->m_prior_log_entries_persisted_complete = true; + sync_point_persist_ready->complete(r); + })); +} + +void SyncPoint::add_in_on_persisted_ctxs(Context* ctx) { + on_sync_point_persisted.push_back(ctx); +} + +void SyncPoint::add_in_on_appending_ctxs(Context* ctx) { + on_sync_point_appending.push_back(ctx); +} + +void SyncPoint::setup_earlier_sync_point(std::shared_ptr sync_point, + uint64_t last_op_sequence_num) { + earlier_sync_point = sync_point; + log_entry->prior_sync_point_flushed = false; + earlier_sync_point->log_entry->next_sync_point_entry = log_entry; + earlier_sync_point->later_sync_point = shared_from_this(); + earlier_sync_point->m_final_op_sequence_num = last_op_sequence_num; + if (!earlier_sync_point->appending) { + /* Append of new sync point deferred until old sync point is appending */ + earlier_sync_point->add_in_on_appending_ctxs(prior_persisted_gather_new_sub()); + } +} + +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/SyncPoint.h b/src/librbd/cache/pwl/SyncPoint.h new file mode 100644 index 0000000000000..424e3730e8a6b --- /dev/null +++ b/src/librbd/cache/pwl/SyncPoint.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H +#define CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H + +#include "librbd/ImageCtx.h" +#include "librbd/cache/pwl/LogEntry.h" +#include "librbd/cache/pwl/Types.h" + +namespace librbd { +namespace cache { +namespace pwl { + +class SyncPoint: public std::enable_shared_from_this { +public: + std::shared_ptr log_entry; + /* Use lock for earlier/later links */ + std::shared_ptr earlier_sync_point; /* NULL if earlier has completed */ + std::shared_ptr later_sync_point; + bool appending = false; + /* Signal these when this sync point is appending to the log, and its order + * of appearance is guaranteed. One of these is is a sub-operation of the + * next sync point's m_prior_log_entries_persisted Gather. */ + std::vector on_sync_point_appending; + /* Signal these when this sync point is appended and persisted. User + * aio_flush() calls are added to this. */ + std::vector on_sync_point_persisted; + + SyncPoint(uint64_t sync_gen_num, CephContext *cct); + ~SyncPoint(); + SyncPoint(const SyncPoint&) = delete; + SyncPoint &operator=(const SyncPoint&) = delete; + void persist_gather_activate(); + Context* persist_gather_new_sub(); + void persist_gather_set_finisher(Context *ctx); + void prior_persisted_gather_activate(); + Context* prior_persisted_gather_new_sub(); + void prior_persisted_gather_set_finisher(); + void add_in_on_persisted_ctxs(Context* cxt); + void add_in_on_appending_ctxs(Context* cxt); + void setup_earlier_sync_point(std::shared_ptr sync_point, + uint64_t last_op_sequence_num); +private: + CephContext *m_cct; + bool m_append_scheduled = false; + uint64_t m_final_op_sequence_num = 0; + /* A sync point can't appear in the log until all the writes bearing + * it and all the prior sync points have been appended and + * persisted. + * + * Writes bearing this sync gen number and the prior sync point will be + * sub-ops of this Gather. This sync point will not be appended until all + * these complete to the point where their persist order is guaranteed. */ + C_Gather *m_prior_log_entries_persisted; + /* The finisher for this will append the sync point to the log. The finisher + * for m_prior_log_entries_persisted will be a sub-op of this. */ + C_Gather *m_sync_point_persist; + int m_prior_log_entries_persisted_result = 0; + int m_prior_log_entries_persisted_complete = false; + friend std::ostream &operator<<(std::ostream &os, + const SyncPoint &p); +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H diff --git a/src/librbd/cache/pwl/Types.cc b/src/librbd/cache/pwl/Types.cc new file mode 100644 index 0000000000000..d19ad0ed406d7 --- /dev/null +++ b/src/librbd/cache/pwl/Types.cc @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "Types.h" +#include "common/ceph_context.h" +#include "include/Context.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::Types: " << this << " " \ + << __func__ << ": " + +namespace librbd { + +namespace cache { + +namespace pwl { + +DeferredContexts::~DeferredContexts() { + finish_contexts(nullptr, contexts, 0); +} + +void DeferredContexts::add(Context* ctx) { + contexts.push_back(ctx); +} + +/* + * A BlockExtent identifies a range by first and last. + * + * An Extent ("image extent") identifies a range by start and length. + * + * The ImageCache interface is defined in terms of image extents, and + * requires no alignment of the beginning or end of the extent. We + * convert between image and block extents here using a "block size" + * of 1. + */ +BlockExtent convert_to_block_extent(const uint64_t offset_bytes, const uint64_t length_bytes) +{ + return BlockExtent(offset_bytes, + offset_bytes + length_bytes); +} + +BlockExtent WriteLogPmemEntry::block_extent() { + return convert_to_block_extent(image_offset_bytes, write_bytes); +} + +uint64_t WriteLogPmemEntry::get_offset_bytes() { + return image_offset_bytes; +} + +uint64_t WriteLogPmemEntry::get_write_bytes() { + return write_bytes; +} + +std::ostream& operator<<(std::ostream& os, + const WriteLogPmemEntry &entry) { + os << "entry_valid=" << (bool)entry.entry_valid << ", " + << "sync_point=" << (bool)entry.sync_point << ", " + << "sequenced=" << (bool)entry.sequenced << ", " + << "has_data=" << (bool)entry.has_data << ", " + << "discard=" << (bool)entry.discard << ", " + << "writesame=" << (bool)entry.writesame << ", " + << "sync_gen_number=" << entry.sync_gen_number << ", " + << "write_sequence_number=" << entry.write_sequence_number << ", " + << "image_offset_bytes=" << entry.image_offset_bytes << ", " + << "write_bytes=" << entry.write_bytes << ", " + << "ws_datalen=" << entry.ws_datalen << ", " + << "entry_index=" << entry.entry_index; + return os; +} + +template +ExtentsSummary::ExtentsSummary(const ExtentsType &extents) + : total_bytes(0), first_image_byte(0), last_image_byte(0) +{ + if (extents.empty()) return; + /* These extents refer to image offsets between first_image_byte + * and last_image_byte, inclusive, but we don't guarantee here + * that they address all of those bytes. There may be gaps. */ + first_image_byte = extents.front().first; + last_image_byte = first_image_byte + extents.front().second; + for (auto &extent : extents) { + /* Ignore zero length extents */ + if (extent.second) { + total_bytes += extent.second; + if (extent.first < first_image_byte) { + first_image_byte = extent.first; + } + if ((extent.first + extent.second) > last_image_byte) { + last_image_byte = extent.first + extent.second; + } + } + } +} + +io::Extent whole_volume_extent() { + return io::Extent({0, std::numeric_limits::max()}); +} + +BlockExtent block_extent(const io::Extent& image_extent) { + return convert_to_block_extent(image_extent.first, image_extent.second); +} + +Context * override_ctx(int r, Context *ctx) { + if (r < 0) { + /* Override next_ctx status with this error */ + return new LambdaContext( + [r, ctx](int _r) { + ctx->complete(r); + }); + } else { + return ctx; + } +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::ExtentsSummary; diff --git a/src/librbd/cache/pwl/Types.h b/src/librbd/cache/pwl/Types.h new file mode 100644 index 0000000000000..78a2440e51f35 --- /dev/null +++ b/src/librbd/cache/pwl/Types.h @@ -0,0 +1,312 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_TYPES_H +#define CEPH_LIBRBD_CACHE_RWL_TYPES_H + +#include +#include +#include "librbd/BlockGuard.h" +#include "librbd/io/Types.h" + +class Context; + +enum { + l_librbd_pwl_first = 26500, + + // All read requests + l_librbd_pwl_rd_req, // read requests + l_librbd_pwl_rd_bytes, // bytes read + l_librbd_pwl_rd_latency, // average req completion latency + + // Read requests completed from RWL (no misses) + l_librbd_pwl_rd_hit_req, // read requests + l_librbd_pwl_rd_hit_bytes, // bytes read + l_librbd_pwl_rd_hit_latency, // average req completion latency + + // Reed requests with hit and miss extents + l_librbd_pwl_rd_part_hit_req, // read ops + + // Per SyncPoint's LogEntry number and write bytes distribution + l_librbd_pwl_syncpoint_hist, + + // All write requests + l_librbd_pwl_wr_req, // write requests + l_librbd_pwl_wr_req_def, // write requests deferred for resources + l_librbd_pwl_wr_req_def_lanes, // write requests deferred for lanes + l_librbd_pwl_wr_req_def_log, // write requests deferred for log entries + l_librbd_pwl_wr_req_def_buf, // write requests deferred for buffer space + l_librbd_pwl_wr_req_overlap, // write requests detained for overlap + l_librbd_pwl_wr_req_queued, // write requests queued for prior barrier + l_librbd_pwl_wr_bytes, // bytes written + + // Write log operations (1 .. n per request that appends to the log) + l_librbd_pwl_log_ops, // log append ops + l_librbd_pwl_log_op_bytes, // average bytes written per log op + + /* + + Req and op average latencies to the beginning of and over various phases: + + +------------------------------+------+-------------------------------+ + | Phase | Name | Description | + +------------------------------+------+-------------------------------+ + | Arrive at RWL | arr |Arrives as a request | + +------------------------------+------+-------------------------------+ + | Allocate resources | all |time spent in block guard for | + | | |overlap sequencing occurs | + | | |before this point | + +------------------------------+------+-------------------------------+ + | Dispatch | dis |Op lifetime begins here. time | + | | |spent in allocation waiting for| + | | |resources occurs before this | + | | |point | + +------------------------------+------+-------------------------------+ + | Payload buffer persist and | buf |time spent queued for | + |replicate | |replication occurs before here | + +------------------------------+------+-------------------------------+ + | Payload buffer persist | bufc |bufc - buf is just the persist | + |complete | |time | + +------------------------------+------+-------------------------------+ + | Log append | app |time spent queued for append | + | | |occurs before here | + +------------------------------+------+-------------------------------+ + | Append complete | appc |appc - app is just the time | + | | |spent in the append operation | + +------------------------------+------+-------------------------------+ + | Complete | cmp |write persisted, replicated, | + | | |and globally visible | + +------------------------------+------+-------------------------------+ + + */ + + /* Request times */ + l_librbd_pwl_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard + l_librbd_pwl_req_arr_to_dis_t, // arrival to dispatch elapsed time + l_librbd_pwl_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources + l_librbd_pwl_wr_latency, // average req (persist) completion latency + l_librbd_pwl_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written + l_librbd_pwl_wr_caller_latency, // average req completion (to caller) latency + + /* Request times for requests that never waited for space*/ + l_librbd_pwl_nowait_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard + l_librbd_pwl_nowait_req_arr_to_dis_t, // arrival to dispatch elapsed time + l_librbd_pwl_nowait_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources + l_librbd_pwl_nowait_wr_latency, // average req (persist) completion latency + l_librbd_pwl_nowait_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written + l_librbd_pwl_nowait_wr_caller_latency, // average req completion (to caller) latency + + /* Log operation times */ + l_librbd_pwl_log_op_alloc_t, // elapsed time of pmemobj_reserve() + l_librbd_pwl_log_op_alloc_t_hist, // Histogram of elapsed time of pmemobj_reserve() + + l_librbd_pwl_log_op_dis_to_buf_t, // dispatch to buffer persist elapsed time + l_librbd_pwl_log_op_dis_to_app_t, // dispatch to log append elapsed time + l_librbd_pwl_log_op_dis_to_cmp_t, // dispatch to persist completion elapsed time + l_librbd_pwl_log_op_dis_to_cmp_t_hist, // Histogram of dispatch to persist completion elapsed time + + l_librbd_pwl_log_op_buf_to_app_t, // data buf persist + append wait time + l_librbd_pwl_log_op_buf_to_bufc_t,// data buf persist / replicate elapsed time + l_librbd_pwl_log_op_buf_to_bufc_t_hist,// data buf persist time vs bytes histogram + l_librbd_pwl_log_op_app_to_cmp_t, // log entry append + completion wait time + l_librbd_pwl_log_op_app_to_appc_t, // log entry append / replicate elapsed time + l_librbd_pwl_log_op_app_to_appc_t_hist, // log entry append time (vs. op bytes) histogram + + l_librbd_pwl_discard, + l_librbd_pwl_discard_bytes, + l_librbd_pwl_discard_latency, + + l_librbd_pwl_aio_flush, + l_librbd_pwl_aio_flush_def, + l_librbd_pwl_aio_flush_latency, + l_librbd_pwl_ws, + l_librbd_pwl_ws_bytes, // Bytes modified by write same, probably much larger than WS payload bytes + l_librbd_pwl_ws_latency, + + l_librbd_pwl_cmp, + l_librbd_pwl_cmp_bytes, + l_librbd_pwl_cmp_latency, + l_librbd_pwl_cmp_fails, + + l_librbd_pwl_flush, + l_librbd_pwl_invalidate_cache, + l_librbd_pwl_invalidate_discard_cache, + + l_librbd_pwl_append_tx_t, + l_librbd_pwl_retire_tx_t, + l_librbd_pwl_append_tx_t_hist, + l_librbd_pwl_retire_tx_t_hist, + + l_librbd_pwl_last, +}; + +namespace librbd { +namespace cache { +namespace pwl { + +class ImageExtentBuf; +typedef std::vector ImageExtentBufs; + +const int IN_FLIGHT_FLUSH_WRITE_LIMIT = 64; +const int IN_FLIGHT_FLUSH_BYTES_LIMIT = (1 * 1024 * 1024); + +/* Limit work between sync points */ +const uint64_t MAX_WRITES_PER_SYNC_POINT = 256; +const uint64_t MAX_BYTES_PER_SYNC_POINT = (1024 * 1024 * 8); + +const uint32_t MIN_WRITE_ALLOC_SIZE = 512; +const uint32_t LOG_STATS_INTERVAL_SECONDS = 5; + +/**** Write log entries ****/ +const unsigned long int MAX_ALLOC_PER_TRANSACTION = 8; +const unsigned long int MAX_FREE_PER_TRANSACTION = 1; +const unsigned int MAX_CONCURRENT_WRITES = 256; + +const uint64_t DEFAULT_POOL_SIZE = 1u<<30; +const uint64_t MIN_POOL_SIZE = DEFAULT_POOL_SIZE; +constexpr double USABLE_SIZE = (7.0 / 10); +const uint64_t BLOCK_ALLOC_OVERHEAD_BYTES = 16; +const uint8_t RWL_POOL_VERSION = 1; +const uint64_t MAX_LOG_ENTRIES = (1024 * 1024); +const double AGGRESSIVE_RETIRE_HIGH_WATER = 0.75; +const double RETIRE_HIGH_WATER = 0.50; +const double RETIRE_LOW_WATER = 0.40; +const int RETIRE_BATCH_TIME_LIMIT_MS = 250; + +/* Defer a set of Contexts until destruct/exit. Used for deferring + * work on a given thread until a required lock is dropped. */ +class DeferredContexts { +private: + std::vector contexts; +public: + ~DeferredContexts(); + void add(Context* ctx); +}; + +/* Pmem structures */ +POBJ_LAYOUT_BEGIN(rbd_pwl); +POBJ_LAYOUT_ROOT(rbd_pwl, struct WriteLogPoolRoot); +POBJ_LAYOUT_TOID(rbd_pwl, uint8_t); +POBJ_LAYOUT_TOID(rbd_pwl, struct WriteLogPmemEntry); +POBJ_LAYOUT_END(rbd_pwl); + +struct WriteLogPmemEntry { + uint64_t sync_gen_number = 0; + uint64_t write_sequence_number = 0; + uint64_t image_offset_bytes; + uint64_t write_bytes; + TOID(uint8_t) write_data; + struct { + uint8_t entry_valid :1; /* if 0, this entry is free */ + uint8_t sync_point :1; /* No data. No write sequence number. Marks sync + point for this sync gen number */ + uint8_t sequenced :1; /* write sequence number is valid */ + uint8_t has_data :1; /* write_data field is valid (else ignore) */ + uint8_t discard :1; /* has_data will be 0 if this is a discard */ + uint8_t writesame :1; /* ws_datalen indicates length of data at write_bytes */ + }; + uint32_t ws_datalen = 0; /* Length of data buffer (writesame only) */ + uint32_t entry_index = 0; /* For debug consistency check. Can be removed if + * we need the space */ + WriteLogPmemEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) + : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes), + entry_valid(0), sync_point(0), sequenced(0), has_data(0), discard(0), writesame(0) { + } + BlockExtent block_extent(); + uint64_t get_offset_bytes(); + uint64_t get_write_bytes(); + bool is_sync_point() { + return sync_point; + } + bool is_discard() { + return discard; + } + bool is_writesame() { + return writesame; + } + bool is_write() { + /* Log entry is a basic write */ + return !is_sync_point() && !is_discard() && !is_writesame(); + } + bool is_writer() { + /* Log entry is any type that writes data */ + return is_write() || is_discard() || is_writesame(); + } + friend std::ostream& operator<<(std::ostream& os, + const WriteLogPmemEntry &entry); +}; + +static_assert(sizeof(WriteLogPmemEntry) == 64); + +struct WriteLogPoolRoot { + union { + struct { + uint8_t layout_version; /* Version of this structure (RWL_POOL_VERSION) */ + }; + uint64_t _u64; + } header; + TOID(struct WriteLogPmemEntry) log_entries; /* contiguous array of log entries */ + uint64_t pool_size; + uint64_t flushed_sync_gen; /* All writing entries with this or a lower + * sync gen number are flushed. */ + uint32_t block_size; /* block size */ + uint32_t num_log_entries; + uint32_t first_free_entry; /* Entry following the newest valid entry */ + uint32_t first_valid_entry; /* Index of the oldest valid entry in the log */ +}; + +struct WriteBufferAllocation { + unsigned int allocation_size = 0; + pobj_action buffer_alloc_action; + TOID(uint8_t) buffer_oid = OID_NULL; + bool allocated = false; + utime_t allocation_lat; +}; + +static inline io::Extent image_extent(const BlockExtent& block_extent) { + return io::Extent(block_extent.block_start, + block_extent.block_end - block_extent.block_start); +} + +template +class ExtentsSummary { +public: + uint64_t total_bytes; + uint64_t first_image_byte; + uint64_t last_image_byte; + explicit ExtentsSummary(const ExtentsType &extents); + friend std::ostream &operator<<(std::ostream &os, + const ExtentsSummary &s) { + os << "total_bytes=" << s.total_bytes << ", " + << "first_image_byte=" << s.first_image_byte << ", " + << "last_image_byte=" << s.last_image_byte << ""; + return os; + } + BlockExtent block_extent() { + return BlockExtent(first_image_byte, last_image_byte); + } + io::Extent image_extent() { + return librbd::cache::pwl::image_extent(block_extent()); + } +}; + +io::Extent whole_volume_extent(); + +BlockExtent block_extent(const io::Extent& image_extent); + +Context * override_ctx(int r, Context *ctx); + +class ImageExtentBuf : public io::Extent { +public: + bufferlist m_bl; + ImageExtentBuf(io::Extent extent) + : io::Extent(extent) { } + ImageExtentBuf(io::Extent extent, bufferlist bl) + : io::Extent(extent), m_bl(bl) { } +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_TYPES_H diff --git a/src/librbd/cache/rwl/ImageCacheState.cc b/src/librbd/cache/rwl/ImageCacheState.cc deleted file mode 100644 index 945e39ed82e00..0000000000000 --- a/src/librbd/cache/rwl/ImageCacheState.cc +++ /dev/null @@ -1,180 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "librbd/cache/Types.h" -#include "librbd/cache/Utils.h" -#include "librbd/cache/rwl/ImageCacheState.h" -#include "librbd/ImageCtx.h" -#include "librbd/Operations.h" -#include "common/environment.h" -#include "common/hostname.h" -#include "common/config_proxy.h" -#include "common/ceph_json.h" - -#undef dout_subsys -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl::ImageCacheState: " \ - << __func__ << ": " - -namespace librbd { -namespace cache { -namespace rwl { - -namespace { -bool get_json_format(const std::string& s, JSONFormattable *f) { - JSONParser p; - bool success = p.parse(s.c_str(), s.size()); - if (success) { - decode_json_obj(*f, &p); - } - return success; -} -} // namespace - -template -ImageCacheState::ImageCacheState(I *image_ctx) : m_image_ctx(image_ctx) { - ldout(image_ctx->cct, 20) << "Initialize RWL cache state with config data. " - << dendl; - - ConfigProxy &config = image_ctx->config; - host = ceph_get_short_hostname(); - path = config.get_val("rbd_rwl_path"); - size = config.get_val("rbd_rwl_size"); - log_periodic_stats = config.get_val("rbd_rwl_log_periodic_stats"); -} - -template -ImageCacheState::ImageCacheState( - I *image_ctx, JSONFormattable &f) : m_image_ctx(image_ctx) { - ldout(image_ctx->cct, 20) << "Initialize RWL cache state with data from " - << "server side"<< dendl; - - present = (bool)f["present"]; - empty = (bool)f["empty"]; - clean = (bool)f["clean"]; - host = (string)f["rwl_host"]; - path = (string)f["rwl_path"]; - uint64_t rwl_size; - std::istringstream iss(f["rwl_size"]); - iss >> rwl_size; - size = rwl_size; - - // Others from config - ConfigProxy &config = image_ctx->config; - log_periodic_stats = config.get_val("rbd_rwl_log_periodic_stats"); -} - -template -void ImageCacheState::write_image_cache_state(Context *on_finish) { - std::shared_lock owner_lock{m_image_ctx->owner_lock}; - JSONFormattable f; - ::encode_json(IMAGE_CACHE_STATE.c_str(), *this, &f); - std::ostringstream oss; - f.flush(oss); - std::string image_state_json = oss.str(); - - ldout(m_image_ctx->cct, 20) << __func__ << " Store state: " - << image_state_json << dendl; - m_image_ctx->operations->execute_metadata_set(IMAGE_CACHE_STATE, - image_state_json, on_finish); -} - -template -void ImageCacheState::clear_image_cache_state(Context *on_finish) { - std::shared_lock owner_lock{m_image_ctx->owner_lock}; - ldout(m_image_ctx->cct, 20) << __func__ << " Remove state: " << dendl; - m_image_ctx->operations->execute_metadata_remove(IMAGE_CACHE_STATE, on_finish); -} - -template -void ImageCacheState::dump(ceph::Formatter *f) const { - ::encode_json("present", present, f); - ::encode_json("empty", empty, f); - ::encode_json("clean", clean, f); - ::encode_json("cache_type", (int)get_image_cache_type(), f); - ::encode_json("rwl_host", host, f); - ::encode_json("rwl_path", path, f); - ::encode_json("rwl_size", size, f); -} - -template -ImageCacheState* ImageCacheState::get_image_cache_state( - I* image_ctx, int &r) { - std::string cache_state_str; - ImageCacheState* cache_state = nullptr; - ldout(image_ctx->cct, 20) << "image_cache_state:" << cache_state_str << dendl; - - r = 0; - bool dirty_cache = image_ctx->test_features(RBD_FEATURE_DIRTY_CACHE); - if (dirty_cache) { - cls_client::metadata_get(&image_ctx->md_ctx, image_ctx->header_oid, - IMAGE_CACHE_STATE, &cache_state_str); - } - - bool rwl_enabled = cache::util::is_rwl_enabled(*image_ctx); - bool cache_desired = rwl_enabled; - cache_desired &= !image_ctx->read_only; - cache_desired &= !image_ctx->test_features(RBD_FEATURE_MIGRATING); - cache_desired &= !image_ctx->test_features(RBD_FEATURE_JOURNALING); - cache_desired &= !image_ctx->old_format; - - if (!dirty_cache && !cache_desired) { - ldout(image_ctx->cct, 20) << "Do not desire to use image cache." << dendl; - } else if (dirty_cache && !cache_desired) { - lderr(image_ctx->cct) << "There's a dirty cache, but RWL cache is disabled." - << dendl; - r = -EINVAL; - }else if ((!dirty_cache || cache_state_str.empty()) && cache_desired) { - cache_state = new ImageCacheState(image_ctx); - } else { - ceph_assert(!cache_state_str.empty()); - JSONFormattable f; - bool success = get_json_format(cache_state_str, &f); - if (!success) { - lderr(image_ctx->cct) << "Failed to parse cache state: " - << cache_state_str << dendl; - r = -EINVAL; - return nullptr; - } - - bool cache_exists = (bool)f["present"]; - int cache_type = (int)f["cache_type"]; - - switch (cache_type) { - case IMAGE_CACHE_TYPE_RWL: - if (!cache_exists) { - cache_state = new ImageCacheState(image_ctx); - } else { - cache_state = new ImageCacheState(image_ctx, f); - } - break; - default: - r = -EINVAL; - } - } - return cache_state; -} - -template -bool ImageCacheState::is_valid() { - if (this->present && - (host.compare(ceph_get_short_hostname()) != 0)) { - auto cleanstring = "dirty"; - if (this->clean) { - cleanstring = "clean"; - } - lderr(m_image_ctx->cct) << "An image cache (RWL) remains on another host " - << host << " which is " << cleanstring - << ". Flush/close the image there to remove the " - << "image cache" << dendl; - return false; - } - return true; -} - -} // namespace rwl -} // namespace cache -} // namespace librbd - -template class librbd::cache::rwl::ImageCacheState; diff --git a/src/librbd/cache/rwl/ImageCacheState.h b/src/librbd/cache/rwl/ImageCacheState.h deleted file mode 100644 index 751978e76ec4f..0000000000000 --- a/src/librbd/cache/rwl/ImageCacheState.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H -#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H - -#include "librbd/ImageCtx.h" -#include "librbd/cache/Types.h" -#include - -class JSONFormattable; -namespace ceph { - class Formatter; -} - -namespace librbd { -namespace cache { -namespace rwl { - -template -class ImageCacheState { -private: - ImageCtxT* m_image_ctx; -public: - bool present = true; - bool empty = true; - bool clean = true; - std::string host; - std::string path; - uint64_t size; - bool log_periodic_stats; - - ImageCacheState(ImageCtxT* image_ctx); - - ImageCacheState(ImageCtxT* image_ctx, JSONFormattable& f); - - ~ImageCacheState() {} - - ImageCacheType get_image_cache_type() const { - return IMAGE_CACHE_TYPE_RWL; - } - - - void write_image_cache_state(Context *on_finish); - - void clear_image_cache_state(Context *on_finish); - - void dump(ceph::Formatter *f) const; - - static ImageCacheState* get_image_cache_state( - ImageCtxT* image_ctx, int &r); - - bool is_valid(); -}; - -} // namespace rwl -} // namespace cache -} // namespace librbd - -extern template class librbd::cache::rwl::ImageCacheState; - -#endif // CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H diff --git a/src/librbd/cache/rwl/InitRequest.cc b/src/librbd/cache/rwl/InitRequest.cc deleted file mode 100644 index 87e372af20d34..0000000000000 --- a/src/librbd/cache/rwl/InitRequest.cc +++ /dev/null @@ -1,171 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "librbd/cache/rwl/InitRequest.h" -#include "librbd/Utils.h" -#include "common/dout.h" -#include "common/errno.h" -#include "librbd/asio/ContextWQ.h" - -#if defined(WITH_RBD_RWL) -#include "librbd/cache/rwl/ImageCacheState.h" -#include "librbd/cache/WriteLogCache.h" -#endif // WITH_RBD_RWL - -#include "librbd/cache/Utils.h" -#include "librbd/ImageCtx.h" - -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl:InitRequest " \ - << this << " " << __func__ << ": " - -namespace librbd { -namespace cache { -namespace rwl { - -using librbd::util::create_async_context_callback; -using librbd::util::create_context_callback; - -template -InitRequest* InitRequest::create(I &image_ctx, - Context *on_finish) { - return new InitRequest(image_ctx, on_finish); -} - -template -InitRequest::InitRequest(I &image_ctx, Context *on_finish) - : m_image_ctx(image_ctx), - m_on_finish(create_async_context_callback(image_ctx, on_finish)), - m_error_result(0) { -} - -template -void InitRequest::send() { -#if defined(WITH_RBD_RWL) - get_image_cache_state(); -#else - finish(); -#endif // WITH_RBD_RWL -} - -#if defined(WITH_RBD_RWL) -template -void InitRequest::get_image_cache_state() { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << dendl; - - int r; - auto cache_state = ImageCacheState::get_image_cache_state(&m_image_ctx, r); - - if (r < 0 || !cache_state) { - save_result(r); - finish(); - return; - } else if (!cache_state->is_valid()) { - delete cache_state; - cache_state = nullptr; - lderr(cct) << "failed to get image cache state: " << cpp_strerror(r) - << dendl; - save_result(-ENOENT); - finish(); - return; - } - - auto cache_type = cache_state->get_image_cache_type(); - switch(cache_type) { - case cache::IMAGE_CACHE_TYPE_RWL: - m_image_ctx.image_cache = - new librbd::cache::WriteLogCache(m_image_ctx, - cache_state); - break; - default: - delete cache_state; - cache_state = nullptr; - save_result(-ENOENT); - finish(); - return; - } - - init_image_cache(); -} - -template -void InitRequest::init_image_cache() { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << dendl; - - using klass = InitRequest; - Context *ctx = create_context_callback( - this); - m_image_ctx.image_cache->init(ctx); -} - -template -void InitRequest::handle_init_image_cache(int r) { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << dendl; - - if (r < 0) { - lderr(cct) << "failed to init image cache: " << cpp_strerror(r) - << dendl; - delete m_image_ctx.image_cache; - m_image_ctx.image_cache = nullptr; - save_result(r); - finish(); - return; - } - set_feature_bit(); -} - -template -void InitRequest::set_feature_bit() { - CephContext *cct = m_image_ctx.cct; - - uint64_t new_features = m_image_ctx.features | RBD_FEATURE_DIRTY_CACHE; - uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE; - ldout(cct, 10) << "old_features=" << m_image_ctx.features - << ", new_features=" << new_features - << ", features_mask=" << features_mask - << dendl; - - int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, - m_image_ctx.header_oid, - new_features, features_mask); - m_image_ctx.features |= RBD_FEATURE_DIRTY_CACHE; - using klass = InitRequest; - Context *ctx = create_context_callback( - this); - ctx->complete(r); -} - -template -void InitRequest::handle_set_feature_bit(int r) { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << "r=" << r << dendl; - - if (r < 0) { - lderr(cct) << "failed to set feature bit: " << cpp_strerror(r) - << dendl; - save_result(r); - } else if (m_image_ctx.discard_granularity_bytes) { - ldout(cct, 1) << "RWL image cache is enabled and " - << "set discard_granularity_bytes = 0." << dendl; - m_image_ctx.discard_granularity_bytes = 0; - } - finish(); -} - -#endif // WITH_RBD_RWL - -template -void InitRequest::finish() { - m_on_finish->complete(m_error_result); - delete this; -} - -} // namespace rwl -} // namespace cache -} // namespace librbd - -template class librbd::cache::rwl::InitRequest; diff --git a/src/librbd/cache/rwl/InitRequest.h b/src/librbd/cache/rwl/InitRequest.h deleted file mode 100644 index 9d18e678354db..0000000000000 --- a/src/librbd/cache/rwl/InitRequest.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H -#define CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H - -class Context; - -namespace librbd { - -class ImageCtx; - -namespace cache { -namespace rwl { - -template -class ImageCacheState; - -template -class InitRequest { -public: - static InitRequest* create(ImageCtxT &image_ctx, Context *on_finish); - - void send(); - -private: - - /** - * @verbatim - * - * Init request goes through the following state machine: - * - * - * | - * v - * GET_IMAGE_CACHE_STATE - * | - * v - * INIT_IMAGE_CACHE - * | - * v - * SET_FEATURE_BIT - * | - * v - * - * - * @endverbatim - */ - - InitRequest(ImageCtxT &image_ctx, Context *on_finish); - - ImageCtxT &m_image_ctx; - Context *m_on_finish; - - int m_error_result; - - bool is_rwl_enabled(); - - void get_image_cache_state(); - - void init_image_cache(); - void handle_init_image_cache(int r); - - void set_feature_bit(); - void handle_set_feature_bit(int r); - - void finish(); - - void save_result(int result) { - if (m_error_result == 0 && result < 0) { - m_error_result = result; - } - } -}; - -} // namespace rwl -} // namespace cache -} // namespace librbd - -extern template class librbd::cache::rwl::InitRequest; - -#endif // CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H diff --git a/src/librbd/cache/rwl/LogEntry.cc b/src/librbd/cache/rwl/LogEntry.cc deleted file mode 100644 index a4fbad4d1ead4..0000000000000 --- a/src/librbd/cache/rwl/LogEntry.cc +++ /dev/null @@ -1,228 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include "LogEntry.h" -#include "librbd/cache/ImageWriteback.h" - -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl::LogEntry: " << this << " " \ - << __func__ << ": " - -namespace librbd { - -namespace cache { - -namespace rwl { - -std::ostream& GenericLogEntry::format(std::ostream &os) const { - os << "ram_entry=[" << ram_entry << "], " - << "pmem_entry=" << (void*)pmem_entry << ", " - << "log_entry_index=" << log_entry_index << ", " - << "completed=" << completed; - return os; -} - -std::ostream &operator<<(std::ostream &os, - const GenericLogEntry &entry) { - return entry.format(os); -} - -std::ostream& SyncPointLogEntry::format(std::ostream &os) const { - os << "(Sync Point) "; - GenericLogEntry::format(os); - os << ", " - << "writes=" << writes << ", " - << "bytes=" << bytes << ", " - << "writes_completed=" << writes_completed << ", " - << "writes_flushed=" << writes_flushed << ", " - << "prior_sync_point_flushed=" << prior_sync_point_flushed << ", " - << "next_sync_point_entry=" << next_sync_point_entry; - return os; -} - -std::ostream &operator<<(std::ostream &os, - const SyncPointLogEntry &entry) { - return entry.format(os); -} - -bool GenericWriteLogEntry::can_writeback() const { - return (this->completed && - (ram_entry.sequenced || - (sync_point_entry && - sync_point_entry->completed))); -} - -std::ostream& GenericWriteLogEntry::format(std::ostream &os) const { - GenericLogEntry::format(os); - os << ", " - << "sync_point_entry=["; - if (sync_point_entry) { - os << *sync_point_entry; - } else { - os << "nullptr"; - } - os << "], " - << "referring_map_entries=" << referring_map_entries; - return os; -} - -std::ostream &operator<<(std::ostream &os, - const GenericWriteLogEntry &entry) { - return entry.format(os); -} - -void WriteLogEntry::init(bool has_data, std::vector::iterator allocation, - uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush) { - ram_entry.has_data = 1; - ram_entry.write_data = allocation->buffer_oid; - ceph_assert(!TOID_IS_NULL(ram_entry.write_data)); - pmem_buffer = D_RW(ram_entry.write_data); - ram_entry.sync_gen_number = current_sync_gen; - if (persist_on_flush) { - /* Persist on flush. Sequence #0 is never used. */ - ram_entry.write_sequence_number = 0; - } else { - /* Persist on write */ - ram_entry.write_sequence_number = last_op_sequence_num; - ram_entry.sequenced = 1; - } - ram_entry.sync_point = 0; - ram_entry.discard = 0; -} - -void WriteLogEntry::init_pmem_bp() { - ceph_assert(!pmem_bp.have_raw()); - pmem_bp = buffer::ptr(buffer::create_static(this->write_bytes(), (char*)pmem_buffer)); -} - -void WriteLogEntry::init_pmem_bl() { - pmem_bl.clear(); - init_pmem_bp(); - ceph_assert(pmem_bp.have_raw()); - int before_bl = pmem_bp.raw_nref(); - this->init_bl(pmem_bp, pmem_bl); - int after_bl = pmem_bp.raw_nref(); - bl_refs = after_bl - before_bl; -} - -unsigned int WriteLogEntry::reader_count() const { - if (pmem_bp.have_raw()) { - return (pmem_bp.raw_nref() - bl_refs - 1); - } else { - return 0; - } -} - -/* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */ -buffer::list& WriteLogEntry::get_pmem_bl() { - if (0 == bl_refs) { - std::lock_guard locker(m_entry_bl_lock); - if (0 == bl_refs) { - init_pmem_bl(); - } - ceph_assert(0 != bl_refs); - } - return pmem_bl; -} - -/* Constructs a new bl containing copies of pmem_bp */ -void WriteLogEntry::copy_pmem_bl(bufferlist *out_bl) { - this->get_pmem_bl(); - /* pmem_bp is now initialized */ - buffer::ptr cloned_bp(pmem_bp.clone()); - out_bl->clear(); - this->init_bl(cloned_bp, *out_bl); -} - -void WriteLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback, - Context *ctx) { - /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the bl even after flush()). */ - bufferlist entry_bl; - buffer::list entry_bl_copy; - copy_pmem_bl(&entry_bl_copy); - entry_bl_copy.begin(0).copy(write_bytes(), entry_bl); - image_writeback.aio_write({{ram_entry.image_offset_bytes, ram_entry.write_bytes}}, - std::move(entry_bl), 0, ctx); -} - -std::ostream& WriteLogEntry::format(std::ostream &os) const { - os << "(Write) "; - GenericWriteLogEntry::format(os); - os << ", " - << "pmem_buffer=" << (void*)pmem_buffer << ", "; - os << "pmem_bp=" << pmem_bp << ", "; - os << "pmem_bl=" << pmem_bl << ", "; - os << "bl_refs=" << bl_refs; - return os; -} - -std::ostream &operator<<(std::ostream &os, - const WriteLogEntry &entry) { - return entry.format(os); -} - -void DiscardLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback, - Context *ctx) { - image_writeback.aio_discard(ram_entry.image_offset_bytes, ram_entry.write_bytes, - m_discard_granularity_bytes, ctx); -} - -void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num) { - ram_entry.sync_gen_number = current_sync_gen; - if (persist_on_flush) { - /* Persist on flush. Sequence #0 is never used. */ - ram_entry.write_sequence_number = 0; - } else { - /* Persist on write */ - ram_entry.write_sequence_number = last_op_sequence_num; - ram_entry.sequenced = 1; - } -} - -std::ostream &DiscardLogEntry::format(std::ostream &os) const { - os << "(Discard) "; - GenericWriteLogEntry::format(os); - return os; -} - -std::ostream &operator<<(std::ostream &os, - const DiscardLogEntry &entry) { - return entry.format(os); -} - -void WriteSameLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) { - for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) { - bl.append(bp); - } - int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen; - if (trailing_partial) { - bl.append(bp, 0, trailing_partial); - } -} - -void WriteSameLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback, - Context *ctx) { - bufferlist entry_bl; - buffer::list entry_bl_copy; - copy_pmem_bl(&entry_bl_copy); - entry_bl_copy.begin(0).copy(write_bytes(), entry_bl); - image_writeback.aio_writesame(ram_entry.image_offset_bytes, ram_entry.write_bytes, - std::move(entry_bl), 0, ctx); -} - -std::ostream &WriteSameLogEntry::format(std::ostream &os) const { - os << "(WriteSame) "; - WriteLogEntry::format(os); - return os; -} - -std::ostream &operator<<(std::ostream &os, - const WriteSameLogEntry &entry) { - return entry.format(os); -} - -} // namespace rwl -} // namespace cache -} // namespace librbd diff --git a/src/librbd/cache/rwl/LogEntry.h b/src/librbd/cache/rwl/LogEntry.h deleted file mode 100644 index df34d8a95d0e0..0000000000000 --- a/src/librbd/cache/rwl/LogEntry.h +++ /dev/null @@ -1,267 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H -#define CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H - -#include "common/ceph_mutex.h" -#include "librbd/Utils.h" -#include "librbd/cache/rwl/Types.h" -#include -#include - -namespace librbd { -namespace cache { -class ImageWritebackInterface; -namespace rwl { - -class SyncPointLogEntry; -class GenericWriteLogEntry; -class WriteLogEntry; - -typedef std::list> GenericWriteLogEntries; - -class GenericLogEntry { -public: - WriteLogPmemEntry ram_entry; - WriteLogPmemEntry *pmem_entry = nullptr; - uint32_t log_entry_index = 0; - bool completed = false; - GenericLogEntry(const uint64_t image_offset_bytes = 0, const uint64_t write_bytes = 0) - : ram_entry(image_offset_bytes, write_bytes) { - }; - virtual ~GenericLogEntry() { }; - GenericLogEntry(const GenericLogEntry&) = delete; - GenericLogEntry &operator=(const GenericLogEntry&) = delete; - virtual bool can_writeback() const { - return false; - } - virtual bool can_retire() const { - return false; - } - virtual void set_flushed(bool flushed) { - ceph_assert(false); - } - virtual unsigned int write_bytes() const { - return 0; - }; - virtual unsigned int bytes_dirty() const { - return 0; - }; - virtual std::shared_ptr get_sync_point_entry() { - return nullptr; - } - virtual void writeback(librbd::cache::ImageWritebackInterface &image_writeback, - Context *ctx) { - ceph_assert(false); - }; - virtual std::ostream& format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const GenericLogEntry &entry); -}; - -class SyncPointLogEntry : public GenericLogEntry { -public: - /* Writing entries using this sync gen number */ - std::atomic writes = {0}; - /* Total bytes for all writing entries using this sync gen number */ - std::atomic bytes = {0}; - /* Writing entries using this sync gen number that have completed */ - std::atomic writes_completed = {0}; - /* Writing entries using this sync gen number that have completed flushing to the writeback interface */ - std::atomic writes_flushed = {0}; - /* All writing entries using all prior sync gen numbers have been flushed */ - std::atomic prior_sync_point_flushed = {true}; - std::shared_ptr next_sync_point_entry = nullptr; - SyncPointLogEntry(const uint64_t sync_gen_number) { - ram_entry.sync_gen_number = sync_gen_number; - ram_entry.sync_point = 1; - }; - ~SyncPointLogEntry() override {}; - SyncPointLogEntry(const SyncPointLogEntry&) = delete; - SyncPointLogEntry &operator=(const SyncPointLogEntry&) = delete; - bool can_retire() const override { - return this->completed; - } - std::ostream& format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const SyncPointLogEntry &entry); -}; - -class GenericWriteLogEntry : public GenericLogEntry { -public: - uint32_t referring_map_entries = 0; - std::shared_ptr sync_point_entry; - GenericWriteLogEntry(std::shared_ptr sync_point_entry, - const uint64_t image_offset_bytes, const uint64_t write_bytes) - : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(sync_point_entry) { } - GenericWriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) - : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(nullptr) { } - ~GenericWriteLogEntry() override {}; - GenericWriteLogEntry(const GenericWriteLogEntry&) = delete; - GenericWriteLogEntry &operator=(const GenericWriteLogEntry&) = delete; - unsigned int write_bytes() const override { - /* The valid bytes in this ops data buffer. Discard and WS override. */ - return ram_entry.write_bytes; - }; - unsigned int bytes_dirty() const override { - /* The bytes in the image this op makes dirty. Discard and WS override. */ - return write_bytes(); - }; - BlockExtent block_extent() { - return ram_entry.block_extent(); - } - uint32_t get_map_ref() { - return(referring_map_entries); - } - void inc_map_ref() { referring_map_entries++; } - void dec_map_ref() { referring_map_entries--; } - bool can_writeback() const override; - std::shared_ptr get_sync_point_entry() override { - return sync_point_entry; - } - virtual void copy_pmem_bl(bufferlist *out_bl) = 0; - void set_flushed(bool flushed) override { - m_flushed = flushed; - } - bool get_flushed() const { - return m_flushed; - } - std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const GenericWriteLogEntry &entry); - -private: - bool m_flushed = false; /* or invalidated */ -}; - -class WriteLogEntry : public GenericWriteLogEntry { -protected: - buffer::ptr pmem_bp; - buffer::list pmem_bl; - std::atomic bl_refs = {0}; /* The refs held on pmem_bp by pmem_bl */ - /* Used in WriteLogEntry::get_pmem_bl() to syncronize between threads making entries readable */ - mutable ceph::mutex m_entry_bl_lock; - - void init_pmem_bp(); - - /* Write same will override */ - virtual void init_bl(buffer::ptr &bp, buffer::list &bl) { - bl.append(bp); - } - - void init_pmem_bl(); - -public: - uint8_t *pmem_buffer = nullptr; - WriteLogEntry(std::shared_ptr sync_point_entry, - const uint64_t image_offset_bytes, const uint64_t write_bytes) - : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes), - m_entry_bl_lock(ceph::make_mutex(util::unique_lock_name( - "librbd::cache::rwl::WriteLogEntry::m_entry_bl_lock", this))) - { } - WriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) - : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes), - m_entry_bl_lock(ceph::make_mutex(util::unique_lock_name( - "librbd::cache::rwl::WriteLogEntry::m_entry_bl_lock", this))) - { } - ~WriteLogEntry() override {}; - WriteLogEntry(const WriteLogEntry&) = delete; - WriteLogEntry &operator=(const WriteLogEntry&) = delete; - void init(bool has_data, std::vector::iterator allocation, - uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush); - BlockExtent block_extent(); - unsigned int reader_count() const; - /* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */ - buffer::list &get_pmem_bl(); - /* Constructs a new bl containing copies of pmem_bp */ - void copy_pmem_bl(bufferlist *out_bl) override; - void writeback(librbd::cache::ImageWritebackInterface &image_writeback, - Context *ctx) override; - bool can_retire() const override { - return (this->completed && this->get_flushed() && (0 == reader_count())); - } - std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const WriteLogEntry &entry); -}; - -class DiscardLogEntry : public GenericWriteLogEntry { -public: - DiscardLogEntry(std::shared_ptr sync_point_entry, - const uint64_t image_offset_bytes, const uint64_t write_bytes, - uint32_t discard_granularity_bytes) - : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes), - m_discard_granularity_bytes(discard_granularity_bytes) { - ram_entry.discard = 1; - }; - DiscardLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) - : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes) { - ram_entry.discard = 1; - }; - DiscardLogEntry(const DiscardLogEntry&) = delete; - DiscardLogEntry &operator=(const DiscardLogEntry&) = delete; - unsigned int write_bytes() const override { - /* The valid bytes in this ops data buffer. */ - return 0; - }; - unsigned int bytes_dirty() const override { - /* The bytes in the image this op makes dirty. */ - return ram_entry.write_bytes; - }; - bool can_retire() const override { - return this->completed; - } - void copy_pmem_bl(bufferlist *out_bl) override { - ceph_assert(false); - } - void writeback(librbd::cache::ImageWritebackInterface &image_writeback, - Context *ctx) override; - void init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num); - std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const DiscardLogEntry &entry); -private: - uint32_t m_discard_granularity_bytes; -}; - -class WriteSameLogEntry : public WriteLogEntry { -protected: - void init_bl(buffer::ptr &bp, buffer::list &bl) override; - -public: - WriteSameLogEntry(std::shared_ptr sync_point_entry, - const uint64_t image_offset_bytes, const uint64_t write_bytes, - const uint32_t data_length) - : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) { - ram_entry.writesame = 1; - ram_entry.ws_datalen = data_length; - }; - WriteSameLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes, - const uint32_t data_length) - : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) { - ram_entry.writesame = 1; - ram_entry.ws_datalen = data_length; - }; - WriteSameLogEntry(const WriteSameLogEntry&) = delete; - WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete; - unsigned int write_bytes() const override { - /* The valid bytes in this ops data buffer. */ - return ram_entry.ws_datalen; - }; - unsigned int bytes_dirty() const override { - /* The bytes in the image this op makes dirty. */ - return ram_entry.write_bytes; - }; - void writeback(librbd::cache::ImageWritebackInterface &image_writeback, - Context *ctx) override; - std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const WriteSameLogEntry &entry); -}; - -} // namespace rwl -} // namespace cache -} // namespace librbd - -#endif // CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H diff --git a/src/librbd/cache/rwl/LogMap.cc b/src/librbd/cache/rwl/LogMap.cc deleted file mode 100644 index e432da7b671af..0000000000000 --- a/src/librbd/cache/rwl/LogMap.cc +++ /dev/null @@ -1,278 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "LogMap.h" -#include "include/ceph_assert.h" -#include "librbd/Utils.h" -#include "librbd/cache/rwl/LogEntry.h" - -namespace librbd { -namespace cache { -namespace rwl { - -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl::LogMap: " << this << " " \ - << __func__ << ": " -template -std::ostream &operator<<(std::ostream &os, - LogMapEntry &e) { - os << "block_extent=" << e.block_extent << ", " - << "log_entry=[" << e.log_entry << "]"; - return os; -} - -template -LogMapEntry::LogMapEntry(const BlockExtent block_extent, - std::shared_ptr log_entry) - : block_extent(block_extent) , log_entry(log_entry) { -} - -template -LogMapEntry::LogMapEntry(std::shared_ptr log_entry) - : block_extent(log_entry->block_extent()) , log_entry(log_entry) { -} - -template -LogMap::LogMap(CephContext *cct) - : m_cct(cct), - m_lock(ceph::make_mutex(util::unique_lock_name( - "librbd::cache::rwl::LogMap::m_lock", this))) { -} - -/** - * Add a write log entry to the map. Subsequent queries for blocks - * within this log entry's extent will find this log entry. Portions - * of prior write log entries overlapping with this log entry will - * be replaced in the map by this log entry. - * - * The map_entries field of the log entry object will be updated to - * contain this map entry. - * - * The map_entries fields of all log entries overlapping with this - * entry will be updated to remove the regions that overlap with - * this. - */ -template -void LogMap::add_log_entry(std::shared_ptr log_entry) { - std::lock_guard locker(m_lock); - add_log_entry_locked(log_entry); -} - -template -void LogMap::add_log_entries(std::list> &log_entries) { - std::lock_guard locker(m_lock); - ldout(m_cct, 20) << dendl; - for (auto &log_entry : log_entries) { - add_log_entry_locked(log_entry); - } -} - -/** - * Remove any map entries that refer to the supplied write log - * entry. - */ -template -void LogMap::remove_log_entry(std::shared_ptr log_entry) { - std::lock_guard locker(m_lock); - remove_log_entry_locked(log_entry); -} - -template -void LogMap::remove_log_entries(std::list> &log_entries) { - std::lock_guard locker(m_lock); - ldout(m_cct, 20) << dendl; - for (auto &log_entry : log_entries) { - remove_log_entry_locked(log_entry); - } -} - -/** - * Returns the list of all write log entries that overlap the specified block - * extent. This doesn't tell you which portions of these entries overlap the - * extent, or each other. For that, use find_map_entries(). A log entry may - * appear in the list more than once, if multiple map entries refer to it - * (e.g. the middle of that write log entry has been overwritten). - */ -template -std::list> LogMap::find_log_entries(BlockExtent block_extent) { - std::lock_guard locker(m_lock); - ldout(m_cct, 20) << dendl; - return find_log_entries_locked(block_extent); -} - -/** - * Returns the list of all write log map entries that overlap the - * specified block extent. - */ -template -LogMapEntries LogMap::find_map_entries(BlockExtent block_extent) { - std::lock_guard locker(m_lock); - ldout(m_cct, 20) << dendl; - return find_map_entries_locked(block_extent); -} - -template -void LogMap::add_log_entry_locked(std::shared_ptr log_entry) { - LogMapEntry map_entry(log_entry); - ldout(m_cct, 20) << "block_extent=" << map_entry.block_extent - << dendl; - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - LogMapEntries overlap_entries = find_map_entries_locked(map_entry.block_extent); - for (auto &entry : overlap_entries) { - ldout(m_cct, 20) << entry << dendl; - if (map_entry.block_extent.block_start <= entry.block_extent.block_start) { - if (map_entry.block_extent.block_end >= entry.block_extent.block_end) { - ldout(m_cct, 20) << "map entry completely occluded by new log entry" << dendl; - remove_map_entry_locked(entry); - } else { - ceph_assert(map_entry.block_extent.block_end < entry.block_extent.block_end); - /* The new entry occludes the beginning of the old entry */ - BlockExtent adjusted_extent(map_entry.block_extent.block_end, - entry.block_extent.block_end); - adjust_map_entry_locked(entry, adjusted_extent); - } - } else { - if (map_entry.block_extent.block_end >= entry.block_extent.block_end) { - /* The new entry occludes the end of the old entry */ - BlockExtent adjusted_extent(entry.block_extent.block_start, - map_entry.block_extent.block_start); - adjust_map_entry_locked(entry, adjusted_extent); - } else { - /* The new entry splits the old entry */ - split_map_entry_locked(entry, map_entry.block_extent); - } - } - } - add_map_entry_locked(map_entry); -} - -template -void LogMap::remove_log_entry_locked(std::shared_ptr log_entry) { - ldout(m_cct, 20) << "*log_entry=" << *log_entry << dendl; - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - - LogMapEntries possible_hits = find_map_entries_locked(log_entry->block_extent()); - for (auto &possible_hit : possible_hits) { - if (possible_hit.log_entry == log_entry) { - /* This map entry refers to the specified log entry */ - remove_map_entry_locked(possible_hit); - } - } -} - -template -void LogMap::add_map_entry_locked(LogMapEntry &map_entry) { - ceph_assert(map_entry.log_entry); - m_block_to_log_entry_map.insert(map_entry); - map_entry.log_entry->inc_map_ref(); -} - -template -void LogMap::remove_map_entry_locked(LogMapEntry &map_entry) { - auto it = m_block_to_log_entry_map.find(map_entry); - ceph_assert(it != m_block_to_log_entry_map.end()); - - LogMapEntry erased = *it; - m_block_to_log_entry_map.erase(it); - erased.log_entry->dec_map_ref(); - if (0 == erased.log_entry->get_map_ref()) { - ldout(m_cct, 20) << "log entry has zero map entries: " << erased.log_entry << dendl; - } -} - -template -void LogMap::adjust_map_entry_locked(LogMapEntry &map_entry, BlockExtent &new_extent) { - auto it = m_block_to_log_entry_map.find(map_entry); - ceph_assert(it != m_block_to_log_entry_map.end()); - - LogMapEntry adjusted = *it; - m_block_to_log_entry_map.erase(it); - - m_block_to_log_entry_map.insert(LogMapEntry(new_extent, adjusted.log_entry)); -} - -template -void LogMap::split_map_entry_locked(LogMapEntry &map_entry, BlockExtent &removed_extent) { - auto it = m_block_to_log_entry_map.find(map_entry); - ceph_assert(it != m_block_to_log_entry_map.end()); - - LogMapEntry split = *it; - m_block_to_log_entry_map.erase(it); - - BlockExtent left_extent(split.block_extent.block_start, - removed_extent.block_start); - m_block_to_log_entry_map.insert(LogMapEntry(left_extent, split.log_entry)); - - BlockExtent right_extent(removed_extent.block_end, - split.block_extent.block_end); - m_block_to_log_entry_map.insert(LogMapEntry(right_extent, split.log_entry)); - - split.log_entry->inc_map_ref(); -} - -template -std::list> LogMap::find_log_entries_locked(const BlockExtent &block_extent) { - std::list> overlaps; - ldout(m_cct, 20) << "block_extent=" << block_extent << dendl; - - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - LogMapEntries map_entries = find_map_entries_locked(block_extent); - for (auto &map_entry : map_entries) { - overlaps.emplace_back(map_entry.log_entry); - } - return overlaps; -} - -/** - * TODO: Generalize this to do some arbitrary thing to each map - * extent, instead of returning a list. - */ -template -LogMapEntries LogMap::find_map_entries_locked(const BlockExtent &block_extent) { - LogMapEntries overlaps; - - ldout(m_cct, 20) << "block_extent=" << block_extent << dendl; - ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - auto p = m_block_to_log_entry_map.equal_range(LogMapEntry(block_extent)); - ldout(m_cct, 20) << "count=" << std::distance(p.first, p.second) << dendl; - for ( auto i = p.first; i != p.second; ++i ) { - LogMapEntry entry = *i; - overlaps.emplace_back(entry); - ldout(m_cct, 20) << entry << dendl; - } - return overlaps; -} - -/* We map block extents to write log entries, or portions of write log - * entries. These are both represented by a WriteLogMapEntry. When a - * GenericWriteLogEntry is added to this map, a WriteLogMapEntry is created to - * represent the entire block extent of the GenericWriteLogEntry, and the - * WriteLogMapEntry is added to the set. - * - * The set must not contain overlapping WriteLogMapEntrys. WriteLogMapEntrys - * in the set that overlap with one being added are adjusted (shrunk, split, - * or removed) before the new entry is added. - * - * This comparison works despite the ambiguity because we ensure the set - * contains no overlapping entries. This comparison works to find entries - * that overlap with a given block extent because equal_range() returns the - * first entry in which the extent doesn't end before the given extent - * starts, and the last entry for which the extent starts before the given - * extent ends (the first entry that the key is less than, and the last entry - * that is less than the key). - */ -template -bool LogMap::LogMapEntryCompare::operator()(const LogMapEntry &lhs, - const LogMapEntry &rhs) const { - if (lhs.block_extent.block_end <= rhs.block_extent.block_start) { - return true; - } - return false; -} - -} //namespace rwl -} //namespace cache -} //namespace librbd - -template class librbd::cache::rwl::LogMap; diff --git a/src/librbd/cache/rwl/LogMap.h b/src/librbd/cache/rwl/LogMap.h deleted file mode 100644 index fcf29d07369ec..0000000000000 --- a/src/librbd/cache/rwl/LogMap.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H -#define CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H - -#include "librbd/BlockGuard.h" -#include - -namespace librbd { -namespace cache { -namespace rwl { - -/** - * WriteLogMap: maps block extents to GenericWriteLogEntries - * - * A WriteLogMapEntry (based on LogMapEntry) refers to a portion of a GenericWriteLogEntry - */ -template -class LogMapEntry { -public: - BlockExtent block_extent; - std::shared_ptr log_entry; - - LogMapEntry(BlockExtent block_extent, - std::shared_ptr log_entry = nullptr); - LogMapEntry(std::shared_ptr log_entry); - - template - friend std::ostream &operator<<(std::ostream &os, - LogMapEntry &e); -}; - -template -using LogMapEntries = std::list>; - -template -class LogMap { -public: - LogMap(CephContext *cct); - LogMap(const LogMap&) = delete; - LogMap &operator=(const LogMap&) = delete; - - void add_log_entry(std::shared_ptr log_entry); - void add_log_entries(std::list> &log_entries); - void remove_log_entry(std::shared_ptr log_entry); - void remove_log_entries(std::list> &log_entries); - std::list> find_log_entries(BlockExtent block_extent); - LogMapEntries find_map_entries(BlockExtent block_extent); - -private: - void add_log_entry_locked(std::shared_ptr log_entry); - void remove_log_entry_locked(std::shared_ptr log_entry); - void add_map_entry_locked(LogMapEntry &map_entry); - void remove_map_entry_locked(LogMapEntry &map_entry); - void adjust_map_entry_locked(LogMapEntry &map_entry, BlockExtent &new_extent); - void split_map_entry_locked(LogMapEntry &map_entry, BlockExtent &removed_extent); - std::list> find_log_entries_locked(const BlockExtent &block_extent); - LogMapEntries find_map_entries_locked(const BlockExtent &block_extent); - - using LogMapEntryT = LogMapEntry; - - class LogMapEntryCompare { - public: - bool operator()(const LogMapEntryT &lhs, - const LogMapEntryT &rhs) const; - }; - - using BlockExtentToLogMapEntries = std::set; - - CephContext *m_cct; - ceph::mutex m_lock; - BlockExtentToLogMapEntries m_block_to_log_entry_map; -}; - -} //namespace rwl -} //namespace cache -} //namespace librbd - -#endif //CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H diff --git a/src/librbd/cache/rwl/LogOperation.cc b/src/librbd/cache/rwl/LogOperation.cc deleted file mode 100644 index 5376f5836e7ef..0000000000000 --- a/src/librbd/cache/rwl/LogOperation.cc +++ /dev/null @@ -1,338 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include "LogOperation.h" -#include "librbd/cache/rwl/Types.h" - -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl::LogOperation: " << this << " " \ - << __func__ << ": " - -namespace librbd { - -namespace cache { - -namespace rwl { - -GenericLogOperation::GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter) - : m_perfcounter(perfcounter), dispatch_time(dispatch_time) { -} - -std::ostream& GenericLogOperation::format(std::ostream &os) const { - os << "dispatch_time=[" << dispatch_time << "], " - << "buf_persist_time=[" << buf_persist_time << "], " - << "buf_persist_comp_time=[" << buf_persist_comp_time << "], " - << "log_append_time=[" << log_append_time << "], " - << "log_append_comp_time=[" << log_append_comp_time << "], "; - return os; -} - -std::ostream &operator<<(std::ostream &os, - const GenericLogOperation &op) { - return op.format(os); -} - -SyncPointLogOperation::SyncPointLogOperation(ceph::mutex &lock, - std::shared_ptr sync_point, - const utime_t dispatch_time, - PerfCounters *perfcounter, - CephContext *cct) - : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock), sync_point(sync_point) { -} - -SyncPointLogOperation::~SyncPointLogOperation() { } - -std::ostream &SyncPointLogOperation::format(std::ostream &os) const { - os << "(Sync Point) "; - GenericLogOperation::format(os); - os << ", " - << "sync_point=[" << *sync_point << "]"; - return os; -} - -std::ostream &operator<<(std::ostream &os, - const SyncPointLogOperation &op) { - return op.format(os); -} - -std::vector SyncPointLogOperation::append_sync_point() { - std::vector appending_contexts; - std::lock_guard locker(m_lock); - if (!sync_point->appending) { - sync_point->appending = true; - } - appending_contexts.swap(sync_point->on_sync_point_appending); - return appending_contexts; -} - -void SyncPointLogOperation::clear_earlier_sync_point() { - std::lock_guard locker(m_lock); - ceph_assert(sync_point->later_sync_point); - ceph_assert(sync_point->later_sync_point->earlier_sync_point == - sync_point); - sync_point->later_sync_point->earlier_sync_point = nullptr; -} - -std::vector SyncPointLogOperation::swap_on_sync_point_persisted() { - std::lock_guard locker(m_lock); - std::vector persisted_contexts; - persisted_contexts.swap(sync_point->on_sync_point_persisted); - return persisted_contexts; -} - -void SyncPointLogOperation::appending() { - ceph_assert(sync_point); - ldout(m_cct, 20) << "Sync point op=[" << *this - << "] appending" << dendl; - auto appending_contexts = append_sync_point(); - for (auto &ctx : appending_contexts) { - ctx->complete(0); - } -} - -void SyncPointLogOperation::complete(int result) { - ceph_assert(sync_point); - ldout(m_cct, 20) << "Sync point op =[" << *this - << "] completed" << dendl; - clear_earlier_sync_point(); - - /* Do append now in case completion occurred before the - * normal append callback executed, and to handle - * on_append work that was queued after the sync point - * entered the appending state. */ - appending(); - auto persisted_contexts = swap_on_sync_point_persisted(); - for (auto &ctx : persisted_contexts) { - ctx->complete(result); - } -} - -GenericWriteLogOperation::GenericWriteLogOperation(std::shared_ptr sync_point, - const utime_t dispatch_time, - PerfCounters *perfcounter, - CephContext *cct) - : GenericLogOperation(dispatch_time, perfcounter), - m_lock(ceph::make_mutex(util::unique_lock_name( - "librbd::cache::rwl::GenericWriteLogOperation::m_lock", this))), - m_cct(cct), - sync_point(sync_point) { -} - -GenericWriteLogOperation::~GenericWriteLogOperation() { } - -std::ostream &GenericWriteLogOperation::format(std::ostream &os) const { - GenericLogOperation::format(os); - return os; -} - -std::ostream &operator<<(std::ostream &os, - const GenericWriteLogOperation &op) { - return op.format(os); -} - -/* Called when the write log operation is appending and its log position is guaranteed */ -void GenericWriteLogOperation::appending() { - Context *on_append = nullptr; - ldout(m_cct, 20) << __func__ << " " << this << dendl; - { - std::lock_guard locker(m_lock); - on_append = on_write_append; - on_write_append = nullptr; - } - if (on_append) { - ldout(m_cct, 20) << __func__ << " " << this << " on_append=" << on_append << dendl; - on_append->complete(0); - } -} - -/* Called when the write log operation is completed in all log replicas */ -void GenericWriteLogOperation::complete(int result) { - appending(); - Context *on_persist = nullptr; - ldout(m_cct, 20) << __func__ << " " << this << dendl; - { - std::lock_guard locker(m_lock); - on_persist = on_write_persist; - on_write_persist = nullptr; - } - if (on_persist) { - ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist << dendl; - on_persist->complete(result); - } -} - -WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set, - uint64_t image_offset_bytes, uint64_t write_bytes, - CephContext *cct) - : GenericWriteLogOperation(set.sync_point, set.dispatch_time, set.perfcounter, cct), - log_entry(std::make_shared(set.sync_point->log_entry, image_offset_bytes, write_bytes)) { - on_write_append = set.extent_ops_appending->new_sub(); - on_write_persist = set.extent_ops_persist->new_sub(); - log_entry->sync_point_entry->writes++; - log_entry->sync_point_entry->bytes += write_bytes; -} - -WriteLogOperation::~WriteLogOperation() { } - -void WriteLogOperation::init(bool has_data, std::vector::iterator allocation, uint64_t current_sync_gen, - uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset, - bool persist_on_flush) { - log_entry->init(has_data, allocation, current_sync_gen, last_op_sequence_num, persist_on_flush); - buffer_alloc = &(*allocation); - bl.substr_of(write_req_bl, buffer_offset, - log_entry->write_bytes()); -} - -std::ostream &WriteLogOperation::format(std::ostream &os) const { - os << "(Write) "; - GenericWriteLogOperation::format(os); - os << ", "; - if (log_entry) { - os << "log_entry=[" << *log_entry << "], "; - } else { - os << "log_entry=nullptr, "; - } - os << "bl=[" << bl << "]," - << "buffer_alloc=" << buffer_alloc; - return os; -} - -std::ostream &operator<<(std::ostream &os, - const WriteLogOperation &op) { - return op.format(os); -} - - -void WriteLogOperation::complete(int result) { - GenericWriteLogOperation::complete(result); - m_perfcounter->tinc(l_librbd_rwl_log_op_dis_to_buf_t, buf_persist_time - dispatch_time); - utime_t buf_lat = buf_persist_comp_time - buf_persist_time; - m_perfcounter->tinc(l_librbd_rwl_log_op_buf_to_bufc_t, buf_lat); - m_perfcounter->hinc(l_librbd_rwl_log_op_buf_to_bufc_t_hist, buf_lat.to_nsec(), - log_entry->ram_entry.write_bytes); - m_perfcounter->tinc(l_librbd_rwl_log_op_buf_to_app_t, log_append_time - buf_persist_time); -} - -void WriteLogOperation::copy_bl_to_pmem_buffer() { - /* operation is a shared_ptr, so write_op is only good as long as operation is in scope */ - bufferlist::iterator i(&bl); - m_perfcounter->inc(l_librbd_rwl_log_op_bytes, log_entry->write_bytes()); - ldout(m_cct, 20) << bl << dendl; - i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->pmem_buffer); -} - -void WriteLogOperation::flush_pmem_buf_to_cache(PMEMobjpool *log_pool) { - buf_persist_time = ceph_clock_now(); - pmemobj_flush(log_pool, log_entry->pmem_buffer, log_entry->write_bytes()); -} - -WriteLogOperationSet::WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr sync_point, - bool persist_on_flush, CephContext *cct, Context *on_finish) - : m_cct(cct), m_on_finish(on_finish), - persist_on_flush(persist_on_flush), - dispatch_time(dispatched), - perfcounter(perfcounter), - sync_point(sync_point) { - on_ops_appending = sync_point->prior_persisted_gather_new_sub(); - on_ops_persist = nullptr; - extent_ops_persist = - new C_Gather(m_cct, - new LambdaContext( [this](int r) { - ldout(this->m_cct,20) << __func__ << " " << this << " m_extent_ops_persist completed" << dendl; - if (on_ops_persist) { - on_ops_persist->complete(r); - } - m_on_finish->complete(r); - })); - auto appending_persist_sub = extent_ops_persist->new_sub(); - extent_ops_appending = - new C_Gather(m_cct, - new LambdaContext( [this, appending_persist_sub](int r) { - ldout(this->m_cct, 20) << __func__ << " " << this << " m_extent_ops_appending completed" << dendl; - on_ops_appending->complete(r); - appending_persist_sub->complete(r); - })); -} - -WriteLogOperationSet::~WriteLogOperationSet() { } - -std::ostream &operator<<(std::ostream &os, - const WriteLogOperationSet &s) { - os << "cell=" << (void*)s.cell << ", " - << "extent_ops_appending=[" << s.extent_ops_appending << ", " - << "extent_ops_persist=[" << s.extent_ops_persist << "]"; - return os; -} - -DiscardLogOperation::DiscardLogOperation(std::shared_ptr sync_point, - const uint64_t image_offset_bytes, - const uint64_t write_bytes, - uint32_t discard_granularity_bytes, - const utime_t dispatch_time, - PerfCounters *perfcounter, - CephContext *cct) - : GenericWriteLogOperation(sync_point, dispatch_time, perfcounter, cct), - log_entry(std::make_shared(sync_point->log_entry, - image_offset_bytes, - write_bytes, - discard_granularity_bytes)) { - on_write_append = sync_point->prior_persisted_gather_new_sub(); - on_write_persist = nullptr; - log_entry->sync_point_entry->writes++; - log_entry->sync_point_entry->bytes += write_bytes; -} - -DiscardLogOperation::~DiscardLogOperation() { } - -void DiscardLogOperation::init(uint64_t current_sync_gen, bool persist_on_flush, - uint64_t last_op_sequence_num, Context *write_persist) { - log_entry->init(current_sync_gen, persist_on_flush, last_op_sequence_num); - this->on_write_persist = write_persist; -} - -std::ostream &DiscardLogOperation::format(std::ostream &os) const { - os << "(Discard) "; - GenericWriteLogOperation::format(os); - os << ", "; - if (log_entry) { - os << "log_entry=[" << *log_entry << "], "; - } else { - os << "log_entry=nullptr, "; - } - return os; -} - -std::ostream &operator<<(std::ostream &os, - const DiscardLogOperation &op) { - return op.format(os); -} - -WriteSameLogOperation::WriteSameLogOperation(WriteLogOperationSet &set, - uint64_t image_offset_bytes, - uint64_t write_bytes, - uint32_t data_len, - CephContext *cct) - : WriteLogOperation(set, image_offset_bytes, write_bytes, cct) { - log_entry = - std::make_shared(set.sync_point->log_entry, image_offset_bytes, write_bytes, data_len); - ldout(m_cct, 20) << __func__ << " " << this << dendl; -} - -WriteSameLogOperation::~WriteSameLogOperation() { } - -std::ostream &WriteSameLogOperation::format(std::ostream &os) const { - os << "(Write Same) "; - WriteLogOperation::format(os); - return os; -} - -std::ostream &operator<<(std::ostream &os, - const WriteSameLogOperation &op) { - return op.format(os); -} - -} // namespace rwl -} // namespace cache -} // namespace librbd diff --git a/src/librbd/cache/rwl/LogOperation.h b/src/librbd/cache/rwl/LogOperation.h deleted file mode 100644 index ad12f6aac460f..0000000000000 --- a/src/librbd/cache/rwl/LogOperation.h +++ /dev/null @@ -1,231 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H -#define CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H - -#include "include/utime.h" -#include "librbd/cache/rwl/LogEntry.h" -#include "librbd/cache/rwl/SyncPoint.h" - -namespace librbd { -namespace cache { -namespace rwl { -struct WriteBufferAllocation; - -class WriteLogOperationSet; - -class WriteLogOperation; - -class GenericWriteLogOperation; - -class SyncPointLogOperation; - -class GenericLogOperation; - -using GenericLogOperationSharedPtr = std::shared_ptr; - -using GenericLogOperationsVector = std::vector; - -class GenericLogOperation { -protected: - PerfCounters *m_perfcounter = nullptr; -public: - utime_t dispatch_time; // When op created - utime_t buf_persist_time; // When buffer persist begins - utime_t buf_persist_comp_time; // When buffer persist completes - utime_t log_append_time; // When log append begins - utime_t log_append_comp_time; // When log append completes - GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter); - virtual ~GenericLogOperation() { }; - GenericLogOperation(const GenericLogOperation&) = delete; - GenericLogOperation &operator=(const GenericLogOperation&) = delete; - virtual std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const GenericLogOperation &op); - virtual const std::shared_ptr get_log_entry() = 0; - virtual void appending() = 0; - virtual void complete(int r) = 0; - virtual void mark_log_entry_completed() {}; - virtual bool reserved_allocated() const { - return false; - } - virtual bool is_writing_op() const { - return false; - } - virtual void copy_bl_to_pmem_buffer() {}; - virtual void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {}; -}; - -class SyncPointLogOperation : public GenericLogOperation { -private: - CephContext *m_cct; - ceph::mutex &m_lock; - std::vector append_sync_point(); - void clear_earlier_sync_point(); - std::vector swap_on_sync_point_persisted(); -public: - std::shared_ptr sync_point; - SyncPointLogOperation(ceph::mutex &lock, - std::shared_ptr sync_point, - const utime_t dispatch_time, - PerfCounters *perfcounter, - CephContext *cct); - ~SyncPointLogOperation() override; - SyncPointLogOperation(const SyncPointLogOperation&) = delete; - SyncPointLogOperation &operator=(const SyncPointLogOperation&) = delete; - std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const SyncPointLogOperation &op); - const std::shared_ptr get_log_entry() override { - return sync_point->log_entry; - } - void appending() override; - void complete(int r) override; -}; - -class GenericWriteLogOperation : public GenericLogOperation { -protected: - ceph::mutex m_lock; - CephContext *m_cct; -public: - std::shared_ptr sync_point; - Context *on_write_append = nullptr; /* Completion for things waiting on this - * write's position in the log to be - * guaranteed */ - Context *on_write_persist = nullptr; /* Completion for things waiting on this - * write to persist */ - GenericWriteLogOperation(std::shared_ptr sync_point, - const utime_t dispatch_time, - PerfCounters *perfcounter, - CephContext *cct); - ~GenericWriteLogOperation() override; - GenericWriteLogOperation(const GenericWriteLogOperation&) = delete; - GenericWriteLogOperation &operator=(const GenericWriteLogOperation&) = delete; - std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const GenericWriteLogOperation &op); - void mark_log_entry_completed() override{ - sync_point->log_entry->writes_completed++; - } - bool reserved_allocated() const override { - return true; - } - bool is_writing_op() const override { - return true; - } - void appending() override; - void complete(int r) override; -}; - -class WriteLogOperation : public GenericWriteLogOperation { -public: - using GenericWriteLogOperation::m_lock; - using GenericWriteLogOperation::sync_point; - using GenericWriteLogOperation::on_write_append; - using GenericWriteLogOperation::on_write_persist; - std::shared_ptr log_entry; - bufferlist bl; - WriteBufferAllocation *buffer_alloc = nullptr; - WriteLogOperation(WriteLogOperationSet &set, const uint64_t image_offset_bytes, - const uint64_t write_bytes, CephContext *cct); - ~WriteLogOperation() override; - WriteLogOperation(const WriteLogOperation&) = delete; - WriteLogOperation &operator=(const WriteLogOperation&) = delete; - void init(bool has_data, std::vector::iterator allocation, uint64_t current_sync_gen, - uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset, - bool persist_on_flush); - std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const WriteLogOperation &op); - const std::shared_ptr get_log_entry() override { - return log_entry; - } - - void complete(int r) override; - void copy_bl_to_pmem_buffer() override; - void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) override; -}; - - -class WriteLogOperationSet { -private: - CephContext *m_cct; - Context *m_on_finish; -public: - bool persist_on_flush; - BlockGuardCell *cell; - C_Gather *extent_ops_appending; - Context *on_ops_appending; - C_Gather *extent_ops_persist; - Context *on_ops_persist; - GenericLogOperationsVector operations; - utime_t dispatch_time; /* When set created */ - PerfCounters *perfcounter = nullptr; - std::shared_ptr sync_point; - WriteLogOperationSet(const utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr sync_point, - const bool persist_on_flush, CephContext *cct, Context *on_finish); - ~WriteLogOperationSet(); - WriteLogOperationSet(const WriteLogOperationSet&) = delete; - WriteLogOperationSet &operator=(const WriteLogOperationSet&) = delete; - friend std::ostream &operator<<(std::ostream &os, - const WriteLogOperationSet &s); -}; - -class DiscardLogOperation : public GenericWriteLogOperation { -public: - using GenericWriteLogOperation::m_lock; - using GenericWriteLogOperation::sync_point; - using GenericWriteLogOperation::on_write_append; - using GenericWriteLogOperation::on_write_persist; - std::shared_ptr log_entry; - DiscardLogOperation(std::shared_ptr sync_point, - const uint64_t image_offset_bytes, - const uint64_t write_bytes, - uint32_t discard_granularity_bytes, - const utime_t dispatch_time, - PerfCounters *perfcounter, - CephContext *cct); - ~DiscardLogOperation() override; - DiscardLogOperation(const DiscardLogOperation&) = delete; - DiscardLogOperation &operator=(const DiscardLogOperation&) = delete; - const std::shared_ptr get_log_entry() override { - return log_entry; - } - bool reserved_allocated() const override { - return false; - } - void init(uint64_t current_sync_gen, bool persist_on_flush, - uint64_t last_op_sequence_num, Context *write_persist); - std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const DiscardLogOperation &op); -}; - -class WriteSameLogOperation : public WriteLogOperation { -public: - using GenericWriteLogOperation::m_lock; - using GenericWriteLogOperation::sync_point; - using GenericWriteLogOperation::on_write_append; - using GenericWriteLogOperation::on_write_persist; - using WriteLogOperation::log_entry; - using WriteLogOperation::bl; - using WriteLogOperation::buffer_alloc; - WriteSameLogOperation(WriteLogOperationSet &set, - const uint64_t image_offset_bytes, - const uint64_t write_bytes, - const uint32_t data_len, - CephContext *cct); - ~WriteSameLogOperation(); - WriteSameLogOperation(const WriteSameLogOperation&) = delete; - WriteSameLogOperation &operator=(const WriteSameLogOperation&) = delete; - std::ostream &format(std::ostream &os) const; - friend std::ostream &operator<<(std::ostream &os, - const WriteSameLogOperation &op); -}; - -} // namespace rwl -} // namespace cache -} // namespace librbd - -#endif // CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H diff --git a/src/librbd/cache/rwl/ReadRequest.cc b/src/librbd/cache/rwl/ReadRequest.cc deleted file mode 100644 index d9860604f7b50..0000000000000 --- a/src/librbd/cache/rwl/ReadRequest.cc +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "ReadRequest.h" - -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl::ReadRequest: " << this << " " \ - << __func__ << ": " - -namespace librbd { -namespace cache { -namespace rwl { - -void C_ReadRequest::finish(int r) { - ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl; - int hits = 0; - int misses = 0; - int hit_bytes = 0; - int miss_bytes = 0; - if (r >= 0) { - /* - * At this point the miss read has completed. We'll iterate through - * read_extents and produce *m_out_bl by assembling pieces of miss_bl - * and the individual hit extent bufs in the read extents that represent - * hits. - */ - uint64_t miss_bl_offset = 0; - for (auto &extent : read_extents) { - if (extent.m_bl.length()) { - /* This was a hit */ - ceph_assert(extent.second == extent.m_bl.length()); - ++hits; - hit_bytes += extent.second; - m_out_bl->claim_append(extent.m_bl); - } else { - /* This was a miss. */ - ++misses; - miss_bytes += extent.second; - bufferlist miss_extent_bl; - miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent.second); - /* Add this read miss bufferlist to the output bufferlist */ - m_out_bl->claim_append(miss_extent_bl); - /* Consume these bytes in the read miss bufferlist */ - miss_bl_offset += extent.second; - } - } - } - ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl; - utime_t now = ceph_clock_now(); - ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes); - m_on_finish->complete(r); - m_perfcounter->inc(l_librbd_rwl_rd_bytes, hit_bytes + miss_bytes); - m_perfcounter->inc(l_librbd_rwl_rd_hit_bytes, hit_bytes); - m_perfcounter->tinc(l_librbd_rwl_rd_latency, now - m_arrived_time); - if (!misses) { - m_perfcounter->inc(l_librbd_rwl_rd_hit_req, 1); - m_perfcounter->tinc(l_librbd_rwl_rd_hit_latency, now - m_arrived_time); - } else { - if (hits) { - m_perfcounter->inc(l_librbd_rwl_rd_part_hit_req, 1); - } - } -} - -} // namespace rwl -} // namespace cache -} // namespace librbd diff --git a/src/librbd/cache/rwl/ReadRequest.h b/src/librbd/cache/rwl/ReadRequest.h deleted file mode 100644 index 9daf7d10499b9..0000000000000 --- a/src/librbd/cache/rwl/ReadRequest.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H -#define CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H - -#include "include/Context.h" -#include "librbd/cache/rwl/Types.h" - -namespace librbd { -namespace cache { -namespace rwl { - -typedef std::vector ImageExtentBufs; - -class C_ReadRequest : public Context { -public: - io::Extents miss_extents; // move back to caller - ImageExtentBufs read_extents; - bufferlist miss_bl; - - C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish) - : m_cct(cct), m_on_finish(on_finish), m_out_bl(out_bl), - m_arrived_time(arrived), m_perfcounter(perfcounter) {} - ~C_ReadRequest() {} - - void finish(int r) override; - - const char *get_name() const { - return "C_ReadRequest"; - } - -private: - CephContext *m_cct; - Context *m_on_finish; - bufferlist *m_out_bl; - utime_t m_arrived_time; - PerfCounters *m_perfcounter; -}; - -} // namespace rwl -} // namespace cache -} // namespace librbd - -#endif // CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H diff --git a/src/librbd/cache/rwl/Request.cc b/src/librbd/cache/rwl/Request.cc deleted file mode 100644 index 684883985d880..0000000000000 --- a/src/librbd/cache/rwl/Request.cc +++ /dev/null @@ -1,633 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "Request.h" -#include "librbd/BlockGuard.h" -#include "librbd/cache/rwl/LogEntry.h" -#include "librbd/cache/ReplicatedWriteLog.h" - -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl::Request: " << this << " " \ - << __func__ << ": " - -namespace librbd { -namespace cache { -namespace rwl { - -template -C_BlockIORequest::C_BlockIORequest(T &rwl, const utime_t arrived, io::Extents &&extents, - bufferlist&& bl, const int fadvise_flags, Context *user_req) - : rwl(rwl), image_extents(std::move(extents)), - bl(std::move(bl)), fadvise_flags(fadvise_flags), - user_req(user_req), image_extents_summary(image_extents), m_arrived_time(arrived) { - ldout(rwl.get_context(), 99) << this << dendl; -} - -template -C_BlockIORequest::~C_BlockIORequest() { - ldout(rwl.get_context(), 99) << this << dendl; - ceph_assert(m_cell_released || !m_cell); -} - -template -std::ostream &operator<<(std::ostream &os, - const C_BlockIORequest &req) { - os << "image_extents=[" << req.image_extents << "], " - << "image_extents_summary=[" << req.image_extents_summary << "], " - << "bl=" << req.bl << ", " - << "user_req=" << req.user_req << ", " - << "m_user_req_completed=" << req.m_user_req_completed << ", " - << "m_deferred=" << req.m_deferred << ", " - << "detained=" << req.detained << ", " - << "waited_lanes=" << req.waited_lanes << ", " - << "waited_entries=" << req.waited_entries << ", " - << "waited_buffers=" << req.waited_buffers << ""; - return os; -} - -template -void C_BlockIORequest::set_cell(BlockGuardCell *cell) { - ldout(rwl.get_context(), 20) << this << " cell=" << cell << dendl; - ceph_assert(cell); - ceph_assert(!m_cell); - m_cell = cell; -} - -template -BlockGuardCell *C_BlockIORequest::get_cell(void) { - ldout(rwl.get_context(), 20) << this << " cell=" << m_cell << dendl; - return m_cell; -} - -template -void C_BlockIORequest::release_cell() { - ldout(rwl.get_context(), 20) << this << " cell=" << m_cell << dendl; - ceph_assert(m_cell); - bool initial = false; - if (m_cell_released.compare_exchange_strong(initial, true)) { - rwl.release_guarded_request(m_cell); - } else { - ldout(rwl.get_context(), 5) << "cell " << m_cell << " already released for " << this << dendl; - } -} - -template -void C_BlockIORequest::complete_user_request(int r) { - bool initial = false; - if (m_user_req_completed.compare_exchange_strong(initial, true)) { - ldout(rwl.get_context(), 15) << this << " completing user req" << dendl; - m_user_req_completed_time = ceph_clock_now(); - user_req->complete(r); - // Set user_req as null as it is deleted - user_req = nullptr; - } else { - ldout(rwl.get_context(), 20) << this << " user req already completed" << dendl; - } -} - -template -void C_BlockIORequest::finish(int r) { - ldout(rwl.get_context(), 20) << this << dendl; - - complete_user_request(r); - bool initial = false; - if (m_finish_called.compare_exchange_strong(initial, true)) { - ldout(rwl.get_context(), 15) << this << " finishing" << dendl; - finish_req(0); - } else { - ldout(rwl.get_context(), 20) << this << " already finished" << dendl; - ceph_assert(0); - } -} - -template -void C_BlockIORequest::deferred() { - bool initial = false; - if (m_deferred.compare_exchange_strong(initial, true)) { - deferred_handler(); - } -} - -template -C_WriteRequest::C_WriteRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents, - bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, - PerfCounters *perfcounter, Context *user_req) - : C_BlockIORequest(rwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req), - m_perfcounter(perfcounter), m_lock(lock) { - ldout(rwl.get_context(), 99) << this << dendl; -} - -template -C_WriteRequest::~C_WriteRequest() { - ldout(rwl.get_context(), 99) << this << dendl; -} - -template -std::ostream &operator<<(std::ostream &os, - const C_WriteRequest &req) { - os << (C_BlockIORequest&)req - << " m_resources.allocated=" << req.m_resources.allocated; - if (req.op_set) { - os << "op_set=" << *req.op_set; - } - return os; -} - -template -void C_WriteRequest::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) { - ldout(rwl.get_context(), 20) << __func__ << " write_req=" << this << " cell=" << guard_ctx.cell << dendl; - - ceph_assert(guard_ctx.cell); - this->detained = guard_ctx.state.detained; /* overlapped */ - this->m_queued = guard_ctx.state.queued; /* queued behind at least one barrier */ - this->set_cell(guard_ctx.cell); -} - -template -void C_WriteRequest::finish_req(int r) { - ldout(rwl.get_context(), 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl; - - /* Completed to caller by here (in finish(), which calls this) */ - utime_t now = ceph_clock_now(); - rwl.release_write_lanes(this); - ceph_assert(m_resources.allocated); - m_resources.allocated = false; - this->release_cell(); /* TODO: Consider doing this in appending state */ - update_req_stats(now); -} - -template -void C_WriteRequest::setup_buffer_resources( - uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, - uint64_t &number_lanes, uint64_t &number_log_entries, - uint64_t &number_unpublished_reserves) { - - ceph_assert(!m_resources.allocated); - - auto image_extents_size = this->image_extents.size(); - m_resources.buffers.reserve(image_extents_size); - - bytes_cached = 0; - bytes_allocated = 0; - number_lanes = image_extents_size; - number_log_entries = image_extents_size; - number_unpublished_reserves = image_extents_size; - - for (auto &extent : this->image_extents) { - m_resources.buffers.emplace_back(); - struct WriteBufferAllocation &buffer = m_resources.buffers.back(); - buffer.allocation_size = MIN_WRITE_ALLOC_SIZE; - buffer.allocated = false; - bytes_cached += extent.second; - if (extent.second > buffer.allocation_size) { - buffer.allocation_size = extent.second; - } - bytes_allocated += buffer.allocation_size; - } - bytes_dirtied = bytes_cached; -} - -template -std::shared_ptr C_WriteRequest::create_operation(uint64_t offset, uint64_t len) { - return std::make_shared(*op_set, offset, len, rwl.get_context()); -} - -template -void C_WriteRequest::setup_log_operations(DeferredContexts &on_exit) { - GenericWriteLogEntries log_entries; - { - std::lock_guard locker(m_lock); - std::shared_ptr current_sync_point = rwl.get_current_sync_point(); - if ((!rwl.get_persist_on_flush() && current_sync_point->log_entry->writes_completed) || - (current_sync_point->log_entry->writes > MAX_WRITES_PER_SYNC_POINT) || - (current_sync_point->log_entry->bytes > MAX_BYTES_PER_SYNC_POINT)) { - /* Create new sync point and persist the previous one. This sequenced - * write will bear a sync gen number shared with no already completed - * writes. A group of sequenced writes may be safely flushed concurrently - * if they all arrived before any of them completed. We'll insert one on - * an aio_flush() from the application. Here we're inserting one to cap - * the number of bytes and writes per sync point. When the application is - * not issuing flushes, we insert sync points to record some observed - * write concurrency information that enables us to safely issue >1 flush - * write (for writes observed here to have been in flight simultaneously) - * at a time in persist-on-write mode. - */ - rwl.flush_new_sync_point(nullptr, on_exit); - current_sync_point = rwl.get_current_sync_point(); - } - uint64_t current_sync_gen = rwl.get_current_sync_gen(); - op_set = - make_unique(this->m_dispatched_time, - m_perfcounter, - current_sync_point, - rwl.get_persist_on_flush(), - rwl.get_context(), this); - ldout(rwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get() << dendl; - ceph_assert(m_resources.allocated); - /* op_set->operations initialized differently for plain write or write same */ - auto allocation = m_resources.buffers.begin(); - uint64_t buffer_offset = 0; - for (auto &extent : this->image_extents) { - /* operation->on_write_persist connected to m_prior_log_entries_persisted Gather */ - auto operation = this->create_operation(extent.first, extent.second); - this->op_set->operations.emplace_back(operation); - - /* A WS is also a write */ - ldout(rwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get() - << " operation=" << operation << dendl; - log_entries.emplace_back(operation->log_entry); - if (!op_set->persist_on_flush) { - rwl.inc_last_op_sequence_num(); - } - operation->init(true, allocation, current_sync_gen, - rwl.get_last_op_sequence_num(), this->bl, buffer_offset, op_set->persist_on_flush); - buffer_offset += operation->log_entry->write_bytes(); - ldout(rwl.get_context(), 20) << "operation=[" << *operation << "]" << dendl; - allocation++; - } - } - /* All extent ops subs created */ - op_set->extent_ops_appending->activate(); - op_set->extent_ops_persist->activate(); - - /* Write data */ - for (auto &operation : op_set->operations) { - operation->copy_bl_to_pmem_buffer(); - } - rwl.add_into_log_map(log_entries); -} - -template -bool C_WriteRequest::append_write_request(std::shared_ptr sync_point) { - std::lock_guard locker(m_lock); - auto write_req_sp = this; - if (sync_point->earlier_sync_point) { - Context *schedule_append_ctx = new LambdaContext([this, write_req_sp](int r) { - write_req_sp->schedule_append(); - }); - sync_point->earlier_sync_point->on_sync_point_appending.push_back(schedule_append_ctx); - return true; - } - return false; -} - -template -void C_WriteRequest::schedule_append() { - ceph_assert(++m_appended == 1); - if (m_do_early_flush) { - /* This caller is waiting for persist, so we'll use their thread to - * expedite it */ - rwl.flush_pmem_buffer(this->op_set->operations); - rwl.schedule_append(this->op_set->operations); - } else { - /* This is probably not still the caller's thread, so do the payload - * flushing/replicating later. */ - rwl.schedule_flush_and_append(this->op_set->operations); - } -} - -/** - * Attempts to allocate log resources for a write. Returns true if successful. - * - * Resources include 1 lane per extent, 1 log entry per extent, and the payload - * data space for each extent. - * - * Lanes are released after the write persists via release_write_lanes() - */ -template -bool C_WriteRequest::alloc_resources() { - this->allocated_time = ceph_clock_now(); - return rwl.alloc_resources(this); -} - -/** - * Takes custody of write_req. Resources must already be allocated. - * - * Locking: - * Acquires lock - */ -template -void C_WriteRequest::dispatch() -{ - CephContext *cct = rwl.get_context(); - DeferredContexts on_exit; - utime_t now = ceph_clock_now(); - this->m_dispatched_time = now; - - ldout(cct, 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl; - this->setup_log_operations(on_exit); - - bool append_deferred = false; - if (!op_set->persist_on_flush && - append_write_request(op_set->sync_point)) { - /* In persist-on-write mode, we defer the append of this write until the - * previous sync point is appending (meaning all the writes before it are - * persisted and that previous sync point can now appear in the - * log). Since we insert sync points in persist-on-write mode when writes - * have already completed to the current sync point, this limits us to - * one inserted sync point in flight at a time, and gives the next - * inserted sync point some time to accumulate a few writes if they - * arrive soon. Without this we can insert an absurd number of sync - * points, each with one or two writes. That uses a lot of log entries, - * and limits flushing to very few writes at a time. */ - m_do_early_flush = false; - append_deferred = true; - } else { - /* The prior sync point is done, so we'll schedule append here. If this is - * persist-on-write, and probably still the caller's thread, we'll use this - * caller's thread to perform the persist & replication of the payload - * buffer. */ - m_do_early_flush = - !(this->detained || this->m_queued || this->m_deferred || op_set->persist_on_flush); - } - if (!append_deferred) { - this->schedule_append(); - } -} - -template -C_FlushRequest::C_FlushRequest(T &rwl, const utime_t arrived, - io::Extents &&image_extents, - bufferlist&& bl, const int fadvise_flags, - ceph::mutex &lock, PerfCounters *perfcounter, - Context *user_req) - : C_BlockIORequest(rwl, arrived, std::move(image_extents), std::move(bl), - fadvise_flags, user_req), - m_lock(lock), m_perfcounter(perfcounter) { - ldout(rwl.get_context(), 20) << this << dendl; -} - -template -void C_FlushRequest::finish_req(int r) { - ldout(rwl.get_context(), 20) << "flush_req=" << this - << " cell=" << this->get_cell() << dendl; - /* Block guard already released */ - ceph_assert(!this->get_cell()); - - /* Completed to caller by here */ - utime_t now = ceph_clock_now(); - m_perfcounter->tinc(l_librbd_rwl_aio_flush_latency, now - this->m_arrived_time); -} - -template -bool C_FlushRequest::alloc_resources() { - ldout(rwl.get_context(), 20) << "req type=" << get_name() << " " - << "req=[" << *this << "]" << dendl; - return rwl.alloc_resources(this); -} - -template -void C_FlushRequest::dispatch() { - utime_t now = ceph_clock_now(); - ldout(rwl.get_context(), 20) << "req type=" << get_name() << " " - << "req=[" << *this << "]" << dendl; - ceph_assert(this->m_resources.allocated); - this->m_dispatched_time = now; - - op = std::make_shared(m_lock, - to_append, - now, - m_perfcounter, - rwl.get_context()); - - m_perfcounter->inc(l_librbd_rwl_log_ops, 1); - rwl.schedule_append(op); -} - -template -void C_FlushRequest::setup_buffer_resources( - uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, - uint64_t &number_lanes, uint64_t &number_log_entries, - uint64_t &number_unpublished_reserves) { - number_log_entries = 1; -} - -template -std::ostream &operator<<(std::ostream &os, - const C_FlushRequest &req) { - os << (C_BlockIORequest&)req - << " m_resources.allocated=" << req.m_resources.allocated; - return os; -} - -template -C_DiscardRequest::C_DiscardRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents, - uint32_t discard_granularity_bytes, ceph::mutex &lock, - PerfCounters *perfcounter, Context *user_req) - : C_BlockIORequest(rwl, arrived, std::move(image_extents), bufferlist(), 0, user_req), - m_discard_granularity_bytes(discard_granularity_bytes), - m_lock(lock), - m_perfcounter(perfcounter) { - ldout(rwl.get_context(), 20) << this << dendl; -} - -template -C_DiscardRequest::~C_DiscardRequest() { - ldout(rwl.get_context(), 20) << this << dendl; -} - -template -bool C_DiscardRequest::alloc_resources() { - ldout(rwl.get_context(), 20) << "req type=" << get_name() << " " - << "req=[" << *this << "]" << dendl; - return rwl.alloc_resources(this); -} - -template -void C_DiscardRequest::setup_log_operations() { - std::lock_guard locker(m_lock); - GenericWriteLogEntries log_entries; - for (auto &extent : this->image_extents) { - op = std::make_shared(rwl.get_current_sync_point(), - extent.first, - extent.second, - m_discard_granularity_bytes, - this->m_dispatched_time, - m_perfcounter, - rwl.get_context()); - log_entries.emplace_back(op->log_entry); - break; - } - uint64_t current_sync_gen = rwl.get_current_sync_gen(); - bool persist_on_flush = rwl.get_persist_on_flush(); - if (!persist_on_flush) { - rwl.inc_last_op_sequence_num(); - } - auto discard_req = this; - Context *on_write_persist = new LambdaContext( - [this, discard_req](int r) { - ldout(rwl.get_context(), 20) << "discard_req=" << discard_req - << " cell=" << discard_req->get_cell() << dendl; - ceph_assert(discard_req->get_cell()); - discard_req->complete_user_request(r); - discard_req->release_cell(); - }); - op->init(current_sync_gen, persist_on_flush, rwl.get_last_op_sequence_num(), on_write_persist); - rwl.add_into_log_map(log_entries); -} - -template -void C_DiscardRequest::dispatch() { - utime_t now = ceph_clock_now(); - ldout(rwl.get_context(), 20) << "req type=" << get_name() << " " - << "req=[" << *this << "]" << dendl; - ceph_assert(this->m_resources.allocated); - this->m_dispatched_time = now; - setup_log_operations(); - m_perfcounter->inc(l_librbd_rwl_log_ops, 1); - rwl.schedule_append(op); -} - -template -void C_DiscardRequest::setup_buffer_resources( - uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, - uint64_t &number_lanes, uint64_t &number_log_entries, - uint64_t &number_unpublished_reserves) { - number_log_entries = 1; - /* No bytes are allocated for a discard, but we count the discarded bytes - * as dirty. This means it's possible to have more bytes dirty than - * there are bytes cached or allocated. */ - for (auto &extent : this->image_extents) { - bytes_dirtied = extent.second; - break; - } -} - -template -void C_DiscardRequest::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) { - ldout(rwl.get_context(), 20) << " cell=" << guard_ctx.cell << dendl; - - ceph_assert(guard_ctx.cell); - this->detained = guard_ctx.state.detained; /* overlapped */ - this->set_cell(guard_ctx.cell); -} - -template -std::ostream &operator<<(std::ostream &os, - const C_DiscardRequest &req) { - os << (C_BlockIORequest&)req; - if (req.op) { - os << " op=[" << *req.op << "]"; - } else { - os << " op=nullptr"; - } - return os; -} - -template -C_WriteSameRequest::C_WriteSameRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents, - bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, - PerfCounters *perfcounter, Context *user_req) - : C_WriteRequest(rwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req) { - ldout(rwl.get_context(), 20) << this << dendl; -} - -template -C_WriteSameRequest::~C_WriteSameRequest() { - ldout(rwl.get_context(), 20) << this << dendl; -} - -template -void C_WriteSameRequest::update_req_stats(utime_t &now) { - /* Write same stats excluded from most write stats - * because the read phase will make them look like slow writes in - * those histograms. */ - ldout(rwl.get_context(), 20) << this << dendl; - utime_t comp_latency = now - this->m_arrived_time; - this->m_perfcounter->tinc(l_librbd_rwl_ws_latency, comp_latency); -} - -/* Write sames will allocate one buffer, the size of the repeating pattern */ -template -void C_WriteSameRequest::setup_buffer_resources( - uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, - uint64_t &number_lanes, uint64_t &number_log_entries, - uint64_t &number_unpublished_reserves) { - ldout(rwl.get_context(), 20) << this << dendl; - ceph_assert(this->image_extents.size() == 1); - bytes_dirtied += this->image_extents[0].second; - auto pattern_length = this->bl.length(); - this->m_resources.buffers.emplace_back(); - struct WriteBufferAllocation &buffer = this->m_resources.buffers.back(); - buffer.allocation_size = MIN_WRITE_ALLOC_SIZE; - buffer.allocated = false; - bytes_cached += pattern_length; - if (pattern_length > buffer.allocation_size) { - buffer.allocation_size = pattern_length; - } - bytes_allocated += buffer.allocation_size; -} - -template -std::shared_ptr C_WriteSameRequest::create_operation(uint64_t offset, uint64_t len) { - ceph_assert(this->image_extents.size() == 1); - return std::make_shared(*this->op_set.get(), offset, len, - this->bl.length(), rwl.get_context()); -} - -template -std::ostream &operator<<(std::ostream &os, - const C_WriteSameRequest &req) { - os << (C_WriteRequest&)req; - return os; -} - -template -C_CompAndWriteRequest::C_CompAndWriteRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents, - bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, - int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter, - Context *user_req) - : C_WriteRequest(rwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req), - mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)) { - ldout(rwl.get_context(), 20) << dendl; -} - -template -C_CompAndWriteRequest::~C_CompAndWriteRequest() { - ldout(rwl.get_context(), 20) << dendl; -} - -template -void C_CompAndWriteRequest::finish_req(int r) { - if (compare_succeeded) { - C_WriteRequest::finish_req(r); - } else { - utime_t now = ceph_clock_now(); - update_req_stats(now); - } -} - -template -void C_CompAndWriteRequest::update_req_stats(utime_t &now) { - /* Compare-and-write stats. Compare-and-write excluded from most write - * stats because the read phase will make them look like slow writes in - * those histograms. */ - if (!compare_succeeded) { - this->m_perfcounter->inc(l_librbd_rwl_cmp_fails, 1); - } - utime_t comp_latency = now - this->m_arrived_time; - this->m_perfcounter->tinc(l_librbd_rwl_cmp_latency, comp_latency); -} - -template -std::ostream &operator<<(std::ostream &os, - const C_CompAndWriteRequest &req) { - os << (C_WriteRequest&)req - << "cmp_bl=" << req.cmp_bl << ", " - << "read_bl=" << req.read_bl << ", " - << "compare_succeeded=" << req.compare_succeeded << ", " - << "mismatch_offset=" << req.mismatch_offset; - return os; -} - -} // namespace rwl -} // namespace cache -} // namespace librbd - -template class librbd::cache::rwl::C_BlockIORequest >; -template class librbd::cache::rwl::C_WriteRequest >; -template class librbd::cache::rwl::C_FlushRequest >; -template class librbd::cache::rwl::C_DiscardRequest >; -template class librbd::cache::rwl::C_WriteSameRequest >; -template class librbd::cache::rwl::C_CompAndWriteRequest >; diff --git a/src/librbd/cache/rwl/Request.h b/src/librbd/cache/rwl/Request.h deleted file mode 100644 index 2ab58e51d7152..0000000000000 --- a/src/librbd/cache/rwl/Request.h +++ /dev/null @@ -1,408 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H -#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H - -#include "include/Context.h" -#include "librbd/cache/ImageCache.h" -#include "librbd/cache/rwl/Types.h" -#include "librbd/cache/rwl/LogOperation.h" - -namespace librbd { -class BlockGuardCell; - -namespace cache { -namespace rwl { - -class GuardedRequestFunctionContext; - -struct WriteRequestResources { - bool allocated = false; - std::vector buffers; -}; - -/** - * A request that can be deferred in a BlockGuard to sequence - * overlapping operations. - * This is the custodian of the BlockGuard cell for this IO, and the - * state information about the progress of this IO. This object lives - * until the IO is persisted in all (live) log replicas. User request - * may be completed from here before the IO persists. - */ -template -class C_BlockIORequest : public Context { -public: - T &rwl; - io::Extents image_extents; - bufferlist bl; - int fadvise_flags; - Context *user_req; /* User write request */ - ExtentsSummary image_extents_summary; - bool detained = false; /* Detained in blockguard (overlapped with a prior IO) */ - utime_t allocated_time; /* When allocation began */ - bool waited_lanes = false; /* This IO waited for free persist/replicate lanes */ - bool waited_entries = false; /* This IO waited for free log entries */ - bool waited_buffers = false; /* This IO waited for data buffers (pmemobj_reserve() failed) */ - - C_BlockIORequest(T &rwl, const utime_t arrived, io::Extents &&extents, - bufferlist&& bl, const int fadvise_flags, Context *user_req); - ~C_BlockIORequest() override; - C_BlockIORequest(const C_BlockIORequest&) = delete; - C_BlockIORequest &operator=(const C_BlockIORequest&) = delete; - - void set_cell(BlockGuardCell *cell); - BlockGuardCell *get_cell(void); - void release_cell(); - - void complete_user_request(int r); - void finish(int r); - virtual void finish_req(int r) = 0; - - virtual bool alloc_resources() = 0; - - void deferred(); - - virtual void deferred_handler() = 0; - - virtual void dispatch() = 0; - - virtual const char *get_name() const { - return "C_BlockIORequest"; - } - uint64_t get_image_extents_size() { - return image_extents.size(); - } - void set_io_waited_for_lanes(bool waited) { - waited_lanes = waited; - } - void set_io_waited_for_entries(bool waited) { - waited_entries = waited; - } - void set_io_waited_for_buffers(bool waited) { - waited_buffers = waited; - } - bool has_io_waited_for_buffers() { - return waited_buffers; - } - std::vector& get_resources_buffers() { - return m_resources.buffers; - } - - void set_allocated(bool allocated) { - if (allocated) { - m_resources.allocated = true; - } else { - m_resources.buffers.clear(); - } - } - - virtual void setup_buffer_resources( - uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, - uint64_t &number_lanes, uint64_t &number_log_entries, - uint64_t &number_unpublished_reserves) {}; - -protected: - utime_t m_arrived_time; - utime_t m_dispatched_time; /* When dispatch began */ - utime_t m_user_req_completed_time; - std::atomic m_deferred = {false}; /* Deferred because this or a prior IO had to wait for write resources */ - WriteRequestResources m_resources; - -private: - std::atomic m_user_req_completed = {false}; - std::atomic m_finish_called = {false}; - std::atomic m_cell_released = {false}; - BlockGuardCell* m_cell = nullptr; - - template - friend std::ostream &operator<<(std::ostream &os, - const C_BlockIORequest &req); -}; - -/** - * This is the custodian of the BlockGuard cell for this write. Block - * guard is not released until the write persists everywhere (this is - * how we guarantee to each log replica that they will never see - * overlapping writes). - */ -template -class C_WriteRequest : public C_BlockIORequest { -public: - using C_BlockIORequest::rwl; - unique_ptr op_set = nullptr; - - C_WriteRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents, - bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, - PerfCounters *perfcounter, Context *user_req); - - ~C_WriteRequest() override; - - void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx); - - /* Common finish to plain write and compare-and-write (if it writes) */ - void finish_req(int r) override; - - /* Compare and write will override this */ - virtual void update_req_stats(utime_t &now) { - // TODO: Add in later PRs - } - bool alloc_resources() override; - - void deferred_handler() override { } - - void dispatch() override; - - virtual std::shared_ptr create_operation(uint64_t offset, uint64_t len); - - virtual void setup_log_operations(DeferredContexts &on_exit); - - bool append_write_request(std::shared_ptr sync_point); - - virtual void schedule_append(); - - const char *get_name() const override { - return "C_WriteRequest"; - } - -protected: - using C_BlockIORequest::m_resources; - PerfCounters *m_perfcounter = nullptr; - /* Plain writes will allocate one buffer per request extent */ - void setup_buffer_resources( - uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, - uint64_t &number_lanes, uint64_t &number_log_entries, - uint64_t &number_unpublished_reserves) override; - -private: - bool m_do_early_flush = false; - std::atomic m_appended = {0}; - bool m_queued = false; - ceph::mutex &m_lock; - template - friend std::ostream &operator<<(std::ostream &os, - const C_WriteRequest &req); -}; - -/** - * This is the custodian of the BlockGuard cell for this - * aio_flush. Block guard is released as soon as the new - * sync point (if required) is created. Subsequent IOs can - * proceed while this flush waits for prior IOs to complete - * and any required sync points to be persisted. - */ -template -class C_FlushRequest : public C_BlockIORequest { -public: - using C_BlockIORequest::rwl; - bool internal = false; - std::shared_ptr to_append; - - C_FlushRequest(T &rwl, const utime_t arrived, - io::Extents &&image_extents, - bufferlist&& bl, const int fadvise_flags, - ceph::mutex &lock, PerfCounters *perfcounter, - Context *user_req); - - ~C_FlushRequest() override {} - - bool alloc_resources() override; - - void dispatch() override; - - const char *get_name() const override { - return "C_FlushRequest"; - } - - void setup_buffer_resources( - uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, - uint64_t &number_lanes, uint64_t &number_log_entries, - uint64_t &number_unpublished_reserves) override; -private: - std::shared_ptr op; - ceph::mutex &m_lock; - PerfCounters *m_perfcounter = nullptr; - - void finish_req(int r) override; - void deferred_handler() override { - m_perfcounter->inc(l_librbd_rwl_aio_flush_def, 1); - } - - template - friend std::ostream &operator<<(std::ostream &os, - const C_FlushRequest &req); -}; - -/** - * This is the custodian of the BlockGuard cell for this discard. As in the - * case of write, the block guard is not released until the discard persists - * everywhere. - */ -template -class C_DiscardRequest : public C_BlockIORequest { -public: - using C_BlockIORequest::rwl; - std::shared_ptr op; - - C_DiscardRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents, - uint32_t discard_granularity_bytes, ceph::mutex &lock, - PerfCounters *perfcounter, Context *user_req); - - ~C_DiscardRequest() override; - void finish_req(int r) override {} - - bool alloc_resources() override; - - void deferred_handler() override { } - - void setup_log_operations(); - - void dispatch() override; - - void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx); - - const char *get_name() const override { - return "C_DiscardRequest"; - } - void setup_buffer_resources( - uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, - uint64_t &number_lanes, uint64_t &number_log_entries, - uint64_t &number_unpublished_reserves) override; -private: - uint32_t m_discard_granularity_bytes; - ceph::mutex &m_lock; - PerfCounters *m_perfcounter = nullptr; - template - friend std::ostream &operator<<(std::ostream &os, - const C_DiscardRequest &req); -}; - -/** - * This is the custodian of the BlockGuard cell for this write same. - * - * A writesame allocates and persists a data buffer like a write, but the - * data buffer is usually much shorter than the write same. - */ -template -class C_WriteSameRequest : public C_WriteRequest { -public: - using C_BlockIORequest::rwl; - C_WriteSameRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents, - bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, - PerfCounters *perfcounter, Context *user_req); - - ~C_WriteSameRequest() override; - - void update_req_stats(utime_t &now) override; - - void setup_buffer_resources( - uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated, - uint64_t &number_lanes, uint64_t &number_log_entries, - uint64_t &number_unpublished_reserves) override; - - std::shared_ptr create_operation(uint64_t offset, uint64_t len) override; - - const char *get_name() const override { - return "C_WriteSameRequest"; - } - - template - friend std::ostream &operator<<(std::ostream &os, - const C_WriteSameRequest &req); -}; - -/** - * This is the custodian of the BlockGuard cell for this compare and write. The - * block guard is acquired before the read begins to guarantee atomicity of this - * operation. If this results in a write, the block guard will be released - * when the write completes to all replicas. - */ -template -class C_CompAndWriteRequest : public C_WriteRequest { -public: - using C_BlockIORequest::rwl; - bool compare_succeeded = false; - uint64_t *mismatch_offset; - bufferlist cmp_bl; - bufferlist read_bl; - C_CompAndWriteRequest(T &rwl, const utime_t arrived, io::Extents &&image_extents, - bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, - int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter, - Context *user_req); - ~C_CompAndWriteRequest(); - - void finish_req(int r) override; - - void update_req_stats(utime_t &now) override; - - /* - * Compare and write doesn't implement alloc_resources(), deferred_handler(), - * or dispatch(). We use the implementation in C_WriteRequest(), and only if the - * compare phase succeeds and a write is actually performed. - */ - - const char *get_name() const override { - return "C_CompAndWriteRequest"; - } - template - friend std::ostream &operator<<(std::ostream &os, - const C_CompAndWriteRequest &req); -}; - -struct BlockGuardReqState { - bool barrier = false; /* This is a barrier request */ - bool current_barrier = false; /* This is the currently active barrier */ - bool detained = false; - bool queued = false; /* Queued for barrier */ - friend std::ostream &operator<<(std::ostream &os, - const BlockGuardReqState &r) { - os << "barrier=" << r.barrier << ", " - << "current_barrier=" << r.current_barrier << ", " - << "detained=" << r.detained << ", " - << "queued=" << r.queued; - return os; - } -}; - -class GuardedRequestFunctionContext : public Context { -public: - BlockGuardCell *cell = nullptr; - BlockGuardReqState state; - GuardedRequestFunctionContext(boost::function &&callback) - : m_callback(std::move(callback)){ } - ~GuardedRequestFunctionContext(void) override { }; - GuardedRequestFunctionContext(const GuardedRequestFunctionContext&) = delete; - GuardedRequestFunctionContext &operator=(const GuardedRequestFunctionContext&) = delete; - -private: - boost::function m_callback; - void finish(int r) override { - ceph_assert(cell); - m_callback(*this); - } -}; - -class GuardedRequest { -public: - const BlockExtent block_extent; - GuardedRequestFunctionContext *guard_ctx; /* Work to do when guard on range obtained */ - - GuardedRequest(const BlockExtent block_extent, - GuardedRequestFunctionContext *on_guard_acquire, bool barrier = false) - : block_extent(block_extent), guard_ctx(on_guard_acquire) { - guard_ctx->state.barrier = barrier; - } - friend std::ostream &operator<<(std::ostream &os, - const GuardedRequest &r) { - os << "guard_ctx->state=[" << r.guard_ctx->state << "], " - << "block_extent.block_start=" << r.block_extent.block_start << ", " - << "block_extent.block_start=" << r.block_extent.block_end; - return os; - } -}; - -} // namespace rwl -} // namespace cache -} // namespace librbd - -#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H diff --git a/src/librbd/cache/rwl/ShutdownRequest.cc b/src/librbd/cache/rwl/ShutdownRequest.cc deleted file mode 100644 index 8259662ae9731..0000000000000 --- a/src/librbd/cache/rwl/ShutdownRequest.cc +++ /dev/null @@ -1,151 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "librbd/cache/rwl/ShutdownRequest.h" -#include "librbd/ImageCtx.h" -#include "librbd/Utils.h" -#include "common/dout.h" -#include "common/errno.h" -#include "librbd/Operations.h" -#include "librbd/asio/ContextWQ.h" -#include "librbd/cache/ImageCache.h" -#include "librbd/cache/Types.h" - - -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl:ShutdownRequest: " \ - << this << " " << __func__ << ": " - -namespace librbd { -namespace cache { -namespace rwl { - -using librbd::util::create_async_context_callback; -using librbd::util::create_context_callback; - -template -ShutdownRequest* ShutdownRequest::create(I &image_ctx, - Context *on_finish) { - return new ShutdownRequest(image_ctx, on_finish); -} - -template -ShutdownRequest::ShutdownRequest(I &image_ctx, Context *on_finish) - : m_image_ctx(image_ctx), - m_on_finish(create_async_context_callback(image_ctx, on_finish)), - m_error_result(0) { -} - -template -void ShutdownRequest::send() { - send_shutdown_image_cache(); -} - -template -void ShutdownRequest::send_shutdown_image_cache() { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << dendl; - - if (m_image_ctx.image_cache == nullptr) { - finish(); - return; - } - - using klass = ShutdownRequest; - Context *ctx = create_context_callback( - this); - - m_image_ctx.image_cache->shut_down(ctx); -} - -template -void ShutdownRequest::handle_shutdown_image_cache(int r) { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << dendl; - - if (r < 0) { - lderr(cct) << "failed to shut down the image cache: " << cpp_strerror(r) - << dendl; - save_result(r); - finish(); - return; - } else { - delete m_image_ctx.image_cache; - m_image_ctx.image_cache = nullptr; - } - send_remove_feature_bit(); -} - -template -void ShutdownRequest::send_remove_feature_bit() { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << dendl; - - uint64_t new_features = m_image_ctx.features & ~RBD_FEATURE_DIRTY_CACHE; - uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE; - ldout(cct, 10) << "old_features=" << m_image_ctx.features - << ", new_features=" << new_features - << ", features_mask=" << features_mask - << dendl; - - int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, m_image_ctx.header_oid, - new_features, features_mask); - m_image_ctx.features &= ~RBD_FEATURE_DIRTY_CACHE; - using klass = ShutdownRequest; - Context *ctx = create_context_callback( - this); - ctx->complete(r); -} - -template -void ShutdownRequest::handle_remove_feature_bit(int r) { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << dendl; - - if (r < 0) { - lderr(cct) << "failed to remove the feature bit: " << cpp_strerror(r) - << dendl; - save_result(r); - finish(); - return; - } - send_remove_image_cache_state(); -} - -template -void ShutdownRequest::send_remove_image_cache_state() { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << dendl; - - using klass = ShutdownRequest; - Context *ctx = create_context_callback( - this); - std::shared_lock owner_lock{m_image_ctx.owner_lock}; - m_image_ctx.operations->execute_metadata_remove(IMAGE_CACHE_STATE, ctx); -} - -template -void ShutdownRequest::handle_remove_image_cache_state(int r) { - CephContext *cct = m_image_ctx.cct; - ldout(cct, 10) << dendl; - - if (r < 0) { - lderr(cct) << "failed to remove the image cache state: " << cpp_strerror(r) - << dendl; - save_result(r); - } - finish(); -} - -template -void ShutdownRequest::finish() { - m_on_finish->complete(m_error_result); - delete this; -} - -} // namespace rwl -} // namespace cache -} // namespace librbd - -template class librbd::cache::rwl::ShutdownRequest; diff --git a/src/librbd/cache/rwl/ShutdownRequest.h b/src/librbd/cache/rwl/ShutdownRequest.h deleted file mode 100644 index 635527f63c7b4..0000000000000 --- a/src/librbd/cache/rwl/ShutdownRequest.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H -#define CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H - -class Context; - -namespace librbd { - -class ImageCtx; - -namespace cache { -namespace rwl { - -template -class ImageCacheState; - -template -class ShutdownRequest { -public: - static ShutdownRequest* create(ImageCtxT &image_ctx, Context *on_finish); - - void send(); - -private: - - /** - * @verbatim - * - * Shutdown request goes through the following state machine: - * - * - * | - * v - * SHUTDOWN_IMAGE_CACHE - * | - * v - * REMOVE_IMAGE_FEATURE_BIT - * | - * v - * REMOVE_IMAGE_CACHE_STATE - * | - * v - * - * - * @endverbatim - */ - - ShutdownRequest(ImageCtxT &image_ctx, Context *on_finish); - - ImageCtxT &m_image_ctx; - Context *m_on_finish; - - int m_error_result; - - void send_shutdown_image_cache(); - void handle_shutdown_image_cache(int r); - - void send_remove_feature_bit(); - void handle_remove_feature_bit(int r); - - void send_remove_image_cache_state(); - void handle_remove_image_cache_state(int r); - - void finish(); - - void save_result(int result) { - if (m_error_result == 0 && result < 0) { - m_error_result = result; - } - } -}; - -} // namespace rwl -} // namespace cache -} // namespace librbd - -extern template class librbd::cache::rwl::ShutdownRequest; - -#endif // CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H diff --git a/src/librbd/cache/rwl/SyncPoint.cc b/src/librbd/cache/rwl/SyncPoint.cc deleted file mode 100644 index cb3f3cfeebd91..0000000000000 --- a/src/librbd/cache/rwl/SyncPoint.cc +++ /dev/null @@ -1,109 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "SyncPoint.h" - -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl::SyncPoint: " << this << " " \ - << __func__ << ": " - -namespace librbd { -namespace cache { -namespace rwl { - -SyncPoint::SyncPoint(uint64_t sync_gen_num, CephContext *cct) - : log_entry(std::make_shared(sync_gen_num)), m_cct(cct) { - m_prior_log_entries_persisted = new C_Gather(cct, nullptr); - m_sync_point_persist = new C_Gather(cct, nullptr); - on_sync_point_appending.reserve(MAX_WRITES_PER_SYNC_POINT + 2); - on_sync_point_persisted.reserve(MAX_WRITES_PER_SYNC_POINT + 2); - ldout(m_cct, 20) << "sync point " << sync_gen_num << dendl; -} - -SyncPoint::~SyncPoint() { - ceph_assert(on_sync_point_appending.empty()); - ceph_assert(on_sync_point_persisted.empty()); - ceph_assert(!earlier_sync_point); -} - -std::ostream &operator<<(std::ostream &os, - const SyncPoint &p) { - os << "log_entry=[" << *p.log_entry << "], " - << "earlier_sync_point=" << p.earlier_sync_point << ", " - << "later_sync_point=" << p.later_sync_point << ", " - << "m_final_op_sequence_num=" << p.m_final_op_sequence_num << ", " - << "m_prior_log_entries_persisted=" << p.m_prior_log_entries_persisted << ", " - << "m_prior_log_entries_persisted_complete=" << p.m_prior_log_entries_persisted_complete << ", " - << "m_append_scheduled=" << p.m_append_scheduled << ", " - << "appending=" << p.appending << ", " - << "on_sync_point_appending=" << p.on_sync_point_appending.size() << ", " - << "on_sync_point_persisted=" << p.on_sync_point_persisted.size() << ""; - return os; -} - -void SyncPoint::persist_gather_set_finisher(Context *ctx) { - m_append_scheduled = true; - /* All prior sync points that are still in this list must already be scheduled for append */ - std::shared_ptr previous = earlier_sync_point; - while (previous) { - ceph_assert(previous->m_append_scheduled); - previous = previous->earlier_sync_point; - } - - m_sync_point_persist->set_finisher(ctx); -} - -void SyncPoint::persist_gather_activate() { - m_sync_point_persist->activate(); -} - -Context* SyncPoint::persist_gather_new_sub() { - return m_sync_point_persist->new_sub(); -} - -void SyncPoint::prior_persisted_gather_activate() { - m_prior_log_entries_persisted->activate(); -} - -Context* SyncPoint::prior_persisted_gather_new_sub() { - return m_prior_log_entries_persisted->new_sub(); -} - -void SyncPoint::prior_persisted_gather_set_finisher() { - Context *sync_point_persist_ready = persist_gather_new_sub(); - std::shared_ptr sp = shared_from_this(); - m_prior_log_entries_persisted-> - set_finisher(new LambdaContext([this, sp, sync_point_persist_ready](int r) { - ldout(m_cct, 20) << "Prior log entries persisted for sync point =[" - << sp << "]" << dendl; - sp->m_prior_log_entries_persisted_result = r; - sp->m_prior_log_entries_persisted_complete = true; - sync_point_persist_ready->complete(r); - })); -} - -void SyncPoint::add_in_on_persisted_ctxs(Context* ctx) { - on_sync_point_persisted.push_back(ctx); -} - -void SyncPoint::add_in_on_appending_ctxs(Context* ctx) { - on_sync_point_appending.push_back(ctx); -} - -void SyncPoint::setup_earlier_sync_point(std::shared_ptr sync_point, - uint64_t last_op_sequence_num) { - earlier_sync_point = sync_point; - log_entry->prior_sync_point_flushed = false; - earlier_sync_point->log_entry->next_sync_point_entry = log_entry; - earlier_sync_point->later_sync_point = shared_from_this(); - earlier_sync_point->m_final_op_sequence_num = last_op_sequence_num; - if (!earlier_sync_point->appending) { - /* Append of new sync point deferred until old sync point is appending */ - earlier_sync_point->add_in_on_appending_ctxs(prior_persisted_gather_new_sub()); - } -} - -} // namespace rwl -} // namespace cache -} // namespace librbd diff --git a/src/librbd/cache/rwl/SyncPoint.h b/src/librbd/cache/rwl/SyncPoint.h deleted file mode 100644 index 1445146534e59..0000000000000 --- a/src/librbd/cache/rwl/SyncPoint.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H -#define CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H - -#include "librbd/ImageCtx.h" -#include "librbd/cache/rwl/LogEntry.h" -#include "librbd/cache/rwl/Types.h" - -namespace librbd { -namespace cache { -namespace rwl { - -class SyncPoint: public std::enable_shared_from_this { -public: - std::shared_ptr log_entry; - /* Use lock for earlier/later links */ - std::shared_ptr earlier_sync_point; /* NULL if earlier has completed */ - std::shared_ptr later_sync_point; - bool appending = false; - /* Signal these when this sync point is appending to the log, and its order - * of appearance is guaranteed. One of these is is a sub-operation of the - * next sync point's m_prior_log_entries_persisted Gather. */ - std::vector on_sync_point_appending; - /* Signal these when this sync point is appended and persisted. User - * aio_flush() calls are added to this. */ - std::vector on_sync_point_persisted; - - SyncPoint(uint64_t sync_gen_num, CephContext *cct); - ~SyncPoint(); - SyncPoint(const SyncPoint&) = delete; - SyncPoint &operator=(const SyncPoint&) = delete; - void persist_gather_activate(); - Context* persist_gather_new_sub(); - void persist_gather_set_finisher(Context *ctx); - void prior_persisted_gather_activate(); - Context* prior_persisted_gather_new_sub(); - void prior_persisted_gather_set_finisher(); - void add_in_on_persisted_ctxs(Context* cxt); - void add_in_on_appending_ctxs(Context* cxt); - void setup_earlier_sync_point(std::shared_ptr sync_point, - uint64_t last_op_sequence_num); -private: - CephContext *m_cct; - bool m_append_scheduled = false; - uint64_t m_final_op_sequence_num = 0; - /* A sync point can't appear in the log until all the writes bearing - * it and all the prior sync points have been appended and - * persisted. - * - * Writes bearing this sync gen number and the prior sync point will be - * sub-ops of this Gather. This sync point will not be appended until all - * these complete to the point where their persist order is guaranteed. */ - C_Gather *m_prior_log_entries_persisted; - /* The finisher for this will append the sync point to the log. The finisher - * for m_prior_log_entries_persisted will be a sub-op of this. */ - C_Gather *m_sync_point_persist; - int m_prior_log_entries_persisted_result = 0; - int m_prior_log_entries_persisted_complete = false; - friend std::ostream &operator<<(std::ostream &os, - const SyncPoint &p); -}; - -} // namespace rwl -} // namespace cache -} // namespace librbd - -#endif // CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H diff --git a/src/librbd/cache/rwl/Types.cc b/src/librbd/cache/rwl/Types.cc deleted file mode 100644 index 7f46c07041009..0000000000000 --- a/src/librbd/cache/rwl/Types.cc +++ /dev/null @@ -1,121 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include "Types.h" -#include "common/ceph_context.h" -#include "include/Context.h" - -#define dout_subsys ceph_subsys_rbd_rwl -#undef dout_prefix -#define dout_prefix *_dout << "librbd::cache::rwl::Types: " << this << " " \ - << __func__ << ": " - -namespace librbd { - -namespace cache { - -namespace rwl { - -DeferredContexts::~DeferredContexts() { - finish_contexts(nullptr, contexts, 0); -} - -void DeferredContexts::add(Context* ctx) { - contexts.push_back(ctx); -} - -/* - * A BlockExtent identifies a range by first and last. - * - * An Extent ("image extent") identifies a range by start and length. - * - * The ImageCache interface is defined in terms of image extents, and - * requires no alignment of the beginning or end of the extent. We - * convert between image and block extents here using a "block size" - * of 1. - */ -BlockExtent convert_to_block_extent(const uint64_t offset_bytes, const uint64_t length_bytes) -{ - return BlockExtent(offset_bytes, - offset_bytes + length_bytes); -} - -BlockExtent WriteLogPmemEntry::block_extent() { - return convert_to_block_extent(image_offset_bytes, write_bytes); -} - -uint64_t WriteLogPmemEntry::get_offset_bytes() { - return image_offset_bytes; -} - -uint64_t WriteLogPmemEntry::get_write_bytes() { - return write_bytes; -} - -std::ostream& operator<<(std::ostream& os, - const WriteLogPmemEntry &entry) { - os << "entry_valid=" << (bool)entry.entry_valid << ", " - << "sync_point=" << (bool)entry.sync_point << ", " - << "sequenced=" << (bool)entry.sequenced << ", " - << "has_data=" << (bool)entry.has_data << ", " - << "discard=" << (bool)entry.discard << ", " - << "writesame=" << (bool)entry.writesame << ", " - << "sync_gen_number=" << entry.sync_gen_number << ", " - << "write_sequence_number=" << entry.write_sequence_number << ", " - << "image_offset_bytes=" << entry.image_offset_bytes << ", " - << "write_bytes=" << entry.write_bytes << ", " - << "ws_datalen=" << entry.ws_datalen << ", " - << "entry_index=" << entry.entry_index; - return os; -} - -template -ExtentsSummary::ExtentsSummary(const ExtentsType &extents) - : total_bytes(0), first_image_byte(0), last_image_byte(0) -{ - if (extents.empty()) return; - /* These extents refer to image offsets between first_image_byte - * and last_image_byte, inclusive, but we don't guarantee here - * that they address all of those bytes. There may be gaps. */ - first_image_byte = extents.front().first; - last_image_byte = first_image_byte + extents.front().second; - for (auto &extent : extents) { - /* Ignore zero length extents */ - if (extent.second) { - total_bytes += extent.second; - if (extent.first < first_image_byte) { - first_image_byte = extent.first; - } - if ((extent.first + extent.second) > last_image_byte) { - last_image_byte = extent.first + extent.second; - } - } - } -} - -io::Extent whole_volume_extent() { - return io::Extent({0, std::numeric_limits::max()}); -} - -BlockExtent block_extent(const io::Extent& image_extent) { - return convert_to_block_extent(image_extent.first, image_extent.second); -} - -Context * override_ctx(int r, Context *ctx) { - if (r < 0) { - /* Override next_ctx status with this error */ - return new LambdaContext( - [r, ctx](int _r) { - ctx->complete(r); - }); - } else { - return ctx; - } -} - -} // namespace rwl -} // namespace cache -} // namespace librbd - -template class librbd::cache::rwl::ExtentsSummary; diff --git a/src/librbd/cache/rwl/Types.h b/src/librbd/cache/rwl/Types.h deleted file mode 100644 index 61b7786219328..0000000000000 --- a/src/librbd/cache/rwl/Types.h +++ /dev/null @@ -1,312 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_LIBRBD_CACHE_RWL_TYPES_H -#define CEPH_LIBRBD_CACHE_RWL_TYPES_H - -#include -#include -#include "librbd/BlockGuard.h" -#include "librbd/io/Types.h" - -class Context; - -enum { - l_librbd_rwl_first = 26500, - - // All read requests - l_librbd_rwl_rd_req, // read requests - l_librbd_rwl_rd_bytes, // bytes read - l_librbd_rwl_rd_latency, // average req completion latency - - // Read requests completed from RWL (no misses) - l_librbd_rwl_rd_hit_req, // read requests - l_librbd_rwl_rd_hit_bytes, // bytes read - l_librbd_rwl_rd_hit_latency, // average req completion latency - - // Reed requests with hit and miss extents - l_librbd_rwl_rd_part_hit_req, // read ops - - // Per SyncPoint's LogEntry number and write bytes distribution - l_librbd_rwl_syncpoint_hist, - - // All write requests - l_librbd_rwl_wr_req, // write requests - l_librbd_rwl_wr_req_def, // write requests deferred for resources - l_librbd_rwl_wr_req_def_lanes, // write requests deferred for lanes - l_librbd_rwl_wr_req_def_log, // write requests deferred for log entries - l_librbd_rwl_wr_req_def_buf, // write requests deferred for buffer space - l_librbd_rwl_wr_req_overlap, // write requests detained for overlap - l_librbd_rwl_wr_req_queued, // write requests queued for prior barrier - l_librbd_rwl_wr_bytes, // bytes written - - // Write log operations (1 .. n per request that appends to the log) - l_librbd_rwl_log_ops, // log append ops - l_librbd_rwl_log_op_bytes, // average bytes written per log op - - /* - - Req and op average latencies to the beginning of and over various phases: - - +------------------------------+------+-------------------------------+ - | Phase | Name | Description | - +------------------------------+------+-------------------------------+ - | Arrive at RWL | arr |Arrives as a request | - +------------------------------+------+-------------------------------+ - | Allocate resources | all |time spent in block guard for | - | | |overlap sequencing occurs | - | | |before this point | - +------------------------------+------+-------------------------------+ - | Dispatch | dis |Op lifetime begins here. time | - | | |spent in allocation waiting for| - | | |resources occurs before this | - | | |point | - +------------------------------+------+-------------------------------+ - | Payload buffer persist and | buf |time spent queued for | - |replicate | |replication occurs before here | - +------------------------------+------+-------------------------------+ - | Payload buffer persist | bufc |bufc - buf is just the persist | - |complete | |time | - +------------------------------+------+-------------------------------+ - | Log append | app |time spent queued for append | - | | |occurs before here | - +------------------------------+------+-------------------------------+ - | Append complete | appc |appc - app is just the time | - | | |spent in the append operation | - +------------------------------+------+-------------------------------+ - | Complete | cmp |write persisted, replicated, | - | | |and globally visible | - +------------------------------+------+-------------------------------+ - - */ - - /* Request times */ - l_librbd_rwl_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard - l_librbd_rwl_req_arr_to_dis_t, // arrival to dispatch elapsed time - l_librbd_rwl_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources - l_librbd_rwl_wr_latency, // average req (persist) completion latency - l_librbd_rwl_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written - l_librbd_rwl_wr_caller_latency, // average req completion (to caller) latency - - /* Request times for requests that never waited for space*/ - l_librbd_rwl_nowait_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard - l_librbd_rwl_nowait_req_arr_to_dis_t, // arrival to dispatch elapsed time - l_librbd_rwl_nowait_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources - l_librbd_rwl_nowait_wr_latency, // average req (persist) completion latency - l_librbd_rwl_nowait_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written - l_librbd_rwl_nowait_wr_caller_latency, // average req completion (to caller) latency - - /* Log operation times */ - l_librbd_rwl_log_op_alloc_t, // elapsed time of pmemobj_reserve() - l_librbd_rwl_log_op_alloc_t_hist, // Histogram of elapsed time of pmemobj_reserve() - - l_librbd_rwl_log_op_dis_to_buf_t, // dispatch to buffer persist elapsed time - l_librbd_rwl_log_op_dis_to_app_t, // dispatch to log append elapsed time - l_librbd_rwl_log_op_dis_to_cmp_t, // dispatch to persist completion elapsed time - l_librbd_rwl_log_op_dis_to_cmp_t_hist, // Histogram of dispatch to persist completion elapsed time - - l_librbd_rwl_log_op_buf_to_app_t, // data buf persist + append wait time - l_librbd_rwl_log_op_buf_to_bufc_t,// data buf persist / replicate elapsed time - l_librbd_rwl_log_op_buf_to_bufc_t_hist,// data buf persist time vs bytes histogram - l_librbd_rwl_log_op_app_to_cmp_t, // log entry append + completion wait time - l_librbd_rwl_log_op_app_to_appc_t, // log entry append / replicate elapsed time - l_librbd_rwl_log_op_app_to_appc_t_hist, // log entry append time (vs. op bytes) histogram - - l_librbd_rwl_discard, - l_librbd_rwl_discard_bytes, - l_librbd_rwl_discard_latency, - - l_librbd_rwl_aio_flush, - l_librbd_rwl_aio_flush_def, - l_librbd_rwl_aio_flush_latency, - l_librbd_rwl_ws, - l_librbd_rwl_ws_bytes, // Bytes modified by write same, probably much larger than WS payload bytes - l_librbd_rwl_ws_latency, - - l_librbd_rwl_cmp, - l_librbd_rwl_cmp_bytes, - l_librbd_rwl_cmp_latency, - l_librbd_rwl_cmp_fails, - - l_librbd_rwl_flush, - l_librbd_rwl_invalidate_cache, - l_librbd_rwl_invalidate_discard_cache, - - l_librbd_rwl_append_tx_t, - l_librbd_rwl_retire_tx_t, - l_librbd_rwl_append_tx_t_hist, - l_librbd_rwl_retire_tx_t_hist, - - l_librbd_rwl_last, -}; - -namespace librbd { -namespace cache { -namespace rwl { - -class ImageExtentBuf; -typedef std::vector ImageExtentBufs; - -const int IN_FLIGHT_FLUSH_WRITE_LIMIT = 64; -const int IN_FLIGHT_FLUSH_BYTES_LIMIT = (1 * 1024 * 1024); - -/* Limit work between sync points */ -const uint64_t MAX_WRITES_PER_SYNC_POINT = 256; -const uint64_t MAX_BYTES_PER_SYNC_POINT = (1024 * 1024 * 8); - -const uint32_t MIN_WRITE_ALLOC_SIZE = 512; -const uint32_t LOG_STATS_INTERVAL_SECONDS = 5; - -/**** Write log entries ****/ -const unsigned long int MAX_ALLOC_PER_TRANSACTION = 8; -const unsigned long int MAX_FREE_PER_TRANSACTION = 1; -const unsigned int MAX_CONCURRENT_WRITES = 256; - -const uint64_t DEFAULT_POOL_SIZE = 1u<<30; -const uint64_t MIN_POOL_SIZE = DEFAULT_POOL_SIZE; -constexpr double USABLE_SIZE = (7.0 / 10); -const uint64_t BLOCK_ALLOC_OVERHEAD_BYTES = 16; -const uint8_t RWL_POOL_VERSION = 1; -const uint64_t MAX_LOG_ENTRIES = (1024 * 1024); -const double AGGRESSIVE_RETIRE_HIGH_WATER = 0.75; -const double RETIRE_HIGH_WATER = 0.50; -const double RETIRE_LOW_WATER = 0.40; -const int RETIRE_BATCH_TIME_LIMIT_MS = 250; - -/* Defer a set of Contexts until destruct/exit. Used for deferring - * work on a given thread until a required lock is dropped. */ -class DeferredContexts { -private: - std::vector contexts; -public: - ~DeferredContexts(); - void add(Context* ctx); -}; - -/* Pmem structures */ -POBJ_LAYOUT_BEGIN(rbd_rwl); -POBJ_LAYOUT_ROOT(rbd_rwl, struct WriteLogPoolRoot); -POBJ_LAYOUT_TOID(rbd_rwl, uint8_t); -POBJ_LAYOUT_TOID(rbd_rwl, struct WriteLogPmemEntry); -POBJ_LAYOUT_END(rbd_rwl); - -struct WriteLogPmemEntry { - uint64_t sync_gen_number = 0; - uint64_t write_sequence_number = 0; - uint64_t image_offset_bytes; - uint64_t write_bytes; - TOID(uint8_t) write_data; - struct { - uint8_t entry_valid :1; /* if 0, this entry is free */ - uint8_t sync_point :1; /* No data. No write sequence number. Marks sync - point for this sync gen number */ - uint8_t sequenced :1; /* write sequence number is valid */ - uint8_t has_data :1; /* write_data field is valid (else ignore) */ - uint8_t discard :1; /* has_data will be 0 if this is a discard */ - uint8_t writesame :1; /* ws_datalen indicates length of data at write_bytes */ - }; - uint32_t ws_datalen = 0; /* Length of data buffer (writesame only) */ - uint32_t entry_index = 0; /* For debug consistency check. Can be removed if - * we need the space */ - WriteLogPmemEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) - : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes), - entry_valid(0), sync_point(0), sequenced(0), has_data(0), discard(0), writesame(0) { - } - BlockExtent block_extent(); - uint64_t get_offset_bytes(); - uint64_t get_write_bytes(); - bool is_sync_point() { - return sync_point; - } - bool is_discard() { - return discard; - } - bool is_writesame() { - return writesame; - } - bool is_write() { - /* Log entry is a basic write */ - return !is_sync_point() && !is_discard() && !is_writesame(); - } - bool is_writer() { - /* Log entry is any type that writes data */ - return is_write() || is_discard() || is_writesame(); - } - friend std::ostream& operator<<(std::ostream& os, - const WriteLogPmemEntry &entry); -}; - -static_assert(sizeof(WriteLogPmemEntry) == 64); - -struct WriteLogPoolRoot { - union { - struct { - uint8_t layout_version; /* Version of this structure (RWL_POOL_VERSION) */ - }; - uint64_t _u64; - } header; - TOID(struct WriteLogPmemEntry) log_entries; /* contiguous array of log entries */ - uint64_t pool_size; - uint64_t flushed_sync_gen; /* All writing entries with this or a lower - * sync gen number are flushed. */ - uint32_t block_size; /* block size */ - uint32_t num_log_entries; - uint32_t first_free_entry; /* Entry following the newest valid entry */ - uint32_t first_valid_entry; /* Index of the oldest valid entry in the log */ -}; - -struct WriteBufferAllocation { - unsigned int allocation_size = 0; - pobj_action buffer_alloc_action; - TOID(uint8_t) buffer_oid = OID_NULL; - bool allocated = false; - utime_t allocation_lat; -}; - -static inline io::Extent image_extent(const BlockExtent& block_extent) { - return io::Extent(block_extent.block_start, - block_extent.block_end - block_extent.block_start); -} - -template -class ExtentsSummary { -public: - uint64_t total_bytes; - uint64_t first_image_byte; - uint64_t last_image_byte; - explicit ExtentsSummary(const ExtentsType &extents); - friend std::ostream &operator<<(std::ostream &os, - const ExtentsSummary &s) { - os << "total_bytes=" << s.total_bytes << ", " - << "first_image_byte=" << s.first_image_byte << ", " - << "last_image_byte=" << s.last_image_byte << ""; - return os; - } - BlockExtent block_extent() { - return BlockExtent(first_image_byte, last_image_byte); - } - io::Extent image_extent() { - return librbd::cache::rwl::image_extent(block_extent()); - } -}; - -io::Extent whole_volume_extent(); - -BlockExtent block_extent(const io::Extent& image_extent); - -Context * override_ctx(int r, Context *ctx); - -class ImageExtentBuf : public io::Extent { -public: - bufferlist m_bl; - ImageExtentBuf(io::Extent extent) - : io::Extent(extent) { } - ImageExtentBuf(io::Extent extent, bufferlist bl) - : io::Extent(extent), m_bl(bl) { } -}; - -} // namespace rwl -} // namespace cache -} // namespace librbd - -#endif // CEPH_LIBRBD_CACHE_RWL_TYPES_H diff --git a/src/librbd/exclusive_lock/PostAcquireRequest.cc b/src/librbd/exclusive_lock/PostAcquireRequest.cc index 6c920cc8aa6a4..cf75d91f0e9be 100644 --- a/src/librbd/exclusive_lock/PostAcquireRequest.cc +++ b/src/librbd/exclusive_lock/PostAcquireRequest.cc @@ -7,8 +7,8 @@ #include "common/dout.h" #include "common/errno.h" #include "include/stringify.h" -#include "librbd/cache/rwl/InitRequest.h" -#include "librbd/cache/rwl/ShutdownRequest.h" +#include "librbd/cache/pwl/InitRequest.h" +#include "librbd/cache/pwl/ShutdownRequest.h" #include "librbd/ExclusiveLock.h" #include "librbd/ImageCtx.h" #include "librbd/ImageState.h" @@ -186,7 +186,7 @@ void PostAcquireRequest::send_open_image_cache() { Context *ctx = create_async_context_callback( m_image_ctx, create_context_callback< klass, &klass::handle_open_image_cache>(this)); - cache::rwl::InitRequest *req = cache::rwl::InitRequest::create( + cache::pwl::InitRequest *req = cache::pwl::InitRequest::create( m_image_ctx, ctx); req->send(); } @@ -216,7 +216,7 @@ void PostAcquireRequest::send_close_image_cache() { using klass = PostAcquireRequest; Context *ctx = create_context_callback( this); - cache::rwl::ShutdownRequest *req = cache::rwl::ShutdownRequest::create( + cache::pwl::ShutdownRequest *req = cache::pwl::ShutdownRequest::create( m_image_ctx, ctx); req->send(); } diff --git a/src/librbd/exclusive_lock/PreReleaseRequest.cc b/src/librbd/exclusive_lock/PreReleaseRequest.cc index fb1aa8322fb90..be5f7c2aa620e 100644 --- a/src/librbd/exclusive_lock/PreReleaseRequest.cc +++ b/src/librbd/exclusive_lock/PreReleaseRequest.cc @@ -5,7 +5,7 @@ #include "common/AsyncOpTracker.h" #include "common/dout.h" #include "common/errno.h" -#include "librbd/cache/rwl/ShutdownRequest.h" +#include "librbd/cache/pwl/ShutdownRequest.h" #include "librbd/ExclusiveLock.h" #include "librbd/ImageState.h" #include "librbd/ImageWatcher.h" @@ -174,7 +174,7 @@ void PreReleaseRequest::send_shut_down_image_cache() { Context *ctx = create_async_context_callback(m_image_ctx, create_context_callback< PreReleaseRequest, &PreReleaseRequest::handle_shut_down_image_cache>(this)); - cache::rwl::ShutdownRequest *req = cache::rwl::ShutdownRequest::create( + cache::pwl::ShutdownRequest *req = cache::pwl::ShutdownRequest::create( m_image_ctx, ctx); req->send(); } diff --git a/src/test/librbd/CMakeLists.txt b/src/test/librbd/CMakeLists.txt index ae6966d5daf95..7f18790dfc1e6 100644 --- a/src/test/librbd/CMakeLists.txt +++ b/src/test/librbd/CMakeLists.txt @@ -121,7 +121,7 @@ if(WITH_RBD_RWL) set(unittest_librbd_srcs ${unittest_librbd_srcs} cache/test_mock_ReplicatedWriteLog.cc - cache/rwl/test_WriteLogMap.cc) + cache/pwl/test_WriteLogMap.cc) endif(WITH_RBD_RWL) add_executable(unittest_librbd diff --git a/src/test/librbd/cache/pwl/test_WriteLogMap.cc b/src/test/librbd/cache/pwl/test_WriteLogMap.cc new file mode 100644 index 0000000000000..7263d0831ac88 --- /dev/null +++ b/src/test/librbd/cache/pwl/test_WriteLogMap.cc @@ -0,0 +1,336 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "test/librbd/test_fixture.h" +#include "test/librbd/test_support.h" + +#include "librbd/cache/pwl/LogMap.cc" + +void register_test_write_log_map() { +} + +namespace librbd { +namespace cache { +namespace pwl { + +struct TestLogEntry { + uint64_t image_offset_bytes; + uint64_t write_bytes; + uint32_t referring_map_entries = 0; + TestLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) + : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes) { + } + uint64_t get_offset_bytes() { + return image_offset_bytes; + } + uint64_t get_write_bytes() { + return write_bytes; + } + BlockExtent block_extent() { + return BlockExtent(image_offset_bytes, image_offset_bytes + write_bytes); + } + uint32_t get_map_ref() { + return referring_map_entries; + } + void inc_map_ref() { + referring_map_entries++; + } + void dec_map_ref() { + referring_map_entries--; + } + friend std::ostream &operator<<(std::ostream &os, + const TestLogEntry &entry) { + os << "referring_map_entries=" << entry.referring_map_entries << ", " + << "image_offset_bytes=" << entry.image_offset_bytes << ", " + << "write_bytes=" << entry.write_bytes; + return os; + }; +}; + +typedef std::list> TestLogEntries; +typedef LogMapEntry TestMapEntry; +typedef LogMapEntries TestLogMapEntries; +typedef LogMap TestLogMap; + +class TestWriteLogMap : public TestFixture { +public: + void SetUp() override { + TestFixture::SetUp(); + m_cct = reinterpret_cast(m_ioctx.cct()); + } + + CephContext *m_cct; +}; + +TEST_F(TestWriteLogMap, Simple) { + TestLogEntries es; + TestLogMapEntries lme; + TestLogMap map(m_cct); + + /* LogEntry takes offset, length, in bytes */ + auto e1 = make_shared(4, 8); + TestLogEntry *e1_ptr = e1.get(); + ASSERT_EQ(4, e1_ptr->get_offset_bytes()); + ASSERT_EQ(8, e1_ptr->get_write_bytes()); + map.add_log_entry(e1); + + /* BlockExtent takes first, last, in blocks */ + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100)); + int numfound = found0.size(); + /* Written range includes the single write above */ + ASSERT_EQ(1, numfound); + ASSERT_EQ(e1, found0.front().log_entry); + + /* Nothing before that */ + found0 = map.find_map_entries(BlockExtent(0, 3)); + numfound = found0.size(); + ASSERT_EQ(0, numfound); + + /* Nothing after that */ + found0 = map.find_map_entries(BlockExtent(12, 99)); + numfound = found0.size(); + ASSERT_EQ(0, numfound); + + /* 4-11 will be e1 */ + for (int i=4; i<12; i++) { + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); + int numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e1, found0.front().log_entry); + } + + map.remove_log_entry(e1); + /* Nothing should be found */ + for (int i=4; i<12; i++) { + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); + int numfound = found0.size(); + ASSERT_EQ(0, numfound); + } +} + +TEST_F(TestWriteLogMap, OverlapFront) { + TestLogMap map(m_cct); + + auto e0 = make_shared(4, 8); + map.add_log_entry(e0); + /* replaces block 4-7 of e0 */ + auto e1 = make_shared(0, 8); + map.add_log_entry(e1); + + /* Written range includes the two writes above */ + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100)); + int numfound = found0.size(); + ASSERT_EQ(2, numfound); + ASSERT_EQ(e1, found0.front().log_entry); + ASSERT_EQ(0, found0.front().block_extent.block_start); + ASSERT_EQ(8, found0.front().block_extent.block_end); + found0.pop_front(); + ASSERT_EQ(e0, found0.front().log_entry); + ASSERT_EQ(8, found0.front().block_extent.block_start); + ASSERT_EQ(12, found0.front().block_extent.block_end); + + /* 0-7 will be e1 */ + for (int i=0; i<8; i++) { + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); + int numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e1, found0.front().log_entry); + } + + /* 8-11 will be e0 */ + for (int i=8; i<12; i++) { + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); + int numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e0, found0.front().log_entry); + } +} + +TEST_F(TestWriteLogMap, OverlapBack) { + TestLogMap map(m_cct); + + auto e0 = make_shared(0, 8); + map.add_log_entry(e0); + /* replaces block 4-7 of e0 */ + auto e1 = make_shared(4, 8); + map.add_log_entry(e1); + + /* Written range includes the two writes above */ + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100)); + int numfound = found0.size(); + ASSERT_EQ(2, numfound); + ASSERT_EQ(e0, found0.front().log_entry); + ASSERT_EQ(0, found0.front().block_extent.block_start); + ASSERT_EQ(4, found0.front().block_extent.block_end); + found0.pop_front(); + ASSERT_EQ(e1, found0.front().log_entry); + ASSERT_EQ(4, found0.front().block_extent.block_start); + ASSERT_EQ(12, found0.front().block_extent.block_end); + + /* 0-3 will be e0 */ + for (int i=0; i<4; i++) { + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); + int numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e0, found0.front().log_entry); + } + + /* 4-11 will be e1 */ + for (int i=4; i<12; i++) { + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); + int numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e1, found0.front().log_entry); + } + + map.remove_log_entry(e0); + + /* 0-3 will find nothing */ + for (int i=0; i<4; i++) { + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); + int numfound = found0.size(); + ASSERT_EQ(0, numfound); + } + + /* 4-11 will still be e1 */ + for (int i=4; i<12; i++) { + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); + int numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e1, found0.front().log_entry); + } + +} + +TEST_F(TestWriteLogMap, OverlapMiddle) { + TestLogMap map(m_cct); + + auto e0 = make_shared(0, 1); + map.add_log_entry(e0); + + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 1)); + int numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e0, found0.front().log_entry); + TestLogEntries entries = map.find_log_entries(BlockExtent(0, 1)); + int entriesfound = entries.size(); + ASSERT_EQ(1, entriesfound); + ASSERT_EQ(e0, entries.front()); + + auto e1 = make_shared(1, 1); + map.add_log_entry(e1); + + found0 = map.find_map_entries(BlockExtent(1, 2)); + numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e1, found0.front().log_entry); + entries = map.find_log_entries(BlockExtent(1, 2)); + entriesfound = entries.size(); + ASSERT_EQ(1, entriesfound); + ASSERT_EQ(e1, entries.front()); + + auto e2 = make_shared(2, 1); + map.add_log_entry(e2); + + found0 = map.find_map_entries(BlockExtent(2, 3)); + numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e2, found0.front().log_entry); + entries = map.find_log_entries(BlockExtent(2, 3)); + entriesfound = entries.size(); + ASSERT_EQ(1, entriesfound); + ASSERT_EQ(e2, entries.front()); + + /* replaces e1 */ + auto e3 = make_shared(1, 1); + map.add_log_entry(e3); + + found0 = map.find_map_entries(BlockExtent(1, 2)); + numfound = found0.size(); + ASSERT_EQ(1, numfound); + ASSERT_EQ(e3, found0.front().log_entry); + entries = map.find_log_entries(BlockExtent(1, 2)); + entriesfound = entries.size(); + ASSERT_EQ(1, entriesfound); + ASSERT_EQ(e3, entries.front()); + + found0 = map.find_map_entries(BlockExtent(0, 100)); + numfound = found0.size(); + ASSERT_EQ(3, numfound); + ASSERT_EQ(e0, found0.front().log_entry); + found0.pop_front(); + ASSERT_EQ(e3, found0.front().log_entry); + found0.pop_front(); + ASSERT_EQ(e2, found0.front().log_entry); + entries = map.find_log_entries(BlockExtent(0, 100)); + entriesfound = entries.size(); + ASSERT_EQ(3, entriesfound); + ASSERT_EQ(e0, entries.front()); + entries.pop_front(); + ASSERT_EQ(e3, entries.front()); + entries.pop_front(); + ASSERT_EQ(e2, entries.front()); + + entries.clear(); + entries.emplace_back(e0); + entries.emplace_back(e1); + map.remove_log_entries(entries); + + found0 = map.find_map_entries(BlockExtent(0, 100)); + numfound = found0.size(); + ASSERT_EQ(2, numfound); + ASSERT_EQ(e3, found0.front().log_entry); + found0.pop_front(); + ASSERT_EQ(e2, found0.front().log_entry); +} + +TEST_F(TestWriteLogMap, OverlapSplit) { + TestLogMap map(m_cct); + + auto e0 = make_shared(0, 8); + map.add_log_entry(e0); + + /* Splits e0 at 1 */ + auto e1 = make_shared(1, 1); + map.add_log_entry(e1); + + /* Splits e0 again at 4 */ + auto e2 = make_shared(4, 2); + map.add_log_entry(e2); + + /* Replaces one block of e2, and one of e0 */ + auto e3 = make_shared(5, 2); + map.add_log_entry(e3); + + /* Expecting: 0:e0, 1:e1, 2..3:e0, 4:e2, 5..6:e3, 7:e0 */ + TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100)); + int numfound = found0.size(); + ASSERT_EQ(6, numfound); + ASSERT_EQ(e0, found0.front().log_entry); + ASSERT_EQ(0, found0.front().block_extent.block_start); + ASSERT_EQ(1, found0.front().block_extent.block_end); + found0.pop_front(); + ASSERT_EQ(e1, found0.front().log_entry); + ASSERT_EQ(1, found0.front().block_extent.block_start); + ASSERT_EQ(2, found0.front().block_extent.block_end); + found0.pop_front(); + ASSERT_EQ(e0, found0.front().log_entry); + ASSERT_EQ(2, found0.front().block_extent.block_start); + ASSERT_EQ(4, found0.front().block_extent.block_end); + found0.pop_front(); + ASSERT_EQ(e2, found0.front().log_entry); + ASSERT_EQ(4, found0.front().block_extent.block_start); + ASSERT_EQ(5, found0.front().block_extent.block_end); + found0.pop_front(); + ASSERT_EQ(e3, found0.front().log_entry); + ASSERT_EQ(5, found0.front().block_extent.block_start); + ASSERT_EQ(7, found0.front().block_extent.block_end); + found0.pop_front(); + ASSERT_EQ(e0, found0.front().log_entry); + ASSERT_EQ(7, found0.front().block_extent.block_start); + ASSERT_EQ(8, found0.front().block_extent.block_end); +} + +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/test/librbd/cache/rwl/test_WriteLogMap.cc b/src/test/librbd/cache/rwl/test_WriteLogMap.cc deleted file mode 100644 index 1fc2cfd42e03e..0000000000000 --- a/src/test/librbd/cache/rwl/test_WriteLogMap.cc +++ /dev/null @@ -1,336 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "test/librbd/test_fixture.h" -#include "test/librbd/test_support.h" - -#include "librbd/cache/rwl/LogMap.cc" - -void register_test_write_log_map() { -} - -namespace librbd { -namespace cache { -namespace rwl { - -struct TestLogEntry { - uint64_t image_offset_bytes; - uint64_t write_bytes; - uint32_t referring_map_entries = 0; - TestLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) - : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes) { - } - uint64_t get_offset_bytes() { - return image_offset_bytes; - } - uint64_t get_write_bytes() { - return write_bytes; - } - BlockExtent block_extent() { - return BlockExtent(image_offset_bytes, image_offset_bytes + write_bytes); - } - uint32_t get_map_ref() { - return referring_map_entries; - } - void inc_map_ref() { - referring_map_entries++; - } - void dec_map_ref() { - referring_map_entries--; - } - friend std::ostream &operator<<(std::ostream &os, - const TestLogEntry &entry) { - os << "referring_map_entries=" << entry.referring_map_entries << ", " - << "image_offset_bytes=" << entry.image_offset_bytes << ", " - << "write_bytes=" << entry.write_bytes; - return os; - }; -}; - -typedef std::list> TestLogEntries; -typedef LogMapEntry TestMapEntry; -typedef LogMapEntries TestLogMapEntries; -typedef LogMap TestLogMap; - -class TestWriteLogMap : public TestFixture { -public: - void SetUp() override { - TestFixture::SetUp(); - m_cct = reinterpret_cast(m_ioctx.cct()); - } - - CephContext *m_cct; -}; - -TEST_F(TestWriteLogMap, Simple) { - TestLogEntries es; - TestLogMapEntries lme; - TestLogMap map(m_cct); - - /* LogEntry takes offset, length, in bytes */ - auto e1 = make_shared(4, 8); - TestLogEntry *e1_ptr = e1.get(); - ASSERT_EQ(4, e1_ptr->get_offset_bytes()); - ASSERT_EQ(8, e1_ptr->get_write_bytes()); - map.add_log_entry(e1); - - /* BlockExtent takes first, last, in blocks */ - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100)); - int numfound = found0.size(); - /* Written range includes the single write above */ - ASSERT_EQ(1, numfound); - ASSERT_EQ(e1, found0.front().log_entry); - - /* Nothing before that */ - found0 = map.find_map_entries(BlockExtent(0, 3)); - numfound = found0.size(); - ASSERT_EQ(0, numfound); - - /* Nothing after that */ - found0 = map.find_map_entries(BlockExtent(12, 99)); - numfound = found0.size(); - ASSERT_EQ(0, numfound); - - /* 4-11 will be e1 */ - for (int i=4; i<12; i++) { - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); - int numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e1, found0.front().log_entry); - } - - map.remove_log_entry(e1); - /* Nothing should be found */ - for (int i=4; i<12; i++) { - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); - int numfound = found0.size(); - ASSERT_EQ(0, numfound); - } -} - -TEST_F(TestWriteLogMap, OverlapFront) { - TestLogMap map(m_cct); - - auto e0 = make_shared(4, 8); - map.add_log_entry(e0); - /* replaces block 4-7 of e0 */ - auto e1 = make_shared(0, 8); - map.add_log_entry(e1); - - /* Written range includes the two writes above */ - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100)); - int numfound = found0.size(); - ASSERT_EQ(2, numfound); - ASSERT_EQ(e1, found0.front().log_entry); - ASSERT_EQ(0, found0.front().block_extent.block_start); - ASSERT_EQ(8, found0.front().block_extent.block_end); - found0.pop_front(); - ASSERT_EQ(e0, found0.front().log_entry); - ASSERT_EQ(8, found0.front().block_extent.block_start); - ASSERT_EQ(12, found0.front().block_extent.block_end); - - /* 0-7 will be e1 */ - for (int i=0; i<8; i++) { - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); - int numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e1, found0.front().log_entry); - } - - /* 8-11 will be e0 */ - for (int i=8; i<12; i++) { - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); - int numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e0, found0.front().log_entry); - } -} - -TEST_F(TestWriteLogMap, OverlapBack) { - TestLogMap map(m_cct); - - auto e0 = make_shared(0, 8); - map.add_log_entry(e0); - /* replaces block 4-7 of e0 */ - auto e1 = make_shared(4, 8); - map.add_log_entry(e1); - - /* Written range includes the two writes above */ - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100)); - int numfound = found0.size(); - ASSERT_EQ(2, numfound); - ASSERT_EQ(e0, found0.front().log_entry); - ASSERT_EQ(0, found0.front().block_extent.block_start); - ASSERT_EQ(4, found0.front().block_extent.block_end); - found0.pop_front(); - ASSERT_EQ(e1, found0.front().log_entry); - ASSERT_EQ(4, found0.front().block_extent.block_start); - ASSERT_EQ(12, found0.front().block_extent.block_end); - - /* 0-3 will be e0 */ - for (int i=0; i<4; i++) { - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); - int numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e0, found0.front().log_entry); - } - - /* 4-11 will be e1 */ - for (int i=4; i<12; i++) { - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); - int numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e1, found0.front().log_entry); - } - - map.remove_log_entry(e0); - - /* 0-3 will find nothing */ - for (int i=0; i<4; i++) { - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); - int numfound = found0.size(); - ASSERT_EQ(0, numfound); - } - - /* 4-11 will still be e1 */ - for (int i=4; i<12; i++) { - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(i, i + 1)); - int numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e1, found0.front().log_entry); - } - -} - -TEST_F(TestWriteLogMap, OverlapMiddle) { - TestLogMap map(m_cct); - - auto e0 = make_shared(0, 1); - map.add_log_entry(e0); - - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 1)); - int numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e0, found0.front().log_entry); - TestLogEntries entries = map.find_log_entries(BlockExtent(0, 1)); - int entriesfound = entries.size(); - ASSERT_EQ(1, entriesfound); - ASSERT_EQ(e0, entries.front()); - - auto e1 = make_shared(1, 1); - map.add_log_entry(e1); - - found0 = map.find_map_entries(BlockExtent(1, 2)); - numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e1, found0.front().log_entry); - entries = map.find_log_entries(BlockExtent(1, 2)); - entriesfound = entries.size(); - ASSERT_EQ(1, entriesfound); - ASSERT_EQ(e1, entries.front()); - - auto e2 = make_shared(2, 1); - map.add_log_entry(e2); - - found0 = map.find_map_entries(BlockExtent(2, 3)); - numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e2, found0.front().log_entry); - entries = map.find_log_entries(BlockExtent(2, 3)); - entriesfound = entries.size(); - ASSERT_EQ(1, entriesfound); - ASSERT_EQ(e2, entries.front()); - - /* replaces e1 */ - auto e3 = make_shared(1, 1); - map.add_log_entry(e3); - - found0 = map.find_map_entries(BlockExtent(1, 2)); - numfound = found0.size(); - ASSERT_EQ(1, numfound); - ASSERT_EQ(e3, found0.front().log_entry); - entries = map.find_log_entries(BlockExtent(1, 2)); - entriesfound = entries.size(); - ASSERT_EQ(1, entriesfound); - ASSERT_EQ(e3, entries.front()); - - found0 = map.find_map_entries(BlockExtent(0, 100)); - numfound = found0.size(); - ASSERT_EQ(3, numfound); - ASSERT_EQ(e0, found0.front().log_entry); - found0.pop_front(); - ASSERT_EQ(e3, found0.front().log_entry); - found0.pop_front(); - ASSERT_EQ(e2, found0.front().log_entry); - entries = map.find_log_entries(BlockExtent(0, 100)); - entriesfound = entries.size(); - ASSERT_EQ(3, entriesfound); - ASSERT_EQ(e0, entries.front()); - entries.pop_front(); - ASSERT_EQ(e3, entries.front()); - entries.pop_front(); - ASSERT_EQ(e2, entries.front()); - - entries.clear(); - entries.emplace_back(e0); - entries.emplace_back(e1); - map.remove_log_entries(entries); - - found0 = map.find_map_entries(BlockExtent(0, 100)); - numfound = found0.size(); - ASSERT_EQ(2, numfound); - ASSERT_EQ(e3, found0.front().log_entry); - found0.pop_front(); - ASSERT_EQ(e2, found0.front().log_entry); -} - -TEST_F(TestWriteLogMap, OverlapSplit) { - TestLogMap map(m_cct); - - auto e0 = make_shared(0, 8); - map.add_log_entry(e0); - - /* Splits e0 at 1 */ - auto e1 = make_shared(1, 1); - map.add_log_entry(e1); - - /* Splits e0 again at 4 */ - auto e2 = make_shared(4, 2); - map.add_log_entry(e2); - - /* Replaces one block of e2, and one of e0 */ - auto e3 = make_shared(5, 2); - map.add_log_entry(e3); - - /* Expecting: 0:e0, 1:e1, 2..3:e0, 4:e2, 5..6:e3, 7:e0 */ - TestLogMapEntries found0 = map.find_map_entries(BlockExtent(0, 100)); - int numfound = found0.size(); - ASSERT_EQ(6, numfound); - ASSERT_EQ(e0, found0.front().log_entry); - ASSERT_EQ(0, found0.front().block_extent.block_start); - ASSERT_EQ(1, found0.front().block_extent.block_end); - found0.pop_front(); - ASSERT_EQ(e1, found0.front().log_entry); - ASSERT_EQ(1, found0.front().block_extent.block_start); - ASSERT_EQ(2, found0.front().block_extent.block_end); - found0.pop_front(); - ASSERT_EQ(e0, found0.front().log_entry); - ASSERT_EQ(2, found0.front().block_extent.block_start); - ASSERT_EQ(4, found0.front().block_extent.block_end); - found0.pop_front(); - ASSERT_EQ(e2, found0.front().log_entry); - ASSERT_EQ(4, found0.front().block_extent.block_start); - ASSERT_EQ(5, found0.front().block_extent.block_end); - found0.pop_front(); - ASSERT_EQ(e3, found0.front().log_entry); - ASSERT_EQ(5, found0.front().block_extent.block_start); - ASSERT_EQ(7, found0.front().block_extent.block_end); - found0.pop_front(); - ASSERT_EQ(e0, found0.front().log_entry); - ASSERT_EQ(7, found0.front().block_extent.block_start); - ASSERT_EQ(8, found0.front().block_extent.block_end); -} - -} // namespace rwl -} // namespace cache -} // namespace librbd diff --git a/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc b/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc index 717b19b938019..4e9f8065ae4c4 100644 --- a/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc +++ b/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc @@ -7,8 +7,8 @@ #include "test/librbd/test_support.h" #include "test/librbd/mock/MockImageCtx.h" #include "include/rbd/librbd.hpp" -#include "librbd/cache/rwl/ImageCacheState.h" -#include "librbd/cache/rwl/Types.h" +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/cache/pwl/Types.h" #include "librbd/cache/ImageWriteback.h" #include "librbd/cache/WriteLogCache.h" @@ -37,13 +37,13 @@ inline ImageCtx *get_image_ctx(MockImageCtx *image_ctx) { } // namespace librbd #include "librbd/cache/WriteLogCache.cc" -#include "librbd/cache/AbstractWriteLog.cc" -#include "librbd/cache/ReplicatedWriteLog.cc" +#include "librbd/cache/pwl/AbstractWriteLog.cc" +#include "librbd/cache/pwl/ReplicatedWriteLog.cc" // template definitions #include "librbd/cache/ImageWriteback.cc" -#include "librbd/cache/rwl/ImageCacheState.cc" -#include "librbd/cache/rwl/Request.cc" +#include "librbd/cache/pwl/ImageCacheState.cc" +#include "librbd/cache/pwl/Request.cc" namespace librbd { namespace cache { @@ -55,7 +55,7 @@ using ::testing::Invoke; struct TestMockCacheReplicatedWriteLog : public TestMockFixture { typedef WriteLogCache MockReplicatedWriteLog; - typedef librbd::cache::rwl::ImageCacheState MockImageCacheStateRWL; + typedef librbd::cache::pwl::ImageCacheState MockImageCacheStateRWL; MockImageCacheStateRWL *get_cache_state(MockImageCtx& mock_image_ctx) { MockImageCacheStateRWL *rwl_state = new MockImageCacheStateRWL(&mock_image_ctx); diff --git a/src/test/librbd/exclusive_lock/test_mock_PostAcquireRequest.cc b/src/test/librbd/exclusive_lock/test_mock_PostAcquireRequest.cc index 9379e3ecf8955..c0e2a0ecf7307 100644 --- a/src/test/librbd/exclusive_lock/test_mock_PostAcquireRequest.cc +++ b/src/test/librbd/exclusive_lock/test_mock_PostAcquireRequest.cc @@ -11,8 +11,8 @@ #include "test/librbd/mock/MockObjectMap.h" #include "test/librados_test_stub/MockTestMemIoCtxImpl.h" #include "test/librados_test_stub/MockTestMemRadosClient.h" -#include "librbd/cache/rwl/InitRequest.h" -#include "librbd/cache/rwl/ShutdownRequest.h" +#include "librbd/cache/pwl/InitRequest.h" +#include "librbd/cache/pwl/ShutdownRequest.h" #include "librbd/exclusive_lock/PostAcquireRequest.h" #include "librbd/image/RefreshRequest.h" @@ -63,7 +63,7 @@ RefreshRequest *RefreshRequest struct InitRequest { @@ -105,7 +105,7 @@ struct ShutdownRequest { ShutdownRequest *ShutdownRequest::s_instance = nullptr; -} // namespace rwl +} // namespace pwl } // namespace cache } // namespace librbd @@ -138,8 +138,8 @@ class TestMockExclusiveLockPostAcquireRequest : public TestMockFixture { public: typedef PostAcquireRequest MockPostAcquireRequest; typedef librbd::image::RefreshRequest MockRefreshRequest; - typedef librbd::cache::rwl::InitRequest MockInitRequest; - typedef librbd::cache::rwl::ShutdownRequest MockShutdownRequest; + typedef librbd::cache::pwl::InitRequest MockInitRequest; + typedef librbd::cache::pwl::ShutdownRequest MockShutdownRequest; void expect_test_features(MockTestImageCtx &mock_image_ctx, uint64_t features, bool enabled) { diff --git a/src/test/librbd/exclusive_lock/test_mock_PreReleaseRequest.cc b/src/test/librbd/exclusive_lock/test_mock_PreReleaseRequest.cc index d56ee1155f625..8c9c1fa44e35d 100644 --- a/src/test/librbd/exclusive_lock/test_mock_PreReleaseRequest.cc +++ b/src/test/librbd/exclusive_lock/test_mock_PreReleaseRequest.cc @@ -1,7 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#include "librbd/cache/rwl/ShutdownRequest.h" +#include "librbd/cache/pwl/ShutdownRequest.h" #include "test/librbd/test_mock_fixture.h" #include "test/librbd/test_support.h" #include "test/librbd/mock/cache/MockImageCache.h" @@ -40,7 +40,7 @@ struct ImageDispatch { } // namespace exclusive_lock namespace cache { -namespace rwl { +namespace pwl { template<> struct ShutdownRequest { static ShutdownRequest *s_instance; @@ -61,7 +61,7 @@ struct ShutdownRequest { ShutdownRequest *ShutdownRequest::s_instance = nullptr; -} // namespace rwl +} // namespace pwl } // namespace cache } // namespace librbd @@ -93,7 +93,7 @@ class TestMockExclusiveLockPreReleaseRequest : public TestMockFixture { public: typedef ImageDispatch MockImageDispatch; typedef PreReleaseRequest MockPreReleaseRequest; - typedef librbd::cache::rwl::ShutdownRequest MockShutdownRequest; + typedef librbd::cache::pwl::ShutdownRequest MockShutdownRequest; void expect_complete_context(MockContext &mock_context, int r) { EXPECT_CALL(mock_context, complete(r));