.set_min(0)
.set_description("maximum io delay (in milliseconds) for simple io scheduler (if set to 0 dalay is calculated based on latency stats)"),
- Option("rbd_rwl_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
- .set_default(false)
+ Option("rbd_persistent_cache_mode", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("disabled")
+ .set_enum_allowed({"disabled", "rwl", "ssd"})
.set_description("enable persistent write back cache for this volume"),
Option("rbd_rwl_log_periodic_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
add_library(rbd_types STATIC
${librbd_types_srcs})
-if (WITH_RBD_RWL AND WITH_RBD_SSD_CACHE)
+if (WITH_RBD_RWL)
target_link_libraries(rbd_types
PRIVATE pmem::pmemobj)
endif()
if(WITH_RBD_SSD_CACHE)
set(rbd_plugin_pwl_srcs
${rbd_plugin_pwl_srcs}
- cache/pwl/SSDWriteLog.cc)
+ cache/pwl/ssd/LogEntry.cc
+ cache/pwl/ssd/Request.cc
+ cache/pwl/ssd/WriteLog.cc)
endif()
if(WITH_RBD_RWL)
set(rbd_plugin_pwl_srcs
${rbd_plugin_pwl_srcs}
- cache/pwl/ReplicatedWriteLog.cc)
+ cache/pwl/rwl/WriteLog.cc
+ cache/pwl/rwl/LogEntry.cc
+ cache/pwl/rwl/LogOperation.cc
+ cache/pwl/rwl/Request.cc)
endif()
add_library(librbd_plugin_pwl_cache SHARED
enum ImageCacheType {
IMAGE_CACHE_TYPE_RWL = 1,
IMAGE_CACHE_TYPE_SSD,
+ IMAGE_CACHE_TYPE_UNKNOWN
};
typedef std::list<Context *> Contexts;
#define CEPH_LIBRBD_CACHE_UTILS_H
#include "acconfig.h"
+#include <string>
class Context;
template <typename T>
bool is_pwl_enabled(T& image_ctx) {
-#if defined(WITH_RBD_RWL)
- return image_ctx.config.template get_val<bool>("rbd_rwl_enabled");
+#if defined(WITH_RBD_RWL) || defined(WITH_RBD_SSD_CACHE)
+ auto value = image_ctx.config.template get_val<std::string>("rbd_persistent_cache_mode");
+ return value == "disabled" ? false : true;
#else
return false;
#endif // WITH_RBD_RWL
#undef dout_subsys
#define dout_subsys ceph_subsys_rbd_pwl
#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::pwl::AbstractWriteLog: " << this << " " \
- << __func__ << ": "
+#define dout_prefix *_dout << "librbd::cache::pwl::AbstractWriteLog: " << this \
+ << " " << __func__ << ": "
namespace librbd {
namespace cache {
typedef AbstractWriteLog<ImageCtx>::Extents Extents;
template <typename I>
-AbstractWriteLog<I>::AbstractWriteLog(I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
- cache::ImageWritebackInterface& image_writeback,
+AbstractWriteLog<I>::AbstractWriteLog(
+ I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
+ Builder<This> *builder, cache::ImageWritebackInterface& image_writeback,
plugin::Api<I>& plugin_api)
- : m_write_log_guard(image_ctx.cct),
+ : m_builder(builder),
+ m_write_log_guard(image_ctx.cct),
m_deferred_dispatch_lock(ceph::make_mutex(pwl::unique_lock_name(
"librbd::cache::pwl::AbstractWriteLog::m_deferred_dispatch_lock", this))),
m_blockguard_lock(ceph::make_mutex(pwl::unique_lock_name(
"librbd::cache::pwl::AbstractWriteLog::m_blockguard_lock", this))),
m_thread_pool(
- image_ctx.cct, "librbd::cache::pwl::AbstractWriteLog::thread_pool", "tp_pwl", 4, ""),
+ image_ctx.cct, "librbd::cache::pwl::AbstractWriteLog::thread_pool",
+ "tp_pwl", 4, ""),
m_cache_state(cache_state),
m_image_ctx(image_ctx),
m_log_pool_config_size(DEFAULT_POOL_SIZE),
template <typename I>
void AbstractWriteLog<I>::perf_start(std::string name) {
- PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_pwl_first, l_librbd_pwl_last);
+ PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_pwl_first,
+ l_librbd_pwl_last);
// Latency axis configuration for op histograms, values are in nanoseconds
PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
template <typename I>
void AbstractWriteLog<I>::update_entries(std::shared_ptr<GenericLogEntry> log_entry,
- WriteLogPmemEntry *pmem_entry, std::map<uint64_t, bool> &missing_sync_points,
+ WriteLogCacheEntry *cache_entry, std::map<uint64_t, bool> &missing_sync_points,
std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> &sync_point_entries,
int entry_index) {
- bool writer = pmem_entry->is_writer();
- if (pmem_entry->is_sync_point()) {
+ bool writer = cache_entry->is_writer();
+ if (cache_entry->is_sync_point()) {
ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " is a sync point. pmem_entry=[" << *pmem_entry << "]" << dendl;
- auto sync_point_entry = std::make_shared<SyncPointLogEntry>(pmem_entry->sync_gen_number);
+ << " is a sync point. cache_entry=[" << *cache_entry << "]" << dendl;
+ auto sync_point_entry = std::make_shared<SyncPointLogEntry>(cache_entry->sync_gen_number);
log_entry = sync_point_entry;
- sync_point_entries[pmem_entry->sync_gen_number] = sync_point_entry;
- missing_sync_points.erase(pmem_entry->sync_gen_number);
- m_current_sync_gen = pmem_entry->sync_gen_number;
- } else if (pmem_entry->is_write()) {
+ sync_point_entries[cache_entry->sync_gen_number] = sync_point_entry;
+ missing_sync_points.erase(cache_entry->sync_gen_number);
+ m_current_sync_gen = cache_entry->sync_gen_number;
+ } else if (cache_entry->is_write()) {
ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " is a write. pmem_entry=[" << *pmem_entry << "]" << dendl;
+ << " is a write. cache_entry=[" << *cache_entry << "]" << dendl;
auto write_entry =
- std::make_shared<WriteLogEntry>(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes);
- write_data_to_buffer(write_entry, pmem_entry);
+ m_builder->create_write_log_entry(nullptr, cache_entry->image_offset_bytes, cache_entry->write_bytes);
+ write_data_to_buffer(write_entry, cache_entry);
log_entry = write_entry;
- } else if (pmem_entry->is_writesame()) {
+ } else if (cache_entry->is_writesame()) {
ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " is a write same. pmem_entry=[" << *pmem_entry << "]" << dendl;
+ << " is a write same. cache_entry=[" << *cache_entry << "]" << dendl;
auto ws_entry =
- std::make_shared<WriteSameLogEntry>(nullptr, pmem_entry->image_offset_bytes,
- pmem_entry->write_bytes, pmem_entry->ws_datalen);
- write_data_to_buffer(ws_entry, pmem_entry);
+ m_builder->create_writesame_log_entry(nullptr, cache_entry->image_offset_bytes,
+ cache_entry->write_bytes, cache_entry->ws_datalen);
+ write_data_to_buffer(ws_entry, cache_entry);
log_entry = ws_entry;
- } else if (pmem_entry->is_discard()) {
+ } else if (cache_entry->is_discard()) {
ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " is a discard. pmem_entry=[" << *pmem_entry << "]" << dendl;
+ << " is a discard. cache_entry=[" << *cache_entry << "]" << dendl;
auto discard_entry =
- std::make_shared<DiscardLogEntry>(nullptr, pmem_entry->image_offset_bytes, pmem_entry->write_bytes,
+ std::make_shared<DiscardLogEntry>(nullptr, cache_entry->image_offset_bytes, cache_entry->write_bytes,
m_discard_granularity_bytes);
log_entry = discard_entry;
} else {
lderr(m_image_ctx.cct) << "Unexpected entry type in entry " << entry_index
- << ", pmem_entry=[" << *pmem_entry << "]" << dendl;
+ << ", cache_entry=[" << *cache_entry << "]" << dendl;
}
if (writer) {
ldout(m_image_ctx.cct, 20) << "Entry " << entry_index
- << " writes. pmem_entry=[" << *pmem_entry << "]" << dendl;
- if (!sync_point_entries[pmem_entry->sync_gen_number]) {
- missing_sync_points[pmem_entry->sync_gen_number] = true;
+ << " writes. cache_entry=[" << *cache_entry << "]" << dendl;
+ if (!sync_point_entries[cache_entry->sync_gen_number]) {
+ missing_sync_points[cache_entry->sync_gen_number] = true;
}
}
}
template <typename I>
void AbstractWriteLog<I>::update_sync_points(std::map<uint64_t, bool> &missing_sync_points,
std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> &sync_point_entries,
- DeferredContexts &later) {
+ DeferredContexts &later, uint32_t alloc_size ) {
/* Create missing sync points. These must not be appended until the
* entry reload is complete and the write map is up to
* date. Currently this is handled by the deferred contexts object
}
if (log_entry->write_bytes() == log_entry->bytes_dirty()) {
/* This entry is a basic write */
- uint64_t bytes_allocated = MIN_WRITE_ALLOC_SIZE;
+ uint64_t bytes_allocated = alloc_size;
if (gen_write_entry->ram_entry.write_bytes > bytes_allocated) {
bytes_allocated = gen_write_entry->ram_entry.write_bytes;
}
uint64_t map_entry_buffer_offset = entry_image_extent.first - map_entry.log_entry->ram_entry.image_offset_bytes;
/* Offset into the log entry buffer of this read hit */
uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset;
- /* Create buffer object referring to pmem pool for this read hit */
+ /* Create buffer object referring to cache pool for this read hit */
auto write_entry = map_entry.log_entry;
/* Make a bl for this hit extent. This will add references to the write_entry->pmem_bp */
buffer::list hit_bl;
buffer::list entry_bl_copy;
- write_entry->copy_pmem_bl(&entry_bl_copy);
+ write_entry->copy_cache_bl(&entry_bl_copy);
entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl);
ceph_assert(hit_bl.length() == entry_hit_length);
ceph_assert(m_initialized);
- auto *write_req =
- new C_WriteRequestT(*this, now, std::move(image_extents), std::move(bl), fadvise_flags,
- m_lock, m_perfcounter, on_finish);
+ C_WriteRequestT *write_req =
+ m_builder->create_write_request(*this, now, std::move(image_extents), std::move(bl),
+ fadvise_flags, m_lock, m_perfcounter, on_finish);
m_perfcounter->inc(l_librbd_pwl_wr_bytes, write_req->image_extents_summary.total_bytes);
/* The lambda below will be called when the block guard for all
* as long as the length of the bl here, which is the pattern that's repeated
* in the image for the entire length of this WS. Read hits and flushing of
* write sames are different than normal writes. */
- auto *ws_req =
- new C_WriteSameRequestT(*this, now, std::move(ws_extents), std::move(bl),
- fadvise_flags, m_lock, m_perfcounter, on_finish);
+ C_WriteSameRequestT *ws_req =
+ m_builder->create_writesame_request(*this, now, std::move(ws_extents), std::move(bl),
+ fadvise_flags, m_lock, m_perfcounter, on_finish);
m_perfcounter->inc(l_librbd_pwl_ws_bytes, ws_req->image_extents_summary.total_bytes);
/* The lambda below will be called when the block guard for all
/* A compare and write request is also a write request. We only allocate
* resources and dispatch this write request if the compare phase
* succeeds. */
- auto *cw_req =
- new C_CompAndWriteRequestT(*this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl),
- mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish);
+ C_WriteRequestT *cw_req =
+ m_builder->create_comp_and_write_request(
+ *this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl),
+ mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish);
m_perfcounter->inc(l_librbd_pwl_cmp_bytes, cw_req->image_extents_summary.total_bytes);
/* The lambda below will be called when the block guard for all
}
}
-template <typename I>
-void AbstractWriteLog<I>::enlist_op_appender()
-{
- m_async_append_ops++;
- m_async_op_tracker.start_op();
- Context *append_ctx = new LambdaContext([this](int r) {
- append_scheduled_ops();
- m_async_append_ops--;
- m_async_op_tracker.finish_op();
- });
- m_work_queue.queue(append_ctx);
-}
-
template <typename I>
void AbstractWriteLog<I>::schedule_append(GenericLogOperationsVector &ops)
{
op->mark_log_entry_completed();
dirty_entries.push_back(log_entry);
}
+ if (log_entry->is_write_entry()) {
+ release_ram(log_entry);
+ }
if (op->reserved_allocated()) {
published_reserves++;
}
}
if (alloc_succeeds) {
- reserve_pmem(req, alloc_succeeds, no_space);
+ reserve_cache(req, alloc_succeeds, no_space);
}
if (alloc_succeeds) {
template <typename I>
void AbstractWriteLog<I>::add_into_log_map(GenericWriteLogEntries &log_entries,
C_BlockIORequestT *req) {
- copy_pmem(req);
+ req->copy_cache();
m_blocks_to_log_entries.add_log_entries(log_entries);
}
#include "librbd/cache/pwl/LogOperation.h"
#include "librbd/cache/pwl/Request.h"
#include "librbd/cache/pwl/LogMap.h"
+#include "librbd/cache/pwl/Builder.h"
#include <functional>
#include <list>
namespace plugin { template <typename> struct Api; }
namespace cache {
-
namespace pwl {
class GenericLogEntry;
class GenericWriteLogEntry;
class SyncPointLogEntry;
class WriteLogEntry;
-struct WriteLogPmemEntry;
+struct WriteLogCacheEntry;
typedef std::list<std::shared_ptr<WriteLogEntry>> WriteLogEntries;
typedef std::list<std::shared_ptr<GenericLogEntry>> GenericLogEntries;
typedef librbd::BlockGuard<GuardedRequest> WriteLogGuard;
class DeferredContexts;
-template <typename> class ImageCacheState;
+template <typename>
+class ImageCacheState;
+
+template<typename T>
+class Builder;
template <typename T>
struct C_BlockIORequest;
class AbstractWriteLog {
public:
typedef io::Extent Extent;
- typedef io::Extents Extents;
+ typedef io::Extents Extents;
+ using This = AbstractWriteLog<ImageCtxT>;
+ Builder<This> *m_builder;
- AbstractWriteLog(ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+ AbstractWriteLog(ImageCtxT &image_ctx,
+ librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+ Builder<This> *builder,
cache::ImageWritebackInterface& image_writeback,
plugin::Api<ImageCtxT>& plugin_api);
virtual ~AbstractWriteLog();
void invalidate(Context *on_finish);
void flush(Context *on_finish);
- using This = AbstractWriteLog<ImageCtxT>;
using C_WriteRequestT = pwl::C_WriteRequest<This>;
using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
using C_FlushRequestT = pwl::C_FlushRequest<This>;
using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
- using C_CompAndWriteRequestT = pwl::C_CompAndWriteRequest<This>;
CephContext * get_context();
void release_guarded_request(BlockGuardCell *cell);
pwl::GenericLogOperationsVector &ops, bool do_early_flush) = 0;
void schedule_append(pwl::GenericLogOperationsVector &ops);
void schedule_append(pwl::GenericLogOperationSharedPtr op);
- void flush_new_sync_point(C_FlushRequestT *flush_req, pwl::DeferredContexts &later);
+ void flush_new_sync_point(C_FlushRequestT *flush_req,
+ pwl::DeferredContexts &later);
std::shared_ptr<pwl::SyncPoint> get_current_sync_point() {
return m_current_sync_point;
}
void add_into_log_map(pwl::GenericWriteLogEntries &log_entries,
C_BlockIORequestT *req);
+ virtual void complete_user_request(Context *&user_req, int r) = 0;
+ virtual void copy_bl_to_buffer(
+ WriteRequestResources *resources,
+ std::unique_ptr<WriteLogOperationSet> &op_set) {}
private:
typedef std::list<pwl::C_WriteRequest<This> *> C_WriteRequests;
bool m_persist_on_write_until_flush = true;
/* Debug counters for the places m_async_op_tracker is used */
- std::atomic<int> m_async_append_ops = {0};
std::atomic<int> m_async_complete_ops = {0};
std::atomic<int> m_async_null_flush_finish = {0};
std::atomic<int> m_async_process_work = {0};
Contexts m_flush_complete_contexts;
std::shared_ptr<pwl::SyncPoint> m_current_sync_point = nullptr;
- bool m_persist_on_flush = false; /* If false, persist each write before completion */
+ bool m_persist_on_flush = false; //If false, persist each write before completion
int m_flush_ops_in_flight = 0;
int m_flush_bytes_in_flight = 0;
uint32_t m_discard_granularity_bytes;
BlockGuardCell* detain_guarded_request_helper(pwl::GuardedRequest &req);
- BlockGuardCell* detain_guarded_request_barrier_helper(pwl::GuardedRequest &req);
+ BlockGuardCell* detain_guarded_request_barrier_helper(
+ pwl::GuardedRequest &req);
void detain_guarded_request(C_BlockIORequestT *request,
pwl::GuardedRequestFunctionContext *guarded_ctx,
bool is_barrier);
void flush_dirty_entries(Context *on_finish);
bool can_flush_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry);
- bool handle_flushed_sync_point(std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
- void sync_point_writer_flushed(std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
+ bool handle_flushed_sync_point(
+ std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
+ void sync_point_writer_flushed(
+ std::shared_ptr<pwl::SyncPointLogEntry> log_entry);
void init_flush_new_sync_point(pwl::DeferredContexts &later);
void new_sync_point(pwl::DeferredContexts &later);
- pwl::C_FlushRequest<AbstractWriteLog<ImageCtxT>>* make_flush_req(Context *on_finish);
- void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, pwl::DeferredContexts &later);
+ pwl::C_FlushRequest<AbstractWriteLog<ImageCtxT>>* make_flush_req(
+ Context *on_finish);
+ void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req,
+ pwl::DeferredContexts &later);
void alloc_and_dispatch_io_req(C_BlockIORequestT *write_req);
- void schedule_complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r);
+ void schedule_complete_op_log_entries(pwl::GenericLogOperations &&ops,
+ const int r);
void internal_flush(bool invalidate, Context *on_finish);
protected:
AsyncOpTracker m_async_op_tracker;
/* Debug counters for the places m_async_op_tracker is used */
std::atomic<int> m_async_flush_ops = {0};
+ std::atomic<int> m_async_append_ops = {0};
/* Acquire locks in order declared here */
void update_entries(
std::shared_ptr<pwl::GenericLogEntry> log_entry,
- pwl::WriteLogPmemEntry *pmem_entry, std::map<uint64_t, bool> &missing_sync_points,
- std::map<uint64_t, std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
+ pwl::WriteLogCacheEntry *cache_entry,
+ std::map<uint64_t, bool> &missing_sync_points,
+ std::map<uint64_t,
+ std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
int entry_index);
void update_sync_points(
std::map<uint64_t, bool> &missing_sync_points,
- std::map<uint64_t, std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
- pwl::DeferredContexts &later);
+ std::map<uint64_t,
+ std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries,
+ pwl::DeferredContexts &later, uint32_t alloc_size);
Context *construct_flush_entry(
const std::shared_ptr<pwl::GenericLogEntry> log_entry, bool invalidating);
void process_writeback_dirty_entries();
bool can_retire_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry);
void dispatch_deferred_writes(void);
- void enlist_op_appender();
void complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r);
bool check_allocation(
C_BlockIORequestT *req,
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
+ uint64_t &bytes_cached, uint64_t &bytes_dirtied,
+ uint64_t &bytes_allocated,
uint64_t &num_lanes, uint64_t &num_log_entries,
uint64_t &num_unpublished_reserves, uint64_t bytes_allocated_cap);
void append_scheduled(
- pwl::GenericLogOperations &ops, bool &ops_remain, bool &appending, bool isRWL=false);
-
+ pwl::GenericLogOperations &ops, bool &ops_remain, bool &appending,
+ bool isRWL=false);
+
virtual void process_work() = 0;
virtual void append_scheduled_ops(void) = 0;
virtual void schedule_append_ops(pwl::GenericLogOperations &ops) = 0;
virtual void remove_pool_file() = 0;
- virtual void initialize_pool(Context *on_finish, pwl::DeferredContexts &later) = 0;
+ virtual void initialize_pool(Context *on_finish,
+ pwl::DeferredContexts &later) = 0;
virtual void write_data_to_buffer(
- std::shared_ptr<pwl::WriteLogEntry> ws_entry, pwl::WriteLogPmemEntry *pmem_entry) {}
+ std::shared_ptr<pwl::WriteLogEntry> ws_entry,
+ pwl::WriteLogCacheEntry *cache_entry) {}
+ virtual void release_ram(
+ const std::shared_ptr<pwl::GenericLogEntry> log_entry) {}
virtual void alloc_op_log_entries(pwl::GenericLogOperations &ops) {}
- virtual bool retire_entries(const unsigned long int frees_per_tx) {return false;}
- virtual void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops) {}
- virtual void copy_pmem(C_BlockIORequestT *req) {}
+ virtual bool retire_entries(const unsigned long int frees_per_tx) {
+ return false;
+ }
+ virtual void schedule_flush_and_append(
+ pwl::GenericLogOperationsVector &ops) {}
virtual void persist_last_flushed_sync_gen() {}
- virtual void reserve_pmem(C_BlockIORequestT *req, bool &alloc_succeeds, bool &no_space) {}
+ virtual void reserve_cache(C_BlockIORequestT *req, bool &alloc_succeeds,
+ bool &no_space) {}
virtual Context *construct_flush_entry_ctx(
- const std::shared_ptr<pwl::GenericLogEntry> log_entry) {return nullptr;}
+ const std::shared_ptr<pwl::GenericLogEntry> log_entry) {
+ return nullptr;
+ }
};
} // namespace pwl
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_BUILDER_H
+#define CEPH_LIBRBD_CACHE_PWL_BUILDER_H
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+template <typename T>
+class Builder {
+public:
+ virtual ~Builder() {}
+ virtual std::shared_ptr<WriteLogEntry> create_write_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes) = 0;
+ virtual std::shared_ptr<WriteLogEntry> create_write_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes) = 0;
+ virtual std::shared_ptr<WriteLogEntry> create_writesame_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) = 0;
+ virtual std::shared_ptr<WriteLogEntry> create_writesame_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) = 0;
+ virtual C_WriteRequest<T> *create_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) = 0;
+ virtual C_WriteSameRequest<T> *create_writesame_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) = 0;
+ virtual C_WriteRequest<T> *create_comp_and_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) = 0;
+ virtual std::shared_ptr<WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<WriteLogEntry> write_log_entry) = 0;
+ virtual std::shared_ptr<WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+ std::shared_ptr<WriteLogEntry> writesame_log_entry) = 0;
+};
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_BUILDER_H
ConfigProxy &config = image_ctx->config;
log_periodic_stats = config.get_val<bool>("rbd_rwl_log_periodic_stats");
+ cache_type = config.get_val<std::string>("rbd_persistent_cache_mode");
}
template <typename I>
::encode_json("present", present, f);
::encode_json("empty", empty, f);
::encode_json("clean", clean, f);
- ::encode_json("cache_type", (int)get_image_cache_type(), f);
+ ::encode_json("cache_type", cache_type, f);
::encode_json("pwl_host", host, f);
::encode_json("pwl_path", path, f);
::encode_json("pwl_size", size, f);
int cache_type = (int)f["cache_type"];
switch (cache_type) {
+ case IMAGE_CACHE_TYPE_SSD:
case IMAGE_CACHE_TYPE_RWL:
if (!cache_exists) {
cache_state = new ImageCacheState<I>(image_ctx, plugin_api);
// vim: ts=8 sw=2 smarttab
#ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
-#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
+#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H
#include "librbd/ImageCtx.h"
#include "librbd/cache/Types.h"
bool clean = true;
std::string host;
std::string path;
+ std::string cache_type;
uint64_t size = 0;
bool log_periodic_stats;
~ImageCacheState() {}
ImageCacheType get_image_cache_type() const {
- return IMAGE_CACHE_TYPE_RWL;
+ if (cache_type == "rwl") {
+ return IMAGE_CACHE_TYPE_RWL;
+ } else if (cache_type == "ssd") {
+ return IMAGE_CACHE_TYPE_SSD;
+ }
+ return IMAGE_CACHE_TYPE_UNKNOWN;
}
#include "librbd/cache/WriteLogImageDispatch.h"
#include "librbd/cache/ImageWriteback.h"
#ifdef WITH_RBD_RWL
-#include "librbd/cache/pwl/ReplicatedWriteLog.h"
+#include "librbd/cache/pwl/rwl/WriteLog.h"
#endif
#ifdef WITH_RBD_SSD_CACHE
-#include "librbd/cache/pwl/SSDWriteLog.h"
+#include "librbd/cache/pwl/ssd/WriteLog.h"
#endif
#include "librbd/cache/Utils.h"
#ifdef WITH_RBD_RWL
case cache::IMAGE_CACHE_TYPE_RWL:
m_image_cache =
- new librbd::cache::pwl::ReplicatedWriteLog<I>(m_image_ctx,
- cache_state,
- m_image_writeback,
- m_plugin_api);
+ new librbd::cache::pwl::rwl::WriteLog<I>(m_image_ctx,
+ cache_state,
+ m_image_writeback,
+ m_plugin_api);
break;
#endif
#ifdef WITH_RBD_SSD_CACHE
case cache::IMAGE_CACHE_TYPE_SSD:
m_image_cache =
- new librbd::cache::pwl::SSDWriteLog<I>(m_image_ctx,
- cache_state,
- m_image_writeback,
- m_plugin_api);
+ new librbd::cache::pwl::ssd::WriteLog<I>(m_image_ctx,
+ cache_state,
+ m_image_writeback,
+ m_plugin_api);
break;
#endif
default:
ldout(cct, 10) << dendl;
using klass = InitRequest<I>;
- Context *ctx = create_context_callback<klass, &klass::handle_init_image_cache>(
- this);
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_init_image_cache>(this);
m_image_cache->init(ctx);
}
ldout(cct, 10) << dendl;
using klass = InitRequest<I>;
- Context *ctx = create_context_callback<klass, &klass::handle_shutdown_image_cache>(
- this);
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_shutdown_image_cache>(this);
m_image_cache->shut_down(ctx);
}
<< __func__ << ": "
namespace librbd {
-
namespace cache {
-
namespace pwl {
std::ostream& GenericLogEntry::format(std::ostream &os) const {
os << "ram_entry=[" << ram_entry << "], "
- << "pmem_entry=" << (void*)pmem_entry << ", "
+ << "cache_entry=" << (void*)cache_entry << ", "
<< "log_entry_index=" << log_entry_index << ", "
<< "completed=" << completed;
return os;
return entry.format(os);
}
-#ifdef WITH_RBD_RWL
-void WriteLogEntry::init_pmem_buffer(std::vector<WriteBufferAllocation>::iterator allocation) {
- ram_entry.write_data = allocation->buffer_oid;
- ceph_assert(!TOID_IS_NULL(ram_entry.write_data));
- pmem_buffer = D_RW(ram_entry.write_data);
-}
-#endif
-
void WriteLogEntry::init(bool has_data,
- uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush) {
+ uint64_t current_sync_gen,
+ uint64_t last_op_sequence_num, bool persist_on_flush) {
ram_entry.has_data = 1;
ram_entry.sync_gen_number = current_sync_gen;
if (persist_on_flush) {
ram_entry.discard = 0;
}
-void WriteLogEntry::init_pmem_bp() {
- ceph_assert(!pmem_bp.have_raw());
- pmem_bp = buffer::ptr(buffer::create_static(this->write_bytes(), (char*)pmem_buffer));
-}
-
-void WriteLogEntry::init_pmem_bl() {
- pmem_bl.clear();
- init_pmem_bp();
- ceph_assert(pmem_bp.have_raw());
- int before_bl = pmem_bp.raw_nref();
- this->init_bl(pmem_bp, pmem_bl);
- int after_bl = pmem_bp.raw_nref();
- bl_refs = after_bl - before_bl;
-}
-
unsigned int WriteLogEntry::reader_count() const {
- if (pmem_bp.have_raw()) {
- return (pmem_bp.raw_nref() - bl_refs - 1);
+ if (cache_bp.have_raw()) {
+ return (cache_bp.raw_nref() - bl_refs - 1);
} else {
return 0;
}
}
-/* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */
-buffer::list& WriteLogEntry::get_pmem_bl() {
- if (0 == bl_refs) {
- std::lock_guard locker(m_entry_bl_lock);
- if (0 == bl_refs) {
- init_pmem_bl();
- }
- ceph_assert(0 != bl_refs);
- }
- return pmem_bl;
-}
-
-/* Constructs a new bl containing copies of pmem_bp */
-void WriteLogEntry::copy_pmem_bl(bufferlist *out_bl) {
- this->get_pmem_bl();
- /* pmem_bp is now initialized */
- buffer::ptr cloned_bp(pmem_bp.clone());
- out_bl->clear();
- this->init_bl(cloned_bp, *out_bl);
-}
-
-void WriteLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) {
- /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the bl even after flush()). */
- bufferlist entry_bl;
- buffer::list entry_bl_copy;
- copy_pmem_bl(&entry_bl_copy);
- entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
- image_writeback.aio_write({{ram_entry.image_offset_bytes, ram_entry.write_bytes}},
- std::move(entry_bl), 0, ctx);
-}
-
std::ostream& WriteLogEntry::format(std::ostream &os) const {
os << "(Write) ";
GenericWriteLogEntry::format(os);
os << ", "
- << "pmem_buffer=" << (void*)pmem_buffer << ", ";
- os << "pmem_bp=" << pmem_bp << ", ";
- os << "pmem_bl=" << pmem_bl << ", ";
+ << "cache_buffer=" << (void*)cache_buffer << ", ";
+ os << "cache_bp=" << cache_bp << ", ";
+ os << "cache_bl=" << cache_bl << ", ";
os << "bl_refs=" << bl_refs;
return os;
}
return entry.format(os);
}
-void DiscardLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) {
- image_writeback.aio_discard(ram_entry.image_offset_bytes, ram_entry.write_bytes,
+void DiscardLogEntry::writeback(
+ librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) {
+ image_writeback.aio_discard(ram_entry.image_offset_bytes,
+ ram_entry.write_bytes,
m_discard_granularity_bytes, ctx);
}
-void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num) {
+void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush,
+ uint64_t last_op_sequence_num) {
ram_entry.sync_gen_number = current_sync_gen;
if (persist_on_flush) {
/* Persist on flush. Sequence #0 is never used. */
return entry.format(os);
}
-void WriteSameLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) {
- for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) {
- bl.append(bp);
- }
- int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen;
- if (trailing_partial) {
- bl.append(bp, 0, trailing_partial);
- }
-}
-
-void WriteSameLogEntry::writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) {
- bufferlist entry_bl;
- buffer::list entry_bl_copy;
- copy_pmem_bl(&entry_bl_copy);
- entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
- image_writeback.aio_writesame(ram_entry.image_offset_bytes, ram_entry.write_bytes,
- std::move(entry_bl), 0, ctx);
-}
-
-std::ostream &WriteSameLogEntry::format(std::ostream &os) const {
- os << "(WriteSame) ";
- WriteLogEntry::format(os);
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const WriteSameLogEntry &entry) {
- return entry.format(os);
-}
-
} // namespace pwl
} // namespace cache
} // namespace librbd
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
-#define CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+#ifndef CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H
#include "common/ceph_mutex.h"
#include "librbd/Utils.h"
class GenericLogEntry {
public:
- WriteLogPmemEntry ram_entry;
- WriteLogPmemEntry *pmem_entry = nullptr;
+ WriteLogCacheEntry ram_entry;
+ WriteLogCacheEntry *cache_entry = nullptr;
uint32_t log_entry_index = 0;
bool completed = false;
- GenericLogEntry(const uint64_t image_offset_bytes = 0, const uint64_t write_bytes = 0)
+ GenericLogEntry(uint64_t image_offset_bytes = 0, uint64_t write_bytes = 0)
: ram_entry(image_offset_bytes, write_bytes) {
};
virtual ~GenericLogEntry() { };
Context *ctx) {
ceph_assert(false);
};
+ virtual void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist &&bl) {
+ ceph_assert(false);
+ }
+ virtual bool is_write_entry() const {
+ return false;
+ }
+ virtual bool is_sync_point() const {
+ return false;
+ }
+ virtual unsigned int get_aligned_data_size() const {
+ return 0;
+ }
+ virtual void remove_cache_bl() {}
virtual std::ostream& format(std::ostream &os) const;
friend std::ostream &operator<<(std::ostream &os,
const GenericLogEntry &entry);
/* All writing entries using all prior sync gen numbers have been flushed */
std::atomic<bool> prior_sync_point_flushed = {true};
std::shared_ptr<SyncPointLogEntry> next_sync_point_entry = nullptr;
- SyncPointLogEntry(const uint64_t sync_gen_number) {
+ SyncPointLogEntry(uint64_t sync_gen_number) {
ram_entry.sync_gen_number = sync_gen_number;
ram_entry.sync_point = 1;
};
bool can_retire() const override {
return this->completed;
}
+ bool is_sync_point() const override {
+ return true;
+ }
std::ostream& format(std::ostream &os) const;
friend std::ostream &operator<<(std::ostream &os,
const SyncPointLogEntry &entry);
uint32_t referring_map_entries = 0;
std::shared_ptr<SyncPointLogEntry> sync_point_entry;
GenericWriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
- const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ uint64_t image_offset_bytes, uint64_t write_bytes)
: GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(sync_point_entry) { }
- GenericWriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ GenericWriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
: GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(nullptr) { }
~GenericWriteLogEntry() override {};
GenericWriteLogEntry(const GenericWriteLogEntry&) = delete;
std::shared_ptr<SyncPointLogEntry> get_sync_point_entry() override {
return sync_point_entry;
}
- virtual void copy_pmem_bl(bufferlist *out_bl) = 0;
+ virtual void copy_cache_bl(bufferlist *out_bl) = 0;
void set_flushed(bool flushed) override {
m_flushed = flushed;
}
class WriteLogEntry : public GenericWriteLogEntry {
protected:
- buffer::ptr pmem_bp;
- buffer::list pmem_bl;
- std::atomic<int> bl_refs = {0}; /* The refs held on pmem_bp by pmem_bl */
- /* Used in WriteLogEntry::get_pmem_bl() to syncronize between threads making entries readable */
+ bool is_writesame = false;
+ buffer::ptr cache_bp;
+ buffer::list cache_bl;
+ std::atomic<int> bl_refs = {0}; /* The refs held on cache_bp by cache_bl */
+ /* Used in WriteLogEntry::get_cache_bl() to syncronize between threads making entries readable */
mutable ceph::mutex m_entry_bl_lock;
- void init_pmem_bp();
-
- /* Write same will override */
- virtual void init_bl(buffer::ptr &bp, buffer::list &bl) {
- bl.append(bp);
- }
-
- void init_pmem_bl();
+ virtual void init_cache_bp() {}
+ virtual void init_bl(buffer::ptr &bp, buffer::list &bl) {}
public:
- uint8_t *pmem_buffer = nullptr;
+ uint8_t *cache_buffer = nullptr;
WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
- const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ uint64_t image_offset_bytes, uint64_t write_bytes)
: GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
m_entry_bl_lock(ceph::make_mutex(pwl::unique_lock_name(
"librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this)))
{ }
- WriteLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
: GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes),
m_entry_bl_lock(ceph::make_mutex(pwl::unique_lock_name(
"librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this)))
{ }
- ~WriteLogEntry() override {};
+ WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {
+ ram_entry.writesame = 1;
+ ram_entry.ws_datalen = data_length;
+ is_writesame = true;
+ };
+ WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
+ ram_entry.writesame = 1;
+ ram_entry.ws_datalen = data_length;
+ is_writesame = true;
+ };
+ ~WriteLogEntry() override {};
WriteLogEntry(const WriteLogEntry&) = delete;
WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+ unsigned int write_bytes() const override {
+ // The valid bytes in this ops data buffer.
+ if(is_writesame) {
+ return ram_entry.ws_datalen;
+ }
+ return ram_entry.write_bytes;
+ };
+ unsigned int bytes_dirty() const override {
+ // The bytes in the image this op makes dirty.
+ return ram_entry.write_bytes;
+ };
void init(bool has_data,
uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush);
- #ifdef WITH_RBD_RWL
- void init_pmem_buffer(std::vector<WriteBufferAllocation>::iterator allocation);
- #endif
+ virtual void init_cache_buffer(std::vector<WriteBufferAllocation>::iterator allocation) {}
+ virtual void init_cache_bl(bufferlist &src_bl, uint64_t off, uint64_t len) {}
+ /* Returns a ref to a bl containing bufferptrs to the entry cache buffer */
+ virtual buffer::list &get_cache_bl() = 0;
+
BlockExtent block_extent();
unsigned int reader_count() const;
- /* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */
- buffer::list &get_pmem_bl();
- /* Constructs a new bl containing copies of pmem_bp */
- void copy_pmem_bl(bufferlist *out_bl) override;
- void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) override;
+ /* Constructs a new bl containing copies of cache_bp */
+ void copy_cache_bl(bufferlist *out_bl) override {};
bool can_retire() const override {
return (this->completed && this->get_flushed() && (0 == reader_count()));
}
+ bool is_write_entry() const override {
+ return true;
+ }
std::ostream &format(std::ostream &os) const;
friend std::ostream &operator<<(std::ostream &os,
const WriteLogEntry &entry);
class DiscardLogEntry : public GenericWriteLogEntry {
public:
DiscardLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
- const uint64_t image_offset_bytes, const uint64_t write_bytes,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
uint32_t discard_granularity_bytes)
: GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes),
m_discard_granularity_bytes(discard_granularity_bytes) {
ram_entry.discard = 1;
};
- DiscardLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes)
+ DiscardLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
: GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
ram_entry.discard = 1;
};
bool can_retire() const override {
return this->completed;
}
- void copy_pmem_bl(bufferlist *out_bl) override {
+ void copy_cache_bl(bufferlist *out_bl) override {
ceph_assert(false);
}
void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
uint32_t m_discard_granularity_bytes;
};
-class WriteSameLogEntry : public WriteLogEntry {
-protected:
- void init_bl(buffer::ptr &bp, buffer::list &bl) override;
-
-public:
- WriteSameLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
- const uint64_t image_offset_bytes, const uint64_t write_bytes,
- const uint32_t data_length)
- : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {
- ram_entry.writesame = 1;
- ram_entry.ws_datalen = data_length;
- };
- WriteSameLogEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes,
- const uint32_t data_length)
- : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) {
- ram_entry.writesame = 1;
- ram_entry.ws_datalen = data_length;
- };
- WriteSameLogEntry(const WriteSameLogEntry&) = delete;
- WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
- unsigned int write_bytes() const override {
- /* The valid bytes in this ops data buffer. */
- return ram_entry.ws_datalen;
- };
- unsigned int bytes_dirty() const override {
- /* The bytes in the image this op makes dirty. */
- return ram_entry.write_bytes;
- };
- void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
- Context *ctx) override;
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const WriteSameLogEntry &entry);
-};
-
} // namespace pwl
} // namespace cache
} // namespace librbd
-#endif // CEPH_LIBRBD_CACHE_RWL_LOG_ENTRY_H
+#endif // CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H
#define dout_subsys ceph_subsys_rbd_pwl
#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::pwl::LogOperation: " << this << " " \
- << __func__ << ": "
+#define dout_prefix *_dout << "librbd::cache::pwl::LogOperation: " << this \
+ << " " << __func__ << ": "
namespace librbd {
-
namespace cache {
-
namespace pwl {
-GenericLogOperation::GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter)
+GenericLogOperation::GenericLogOperation(utime_t dispatch_time,
+ PerfCounters *perfcounter)
: m_perfcounter(perfcounter), dispatch_time(dispatch_time) {
}
SyncPointLogOperation::SyncPointLogOperation(ceph::mutex &lock,
std::shared_ptr<SyncPoint> sync_point,
- const utime_t dispatch_time,
+ utime_t dispatch_time,
PerfCounters *perfcounter,
CephContext *cct)
- : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock), sync_point(sync_point) {
+ : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock),
+ sync_point(sync_point) {
}
SyncPointLogOperation::~SyncPointLogOperation() { }
}
GenericWriteLogOperation::GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
- const utime_t dispatch_time,
+ utime_t dispatch_time,
PerfCounters *perfcounter,
CephContext *cct)
: GenericLogOperation(dispatch_time, perfcounter),
on_write_persist = nullptr;
}
if (on_persist) {
- ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist << dendl;
+ ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist
+ << dendl;
on_persist->complete(result);
}
}
-WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set,
- uint64_t image_offset_bytes, uint64_t write_bytes,
- CephContext *cct)
- : GenericWriteLogOperation(set.sync_point, set.dispatch_time, set.perfcounter, cct),
- log_entry(std::make_shared<WriteLogEntry>(set.sync_point->log_entry, image_offset_bytes, write_bytes)) {
+WriteLogOperation::WriteLogOperation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<WriteLogEntry> write_log_entry)
+ : GenericWriteLogOperation(set.sync_point, set.dispatch_time,
+ set.perfcounter, cct),
+ log_entry(write_log_entry) {
on_write_append = set.extent_ops_appending->new_sub();
on_write_persist = set.extent_ops_persist->new_sub();
log_entry->sync_point_entry->writes++;
log_entry->sync_point_entry->bytes += write_bytes;
}
+WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes,
+ uint32_t data_len,
+ CephContext *cct,
+ std::shared_ptr<WriteLogEntry> writesame_log_entry)
+ : WriteLogOperation(set, image_offset_bytes, write_bytes, cct,
+ writesame_log_entry) {
+ is_writesame = true;
+}
+
WriteLogOperation::~WriteLogOperation() { }
-void WriteLogOperation::init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation, uint64_t current_sync_gen,
- uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset,
+void WriteLogOperation::init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation,
+ uint64_t current_sync_gen,
+ uint64_t last_op_sequence_num,
+ bufferlist &write_req_bl, uint64_t buffer_offset,
bool persist_on_flush) {
- log_entry->init(has_data, current_sync_gen, last_op_sequence_num, persist_on_flush);
+ log_entry->init(has_data, current_sync_gen, last_op_sequence_num,
+ persist_on_flush);
buffer_alloc = &(*allocation);
- bl.substr_of(write_req_bl, buffer_offset,
- log_entry->write_bytes());
+ bl.substr_of(write_req_bl, buffer_offset, log_entry->write_bytes());
+ log_entry->init_cache_bl(write_req_bl, buffer_offset,
+ log_entry->write_bytes());
}
std::ostream &WriteLogOperation::format(std::ostream &os) const {
- os << "(Write) ";
+ string op_name = is_writesame ? "(Write Same) " : "(Write) ";
+ os << op_name;
GenericWriteLogOperation::format(os);
os << ", ";
if (log_entry) {
m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_app_t, log_append_time - buf_persist_time);
}
-#ifdef WITH_RBD_RWL
-void WriteLogOperation::copy_bl_to_pmem_buffer(std::vector<WriteBufferAllocation>::iterator allocation) {
- /* operation is a shared_ptr, so write_op is only good as long as operation is in scope */
- bufferlist::iterator i(&bl);
- m_perfcounter->inc(l_librbd_pwl_log_op_bytes, log_entry->write_bytes());
- ldout(m_cct, 20) << bl << dendl;
- log_entry->init_pmem_buffer(allocation);
- i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->pmem_buffer);
-}
-
-void WriteLogOperation::flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {
- buf_persist_time = ceph_clock_now();
- pmemobj_flush(log_pool, log_entry->pmem_buffer, log_entry->write_bytes());
-}
-#endif
-
WriteLogOperationSet::WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr<SyncPoint> sync_point,
bool persist_on_flush, CephContext *cct, Context *on_finish)
: m_cct(cct), m_on_finish(on_finish),
}
DiscardLogOperation::DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
- const uint64_t image_offset_bytes,
- const uint64_t write_bytes,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes,
uint32_t discard_granularity_bytes,
- const utime_t dispatch_time,
+ utime_t dispatch_time,
PerfCounters *perfcounter,
CephContext *cct)
: GenericWriteLogOperation(sync_point, dispatch_time, perfcounter, cct),
return op.format(os);
}
-WriteSameLogOperation::WriteSameLogOperation(WriteLogOperationSet &set,
- uint64_t image_offset_bytes,
- uint64_t write_bytes,
- uint32_t data_len,
- CephContext *cct)
- : WriteLogOperation(set, image_offset_bytes, write_bytes, cct) {
- log_entry =
- std::make_shared<WriteSameLogEntry>(set.sync_point->log_entry, image_offset_bytes, write_bytes, data_len);
- ldout(m_cct, 20) << __func__ << " " << this << dendl;
-}
-
-WriteSameLogOperation::~WriteSameLogOperation() { }
-
-std::ostream &WriteSameLogOperation::format(std::ostream &os) const {
- os << "(Write Same) ";
- WriteLogOperation::format(os);
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os,
- const WriteSameLogOperation &op) {
- return op.format(os);
-}
-
} // namespace pwl
} // namespace cache
} // namespace librbd
namespace librbd {
namespace cache {
namespace pwl {
+
struct WriteBufferAllocation;
class WriteLogOperationSet;
class GenericLogOperation;
+template <typename T>
+class AbstractWriteLog;
+
using GenericLogOperationSharedPtr = std::shared_ptr<GenericLogOperation>;
using GenericLogOperationsVector = std::vector<GenericLogOperationSharedPtr>;
utime_t buf_persist_comp_time; // When buffer persist completes
utime_t log_append_time; // When log append begins
utime_t log_append_comp_time; // When log append completes
- GenericLogOperation(const utime_t dispatch_time, PerfCounters *perfcounter);
+ GenericLogOperation(utime_t dispatch_time, PerfCounters *perfcounter);
virtual ~GenericLogOperation() { };
GenericLogOperation(const GenericLogOperation&) = delete;
GenericLogOperation &operator=(const GenericLogOperation&) = delete;
virtual bool is_writing_op() const {
return false;
}
- #ifdef WITH_RBD_RWL
- virtual void copy_bl_to_pmem_buffer(
+ virtual void copy_bl_to_cache_buffer(
std::vector<WriteBufferAllocation>::iterator allocation) {};
- virtual void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {};
- #endif
};
class SyncPointLogOperation : public GenericLogOperation {
std::shared_ptr<SyncPoint> sync_point;
SyncPointLogOperation(ceph::mutex &lock,
std::shared_ptr<SyncPoint> sync_point,
- const utime_t dispatch_time,
+ utime_t dispatch_time,
PerfCounters *perfcounter,
CephContext *cct);
~SyncPointLogOperation() override;
Context *on_write_persist = nullptr; /* Completion for things waiting on this
* write to persist */
GenericWriteLogOperation(std::shared_ptr<SyncPoint> sync_point,
- const utime_t dispatch_time,
+ utime_t dispatch_time,
PerfCounters *perfcounter,
CephContext *cct);
~GenericWriteLogOperation() override;
using GenericWriteLogOperation::on_write_persist;
std::shared_ptr<WriteLogEntry> log_entry;
bufferlist bl;
+ bool is_writesame = false;
WriteBufferAllocation *buffer_alloc = nullptr;
- WriteLogOperation(WriteLogOperationSet &set, const uint64_t image_offset_bytes,
- const uint64_t write_bytes, CephContext *cct);
- ~WriteLogOperation() override;
+ WriteLogOperation(WriteLogOperationSet &set,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<WriteLogEntry> write_log_entry);
+ WriteLogOperation(WriteLogOperationSet &set,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len,
+ CephContext *cct,
+ std::shared_ptr<WriteLogEntry> writesame_log_entry);
+ ~WriteLogOperation() override;
WriteLogOperation(const WriteLogOperation&) = delete;
WriteLogOperation &operator=(const WriteLogOperation&) = delete;
- void init(bool has_data, std::vector<WriteBufferAllocation>::iterator allocation, uint64_t current_sync_gen,
- uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset,
+ void init(bool has_data,
+ std::vector<WriteBufferAllocation>::iterator allocation,
+ uint64_t current_sync_gen, uint64_t last_op_sequence_num,
+ bufferlist &write_req_bl, uint64_t buffer_offset,
bool persist_on_flush);
std::ostream &format(std::ostream &os) const;
friend std::ostream &operator<<(std::ostream &os,
}
void complete(int r) override;
- #ifdef WITH_RBD_RWL
- void copy_bl_to_pmem_buffer(
- std::vector<WriteBufferAllocation>::iterator allocation) override;
- void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) override;
- #endif
};
utime_t dispatch_time; /* When set created */
PerfCounters *perfcounter = nullptr;
std::shared_ptr<SyncPoint> sync_point;
- WriteLogOperationSet(const utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr<SyncPoint> sync_point,
- const bool persist_on_flush, CephContext *cct, Context *on_finish);
+ WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter,
+ std::shared_ptr<SyncPoint> sync_point,
+ const bool persist_on_flush, CephContext *cct,
+ Context *on_finish);
~WriteLogOperationSet();
WriteLogOperationSet(const WriteLogOperationSet&) = delete;
WriteLogOperationSet &operator=(const WriteLogOperationSet&) = delete;
using GenericWriteLogOperation::on_write_persist;
std::shared_ptr<DiscardLogEntry> log_entry;
DiscardLogOperation(std::shared_ptr<SyncPoint> sync_point,
- const uint64_t image_offset_bytes,
- const uint64_t write_bytes,
+ uint64_t image_offset_bytes,
+ uint64_t write_bytes,
uint32_t discard_granularity_bytes,
- const utime_t dispatch_time,
+ utime_t dispatch_time,
PerfCounters *perfcounter,
CephContext *cct);
~DiscardLogOperation() override;
const DiscardLogOperation &op);
};
-class WriteSameLogOperation : public WriteLogOperation {
-public:
- using GenericWriteLogOperation::m_lock;
- using GenericWriteLogOperation::sync_point;
- using GenericWriteLogOperation::on_write_append;
- using GenericWriteLogOperation::on_write_persist;
- using WriteLogOperation::log_entry;
- using WriteLogOperation::bl;
- using WriteLogOperation::buffer_alloc;
- WriteSameLogOperation(WriteLogOperationSet &set,
- const uint64_t image_offset_bytes,
- const uint64_t write_bytes,
- const uint32_t data_len,
- CephContext *cct);
- ~WriteSameLogOperation();
- WriteSameLogOperation(const WriteSameLogOperation&) = delete;
- WriteSameLogOperation &operator=(const WriteSameLogOperation&) = delete;
- std::ostream &format(std::ostream &os) const;
- friend std::ostream &operator<<(std::ostream &os,
- const WriteSameLogOperation &op);
-};
-
} // namespace pwl
} // namespace cache
} // namespace librbd
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "ReplicatedWriteLog.h"
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "include/ceph_assert.h"
-#include "common/deleter.h"
-#include "common/dout.h"
-#include "common/environment.h"
-#include "common/errno.h"
-#include "common/WorkQueue.h"
-#include "common/Timer.h"
-#include "common/perf_counters.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/asio/ContextWQ.h"
-#include "librbd/cache/pwl/ImageCacheState.h"
-#include "librbd/cache/pwl/LogEntry.h"
-#include "librbd/plugin/Api.h"
-#include <map>
-#include <vector>
-
-#undef dout_subsys
-#define dout_subsys ceph_subsys_rbd_pwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::pwl::ReplicatedWriteLog: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace pwl {
-
-using namespace librbd::cache::pwl;
-
-const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION;
-
-template <typename I>
-ReplicatedWriteLog<I>::ReplicatedWriteLog(
- I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
- ImageWritebackInterface& image_writeback,
- plugin::Api<I>& plugin_api)
-: AbstractWriteLog<I>(image_ctx, cache_state, image_writeback, plugin_api),
- m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl))
-{
-}
-
-template <typename I>
-ReplicatedWriteLog<I>::~ReplicatedWriteLog() {
- m_log_pool = nullptr;
-}
-
-/*
- * Allocate the (already reserved) write log entries for a set of operations.
- *
- * Locking:
- * Acquires lock
- */
-template <typename I>
-void ReplicatedWriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops)
-{
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
-
- ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
-
- /* Allocate the (already reserved) log entries */
- std::lock_guard locker(m_lock);
-
- for (auto &operation : ops) {
- uint32_t entry_index = this->m_first_free_entry;
- this->m_first_free_entry = (this->m_first_free_entry + 1) % this->m_total_log_entries;
- auto &log_entry = operation->get_log_entry();
- log_entry->log_entry_index = entry_index;
- log_entry->ram_entry.entry_index = entry_index;
- log_entry->pmem_entry = &pmem_log_entries[entry_index];
- log_entry->ram_entry.entry_valid = 1;
- m_log_entries.push_back(log_entry);
- ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
- }
-}
-
-/*
- * Write and persist the (already allocated) write log entries and
- * data buffer allocations for a set of ops. The data buffer for each
- * of these must already have been persisted to its reserved area.
- */
-template <typename I>
-int ReplicatedWriteLog<I>::append_op_log_entries(GenericLogOperations &ops)
-{
- CephContext *cct = m_image_ctx.cct;
- GenericLogOperationsVector entries_to_flush;
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- int ret = 0;
-
- ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
-
- if (ops.empty()) {
- return 0;
- }
- entries_to_flush.reserve(OPS_APPENDED_TOGETHER);
-
- /* Write log entries to ring and persist */
- utime_t now = ceph_clock_now();
- for (auto &operation : ops) {
- if (!entries_to_flush.empty()) {
- /* Flush these and reset the list if the current entry wraps to the
- * tail of the ring */
- if (entries_to_flush.back()->get_log_entry()->log_entry_index >
- operation->get_log_entry()->log_entry_index) {
- ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at "
- << "operation=[" << *operation << "]" << dendl;
- flush_op_log_entries(entries_to_flush);
- entries_to_flush.clear();
- now = ceph_clock_now();
- }
- }
- ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index="
- << operation->get_log_entry()->log_entry_index << " "
- << "from " << &operation->get_log_entry()->ram_entry << " "
- << "to " << operation->get_log_entry()->pmem_entry << " "
- << "operation=[" << *operation << "]" << dendl;
- ldout(m_image_ctx.cct, 05) << "APPENDING: index="
- << operation->get_log_entry()->log_entry_index << " "
- << "operation=[" << *operation << "]" << dendl;
- operation->log_append_time = now;
- *operation->get_log_entry()->pmem_entry = operation->get_log_entry()->ram_entry;
- ldout(m_image_ctx.cct, 20) << "APPENDING: index="
- << operation->get_log_entry()->log_entry_index << " "
- << "pmem_entry=[" << *operation->get_log_entry()->pmem_entry
- << "]" << dendl;
- entries_to_flush.push_back(operation);
- }
- flush_op_log_entries(entries_to_flush);
-
- /* Drain once for all */
- pmemobj_drain(m_log_pool);
-
- /*
- * Atomically advance the log head pointer and publish the
- * allocations for all the data buffers they refer to.
- */
- utime_t tx_start = ceph_clock_now();
- TX_BEGIN(m_log_pool) {
- D_RW(pool_root)->first_free_entry = this->m_first_free_entry;
- for (auto &operation : ops) {
- if (operation->reserved_allocated()) {
- auto write_op = (std::shared_ptr<WriteLogOperation>&) operation;
- pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1);
- } else {
- ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
- }
- }
- } TX_ONCOMMIT {
- } TX_ONABORT {
- lderr(cct) << "failed to commit " << ops.size()
- << " log entries (" << this->m_log_pool_name << ")" << dendl;
- ceph_assert(false);
- ret = -EIO;
- } TX_FINALLY {
- } TX_END;
-
- utime_t tx_end = ceph_clock_now();
- m_perfcounter->tinc(l_librbd_pwl_append_tx_t, tx_end - tx_start);
- m_perfcounter->hinc(
- l_librbd_pwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size());
- for (auto &operation : ops) {
- operation->log_append_comp_time = tx_end;
- }
-
- return ret;
-}
-
-/*
- * Flush the persistent write log entries set of ops. The entries must
- * be contiguous in persistent memory.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::flush_op_log_entries(GenericLogOperationsVector &ops)
-{
- if (ops.empty()) {
- return;
- }
-
- if (ops.size() > 1) {
- ceph_assert(ops.front()->get_log_entry()->pmem_entry < ops.back()->get_log_entry()->pmem_entry);
- }
-
- ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() << " "
- << "start address="
- << ops.front()->get_log_entry()->pmem_entry << " "
- << "bytes="
- << ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry))
- << dendl;
- pmemobj_flush(m_log_pool,
- ops.front()->get_log_entry()->pmem_entry,
- ops.size() * sizeof(*(ops.front()->get_log_entry()->pmem_entry)));
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::remove_pool_file() {
- if (m_log_pool) {
- ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl;
- pmemobj_close(m_log_pool);
- }
- if (m_cache_state->clean) {
- ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << this->m_log_pool_name << dendl;
- if (remove(this->m_log_pool_name.c_str()) != 0) {
- lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << this->m_log_pool_name << "\": "
- << pmemobj_errormsg() << dendl;
- } else {
- m_cache_state->clean = true;
- m_cache_state->empty = true;
- m_cache_state->present = false;
- }
- } else {
- ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << this->m_log_pool_name << dendl;
- }
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::initialize_pool(Context *on_finish, pwl::DeferredContexts &later) {
- CephContext *cct = m_image_ctx.cct;
- TOID(struct WriteLogPoolRoot) pool_root;
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
- if ((m_log_pool =
- pmemobj_create(this->m_log_pool_name.c_str(),
- this->m_pwl_pool_layout_name,
- this->m_log_pool_config_size,
- (S_IWUSR | S_IRUSR))) == NULL) {
- lderr(cct) << "failed to create pool (" << this->m_log_pool_name << ")"
- << pmemobj_errormsg() << dendl;
- m_cache_state->present = false;
- m_cache_state->clean = true;
- m_cache_state->empty = true;
- /* TODO: filter/replace errnos that are meaningless to the caller */
- on_finish->complete(-errno);
- return;
- }
- m_cache_state->present = true;
- m_cache_state->clean = true;
- m_cache_state->empty = true;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-
- /* new pool, calculate and store metadata */
- size_t effective_pool_size = (size_t)(this->m_log_pool_config_size * USABLE_SIZE);
- size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogPmemEntry);
- uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size);
- if (num_small_writes > MAX_LOG_ENTRIES) {
- num_small_writes = MAX_LOG_ENTRIES;
- }
- if (num_small_writes <= 2) {
- lderr(cct) << "num_small_writes needs to > 2" << dendl;
- on_finish->complete(-EINVAL);
- return;
- }
- this->m_log_pool_actual_size = this->m_log_pool_config_size;
- this->m_bytes_allocated_cap = effective_pool_size;
- /* Log ring empty */
- m_first_free_entry = 0;
- m_first_valid_entry = 0;
- TX_BEGIN(m_log_pool) {
- TX_ADD(pool_root);
- D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION;
- D_RW(pool_root)->log_entries =
- TX_ZALLOC(struct WriteLogPmemEntry,
- sizeof(struct WriteLogPmemEntry) * num_small_writes);
- D_RW(pool_root)->pool_size = this->m_log_pool_actual_size;
- D_RW(pool_root)->flushed_sync_gen = this->m_flushed_sync_gen;
- D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE;
- D_RW(pool_root)->num_log_entries = num_small_writes;
- D_RW(pool_root)->first_free_entry = m_first_free_entry;
- D_RW(pool_root)->first_valid_entry = m_first_valid_entry;
- } TX_ONCOMMIT {
- this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
- this->m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free
- } TX_ONABORT {
- this->m_total_log_entries = 0;
- this->m_free_log_entries = 0;
- lderr(cct) << "failed to initialize pool (" << this->m_log_pool_name << ")" << dendl;
- on_finish->complete(-pmemobj_tx_errno());
- return;
- } TX_FINALLY {
- } TX_END;
- } else {
- m_cache_state->present = true;
- /* Open existing pool */
- if ((m_log_pool =
- pmemobj_open(this->m_log_pool_name.c_str(),
- this->m_pwl_pool_layout_name)) == NULL) {
- lderr(cct) << "failed to open pool (" << this->m_log_pool_name << "): "
- << pmemobj_errormsg() << dendl;
- on_finish->complete(-errno);
- return;
- }
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- if (D_RO(pool_root)->header.layout_version != RWL_POOL_VERSION) {
- // TODO: will handle upgrading version in the future
- lderr(cct) << "Pool layout version is " << D_RO(pool_root)->header.layout_version
- << " expected " << RWL_POOL_VERSION << dendl;
- on_finish->complete(-EINVAL);
- return;
- }
- if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) {
- lderr(cct) << "Pool block size is " << D_RO(pool_root)->block_size
- << " expected " << MIN_WRITE_ALLOC_SIZE << dendl;
- on_finish->complete(-EINVAL);
- return;
- }
- this->m_log_pool_actual_size = D_RO(pool_root)->pool_size;
- this->m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen;
- this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
- m_first_free_entry = D_RO(pool_root)->first_free_entry;
- m_first_valid_entry = D_RO(pool_root)->first_valid_entry;
- if (m_first_free_entry < m_first_valid_entry) {
- /* Valid entries wrap around the end of the ring, so first_free is lower
- * than first_valid. If first_valid was == first_free+1, the entry at
- * first_free would be empty. The last entry is never used, so in
- * that case there would be zero free log entries. */
- this->m_free_log_entries = this->m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1;
- } else {
- /* first_valid is <= first_free. If they are == we have zero valid log
- * entries, and n-1 free log entries */
- this->m_free_log_entries = this->m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1;
- }
- size_t effective_pool_size = (size_t)(this->m_log_pool_config_size * USABLE_SIZE);
- this->m_bytes_allocated_cap = effective_pool_size;
- load_existing_entries(later);
- m_cache_state->clean = this->m_dirty_log_entries.empty();
- m_cache_state->empty = m_log_entries.empty();
- }
-}
-
-/*
- * Loads the log entries from an existing log.
- *
- * Creates the in-memory structures to represent the state of the
- * re-opened log.
- *
- * Finds the last appended sync point, and any sync points referred to
- * in log entries, but missing from the log. These missing sync points
- * are created and scheduled for append. Some rudimentary consistency
- * checking is done.
- *
- * Rebuilds the m_blocks_to_log_entries map, to make log entries
- * readable.
- *
- * Places all writes on the dirty entries list, which causes them all
- * to be flushed.
- *
- */
-
-template <typename I>
-void ReplicatedWriteLog<I>::load_existing_entries(DeferredContexts &later) {
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- struct WriteLogPmemEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
- uint64_t entry_index = m_first_valid_entry;
- /* The map below allows us to find sync point log entries by sync
- * gen number, which is necessary so write entries can be linked to
- * their sync points. */
- std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
- /* The map below tracks sync points referred to in writes but not
- * appearing in the sync_point_entries map. We'll use this to
- * determine which sync points are missing and need to be
- * created. */
- std::map<uint64_t, bool> missing_sync_points;
-
- /*
- * Read the existing log entries. Construct an in-memory log entry
- * object of the appropriate type for each. Add these to the global
- * log entries list.
- *
- * Write entries will not link to their sync points yet. We'll do
- * that in the next pass. Here we'll accumulate a map of sync point
- * gen numbers that are referred to in writes but do not appearing in
- * the log.
- */
- while (entry_index != m_first_free_entry) {
- WriteLogPmemEntry *pmem_entry = &pmem_log_entries[entry_index];
- std::shared_ptr<GenericLogEntry> log_entry = nullptr;
- ceph_assert(pmem_entry->entry_index == entry_index);
-
- this->update_entries(log_entry, pmem_entry, missing_sync_points,
- sync_point_entries, entry_index);
-
- log_entry->ram_entry = *pmem_entry;
- log_entry->pmem_entry = pmem_entry;
- log_entry->log_entry_index = entry_index;
- log_entry->completed = true;
-
- m_log_entries.push_back(log_entry);
-
- entry_index = (entry_index + 1) % this->m_total_log_entries;
- }
-
- this->update_sync_points(missing_sync_points, sync_point_entries, later);
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::write_data_to_buffer(std::shared_ptr<WriteLogEntry> ws_entry,
- WriteLogPmemEntry *pmem_entry) {
- ws_entry->pmem_buffer = D_RW(pmem_entry->write_data);
-}
-
-/**
- * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries
- * that are eligible to be retired. Returns true if anything was
- * retired.
- */
-template <typename I>
-bool ReplicatedWriteLog<I>::retire_entries(const unsigned long int frees_per_tx) {
- CephContext *cct = m_image_ctx.cct;
- GenericLogEntriesVector retiring_entries;
- uint32_t initial_first_valid_entry;
- uint32_t first_valid_entry;
-
- std::lock_guard retire_locker(this->m_log_retire_lock);
- ldout(cct, 20) << "Look for entries to retire" << dendl;
- {
- /* Entry readers can't be added while we hold m_entry_reader_lock */
- RWLock::WLocker entry_reader_locker(this->m_entry_reader_lock);
- std::lock_guard locker(m_lock);
- initial_first_valid_entry = this->m_first_valid_entry;
- first_valid_entry = this->m_first_valid_entry;
- auto entry = m_log_entries.front();
- while (!m_log_entries.empty() &&
- retiring_entries.size() < frees_per_tx &&
- this->can_retire_entry(entry)) {
- if (entry->log_entry_index != first_valid_entry) {
- lderr(cct) << "Retiring entry index (" << entry->log_entry_index
- << ") and first valid log entry index (" << first_valid_entry
- << ") must be ==." << dendl;
- }
- ceph_assert(entry->log_entry_index == first_valid_entry);
- first_valid_entry = (first_valid_entry + 1) % this->m_total_log_entries;
- m_log_entries.pop_front();
- retiring_entries.push_back(entry);
- /* Remove entry from map so there will be no more readers */
- if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) {
- auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(entry);
- if (gen_write_entry) {
- this->m_blocks_to_log_entries.remove_log_entry(gen_write_entry);
- }
- }
- entry = m_log_entries.front();
- }
- }
-
- if (retiring_entries.size()) {
- ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl;
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
-
- utime_t tx_start;
- utime_t tx_end;
- /* Advance first valid entry and release buffers */
- {
- uint64_t flushed_sync_gen;
- std::lock_guard append_locker(this->m_log_append_lock);
- {
- std::lock_guard locker(m_lock);
- flushed_sync_gen = this->m_flushed_sync_gen;
- }
-
- tx_start = ceph_clock_now();
- TX_BEGIN(m_log_pool) {
- if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
- ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from "
- << D_RO(pool_root)->flushed_sync_gen << " to "
- << flushed_sync_gen << dendl;
- D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
- }
- D_RW(pool_root)->first_valid_entry = first_valid_entry;
- for (auto &entry: retiring_entries) {
- if (entry->write_bytes()) {
- ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo
- << "." << entry->ram_entry.write_data.oid.off << dendl;
- TX_FREE(entry->ram_entry.write_data);
- } else {
- ldout(cct, 20) << "Retiring non-write: " << *entry << dendl;
- }
- }
- } TX_ONCOMMIT {
- } TX_ONABORT {
- lderr(cct) << "failed to commit free of" << retiring_entries.size()
- << " log entries (" << this->m_log_pool_name << ")" << dendl;
- ceph_assert(false);
- } TX_FINALLY {
- } TX_END;
- tx_end = ceph_clock_now();
- }
- m_perfcounter->tinc(l_librbd_pwl_retire_tx_t, tx_end - tx_start);
- m_perfcounter->hinc(l_librbd_pwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(),
- retiring_entries.size());
-
- /* Update runtime copy of first_valid, and free entries counts */
- {
- std::lock_guard locker(m_lock);
-
- ceph_assert(this->m_first_valid_entry == initial_first_valid_entry);
- this->m_first_valid_entry = first_valid_entry;
- this->m_free_log_entries += retiring_entries.size();
- for (auto &entry: retiring_entries) {
- if (entry->write_bytes()) {
- ceph_assert(this->m_bytes_cached >= entry->write_bytes());
- this->m_bytes_cached -= entry->write_bytes();
- uint64_t entry_allocation_size = entry->write_bytes();
- if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) {
- entry_allocation_size = MIN_WRITE_ALLOC_SIZE;
- }
- ceph_assert(this->m_bytes_allocated >= entry_allocation_size);
- this->m_bytes_allocated -= entry_allocation_size;
- }
- }
- this->m_alloc_failed_since_retire = false;
- this->wake_up();
- }
- } else {
- ldout(cct, 20) << "Nothing to retire" << dendl;
- return false;
- }
- return true;
-}
-
-template <typename I>
-Context* ReplicatedWriteLog<I>::construct_flush_entry_ctx(
- std::shared_ptr<GenericLogEntry> log_entry) {
- bool invalidating = this->m_invalidating; // snapshot so we behave consistently
- Context *ctx = this->construct_flush_entry(log_entry, invalidating);
-
- if (invalidating) {
- return ctx;
- }
- return new LambdaContext(
- [this, log_entry, ctx](int r) {
- m_image_ctx.op_work_queue->queue(new LambdaContext(
- [this, log_entry, ctx](int r) {
- ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
- << " " << *log_entry << dendl;
- log_entry->writeback(this->m_image_writeback, ctx);
- }), 0);
- });
-}
-
-const unsigned long int ops_flushed_together = 4;
-/*
- * Performs the pmem buffer flush on all scheduled ops, then schedules
- * the log event append operation for all of them.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::flush_then_append_scheduled_ops(void)
-{
- GenericLogOperations ops;
- bool ops_remain = false;
- ldout(m_image_ctx.cct, 20) << dendl;
- do {
- {
- ops.clear();
- std::lock_guard locker(m_lock);
- if (m_ops_to_flush.size()) {
- auto last_in_batch = m_ops_to_flush.begin();
- unsigned int ops_to_flush = m_ops_to_flush.size();
- if (ops_to_flush > ops_flushed_together) {
- ops_to_flush = ops_flushed_together;
- }
- ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl;
- std::advance(last_in_batch, ops_to_flush);
- ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch);
- ops_remain = !m_ops_to_flush.empty();
- ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", "
- << m_ops_to_flush.size() << " remain" << dendl;
- } else {
- ops_remain = false;
- }
- }
- if (ops_remain) {
- enlist_op_flusher();
- }
-
- /* Ops subsequently scheduled for flush may finish before these,
- * which is fine. We're unconcerned with completion order until we
- * get to the log message append step. */
- if (ops.size()) {
- flush_pmem_buffer(ops);
- schedule_append_ops(ops);
- }
- } while (ops_remain);
- append_scheduled_ops();
-}
-
-/*
- * Performs the log event append operation for all of the scheduled
- * events.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::append_scheduled_ops(void) {
- GenericLogOperations ops;
- int append_result = 0;
- bool ops_remain = false;
- bool appending = false; /* true if we set m_appending */
- ldout(m_image_ctx.cct, 20) << dendl;
- do {
- ops.clear();
- this->append_scheduled(ops, ops_remain, appending, true);
-
- if (ops.size()) {
- std::lock_guard locker(this->m_log_append_lock);
- alloc_op_log_entries(ops);
- append_result = append_op_log_entries(ops);
- }
-
- int num_ops = ops.size();
- if (num_ops) {
- /* New entries may be flushable. Completion will wake up flusher. */
- this->complete_op_log_entries(std::move(ops), append_result);
- }
- } while (ops_remain);
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::enlist_op_flusher()
-{
- this->m_async_flush_ops++;
- this->m_async_op_tracker.start_op();
- Context *flush_ctx = new LambdaContext([this](int r) {
- flush_then_append_scheduled_ops();
- this->m_async_flush_ops--;
- this->m_async_op_tracker.finish_op();
- });
- this->m_work_queue.queue(flush_ctx);
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::setup_schedule_append(
- pwl::GenericLogOperationsVector &ops, bool do_early_flush) {
- if (do_early_flush) {
- /* This caller is waiting for persist, so we'll use their thread to
- * expedite it */
- flush_pmem_buffer(ops);
- this->schedule_append(ops);
- } else {
- /* This is probably not still the caller's thread, so do the payload
- * flushing/replicating later. */
- schedule_flush_and_append(ops);
- }
-}
-
-/*
- * Takes custody of ops. They'll all get their log entries appended,
- * and have their on_write_persist contexts completed once they and
- * all prior log entries are persisted everywhere.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::schedule_append_ops(GenericLogOperations &ops)
-{
- bool need_finisher;
- GenericLogOperationsVector appending;
-
- std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
- {
- std::lock_guard locker(m_lock);
-
- need_finisher = this->m_ops_to_append.empty() && !this->m_appending;
- this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops);
- }
-
- if (need_finisher) {
- this->enlist_op_appender();
- }
-
- for (auto &op : appending) {
- op->appending();
- }
-}
-
-/*
- * Takes custody of ops. They'll all get their pmem blocks flushed,
- * then get their log entries appended.
- */
-template <typename I>
-void ReplicatedWriteLog<I>::schedule_flush_and_append(GenericLogOperationsVector &ops)
-{
- GenericLogOperations to_flush(ops.begin(), ops.end());
- bool need_finisher;
- ldout(m_image_ctx.cct, 20) << dendl;
- {
- std::lock_guard locker(m_lock);
-
- need_finisher = m_ops_to_flush.empty();
- m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush);
- }
-
- if (need_finisher) {
- enlist_op_flusher();
- }
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::process_work() {
- CephContext *cct = m_image_ctx.cct;
- int max_iterations = 4;
- bool wake_up_requested = false;
- uint64_t aggressive_high_water_bytes = this->m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER;
- uint64_t high_water_bytes = this->m_bytes_allocated_cap * RETIRE_HIGH_WATER;
- uint64_t low_water_bytes = this->m_bytes_allocated_cap * RETIRE_LOW_WATER;
- uint64_t aggressive_high_water_entries = this->m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER;
- uint64_t high_water_entries = this->m_total_log_entries * RETIRE_HIGH_WATER;
- uint64_t low_water_entries = this->m_total_log_entries * RETIRE_LOW_WATER;
-
- ldout(cct, 20) << dendl;
-
- do {
- {
- std::lock_guard locker(m_lock);
- this->m_wake_up_requested = false;
- }
- if (this->m_alloc_failed_since_retire || this->m_invalidating ||
- this->m_bytes_allocated > high_water_bytes ||
- (m_log_entries.size() > high_water_entries)) {
- int retired = 0;
- utime_t started = ceph_clock_now();
- ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire
- << ", allocated > high_water="
- << (this->m_bytes_allocated > high_water_bytes)
- << ", allocated_entries > high_water="
- << (m_log_entries.size() > high_water_entries)
- << dendl;
- while (this->m_alloc_failed_since_retire || this->m_invalidating ||
- (this->m_bytes_allocated > high_water_bytes) ||
- (m_log_entries.size() > high_water_entries) ||
- (((this->m_bytes_allocated > low_water_bytes) ||
- (m_log_entries.size() > low_water_entries)) &&
- (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) {
- if (!retire_entries((this->m_shutting_down || this->m_invalidating ||
- (this->m_bytes_allocated > aggressive_high_water_bytes) ||
- (m_log_entries.size() > aggressive_high_water_entries))
- ? MAX_ALLOC_PER_TRANSACTION
- : MAX_FREE_PER_TRANSACTION)) {
- break;
- }
- retired++;
- this->dispatch_deferred_writes();
- this->process_writeback_dirty_entries();
- }
- ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl;
- }
- this->dispatch_deferred_writes();
- this->process_writeback_dirty_entries();
-
- {
- std::lock_guard locker(m_lock);
- wake_up_requested = this->m_wake_up_requested;
- }
- } while (wake_up_requested && --max_iterations > 0);
-
- {
- std::lock_guard locker(m_lock);
- this->m_wake_up_scheduled = false;
- /* Reschedule if it's still requested */
- if (this->m_wake_up_requested) {
- this->wake_up();
- }
- }
-}
-
-/*
- * Flush the pmem regions for the data blocks of a set of operations
- *
- * V is expected to be GenericLogOperations<I>, or GenericLogOperationsVector<I>
- */
-template <typename I>
-template <typename V>
-void ReplicatedWriteLog<I>::flush_pmem_buffer(V& ops)
-{
- for (auto &operation : ops) {
- operation->flush_pmem_buf_to_cache(m_log_pool);
- }
-
- /* Drain once for all */
- pmemobj_drain(m_log_pool);
-
- utime_t now = ceph_clock_now();
- for (auto &operation : ops) {
- if (operation->reserved_allocated()) {
- operation->buf_persist_comp_time = now;
- } else {
- ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
- }
- }
-}
-
-/**
- * Update/persist the last flushed sync point in the log
- */
-template <typename I>
-void ReplicatedWriteLog<I>::persist_last_flushed_sync_gen()
-{
- TOID(struct WriteLogPoolRoot) pool_root;
- pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
- uint64_t flushed_sync_gen;
-
- std::lock_guard append_locker(this->m_log_append_lock);
- {
- std::lock_guard locker(m_lock);
- flushed_sync_gen = this->m_flushed_sync_gen;
- }
-
- if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
- ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from "
- << D_RO(pool_root)->flushed_sync_gen << " to "
- << flushed_sync_gen << dendl;
- TX_BEGIN(m_log_pool) {
- D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
- } TX_ONCOMMIT {
- } TX_ONABORT {
- lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl;
- ceph_assert(false);
- } TX_FINALLY {
- } TX_END;
- }
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::reserve_pmem(C_BlockIORequestT *req,
- bool &alloc_succeeds, bool &no_space) {
- std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
- for (auto &buffer : buffers) {
- utime_t before_reserve = ceph_clock_now();
- buffer.buffer_oid = pmemobj_reserve(m_log_pool,
- &buffer.buffer_alloc_action,
- buffer.allocation_size,
- 0 /* Object type */);
- buffer.allocation_lat = ceph_clock_now() - before_reserve;
- if (TOID_IS_NULL(buffer.buffer_oid)) {
- if (!req->has_io_waited_for_buffers()) {
- req->set_io_waited_for_entries(true);
- }
- ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: "
- << pmemobj_errormsg() << ". "
- << *req << dendl;
- alloc_succeeds = false;
- no_space = true; /* Entries need to be retired */
- break;
- } else {
- buffer.allocated = true;
- }
- ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo
- << "." << buffer.buffer_oid.oid.off
- << ", size=" << buffer.allocation_size << dendl;
- }
-}
-
-template <typename I>
-void ReplicatedWriteLog<I>::copy_pmem(C_BlockIORequestT *req) {
- req->copy_pmem();
-}
-
-template <typename I>
-bool ReplicatedWriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
- bool alloc_succeeds = true;
- uint64_t bytes_allocated = 0;
- uint64_t bytes_cached = 0;
- uint64_t bytes_dirtied = 0;
- uint64_t num_lanes = 0;
- uint64_t num_unpublished_reserves = 0;
- uint64_t num_log_entries = 0;
-
- ldout(m_image_ctx.cct, 20) << dendl;
- // Setup buffer, and get all the number of required resources
- req->setup_buffer_resources(bytes_cached, bytes_dirtied, bytes_allocated,
- num_lanes, num_log_entries, num_unpublished_reserves);
-
- alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied, bytes_allocated,
- num_lanes, num_log_entries, num_unpublished_reserves,
- this->m_bytes_allocated_cap);
-
- std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
- if (!alloc_succeeds) {
- /* On alloc failure, free any buffers we did allocate */
- for (auto &buffer : buffers) {
- if (buffer.allocated) {
- pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1);
- }
- }
- }
-
- req->set_allocated(alloc_succeeds);
- return alloc_succeeds;
-}
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::pwl::ReplicatedWriteLog<librbd::ImageCtx>;
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
-#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
-
-#include <libpmemobj.h>
-#include "common/RWLock.h"
-#include "common/WorkQueue.h"
-#include "common/AsyncOpTracker.h"
-#include "librbd/cache/ImageWriteback.h"
-#include "librbd/Utils.h"
-#include "librbd/BlockGuard.h"
-#include "librbd/cache/Types.h"
-#include "librbd/cache/pwl/LogOperation.h"
-#include "librbd/cache/pwl/Request.h"
-#include "librbd/cache/pwl/LogMap.h"
-#include "AbstractWriteLog.h"
-#include <functional>
-#include <list>
-
-class Context;
-class SafeTimer;
-
-namespace librbd {
-
-struct ImageCtx;
-
-namespace cache {
-
-namespace pwl {
-
-template <typename ImageCtxT>
-class ReplicatedWriteLog : public AbstractWriteLog<ImageCtxT> {
-public:
- ReplicatedWriteLog(
- ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
- ImageWritebackInterface& image_writeback,
- plugin::Api<ImageCtxT>& plugin_api);
- ~ReplicatedWriteLog();
- ReplicatedWriteLog(const ReplicatedWriteLog&) = delete;
- ReplicatedWriteLog &operator=(const ReplicatedWriteLog&) = delete;
-
-private:
- using This = AbstractWriteLog<ImageCtxT>;
- using C_WriteRequestT = pwl::C_WriteRequest<This>;
- using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
- using C_FlushRequestT = pwl::C_FlushRequest<This>;
- using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
- using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
- using C_CompAndWriteRequestT = pwl::C_CompAndWriteRequest<This>;
-
- PMEMobjpool *m_log_pool = nullptr;
- const char* m_pwl_pool_layout_name;
-
- void remove_pool_file();
- void load_existing_entries(pwl::DeferredContexts &later);
- void alloc_op_log_entries(pwl::GenericLogOperations &ops);
- int append_op_log_entries(pwl::GenericLogOperations &ops);
- void flush_then_append_scheduled_ops(void);
- void enlist_op_flusher();
- void flush_op_log_entries(pwl::GenericLogOperationsVector &ops);
- template <typename V>
- void flush_pmem_buffer(V& ops);
-
-protected:
- using AbstractWriteLog<ImageCtxT>::m_lock;
- using AbstractWriteLog<ImageCtxT>::m_log_entries;
- using AbstractWriteLog<ImageCtxT>::m_image_ctx;
- using AbstractWriteLog<ImageCtxT>::m_perfcounter;
- using AbstractWriteLog<ImageCtxT>::m_ops_to_flush;
- using AbstractWriteLog<ImageCtxT>::m_cache_state;
- using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
- using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
-
- void process_work() override;
- void copy_pmem(C_BlockIORequestT *req) override;
- void schedule_append_ops(pwl::GenericLogOperations &ops) override;
- void append_scheduled_ops(void) override;
- void reserve_pmem(C_BlockIORequestT *req, bool &alloc_succeeds, bool &no_space) override;
- bool retire_entries(const unsigned long int frees_per_tx) override;
- void persist_last_flushed_sync_gen() override;
- bool alloc_resources(C_BlockIORequestT *req) override;
- void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops) override;
- void setup_schedule_append(
- pwl::GenericLogOperationsVector &ops, bool do_early_flush) override;
- Context *construct_flush_entry_ctx(
- const std::shared_ptr<pwl::GenericLogEntry> log_entry) override;
- void initialize_pool(Context *on_finish, pwl::DeferredContexts &later) override;
- void write_data_to_buffer(
- std::shared_ptr<pwl::WriteLogEntry> ws_entry,
- pwl::WriteLogPmemEntry *pmem_entry) override;
-};
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-extern template class librbd::cache::pwl::ReplicatedWriteLog<librbd::ImageCtx>;
-
-#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
if (m_user_req_completed.compare_exchange_strong(initial, true)) {
ldout(pwl.get_context(), 15) << this << " completing user req" << dendl;
m_user_req_completed_time = ceph_clock_now();
- user_req->complete(r);
- // Set user_req as null as it is deleted
- user_req = nullptr;
+ pwl.complete_user_request(user_req, r);
} else {
ldout(pwl.get_context(), 20) << this << " user req already completed" << dendl;
}
ldout(pwl.get_context(), 99) << this << dendl;
}
+template <typename T>
+C_WriteRequest<T>::C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req)
+ : C_BlockIORequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req),
+ mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)),
+ m_perfcounter(perfcounter), m_lock(lock) {
+ is_comp_and_write = true;
+ ldout(pwl.get_context(), 20) << dendl;
+}
+
template <typename T>
C_WriteRequest<T>::~C_WriteRequest() {
ldout(pwl.get_context(), 99) << this << dendl;
/* Completed to caller by here (in finish(), which calls this) */
utime_t now = ceph_clock_now();
+ if(is_comp_and_write && !compare_succeeded) {
+ update_req_stats(now);
+ return;
+ }
pwl.release_write_lanes(this);
ceph_assert(m_resources.allocated);
m_resources.allocated = false;
}
template <typename T>
-void C_WriteRequest<T>::setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {
-
- ceph_assert(!m_resources.allocated);
-
- auto image_extents_size = this->image_extents.size();
- m_resources.buffers.reserve(image_extents_size);
-
- bytes_cached = 0;
- bytes_allocated = 0;
- number_lanes = image_extents_size;
- number_log_entries = image_extents_size;
- number_unpublished_reserves = image_extents_size;
-
- for (auto &extent : this->image_extents) {
- m_resources.buffers.emplace_back();
- struct WriteBufferAllocation &buffer = m_resources.buffers.back();
- buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
- buffer.allocated = false;
- bytes_cached += extent.second;
- if (extent.second > buffer.allocation_size) {
- buffer.allocation_size = extent.second;
- }
- bytes_allocated += buffer.allocation_size;
- }
- bytes_dirtied = bytes_cached;
-}
-
-template <typename T>
-std::shared_ptr<WriteLogOperation> C_WriteRequest<T>::create_operation(uint64_t offset, uint64_t len) {
- return std::make_shared<WriteLogOperation>(*op_set, offset, len, pwl.get_context());
+std::shared_ptr<WriteLogOperation> C_WriteRequest<T>::create_operation(
+ uint64_t offset, uint64_t len) {
+ return pwl.m_builder->create_write_log_operation(
+ *op_set, offset, len, pwl.get_context(),
+ pwl.m_builder->create_write_log_entry(op_set->sync_point->log_entry, offset, len));
}
template <typename T>
pwl.add_into_log_map(log_entries, this);
}
-#ifdef WITH_RBD_RWL
template <typename T>
-void C_WriteRequest<T>::copy_pmem() {
- auto allocation = m_resources.buffers.begin();
- for (auto &operation : op_set->operations) {
- operation->copy_bl_to_pmem_buffer(allocation);
- allocation++;
- }
+void C_WriteRequest<T>::copy_cache() {
+ pwl.copy_bl_to_buffer(&m_resources, op_set);
}
-#endif
template <typename T>
bool C_WriteRequest<T>::append_write_request(std::shared_ptr<SyncPoint> sync_point) {
template <typename T>
void C_FlushRequest<T>::setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {
- number_log_entries = 1;
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+ *number_log_entries = 1;
}
template <typename T>
template <typename T>
void C_DiscardRequest<T>::setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {
- number_log_entries = 1;
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+ *number_log_entries = 1;
/* No bytes are allocated for a discard, but we count the discarded bytes
* as dirty. This means it's possible to have more bytes dirty than
* there are bytes cached or allocated. */
for (auto &extent : this->image_extents) {
- bytes_dirtied = extent.second;
+ *bytes_dirtied = extent.second;
break;
}
}
}
template <typename T>
-C_WriteSameRequest<T>::C_WriteSameRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
- bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
- PerfCounters *perfcounter, Context *user_req)
- : C_WriteRequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req) {
+C_WriteSameRequest<T>::C_WriteSameRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_WriteRequest<T>(pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req) {
ldout(pwl.get_context(), 20) << this << dendl;
}
this->m_perfcounter->tinc(l_librbd_pwl_ws_latency, comp_latency);
}
-/* Write sames will allocate one buffer, the size of the repeating pattern */
template <typename T>
-void C_WriteSameRequest<T>::setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {
- ldout(pwl.get_context(), 20) << this << dendl;
+std::shared_ptr<WriteLogOperation> C_WriteSameRequest<T>::create_operation(
+ uint64_t offset, uint64_t len) {
ceph_assert(this->image_extents.size() == 1);
- bytes_dirtied += this->image_extents[0].second;
- auto pattern_length = this->bl.length();
- this->m_resources.buffers.emplace_back();
- struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
- buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
- buffer.allocated = false;
- bytes_cached += pattern_length;
- if (pattern_length > buffer.allocation_size) {
- buffer.allocation_size = pattern_length;
- }
- bytes_allocated += buffer.allocation_size;
-}
-
-template <typename T>
-std::shared_ptr<WriteLogOperation> C_WriteSameRequest<T>::create_operation(uint64_t offset, uint64_t len) {
- ceph_assert(this->image_extents.size() == 1);
- return std::make_shared<WriteSameLogOperation>(*this->op_set.get(), offset, len,
- this->bl.length(), pwl.get_context());
+ WriteLogOperationSet &set = *this->op_set.get();
+ return pwl.m_builder->create_write_log_operation(
+ *this->op_set.get(), offset, len, this->bl.length(), pwl.get_context(),
+ pwl.m_builder->create_writesame_log_entry(set.sync_point->log_entry, offset,
+ len, this->bl.length()));
}
template <typename T>
}
template <typename T>
-C_CompAndWriteRequest<T>::C_CompAndWriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
- bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
- int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
- Context *user_req)
- : C_WriteRequest<T>(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, lock, perfcounter, user_req),
- mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)) {
- ldout(pwl.get_context(), 20) << dendl;
-}
-
-template <typename T>
-C_CompAndWriteRequest<T>::~C_CompAndWriteRequest() {
- ldout(pwl.get_context(), 20) << dendl;
-}
-
-template <typename T>
-void C_CompAndWriteRequest<T>::finish_req(int r) {
- if (compare_succeeded) {
- C_WriteRequest<T>::finish_req(r);
- } else {
- utime_t now = ceph_clock_now();
- update_req_stats(now);
- }
-}
-
-template <typename T>
-void C_CompAndWriteRequest<T>::update_req_stats(utime_t &now) {
+void C_WriteRequest<T>::update_req_stats(utime_t &now) {
/* Compare-and-write stats. Compare-and-write excluded from most write
* stats because the read phase will make them look like slow writes in
* those histograms. */
- if (!compare_succeeded) {
- this->m_perfcounter->inc(l_librbd_pwl_cmp_fails, 1);
+ if(is_comp_and_write) {
+ if (!compare_succeeded) {
+ this->m_perfcounter->inc(l_librbd_pwl_cmp_fails, 1);
+ }
+ utime_t comp_latency = now - this->m_arrived_time;
+ this->m_perfcounter->tinc(l_librbd_pwl_cmp_latency, comp_latency);
}
- utime_t comp_latency = now - this->m_arrived_time;
- this->m_perfcounter->tinc(l_librbd_pwl_cmp_latency, comp_latency);
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os,
- const C_CompAndWriteRequest<T> &req) {
- os << (C_WriteRequest<T>&)req
- << "cmp_bl=" << req.cmp_bl << ", "
- << "read_bl=" << req.read_bl << ", "
- << "compare_succeeded=" << req.compare_succeeded << ", "
- << "mismatch_offset=" << req.mismatch_offset;
- return os;
}
} // namespace pwl
template class librbd::cache::pwl::C_FlushRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
template class librbd::cache::pwl::C_DiscardRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
template class librbd::cache::pwl::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
-template class librbd::cache::pwl::C_CompAndWriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H
-#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+#ifndef CEPH_LIBRBD_CACHE_PWL_REQUEST_H
+#define CEPH_LIBRBD_CACHE_PWL_REQUEST_H
#include "include/Context.h"
#include "librbd/cache/pwl/Types.h"
virtual void dispatch() = 0;
- virtual void copy_pmem() {};
+ virtual void copy_cache() {};
virtual const char *get_name() const {
return "C_BlockIORequest";
}
virtual void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) {};
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {};
protected:
utime_t m_arrived_time;
class C_WriteRequest : public C_BlockIORequest<T> {
public:
using C_BlockIORequest<T>::pwl;
+ bool compare_succeeded = false;
+ uint64_t *mismatch_offset;
+ bufferlist cmp_bl;
+ bufferlist read_bl;
+ bool is_comp_and_write = false;
unique_ptr<WriteLogOperationSet> op_set = nullptr;
C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
PerfCounters *perfcounter, Context *user_req);
+ C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
+ Context *user_req);
+
~C_WriteRequest() override;
void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx);
void finish_req(int r) override;
/* Compare and write will override this */
- virtual void update_req_stats(utime_t &now) {
- // TODO: Add in later PRs
- }
+ virtual void update_req_stats(utime_t &now);
+
bool alloc_resources() override;
void deferred_handler() override { }
void dispatch() override;
- #ifdef WITH_RBD_RWL
- void copy_pmem() override;
- #endif
+ void copy_cache() override;
- virtual std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset, uint64_t len);
+ virtual std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset,
+ uint64_t len);
virtual void setup_log_operations(DeferredContexts &on_exit);
protected:
using C_BlockIORequest<T>::m_resources;
PerfCounters *m_perfcounter = nullptr;
- /* Plain writes will allocate one buffer per request extent */
- void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) override;
private:
bool m_do_early_flush = false;
}
void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) override;
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
private:
std::shared_ptr<SyncPointLogOperation> op;
ceph::mutex &m_lock;
return "C_DiscardRequest";
}
void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) override;
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
private:
uint32_t m_discard_granularity_bytes;
ceph::mutex &m_lock;
void update_req_stats(utime_t &now) override;
- void setup_buffer_resources(
- uint64_t &bytes_cached, uint64_t &bytes_dirtied, uint64_t &bytes_allocated,
- uint64_t &number_lanes, uint64_t &number_log_entries,
- uint64_t &number_unpublished_reserves) override;
-
std::shared_ptr<WriteLogOperation> create_operation(uint64_t offset, uint64_t len) override;
const char *get_name() const override {
const C_WriteSameRequest<U> &req);
};
-/**
- * This is the custodian of the BlockGuard cell for this compare and write. The
- * block guard is acquired before the read begins to guarantee atomicity of this
- * operation. If this results in a write, the block guard will be released
- * when the write completes to all replicas.
- */
-template <typename T>
-class C_CompAndWriteRequest : public C_WriteRequest<T> {
-public:
- using C_BlockIORequest<T>::pwl;
- bool compare_succeeded = false;
- uint64_t *mismatch_offset;
- bufferlist cmp_bl;
- bufferlist read_bl;
- C_CompAndWriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents,
- bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
- int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter,
- Context *user_req);
- ~C_CompAndWriteRequest();
-
- void finish_req(int r) override;
-
- void update_req_stats(utime_t &now) override;
-
- /*
- * Compare and write doesn't implement alloc_resources(), deferred_handler(),
- * or dispatch(). We use the implementation in C_WriteRequest(), and only if the
- * compare phase succeeds and a write is actually performed.
- */
-
- const char *get_name() const override {
- return "C_CompAndWriteRequest";
- }
- template <typename U>
- friend std::ostream &operator<<(std::ostream &os,
- const C_CompAndWriteRequest<U> &req);
-};
-
struct BlockGuardReqState {
bool barrier = false; /* This is a barrier request */
bool current_barrier = false; /* This is the currently active barrier */
} // namespace cache
} // namespace librbd
-#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+#endif // CEPH_LIBRBD_CACHE_PWL_REQUEST_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_SSD_TYPES_H
-#define CEPH_LIBRBD_CACHE_SSD_TYPES_H
-
-#include "acconfig.h"
-
-#include "librbd/io/Types.h"
-#include "Types.h" //generic type = to be renamed
-
-namespace librbd {
-namespace cache {
-namespace pwl {
-
-struct SuperBlock{
- WriteLogPoolRoot root;
-
- DENC(SuperBlock, v, p) {
- DENC_START(1, 1, p);
- denc(v.root, p);
- DENC_FINISH(p);
- }
-
- void dump(Formatter *f) const {
- f->dump_object("super", root);
- }
-
- static void generate_test_instances(list<SuperBlock*>& ls) {
- ls.push_back(new SuperBlock);
- ls.push_back(new SuperBlock);
- ls.back()->root.first_valid_entry = 2;
- }
-};
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-WRITE_CLASS_DENC(librbd::cache::pwl::SuperBlock)
-
-#endif // CEPH_LIBRBD_CACHE_SSD_TYPES_H
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "SSDWriteLog.h"
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "include/ceph_assert.h"
-#include "common/deleter.h"
-#include "common/dout.h"
-#include "common/environment.h"
-#include "common/errno.h"
-#include "common/WorkQueue.h"
-#include "common/Timer.h"
-#include "common/perf_counters.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/asio/ContextWQ.h"
-#include "librbd/cache/pwl/ImageCacheState.h"
-#include "librbd/cache/pwl/LogEntry.h"
-#include <map>
-#include <vector>
-
-#undef dout_subsys
-#define dout_subsys ceph_subsys_rbd_pwl
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::cache::pwl::SSDWriteLog: " << this << " " \
- << __func__ << ": "
-
-namespace librbd {
-namespace cache {
-namespace pwl {
-
-using namespace librbd::cache::pwl;
-
-// SSD: this number can be updated later
-const unsigned long int ops_appended_together = MAX_WRITES_PER_SYNC_POINT;
-
-template <typename I>
-SSDWriteLog<I>::SSDWriteLog(
- I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
- cache::ImageWritebackInterface& image_writeback,
- plugin::Api<I>& plugin_api)
- : AbstractWriteLog<I>(image_ctx, cache_state, image_writeback, plugin_api)
-{
-}
-
-template <typename I>
-void SSDWriteLog<I>::initialize_pool(Context *on_finish, pwl::DeferredContexts &later) {
- CephContext *cct = m_image_ctx.cct;
- ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
- if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
- int fd = ::open(this->m_log_pool_name.c_str(), O_RDWR|O_CREAT, 0644);
- bool succeed = true;
- if (fd >= 0) {
- if (truncate(this->m_log_pool_name.c_str(), this->m_log_pool_config_size) != 0) {
- succeed = false;
- }
- ::close(fd);
- } else {
- succeed = false;
- }
- if (!succeed) {
- m_cache_state->present = false;
- m_cache_state->clean = true;
- m_cache_state->empty = true;
- /* TODO: filter/replace errnos that are meaningless to the caller */
- on_finish->complete(-errno);
- return;
- }
-
- bdev = BlockDevice::create(cct, this->m_log_pool_name, aio_cache_cb,
- nullptr, nullptr, nullptr);
- int r = bdev->open(this->m_log_pool_name);
- if (r < 0) {
- delete bdev;
- on_finish->complete(-1);
- return;
- }
- m_cache_state->present = true;
- m_cache_state->clean = true;
- m_cache_state->empty = true;
- /* new pool, calculate and store metadata */
- size_t small_write_size = MIN_WRITE_ALLOC_SIZE + sizeof(struct WriteLogPmemEntry);
-
- uint64_t num_small_writes = (uint64_t)(this->m_log_pool_config_size / small_write_size);
- if (num_small_writes > MAX_LOG_ENTRIES) {
- num_small_writes = MAX_LOG_ENTRIES;
- }
- assert(num_small_writes > 2);
- m_log_pool_ring_buffer_size = this->m_log_pool_config_size - DATA_RING_BUFFER_OFFSET;
- /* Log ring empty */
- m_first_free_entry = DATA_RING_BUFFER_OFFSET;
- m_first_valid_entry = DATA_RING_BUFFER_OFFSET;
-
- pool_size = this->m_log_pool_config_size;
- auto new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
- new_root->pool_size = this->m_log_pool_config_size;
- new_root->flushed_sync_gen = this->m_flushed_sync_gen;
- new_root->block_size = MIN_WRITE_ALLOC_SIZE;
- new_root->first_free_entry = m_first_free_entry;
- new_root->first_valid_entry = m_first_valid_entry;
- new_root->num_log_entries = num_small_writes;
- pool_root = *new_root;
-
- r = update_pool_root_sync(new_root);
- if (r != 0) {
- this->m_total_log_entries = 0;
- this->m_free_log_entries = 0;
- lderr(m_image_ctx.cct) << "failed to initialize pool ("
- << this->m_log_pool_name << ")" << dendl;
- on_finish->complete(r);
- }
- this->m_total_log_entries = new_root->num_log_entries;
- this->m_free_log_entries = new_root->num_log_entries - 1;
- } else {
- m_cache_state->present = true;
- bdev = BlockDevice::create(
- cct, this->m_log_pool_name, aio_cache_cb,
- static_cast<void*>(this), nullptr, static_cast<void*>(this));
- int r = bdev->open(this->m_log_pool_name);
- if (r < 0) {
- delete bdev;
- on_finish->complete(r);
- return;
- }
- //load_existing_entries(later); #TODO: Implement and uncomment in later PR
- if (m_first_free_entry < m_first_valid_entry) {
- /* Valid entries wrap around the end of the ring, so first_free is lower
- * than first_valid. If first_valid was == first_free+1, the entry at
- * first_free would be empty. The last entry is never used, so in
- * that case there would be zero free log entries. */
- this->m_free_log_entries = this->m_total_log_entries -
- (m_first_valid_entry - m_first_free_entry) - 1;
- } else {
- /* first_valid is <= first_free. If they are == we have zero valid log
- * entries, and n-1 free log entries */
- this->m_free_log_entries = this->m_total_log_entries -
- (m_first_free_entry - m_first_valid_entry) - 1;
- }
- m_cache_state->clean = this->m_dirty_log_entries.empty();
- m_cache_state->empty = m_log_entries.empty();
- }
-}
-
-template <typename I>
-int SSDWriteLog<I>::update_pool_root_sync(
- std::shared_ptr<WriteLogPoolRoot> root) {
- bufferlist bl;
- SuperBlock superblock;
- superblock.root = *root;
- encode(superblock, bl);
- bl.append_zero(MIN_WRITE_ALLOC_SIZE - bl.length());
- ceph_assert(bl.length() % MIN_WRITE_ALLOC_SIZE == 0);
- return bdev->write(0, bl, false);
-}
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-template class librbd::cache::pwl::SSDWriteLog<librbd::ImageCtx>;
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
-#define CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
-
-#include "AbstractWriteLog.h"
-#include "blk/BlockDevice.h"
-#include "common/AsyncOpTracker.h"
-#include "common/Checksummer.h"
-#include "common/environment.h"
-#include "common/RWLock.h"
-#include "common/WorkQueue.h"
-#include "librbd/BlockGuard.h"
-#include "librbd/Utils.h"
-#include "librbd/cache/ImageWriteback.h"
-#include "librbd/cache/Types.h"
-#include "librbd/cache/pwl/LogMap.h"
-#include "librbd/cache/pwl/LogOperation.h"
-#include "librbd/cache/pwl/Request.h"
-#include "librbd/cache/pwl/SSDTypes.h"
-#include <functional>
-#include <list>
-
-namespace librbd {
-
-struct ImageCtx;
-
-namespace cache {
-
-namespace pwl {
-
-template <typename ImageCtxT>
-class SSDWriteLog : public AbstractWriteLog<ImageCtxT> {
-public:
- SSDWriteLog(ImageCtxT &image_ctx,
- librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
- cache::ImageWritebackInterface& image_writeback,
- plugin::Api<ImageCtxT>& plugin_api);
- ~SSDWriteLog() {}
- SSDWriteLog(const SSDWriteLog&) = delete;
- SSDWriteLog &operator=(const SSDWriteLog&) = delete;
-
- using This = AbstractWriteLog<ImageCtxT>;
- using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
-
- //TODO: Implement below functions in later PR
- bool alloc_resources(C_BlockIORequestT *req) override { return false; }
- void setup_schedule_append(
- pwl::GenericLogOperationsVector &ops, bool do_early_flush) override {}
-
-protected:
- using AbstractWriteLog<ImageCtxT>::m_lock;
- using AbstractWriteLog<ImageCtxT>::m_log_entries;
- using AbstractWriteLog<ImageCtxT>::m_image_ctx;
- using AbstractWriteLog<ImageCtxT>::m_cache_state;
- using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
- using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
-
- void initialize_pool(Context *on_finish, pwl::DeferredContexts &later) override;
- //TODO: Implement below functions in later PR
- void process_work() override {}
- void append_scheduled_ops(void) override {}
- void schedule_append_ops(pwl::GenericLogOperations &ops) override {}
- void remove_pool_file() override {}
-
-private:
- uint64_t m_log_pool_ring_buffer_size; /* Size of ring buffer */
-
- //classes and functions to faciliate block device operations
- class AioTransContext {
- public:
- Context *on_finish;
- ::IOContext ioc;
- explicit AioTransContext(CephContext* cct, Context *cb)
- :on_finish(cb), ioc(cct, this) {
- }
- ~AioTransContext(){}
-
- void aio_finish() {
- on_finish->complete(ioc.get_return_value());
- delete this;
- }
- }; //class AioTransContext
-
- BlockDevice *bdev = nullptr;
- uint64_t pool_size;
- pwl::WriteLogPoolRoot pool_root;
-
- int update_pool_root_sync(std::shared_ptr<pwl::WriteLogPoolRoot> root);
-
- static void aio_cache_cb(void *priv, void *priv2) {
- AioTransContext *c = static_cast<AioTransContext*>(priv2);
- c->aio_finish();
- }
-};//class SSDWriteLog
-
-} // namespace pwl
-} // namespace cache
-} // namespace librbd
-
-extern template class librbd::cache::pwl::SSDWriteLog<librbd::ImageCtx>;
-
-#endif // CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
namespace plugin { template <typename> struct Api; }
namespace cache {
-
namespace pwl {
template<typename>
* convert between image and block extents here using a "block size"
* of 1.
*/
-BlockExtent convert_to_block_extent(const uint64_t offset_bytes, const uint64_t length_bytes)
+BlockExtent convert_to_block_extent(uint64_t offset_bytes, uint64_t length_bytes)
{
return BlockExtent(offset_bytes,
offset_bytes + length_bytes);
}
-BlockExtent WriteLogPmemEntry::block_extent() {
+BlockExtent WriteLogCacheEntry::block_extent() {
return convert_to_block_extent(image_offset_bytes, write_bytes);
}
-uint64_t WriteLogPmemEntry::get_offset_bytes() {
+uint64_t WriteLogCacheEntry::get_offset_bytes() {
return image_offset_bytes;
}
-uint64_t WriteLogPmemEntry::get_write_bytes() {
+uint64_t WriteLogCacheEntry::get_write_bytes() {
return write_bytes;
}
#ifdef WITH_RBD_SSD_CACHE
-void WriteLogPmemEntry::dump(Formatter *f) const {
+void WriteLogCacheEntry::dump(Formatter *f) const {
f->dump_unsigned("sync_gen_number", sync_gen_number);
f->dump_unsigned("write_sequence_number", write_sequence_number);
f->dump_unsigned("image_offset_bytes", image_offset_bytes);
f->dump_unsigned("entry_index", entry_index);
}
-void WriteLogPmemEntry::generate_test_instances(list<WriteLogPmemEntry*>& ls) {
- ls.push_back(new WriteLogPmemEntry);
- ls.push_back(new WriteLogPmemEntry);
+void WriteLogCacheEntry::generate_test_instances(list<WriteLogCacheEntry*>& ls) {
+ ls.push_back(new WriteLogCacheEntry);
+ ls.push_back(new WriteLogCacheEntry);
ls.back()->sync_gen_number = 1;
ls.back()->write_sequence_number = 1;
ls.back()->image_offset_bytes = 1;
#endif
std::ostream& operator<<(std::ostream& os,
- const WriteLogPmemEntry &entry) {
+ const WriteLogCacheEntry &entry) {
os << "entry_valid=" << (bool)entry.entry_valid << ", "
<< "sync_point=" << (bool)entry.sync_point << ", "
<< "sequenced=" << (bool)entry.sequenced << ", "
POBJ_LAYOUT_BEGIN(rbd_pwl);
POBJ_LAYOUT_ROOT(rbd_pwl, struct WriteLogPoolRoot);
POBJ_LAYOUT_TOID(rbd_pwl, uint8_t);
-POBJ_LAYOUT_TOID(rbd_pwl, struct WriteLogPmemEntry);
+POBJ_LAYOUT_TOID(rbd_pwl, struct WriteLogCacheEntry);
POBJ_LAYOUT_END(rbd_pwl);
#endif
-struct WriteLogPmemEntry {
+struct WriteLogCacheEntry {
uint64_t sync_gen_number = 0;
uint64_t write_sequence_number = 0;
uint64_t image_offset_bytes;
TOID(uint8_t) write_data;
#endif
#ifdef WITH_RBD_SSD_CACHE
- uint64_t write_data_pos; /* SSD data offset */
+ uint64_t write_data_pos = 0; /* SSD data offset */
#endif
union {
uint8_t flags;
uint32_t ws_datalen = 0; /* Length of data buffer (writesame only) */
uint32_t entry_index = 0; /* For debug consistency check. Can be removed if
* we need the space */
- WriteLogPmemEntry(const uint64_t image_offset_bytes=0, const uint64_t write_bytes=0)
+ WriteLogCacheEntry(uint64_t image_offset_bytes=0, uint64_t write_bytes=0)
: image_offset_bytes(image_offset_bytes), write_bytes(write_bytes),
entry_valid(0), sync_point(0), sequenced(0), has_data(0), discard(0), writesame(0) {
}
return is_write() || is_discard() || is_writesame();
}
friend std::ostream& operator<<(std::ostream& os,
- const WriteLogPmemEntry &entry);
+ const WriteLogCacheEntry &entry);
#ifdef WITH_RBD_SSD_CACHE
- DENC(WriteLogPmemEntry, v, p) {
+ DENC(WriteLogCacheEntry, v, p) {
DENC_START(1, 1, p);
denc(v.sync_gen_number, p);
denc(v.write_sequence_number, p);
}
#endif
void dump(ceph::Formatter *f) const;
- static void generate_test_instances(list<WriteLogPmemEntry*>& ls);
+ static void generate_test_instances(list<WriteLogCacheEntry*>& ls);
};
struct WriteLogPoolRoot {
};
uint64_t _u64;
} header;
- TOID(struct WriteLogPmemEntry) log_entries; /* contiguous array of log entries */
+ TOID(struct WriteLogCacheEntry) log_entries; /* contiguous array of log entries */
#endif
#ifdef WITH_RBD_SSD_CACHE
uint64_t layout_version = 0;
} // namespace librbd
#ifdef WITH_RBD_SSD_CACHE
-WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogPmemEntry)
+WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogCacheEntry)
WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogPoolRoot)
#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H
+
+#include <iostream>
+#include "LogEntry.h"
+#include "Request.h"
+#include "LogOperation.h"
+
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/pwl/Builder.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename T>
+class Builder : public pwl::Builder<T> {
+public:
+ std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes) override {
+ return std::make_shared<WriteLogEntry>(image_offset_bytes, write_bytes);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes) override {
+ return std::make_shared<WriteLogEntry>(
+ sync_point_entry, image_offset_bytes, write_bytes);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) override {
+ return std::make_shared<WriteSameLogEntry>(
+ image_offset_bytes, write_bytes, data_length);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) override {
+ return std::make_shared<WriteSameLogEntry>(
+ sync_point_entry, image_offset_bytes, write_bytes, data_length);
+ }
+ pwl::C_WriteRequest<T> *create_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req);
+ }
+ pwl::C_WriteSameRequest<T> *create_writesame_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_WriteSameRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req);
+ }
+ pwl::C_WriteRequest<T> *create_comp_and_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new rwl::C_CompAndWriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req);
+ }
+ std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> write_log_entry) {
+ return std::make_shared<WriteLogOperation>(
+ set, image_offset_bytes, write_bytes, cct, write_log_entry);
+ }
+ std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> writesame_log_entry) {
+ return std::make_shared<WriteLogOperation>(
+ set, image_offset_bytes, write_bytes, data_len, cct,
+ writesame_log_entry);
+ }
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/ImageWriteback.h"
+#include "LogEntry.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::WriteLogEntry: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+void WriteLogEntry::writeback(
+ librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) {
+ /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the
+ * bl even after flush()). */
+ bufferlist entry_bl;
+ buffer::list entry_bl_copy;
+ copy_cache_bl(&entry_bl_copy);
+ entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
+ image_writeback.aio_write({{ram_entry.image_offset_bytes,
+ ram_entry.write_bytes}},
+ std::move(entry_bl), 0, ctx);
+}
+
+void WriteLogEntry::init_cache_bp() {
+ ceph_assert(!this->cache_bp.have_raw());
+ cache_bp = buffer::ptr(buffer::create_static(this->write_bytes(),
+ (char*)this->cache_buffer));
+}
+
+void WriteLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) {
+ if(!is_writesame) {
+ bl.append(bp);
+ return;
+ }
+ for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) {
+ bl.append(bp);
+ }
+ int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen;
+ if (trailing_partial) {
+ bl.append(bp, 0, trailing_partial);
+ }
+}
+
+void WriteLogEntry::init_cache_buffer(
+ std::vector<WriteBufferAllocation>::iterator allocation) {
+ this->ram_entry.write_data = allocation->buffer_oid;
+ ceph_assert(!TOID_IS_NULL(this->ram_entry.write_data));
+ cache_buffer = D_RW(this->ram_entry.write_data);
+}
+
+buffer::list& WriteLogEntry::get_cache_bl() {
+ if (0 == bl_refs) {
+ std::lock_guard locker(m_entry_bl_lock);
+ if (0 == bl_refs) {
+ //init pmem bufferlist
+ cache_bl.clear();
+ init_cache_bp();
+ ceph_assert(cache_bp.have_raw());
+ int before_bl = cache_bp.raw_nref();
+ this->init_bl(cache_bp, cache_bl);
+ int after_bl = cache_bp.raw_nref();
+ bl_refs = after_bl - before_bl;
+ }
+ ceph_assert(0 != bl_refs);
+ }
+ return cache_bl;
+}
+
+void WriteLogEntry::copy_cache_bl(bufferlist *out_bl) {
+ this->get_cache_bl();
+ // cache_bp is now initialized
+ buffer::ptr cloned_bp(cache_bp.clone());
+ out_bl->clear();
+ this->init_bl(cloned_bp, *out_bl);
+}
+
+void WriteSameLogEntry::writeback(
+ librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) {
+ bufferlist entry_bl;
+ buffer::list entry_bl_copy;
+ copy_cache_bl(&entry_bl_copy);
+ entry_bl_copy.begin(0).copy(write_bytes(), entry_bl);
+ image_writeback.aio_writesame(ram_entry.image_offset_bytes,
+ ram_entry.write_bytes,
+ std::move(entry_bl), 0, ctx);
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H
+
+#include "librbd/cache/pwl/LogEntry.h"
+
+namespace librbd {
+namespace cache {
+class ImageWritebackInterface;
+namespace pwl {
+namespace rwl {
+
+class WriteLogEntry : public pwl::WriteLogEntry {
+public:
+ WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes)
+ : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {}
+ WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes)
+ : pwl::WriteLogEntry(image_offset_bytes, write_bytes) {}
+ WriteLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes,
+ data_length) {}
+ WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : pwl::WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+ ~WriteLogEntry() {}
+ WriteLogEntry(const WriteLogEntry&) = delete;
+ WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+
+ void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) override;
+ void init_cache_bp() override;
+ void init_bl(buffer::ptr &bp, buffer::list &bl) override;
+ void init_cache_buffer(
+ std::vector<WriteBufferAllocation>::iterator allocation) override;
+ buffer::list &get_cache_bl() override;
+ void copy_cache_bl(bufferlist *out_bl) override;
+};
+
+class WriteSameLogEntry : public WriteLogEntry {
+public:
+ WriteSameLogEntry(std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes,
+ data_length) {}
+ WriteSameLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+ ~WriteSameLogEntry() {}
+ WriteSameLogEntry(const WriteSameLogEntry&) = delete;
+ WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
+
+ void writeback(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LogOperation.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::LogOperation: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+void WriteLogOperation::copy_bl_to_cache_buffer(
+ std::vector<WriteBufferAllocation>::iterator allocation) {
+ /* operation is a shared_ptr, so write_op is only good as long as operation is
+ * in scope */
+ bufferlist::iterator i(&bl);
+ m_perfcounter->inc(l_librbd_pwl_log_op_bytes, log_entry->write_bytes());
+ ldout(m_cct, 20) << bl << dendl;
+ log_entry->init_cache_buffer(allocation);
+ i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->cache_buffer);
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H
+#define CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H
+
+#include "librbd/cache/pwl/LogOperation.h"
+
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+class WriteLogOperation : public pwl::WriteLogOperation {
+public:
+ WriteLogOperation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> write_log_entry)
+ : pwl::WriteLogOperation(set, image_offset_bytes, write_bytes, cct,
+ write_log_entry) {}
+
+ WriteLogOperation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> writesame_log_entry)
+ : pwl::WriteLogOperation(set, image_offset_bytes, write_bytes, cct,
+ writesame_log_entry) {}
+
+ void copy_bl_to_cache_buffer(
+ std::vector<WriteBufferAllocation>::iterator allocation) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Request.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::Request: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename T>
+void C_WriteRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+
+ ceph_assert(!this->m_resources.allocated);
+
+ auto image_extents_size = this->image_extents.size();
+ this->m_resources.buffers.reserve(image_extents_size);
+
+ *bytes_cached = 0;
+ *bytes_allocated = 0;
+ *number_lanes = image_extents_size;
+ *number_log_entries = image_extents_size;
+ *number_unpublished_reserves = image_extents_size;
+
+ for (auto &extent : this->image_extents) {
+ this->m_resources.buffers.emplace_back();
+ struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
+ buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
+ buffer.allocated = false;
+ *bytes_cached += extent.second;
+ if (extent.second > buffer.allocation_size) {
+ buffer.allocation_size = extent.second;
+ }
+ *bytes_allocated += buffer.allocation_size;
+ }
+ *bytes_dirtied = *bytes_cached;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<T> &req) {
+ os << (C_WriteRequest<T>&)req
+ << "cmp_bl=" << req.cmp_bl << ", "
+ << "read_bl=" << req.read_bl << ", "
+ << "compare_succeeded=" << req.compare_succeeded << ", "
+ << "mismatch_offset=" << req.mismatch_offset;
+ return os;
+}
+
+template <typename T>
+void C_WriteSameRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+ ceph_assert(this->image_extents.size() == 1);
+ *bytes_dirtied += this->image_extents[0].second;
+ auto pattern_length = this->bl.length();
+ this->m_resources.buffers.emplace_back();
+ struct WriteBufferAllocation &buffer = this->m_resources.buffers.back();
+ buffer.allocation_size = MIN_WRITE_ALLOC_SIZE;
+ buffer.allocated = false;
+ *bytes_cached += pattern_length;
+ if (pattern_length > buffer.allocation_size) {
+ buffer.allocation_size = pattern_length;
+ }
+ *bytes_allocated += buffer.allocation_size;
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::rwl::C_WriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::rwl::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::rwl::C_CompAndWriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H
+
+#include "librbd/cache/pwl/Request.h"
+
+namespace librbd {
+class BlockGuardCell;
+
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename T>
+class C_WriteRequest : public pwl::C_WriteRequest<T> {
+public:
+ C_WriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ C_WriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req) {}
+protected:
+ //Plain writes will allocate one buffer per request extent
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+};
+
+template <typename T>
+class C_CompAndWriteRequest : public C_WriteRequest<T> {
+public:
+ C_CompAndWriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ const char *get_name() const override {
+ return "C_CompAndWriteRequest";
+ }
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<U> &req);
+};
+
+template <typename T>
+class C_WriteSameRequest : public pwl::C_WriteSameRequest<T> {
+public:
+ C_WriteSameRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteSameRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "WriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include "librbd/plugin/Api.h"
+#include <map>
+#include <vector>
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::rwl::WriteLog: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+using namespace librbd::cache::pwl;
+namespace rwl {
+
+const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION;
+
+template <typename I>
+Builder<AbstractWriteLog<I>>* WriteLog<I>::create_builder() {
+ m_builderobj = new Builder<This>();
+ return m_builderobj;
+}
+
+template <typename I>
+WriteLog<I>::WriteLog(
+ I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
+ ImageWritebackInterface& image_writeback,
+ plugin::Api<I>& plugin_api)
+: AbstractWriteLog<I>(image_ctx, cache_state, create_builder(), image_writeback,
+ plugin_api),
+ m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl))
+{
+}
+
+template <typename I>
+WriteLog<I>::~WriteLog() {
+ m_log_pool = nullptr;
+ delete m_builderobj;
+}
+
+/*
+ * Allocate the (already reserved) write log entries for a set of operations.
+ *
+ * Locking:
+ * Acquires lock
+ */
+template <typename I>
+void WriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops)
+{
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ struct WriteLogCacheEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
+
+ ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
+
+ /* Allocate the (already reserved) log entries */
+ std::lock_guard locker(m_lock);
+
+ for (auto &operation : ops) {
+ uint32_t entry_index = this->m_first_free_entry;
+ this->m_first_free_entry = (this->m_first_free_entry + 1) % this->m_total_log_entries;
+ auto &log_entry = operation->get_log_entry();
+ log_entry->log_entry_index = entry_index;
+ log_entry->ram_entry.entry_index = entry_index;
+ log_entry->cache_entry = &pmem_log_entries[entry_index];
+ log_entry->ram_entry.entry_valid = 1;
+ m_log_entries.push_back(log_entry);
+ ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
+ }
+}
+
+/*
+ * Write and persist the (already allocated) write log entries and
+ * data buffer allocations for a set of ops. The data buffer for each
+ * of these must already have been persisted to its reserved area.
+ */
+template <typename I>
+int WriteLog<I>::append_op_log_entries(GenericLogOperations &ops)
+{
+ CephContext *cct = m_image_ctx.cct;
+ GenericLogOperationsVector entries_to_flush;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ int ret = 0;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock));
+
+ if (ops.empty()) {
+ return 0;
+ }
+ entries_to_flush.reserve(OPS_APPENDED_TOGETHER);
+
+ /* Write log entries to ring and persist */
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ if (!entries_to_flush.empty()) {
+ /* Flush these and reset the list if the current entry wraps to the
+ * tail of the ring */
+ if (entries_to_flush.back()->get_log_entry()->log_entry_index >
+ operation->get_log_entry()->log_entry_index) {
+ ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at "
+ << "operation=[" << *operation << "]" << dendl;
+ flush_op_log_entries(entries_to_flush);
+ entries_to_flush.clear();
+ now = ceph_clock_now();
+ }
+ }
+ ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index="
+ << operation->get_log_entry()->log_entry_index << " "
+ << "from " << &operation->get_log_entry()->ram_entry << " "
+ << "to " << operation->get_log_entry()->cache_entry << " "
+ << "operation=[" << *operation << "]" << dendl;
+ ldout(m_image_ctx.cct, 05) << "APPENDING: index="
+ << operation->get_log_entry()->log_entry_index << " "
+ << "operation=[" << *operation << "]" << dendl;
+ operation->log_append_time = now;
+ *operation->get_log_entry()->cache_entry = operation->get_log_entry()->ram_entry;
+ ldout(m_image_ctx.cct, 20) << "APPENDING: index="
+ << operation->get_log_entry()->log_entry_index << " "
+ << "pmem_entry=[" << *operation->get_log_entry()->cache_entry
+ << "]" << dendl;
+ entries_to_flush.push_back(operation);
+ }
+ flush_op_log_entries(entries_to_flush);
+
+ /* Drain once for all */
+ pmemobj_drain(m_log_pool);
+
+ /*
+ * Atomically advance the log head pointer and publish the
+ * allocations for all the data buffers they refer to.
+ */
+ utime_t tx_start = ceph_clock_now();
+ TX_BEGIN(m_log_pool) {
+ D_RW(pool_root)->first_free_entry = this->m_first_free_entry;
+ for (auto &operation : ops) {
+ if (operation->reserved_allocated()) {
+ auto write_op = (std::shared_ptr<WriteLogOperation>&) operation;
+ pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1);
+ } else {
+ ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
+ }
+ }
+ } TX_ONCOMMIT {
+ } TX_ONABORT {
+ lderr(cct) << "failed to commit " << ops.size()
+ << " log entries (" << this->m_log_pool_name << ")" << dendl;
+ ceph_assert(false);
+ ret = -EIO;
+ } TX_FINALLY {
+ } TX_END;
+
+ utime_t tx_end = ceph_clock_now();
+ m_perfcounter->tinc(l_librbd_pwl_append_tx_t, tx_end - tx_start);
+ m_perfcounter->hinc(
+ l_librbd_pwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size());
+ for (auto &operation : ops) {
+ operation->log_append_comp_time = tx_end;
+ }
+
+ return ret;
+}
+
+/*
+ * Flush the persistent write log entries set of ops. The entries must
+ * be contiguous in persistent memory.
+ */
+template <typename I>
+void WriteLog<I>::flush_op_log_entries(GenericLogOperationsVector &ops)
+{
+ if (ops.empty()) {
+ return;
+ }
+
+ if (ops.size() > 1) {
+ ceph_assert(ops.front()->get_log_entry()->cache_entry < ops.back()->get_log_entry()->cache_entry);
+ }
+
+ ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() << " "
+ << "start address="
+ << ops.front()->get_log_entry()->cache_entry << " "
+ << "bytes="
+ << ops.size() * sizeof(*(ops.front()->get_log_entry()->cache_entry))
+ << dendl;
+ pmemobj_flush(m_log_pool,
+ ops.front()->get_log_entry()->cache_entry,
+ ops.size() * sizeof(*(ops.front()->get_log_entry()->cache_entry)));
+}
+
+template <typename I>
+void WriteLog<I>::remove_pool_file() {
+ if (m_log_pool) {
+ ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl;
+ pmemobj_close(m_log_pool);
+ }
+ if (m_cache_state->clean) {
+ ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << this->m_log_pool_name << dendl;
+ if (remove(this->m_log_pool_name.c_str()) != 0) {
+ lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << this->m_log_pool_name << "\": "
+ << pmemobj_errormsg() << dendl;
+ } else {
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ m_cache_state->present = false;
+ }
+ } else {
+ ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << this->m_log_pool_name << dendl;
+ }
+}
+
+template <typename I>
+void WriteLog<I>::initialize_pool(Context *on_finish, pwl::DeferredContexts &later) {
+ CephContext *cct = m_image_ctx.cct;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
+ if ((m_log_pool =
+ pmemobj_create(this->m_log_pool_name.c_str(),
+ this->m_pwl_pool_layout_name,
+ this->m_log_pool_config_size,
+ (S_IWUSR | S_IRUSR))) == NULL) {
+ lderr(cct) << "failed to create pool (" << this->m_log_pool_name << ")"
+ << pmemobj_errormsg() << dendl;
+ m_cache_state->present = false;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ /* TODO: filter/replace errnos that are meaningless to the caller */
+ on_finish->complete(-errno);
+ return;
+ }
+ m_cache_state->present = true;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+ /* new pool, calculate and store metadata */
+ size_t effective_pool_size = (size_t)(this->m_log_pool_config_size * USABLE_SIZE);
+ size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogCacheEntry);
+ uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size);
+ if (num_small_writes > MAX_LOG_ENTRIES) {
+ num_small_writes = MAX_LOG_ENTRIES;
+ }
+ if (num_small_writes <= 2) {
+ lderr(cct) << "num_small_writes needs to > 2" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ this->m_log_pool_actual_size = this->m_log_pool_config_size;
+ this->m_bytes_allocated_cap = effective_pool_size;
+ /* Log ring empty */
+ m_first_free_entry = 0;
+ m_first_valid_entry = 0;
+ TX_BEGIN(m_log_pool) {
+ TX_ADD(pool_root);
+ D_RW(pool_root)->header.layout_version = RWL_POOL_VERSION;
+ D_RW(pool_root)->log_entries =
+ TX_ZALLOC(struct WriteLogCacheEntry,
+ sizeof(struct WriteLogCacheEntry) * num_small_writes);
+ D_RW(pool_root)->pool_size = this->m_log_pool_actual_size;
+ D_RW(pool_root)->flushed_sync_gen = this->m_flushed_sync_gen;
+ D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE;
+ D_RW(pool_root)->num_log_entries = num_small_writes;
+ D_RW(pool_root)->first_free_entry = m_first_free_entry;
+ D_RW(pool_root)->first_valid_entry = m_first_valid_entry;
+ } TX_ONCOMMIT {
+ this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
+ this->m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free
+ } TX_ONABORT {
+ this->m_total_log_entries = 0;
+ this->m_free_log_entries = 0;
+ lderr(cct) << "failed to initialize pool (" << this->m_log_pool_name << ")" << dendl;
+ on_finish->complete(-pmemobj_tx_errno());
+ return;
+ } TX_FINALLY {
+ } TX_END;
+ } else {
+ m_cache_state->present = true;
+ /* Open existing pool */
+ if ((m_log_pool =
+ pmemobj_open(this->m_log_pool_name.c_str(),
+ this->m_pwl_pool_layout_name)) == NULL) {
+ lderr(cct) << "failed to open pool (" << this->m_log_pool_name << "): "
+ << pmemobj_errormsg() << dendl;
+ on_finish->complete(-errno);
+ return;
+ }
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ if (D_RO(pool_root)->header.layout_version != RWL_POOL_VERSION) {
+ // TODO: will handle upgrading version in the future
+ lderr(cct) << "Pool layout version is "
+ << D_RO(pool_root)->header.layout_version
+ << " expected " << RWL_POOL_VERSION << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) {
+ lderr(cct) << "Pool block size is " << D_RO(pool_root)->block_size
+ << " expected " << MIN_WRITE_ALLOC_SIZE << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ this->m_log_pool_actual_size = D_RO(pool_root)->pool_size;
+ this->m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen;
+ this->m_total_log_entries = D_RO(pool_root)->num_log_entries;
+ m_first_free_entry = D_RO(pool_root)->first_free_entry;
+ m_first_valid_entry = D_RO(pool_root)->first_valid_entry;
+ if (m_first_free_entry < m_first_valid_entry) {
+ /* Valid entries wrap around the end of the ring, so first_free is lower
+ * than first_valid. If first_valid was == first_free+1, the entry at
+ * first_free would be empty. The last entry is never used, so in
+ * that case there would be zero free log entries. */
+ this->m_free_log_entries = this->m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1;
+ } else {
+ /* first_valid is <= first_free. If they are == we have zero valid log
+ * entries, and n-1 free log entries */
+ this->m_free_log_entries = this->m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1;
+ }
+ size_t effective_pool_size = (size_t)(this->m_log_pool_config_size * USABLE_SIZE);
+ this->m_bytes_allocated_cap = effective_pool_size;
+ load_existing_entries(later);
+ m_cache_state->clean = this->m_dirty_log_entries.empty();
+ m_cache_state->empty = m_log_entries.empty();
+ }
+}
+
+/*
+ * Loads the log entries from an existing log.
+ *
+ * Creates the in-memory structures to represent the state of the
+ * re-opened log.
+ *
+ * Finds the last appended sync point, and any sync points referred to
+ * in log entries, but missing from the log. These missing sync points
+ * are created and scheduled for append. Some rudimentary consistency
+ * checking is done.
+ *
+ * Rebuilds the m_blocks_to_log_entries map, to make log entries
+ * readable.
+ *
+ * Places all writes on the dirty entries list, which causes them all
+ * to be flushed.
+ *
+ */
+
+template <typename I>
+void WriteLog<I>::load_existing_entries(DeferredContexts &later) {
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ struct WriteLogCacheEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries);
+ uint64_t entry_index = m_first_valid_entry;
+ /* The map below allows us to find sync point log entries by sync
+ * gen number, which is necessary so write entries can be linked to
+ * their sync points. */
+ std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
+ /* The map below tracks sync points referred to in writes but not
+ * appearing in the sync_point_entries map. We'll use this to
+ * determine which sync points are missing and need to be
+ * created. */
+ std::map<uint64_t, bool> missing_sync_points;
+
+ /*
+ * Read the existing log entries. Construct an in-memory log entry
+ * object of the appropriate type for each. Add these to the global
+ * log entries list.
+ *
+ * Write entries will not link to their sync points yet. We'll do
+ * that in the next pass. Here we'll accumulate a map of sync point
+ * gen numbers that are referred to in writes but do not appearing in
+ * the log.
+ */
+ while (entry_index != m_first_free_entry) {
+ WriteLogCacheEntry *pmem_entry = &pmem_log_entries[entry_index];
+ std::shared_ptr<GenericLogEntry> log_entry = nullptr;
+ ceph_assert(pmem_entry->entry_index == entry_index);
+
+ this->update_entries(log_entry, pmem_entry, missing_sync_points,
+ sync_point_entries, entry_index);
+
+ log_entry->ram_entry = *pmem_entry;
+ log_entry->cache_entry = pmem_entry;
+ log_entry->log_entry_index = entry_index;
+ log_entry->completed = true;
+
+ m_log_entries.push_back(log_entry);
+
+ entry_index = (entry_index + 1) % this->m_total_log_entries;
+ }
+
+ this->update_sync_points(missing_sync_points, sync_point_entries, later, MIN_WRITE_ALLOC_SIZE);
+}
+
+template <typename I>
+void WriteLog<I>::write_data_to_buffer(
+ std::shared_ptr<pwl::WriteLogEntry> ws_entry,
+ WriteLogCacheEntry *pmem_entry) {
+ ws_entry->cache_buffer = D_RW(pmem_entry->write_data);
+}
+
+/**
+ * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries
+ * that are eligible to be retired. Returns true if anything was
+ * retired.
+ */
+template <typename I>
+bool WriteLog<I>::retire_entries(const unsigned long int frees_per_tx) {
+ CephContext *cct = m_image_ctx.cct;
+ GenericLogEntriesVector retiring_entries;
+ uint32_t initial_first_valid_entry;
+ uint32_t first_valid_entry;
+
+ std::lock_guard retire_locker(this->m_log_retire_lock);
+ ldout(cct, 20) << "Look for entries to retire" << dendl;
+ {
+ /* Entry readers can't be added while we hold m_entry_reader_lock */
+ RWLock::WLocker entry_reader_locker(this->m_entry_reader_lock);
+ std::lock_guard locker(m_lock);
+ initial_first_valid_entry = this->m_first_valid_entry;
+ first_valid_entry = this->m_first_valid_entry;
+ auto entry = m_log_entries.front();
+ while (!m_log_entries.empty() &&
+ retiring_entries.size() < frees_per_tx &&
+ this->can_retire_entry(entry)) {
+ if (entry->log_entry_index != first_valid_entry) {
+ lderr(cct) << "Retiring entry index (" << entry->log_entry_index
+ << ") and first valid log entry index (" << first_valid_entry
+ << ") must be ==." << dendl;
+ }
+ ceph_assert(entry->log_entry_index == first_valid_entry);
+ first_valid_entry = (first_valid_entry + 1) % this->m_total_log_entries;
+ m_log_entries.pop_front();
+ retiring_entries.push_back(entry);
+ /* Remove entry from map so there will be no more readers */
+ if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) {
+ auto gen_write_entry = static_pointer_cast<GenericWriteLogEntry>(entry);
+ if (gen_write_entry) {
+ this->m_blocks_to_log_entries.remove_log_entry(gen_write_entry);
+ }
+ }
+ entry = m_log_entries.front();
+ }
+ }
+
+ if (retiring_entries.size()) {
+ ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl;
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+
+ utime_t tx_start;
+ utime_t tx_end;
+ /* Advance first valid entry and release buffers */
+ {
+ uint64_t flushed_sync_gen;
+ std::lock_guard append_locker(this->m_log_append_lock);
+ {
+ std::lock_guard locker(m_lock);
+ flushed_sync_gen = this->m_flushed_sync_gen;
+ }
+
+ tx_start = ceph_clock_now();
+ TX_BEGIN(m_log_pool) {
+ if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
+ ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from "
+ << D_RO(pool_root)->flushed_sync_gen << " to "
+ << flushed_sync_gen << dendl;
+ D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
+ }
+ D_RW(pool_root)->first_valid_entry = first_valid_entry;
+ for (auto &entry: retiring_entries) {
+ if (entry->write_bytes()) {
+ ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo
+ << "." << entry->ram_entry.write_data.oid.off << dendl;
+ TX_FREE(entry->ram_entry.write_data);
+ } else {
+ ldout(cct, 20) << "Retiring non-write: " << *entry << dendl;
+ }
+ }
+ } TX_ONCOMMIT {
+ } TX_ONABORT {
+ lderr(cct) << "failed to commit free of" << retiring_entries.size()
+ << " log entries (" << this->m_log_pool_name << ")" << dendl;
+ ceph_assert(false);
+ } TX_FINALLY {
+ } TX_END;
+ tx_end = ceph_clock_now();
+ }
+ m_perfcounter->tinc(l_librbd_pwl_retire_tx_t, tx_end - tx_start);
+ m_perfcounter->hinc(l_librbd_pwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(),
+ retiring_entries.size());
+
+ /* Update runtime copy of first_valid, and free entries counts */
+ {
+ std::lock_guard locker(m_lock);
+
+ ceph_assert(this->m_first_valid_entry == initial_first_valid_entry);
+ this->m_first_valid_entry = first_valid_entry;
+ this->m_free_log_entries += retiring_entries.size();
+ for (auto &entry: retiring_entries) {
+ if (entry->write_bytes()) {
+ ceph_assert(this->m_bytes_cached >= entry->write_bytes());
+ this->m_bytes_cached -= entry->write_bytes();
+ uint64_t entry_allocation_size = entry->write_bytes();
+ if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) {
+ entry_allocation_size = MIN_WRITE_ALLOC_SIZE;
+ }
+ ceph_assert(this->m_bytes_allocated >= entry_allocation_size);
+ this->m_bytes_allocated -= entry_allocation_size;
+ }
+ }
+ this->m_alloc_failed_since_retire = false;
+ this->wake_up();
+ }
+ } else {
+ ldout(cct, 20) << "Nothing to retire" << dendl;
+ return false;
+ }
+ return true;
+}
+
+template <typename I>
+Context* WriteLog<I>::construct_flush_entry_ctx(
+ std::shared_ptr<GenericLogEntry> log_entry) {
+ bool invalidating = this->m_invalidating; // snapshot so we behave consistently
+ Context *ctx = this->construct_flush_entry(log_entry, invalidating);
+
+ if (invalidating) {
+ return ctx;
+ }
+ return new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ m_image_ctx.op_work_queue->queue(new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+ << " " << *log_entry << dendl;
+ log_entry->writeback(this->m_image_writeback, ctx);
+ }), 0);
+ });
+}
+
+const unsigned long int ops_flushed_together = 4;
+/*
+ * Performs the pmem buffer flush on all scheduled ops, then schedules
+ * the log event append operation for all of them.
+ */
+template <typename I>
+void WriteLog<I>::flush_then_append_scheduled_ops(void)
+{
+ GenericLogOperations ops;
+ bool ops_remain = false;
+ ldout(m_image_ctx.cct, 20) << dendl;
+ do {
+ {
+ ops.clear();
+ std::lock_guard locker(m_lock);
+ if (m_ops_to_flush.size()) {
+ auto last_in_batch = m_ops_to_flush.begin();
+ unsigned int ops_to_flush = m_ops_to_flush.size();
+ if (ops_to_flush > ops_flushed_together) {
+ ops_to_flush = ops_flushed_together;
+ }
+ ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl;
+ std::advance(last_in_batch, ops_to_flush);
+ ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch);
+ ops_remain = !m_ops_to_flush.empty();
+ ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", "
+ << m_ops_to_flush.size() << " remain" << dendl;
+ } else {
+ ops_remain = false;
+ }
+ }
+ if (ops_remain) {
+ enlist_op_flusher();
+ }
+
+ /* Ops subsequently scheduled for flush may finish before these,
+ * which is fine. We're unconcerned with completion order until we
+ * get to the log message append step. */
+ if (ops.size()) {
+ flush_pmem_buffer(ops);
+ schedule_append_ops(ops);
+ }
+ } while (ops_remain);
+ append_scheduled_ops();
+}
+
+/*
+ * Performs the log event append operation for all of the scheduled
+ * events.
+ */
+template <typename I>
+void WriteLog<I>::append_scheduled_ops(void) {
+ GenericLogOperations ops;
+ int append_result = 0;
+ bool ops_remain = false;
+ bool appending = false; /* true if we set m_appending */
+ ldout(m_image_ctx.cct, 20) << dendl;
+ do {
+ ops.clear();
+ this->append_scheduled(ops, ops_remain, appending, true);
+
+ if (ops.size()) {
+ std::lock_guard locker(this->m_log_append_lock);
+ alloc_op_log_entries(ops);
+ append_result = append_op_log_entries(ops);
+ }
+
+ int num_ops = ops.size();
+ if (num_ops) {
+ /* New entries may be flushable. Completion will wake up flusher. */
+ this->complete_op_log_entries(std::move(ops), append_result);
+ }
+ } while (ops_remain);
+}
+
+template <typename I>
+void WriteLog<I>::enlist_op_flusher()
+{
+ this->m_async_flush_ops++;
+ this->m_async_op_tracker.start_op();
+ Context *flush_ctx = new LambdaContext([this](int r) {
+ flush_then_append_scheduled_ops();
+ this->m_async_flush_ops--;
+ this->m_async_op_tracker.finish_op();
+ });
+ this->m_work_queue.queue(flush_ctx);
+}
+
+template <typename I>
+void WriteLog<I>::setup_schedule_append(
+ pwl::GenericLogOperationsVector &ops, bool do_early_flush) {
+ if (do_early_flush) {
+ /* This caller is waiting for persist, so we'll use their thread to
+ * expedite it */
+ flush_pmem_buffer(ops);
+ this->schedule_append(ops);
+ } else {
+ /* This is probably not still the caller's thread, so do the payload
+ * flushing/replicating later. */
+ schedule_flush_and_append(ops);
+ }
+}
+
+/*
+ * Takes custody of ops. They'll all get their log entries appended,
+ * and have their on_write_persist contexts completed once they and
+ * all prior log entries are persisted everywhere.
+ */
+template <typename I>
+void WriteLog<I>::schedule_append_ops(GenericLogOperations &ops)
+{
+ bool need_finisher;
+ GenericLogOperationsVector appending;
+
+ std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
+ {
+ std::lock_guard locker(m_lock);
+
+ need_finisher = this->m_ops_to_append.empty() && !this->m_appending;
+ this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops);
+ }
+
+ if (need_finisher) {
+ //enlist op appender
+ this->m_async_append_ops++;
+ this->m_async_op_tracker.start_op();
+ Context *append_ctx = new LambdaContext([this](int r) {
+ append_scheduled_ops();
+ this->m_async_append_ops--;
+ this->m_async_op_tracker.finish_op();
+ });
+ this->m_work_queue.queue(append_ctx);
+ }
+
+ for (auto &op : appending) {
+ op->appending();
+ }
+}
+
+/*
+ * Takes custody of ops. They'll all get their pmem blocks flushed,
+ * then get their log entries appended.
+ */
+template <typename I>
+void WriteLog<I>::schedule_flush_and_append(GenericLogOperationsVector &ops)
+{
+ GenericLogOperations to_flush(ops.begin(), ops.end());
+ bool need_finisher;
+ ldout(m_image_ctx.cct, 20) << dendl;
+ {
+ std::lock_guard locker(m_lock);
+
+ need_finisher = m_ops_to_flush.empty();
+ m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush);
+ }
+
+ if (need_finisher) {
+ enlist_op_flusher();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::process_work() {
+ CephContext *cct = m_image_ctx.cct;
+ int max_iterations = 4;
+ bool wake_up_requested = false;
+ uint64_t aggressive_high_water_bytes = this->m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER;
+ uint64_t high_water_bytes = this->m_bytes_allocated_cap * RETIRE_HIGH_WATER;
+ uint64_t low_water_bytes = this->m_bytes_allocated_cap * RETIRE_LOW_WATER;
+ uint64_t aggressive_high_water_entries = this->m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER;
+ uint64_t high_water_entries = this->m_total_log_entries * RETIRE_HIGH_WATER;
+ uint64_t low_water_entries = this->m_total_log_entries * RETIRE_LOW_WATER;
+
+ ldout(cct, 20) << dendl;
+
+ do {
+ {
+ std::lock_guard locker(m_lock);
+ this->m_wake_up_requested = false;
+ }
+ if (this->m_alloc_failed_since_retire || this->m_invalidating ||
+ this->m_bytes_allocated > high_water_bytes ||
+ (m_log_entries.size() > high_water_entries)) {
+ int retired = 0;
+ utime_t started = ceph_clock_now();
+ ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire
+ << ", allocated > high_water="
+ << (this->m_bytes_allocated > high_water_bytes)
+ << ", allocated_entries > high_water="
+ << (m_log_entries.size() > high_water_entries)
+ << dendl;
+ while (this->m_alloc_failed_since_retire || this->m_invalidating ||
+ (this->m_bytes_allocated > high_water_bytes) ||
+ (m_log_entries.size() > high_water_entries) ||
+ (((this->m_bytes_allocated > low_water_bytes) ||
+ (m_log_entries.size() > low_water_entries)) &&
+ (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) {
+ if (!retire_entries((this->m_shutting_down || this->m_invalidating ||
+ (this->m_bytes_allocated > aggressive_high_water_bytes) ||
+ (m_log_entries.size() > aggressive_high_water_entries))
+ ? MAX_ALLOC_PER_TRANSACTION
+ : MAX_FREE_PER_TRANSACTION)) {
+ break;
+ }
+ retired++;
+ this->dispatch_deferred_writes();
+ this->process_writeback_dirty_entries();
+ }
+ ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl;
+ }
+ this->dispatch_deferred_writes();
+ this->process_writeback_dirty_entries();
+
+ {
+ std::lock_guard locker(m_lock);
+ wake_up_requested = this->m_wake_up_requested;
+ }
+ } while (wake_up_requested && --max_iterations > 0);
+
+ {
+ std::lock_guard locker(m_lock);
+ this->m_wake_up_scheduled = false;
+ /* Reschedule if it's still requested */
+ if (this->m_wake_up_requested) {
+ this->wake_up();
+ }
+ }
+}
+
+/*
+ * Flush the pmem regions for the data blocks of a set of operations
+ *
+ * V is expected to be GenericLogOperations<I>, or GenericLogOperationsVector<I>
+ */
+template <typename I>
+template <typename V>
+void WriteLog<I>::flush_pmem_buffer(V& ops)
+{
+ for (auto &operation : ops) {
+ if(operation->is_writing_op()) {
+ auto log_entry = static_pointer_cast<WriteLogEntry>(operation->get_log_entry());
+ pmemobj_flush(m_log_pool, log_entry->cache_buffer, log_entry->write_bytes());
+ }
+ }
+
+ /* Drain once for all */
+ pmemobj_drain(m_log_pool);
+
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ if (operation->reserved_allocated()) {
+ operation->buf_persist_comp_time = now;
+ } else {
+ ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl;
+ }
+ }
+}
+
+/**
+ * Update/persist the last flushed sync point in the log
+ */
+template <typename I>
+void WriteLog<I>::persist_last_flushed_sync_gen()
+{
+ TOID(struct WriteLogPoolRoot) pool_root;
+ pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot);
+ uint64_t flushed_sync_gen;
+
+ std::lock_guard append_locker(this->m_log_append_lock);
+ {
+ std::lock_guard locker(m_lock);
+ flushed_sync_gen = this->m_flushed_sync_gen;
+ }
+
+ if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) {
+ ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from "
+ << D_RO(pool_root)->flushed_sync_gen << " to "
+ << flushed_sync_gen << dendl;
+ TX_BEGIN(m_log_pool) {
+ D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen;
+ } TX_ONCOMMIT {
+ } TX_ONABORT {
+ lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl;
+ ceph_assert(false);
+ } TX_FINALLY {
+ } TX_END;
+ }
+}
+
+template <typename I>
+void WriteLog<I>::reserve_cache(C_BlockIORequestT *req,
+ bool &alloc_succeeds, bool &no_space) {
+ std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
+ for (auto &buffer : buffers) {
+ utime_t before_reserve = ceph_clock_now();
+ buffer.buffer_oid = pmemobj_reserve(m_log_pool,
+ &buffer.buffer_alloc_action,
+ buffer.allocation_size,
+ 0 /* Object type */);
+ buffer.allocation_lat = ceph_clock_now() - before_reserve;
+ if (TOID_IS_NULL(buffer.buffer_oid)) {
+ if (!req->has_io_waited_for_buffers()) {
+ req->set_io_waited_for_entries(true);
+ }
+ ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: "
+ << pmemobj_errormsg() << ". "
+ << *req << dendl;
+ alloc_succeeds = false;
+ no_space = true; /* Entries need to be retired */
+ break;
+ } else {
+ buffer.allocated = true;
+ }
+ ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo
+ << "." << buffer.buffer_oid.oid.off
+ << ", size=" << buffer.allocation_size << dendl;
+ }
+}
+
+template<typename I>
+void WriteLog<I>::copy_bl_to_buffer(
+ WriteRequestResources *resources, std::unique_ptr<WriteLogOperationSet> &op_set) {
+ auto allocation = resources->buffers.begin();
+ for (auto &operation : op_set->operations) {
+ operation->copy_bl_to_cache_buffer(allocation);
+ allocation++;
+ }
+}
+
+template <typename I>
+bool WriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
+ bool alloc_succeeds = true;
+ uint64_t bytes_allocated = 0;
+ uint64_t bytes_cached = 0;
+ uint64_t bytes_dirtied = 0;
+ uint64_t num_lanes = 0;
+ uint64_t num_unpublished_reserves = 0;
+ uint64_t num_log_entries = 0;
+
+ ldout(m_image_ctx.cct, 20) << dendl;
+ // Setup buffer, and get all the number of required resources
+ req->setup_buffer_resources(&bytes_cached, &bytes_dirtied, &bytes_allocated,
+ &num_lanes, &num_log_entries, &num_unpublished_reserves);
+
+ alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied, bytes_allocated,
+ num_lanes, num_log_entries, num_unpublished_reserves,
+ this->m_bytes_allocated_cap);
+
+ std::vector<WriteBufferAllocation>& buffers = req->get_resources_buffers();
+ if (!alloc_succeeds) {
+ /* On alloc failure, free any buffers we did allocate */
+ for (auto &buffer : buffers) {
+ if (buffer.allocated) {
+ pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1);
+ }
+ }
+ }
+
+ req->set_allocated(alloc_succeeds);
+ return alloc_succeeds;
+}
+
+template <typename I>
+void WriteLog<I>::complete_user_request(Context *&user_req, int r) {
+ user_req->complete(r);
+ // Set user_req as null as it is deleted
+ user_req = nullptr;
+}
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::rwl::WriteLog<librbd::ImageCtx>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
+
+#include <functional>
+#include <libpmemobj.h>
+#include <list>
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/Utils.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/cache/Types.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/cache/pwl/LogMap.h"
+#include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/Request.h"
+#include "librbd/cache/pwl/rwl/Builder.h"
+
+class Context;
+class SafeTimer;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+namespace pwl {
+namespace rwl {
+
+template <typename ImageCtxT>
+class WriteLog : public AbstractWriteLog<ImageCtxT> {
+public:
+ WriteLog(
+ ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+ ImageWritebackInterface& image_writeback,
+ plugin::Api<ImageCtxT>& plugin_api);
+ ~WriteLog();
+ WriteLog(const WriteLog&) = delete;
+ WriteLog &operator=(const WriteLog&) = delete;
+
+ using This = AbstractWriteLog<ImageCtxT>;
+ using C_WriteRequestT = pwl::C_WriteRequest<This>;
+ using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
+
+ void copy_bl_to_buffer(
+ WriteRequestResources *resources, std::unique_ptr<WriteLogOperationSet> &op_set) override;
+ void complete_user_request(Context *&user_req, int r) override;
+private:
+ using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
+ using C_FlushRequestT = pwl::C_FlushRequest<This>;
+ using C_DiscardRequestT = pwl::C_DiscardRequest<This>;
+
+ PMEMobjpool *m_log_pool = nullptr;
+ Builder<This> *m_builderobj;
+ const char* m_pwl_pool_layout_name;
+
+ Builder<This>* create_builder();
+ void remove_pool_file();
+ void load_existing_entries(pwl::DeferredContexts &later);
+ void alloc_op_log_entries(pwl::GenericLogOperations &ops);
+ int append_op_log_entries(pwl::GenericLogOperations &ops);
+ void flush_then_append_scheduled_ops(void);
+ void enlist_op_flusher();
+ void flush_op_log_entries(pwl::GenericLogOperationsVector &ops);
+ template <typename V>
+ void flush_pmem_buffer(V& ops);
+
+protected:
+ using AbstractWriteLog<ImageCtxT>::m_lock;
+ using AbstractWriteLog<ImageCtxT>::m_log_entries;
+ using AbstractWriteLog<ImageCtxT>::m_image_ctx;
+ using AbstractWriteLog<ImageCtxT>::m_perfcounter;
+ using AbstractWriteLog<ImageCtxT>::m_ops_to_flush;
+ using AbstractWriteLog<ImageCtxT>::m_cache_state;
+ using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
+ using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
+
+ void process_work() override;
+ void schedule_append_ops(pwl::GenericLogOperations &ops) override;
+ void append_scheduled_ops(void) override;
+ void reserve_cache(C_BlockIORequestT *req, bool &alloc_succeeds, bool &no_space) override;
+ bool retire_entries(const unsigned long int frees_per_tx) override;
+ void persist_last_flushed_sync_gen() override;
+ bool alloc_resources(C_BlockIORequestT *req) override;
+ void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops) override;
+ void setup_schedule_append(
+ pwl::GenericLogOperationsVector &ops, bool do_early_flush) override;
+ Context *construct_flush_entry_ctx(
+ const std::shared_ptr<pwl::GenericLogEntry> log_entry) override;
+ void initialize_pool(Context *on_finish, pwl::DeferredContexts &later) override;
+ void write_data_to_buffer(
+ std::shared_ptr<pwl::WriteLogEntry> ws_entry,
+ pwl::WriteLogCacheEntry *pmem_entry) override;
+};
+
+} // namespace rwl
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::rwl::WriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H
+#define CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H
+
+#include <iostream>
+#include "LogEntry.h"
+#include "Request.h"
+
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/pwl/Builder.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+template <typename T>
+class Builder : public pwl::Builder<T> {
+public:
+ std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes) override {
+ return std::make_shared<WriteLogEntry>(image_offset_bytes, write_bytes);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_write_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes) override {
+ return std::make_shared<WriteLogEntry>(
+ sync_point_entry, image_offset_bytes, write_bytes);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) override {
+ return std::make_shared<WriteSameLogEntry>(
+ image_offset_bytes, write_bytes, data_length);
+ }
+ std::shared_ptr<pwl::WriteLogEntry> create_writesame_log_entry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length) override {
+ return std::make_shared<WriteSameLogEntry>(
+ sync_point_entry, image_offset_bytes, write_bytes, data_length);
+ }
+ pwl::C_WriteRequest<T> *create_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req);
+ }
+ pwl::C_WriteSameRequest<T> *create_writesame_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_WriteSameRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req);
+ }
+ pwl::C_WriteRequest<T> *create_comp_and_write_request(
+ T &pwl, utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req) override {
+ return new C_CompAndWriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req);
+ }
+ std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> write_log_entry) {
+ return std::make_shared<WriteLogOperation>(
+ set, image_offset_bytes, write_bytes, cct, write_log_entry);
+ }
+ std::shared_ptr<pwl::WriteLogOperation> create_write_log_operation(
+ WriteLogOperationSet &set, uint64_t image_offset_bytes,
+ uint64_t write_bytes, uint32_t data_len, CephContext *cct,
+ std::shared_ptr<pwl::WriteLogEntry> writesame_log_entry) {
+ return std::make_shared<WriteLogOperation>(
+ set, image_offset_bytes, write_bytes, data_len, cct,
+ writesame_log_entry);
+ }
+};
+
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/pwl/ssd/LogEntry.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::WriteLogEntry: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+void WriteLogEntry::init_cache_bl(
+ bufferlist &src_bl, uint64_t off, uint64_t len) {
+ cache_bl.clear();
+ cache_bl.substr_of(src_bl, off, len);
+}
+
+buffer::list& WriteLogEntry::get_cache_bl() {
+ std::lock_guard locker(m_entry_bl_lock);
+ return cache_bl;
+}
+
+void WriteLogEntry::remove_cache_bl() {
+ std::lock_guard locker(m_entry_bl_lock);
+ cache_bl.clear();
+}
+
+unsigned int WriteLogEntry::get_aligned_data_size() const {
+ if (cache_bl.length()) {
+ return round_up_to(cache_bl.length(), MIN_WRITE_ALLOC_SSD_SIZE);
+ }
+ return round_up_to(write_bytes(), MIN_WRITE_ALLOC_SSD_SIZE);
+}
+
+void WriteLogEntry::writeback_bl(
+ librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist&& bl) {
+ image_writeback.aio_write({{ram_entry.image_offset_bytes,
+ ram_entry.write_bytes}},
+ std::move(bl), 0, ctx);
+}
+
+void WriteSameLogEntry::writeback_bl(
+ librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist &&bl) {
+ image_writeback.aio_writesame(ram_entry.image_offset_bytes,
+ ram_entry.write_bytes,
+ std::move(bl), 0, ctx);
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// // vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H
+#define CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H
+
+#include "librbd/cache/pwl/LogEntry.h"
+
+namespace librbd {
+namespace cache {
+class ImageWritebackInterface;
+namespace pwl {
+namespace ssd {
+
+class WriteLogEntry : public pwl::WriteLogEntry {
+public:
+ WriteLogEntry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes)
+ : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {}
+ WriteLogEntry(
+ uint64_t image_offset_bytes, uint64_t write_bytes)
+ : pwl::WriteLogEntry(image_offset_bytes, write_bytes) {}
+ WriteLogEntry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes,
+ write_bytes, data_length) {}
+ WriteLogEntry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : pwl::WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+ ~WriteLogEntry() {}
+ WriteLogEntry(const WriteLogEntry&) = delete;
+ WriteLogEntry &operator=(const WriteLogEntry&) = delete;
+ void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist &&bl) override;
+ void init_cache_bl(bufferlist &src_bl, uint64_t off, uint64_t len) override;
+ buffer::list &get_cache_bl() override;
+ void remove_cache_bl() override;
+ unsigned int get_aligned_data_size() const override;
+};
+
+class WriteSameLogEntry : public WriteLogEntry {
+public:
+ WriteSameLogEntry(
+ std::shared_ptr<SyncPointLogEntry> sync_point_entry,
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(sync_point_entry, image_offset_bytes,
+ write_bytes, data_length) {}
+ WriteSameLogEntry(
+ uint64_t image_offset_bytes, uint64_t write_bytes,
+ uint32_t data_length)
+ : WriteLogEntry(image_offset_bytes, write_bytes, data_length) {}
+ ~WriteSameLogEntry() {}
+ WriteSameLogEntry(const WriteSameLogEntry&) = delete;
+ WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete;
+ void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback,
+ Context *ctx, ceph::bufferlist &&bl) override;
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Request.h"
+
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::Request: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+template <typename T>
+void C_WriteRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+
+ auto image_extents_size = this->image_extents.size();
+ *bytes_cached = 0;
+ *bytes_allocated = 0;
+ *number_lanes = image_extents_size;
+ *number_log_entries = image_extents_size;
+
+ for (auto &extent : this->image_extents) {
+ *bytes_cached += extent.second;
+ *bytes_allocated += round_up_to(extent.second, MIN_WRITE_ALLOC_SSD_SIZE);
+ }
+ *bytes_dirtied = *bytes_cached;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<T> &req) {
+ os << (C_WriteRequest<T>&)req
+ << "cmp_bl=" << req.cmp_bl << ", "
+ << "read_bl=" << req.read_bl << ", "
+ << "compare_succeeded=" << req.compare_succeeded << ", "
+ << "mismatch_offset=" << req.mismatch_offset;
+ return os;
+}
+
+template <typename T>
+void C_WriteSameRequest<T>::setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated,
+ uint64_t *number_lanes, uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) {
+ ceph_assert(this->image_extents.size() == 1);
+ *bytes_dirtied = this->image_extents[0].second;
+ *bytes_cached = this->bl.length();
+ *bytes_allocated = round_up_to(*bytes_cached, MIN_WRITE_ALLOC_SSD_SIZE);
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ssd::C_WriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::ssd::C_WriteSameRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
+template class librbd::cache::pwl::ssd::C_CompAndWriteRequest<librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx> >;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_SSD_REQUEST_H
+#define CEPH_LIBRBD_CACHE_SSD_REQUEST_H
+
+#include "librbd/cache/pwl/Request.h"
+
+namespace librbd {
+class BlockGuardCell;
+
+namespace cache {
+namespace pwl {
+
+template<typename T>
+class AbstractWriteLog;
+
+namespace ssd {
+
+template <typename T>
+class C_WriteRequest : public pwl::C_WriteRequest<T> {
+public:
+ C_WriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset, fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ C_WriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl),
+ fadvise_flags, lock, perfcounter, user_req) {}
+protected:
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+};
+
+template <typename T>
+class C_CompAndWriteRequest : public C_WriteRequest<T> {
+public:
+ C_CompAndWriteRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset,
+ const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : C_WriteRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(cmp_bl),
+ std::move(bl), mismatch_offset,fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ const char *get_name() const override {
+ return "C_CompAndWriteRequest";
+ }
+ template <typename U>
+ friend std::ostream &operator<<(std::ostream &os,
+ const C_CompAndWriteRequest<U> &req);
+};
+
+template <typename T>
+class C_WriteSameRequest : public pwl::C_WriteSameRequest<T> {
+public:
+ C_WriteSameRequest(
+ T &pwl, const utime_t arrived, io::Extents &&image_extents,
+ bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock,
+ PerfCounters *perfcounter, Context *user_req)
+ : pwl::C_WriteSameRequest<T>(
+ pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags,
+ lock, perfcounter, user_req) {}
+
+ void setup_buffer_resources(
+ uint64_t *bytes_cached, uint64_t *bytes_dirtied,
+ uint64_t *bytes_allocated, uint64_t *number_lanes,
+ uint64_t *number_log_entries,
+ uint64_t *number_unpublished_reserves) override;
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_CACHE_SSD_REQUEST_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_SSD_TYPES_H
+#define CEPH_LIBRBD_CACHE_SSD_TYPES_H
+
+#include "acconfig.h"
+
+#include "librbd/io/Types.h"
+#include "librbd/cache/pwl/Types.h"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+struct SuperBlock{
+ WriteLogPoolRoot root;
+
+ DENC(SuperBlock, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.root, p);
+ DENC_FINISH(p);
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_object("super", root);
+ }
+
+ static void generate_test_instances(list<SuperBlock*>& ls) {
+ ls.push_back(new SuperBlock);
+ ls.push_back(new SuperBlock);
+ ls.back()->root.first_valid_entry = 2;
+ }
+};
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+WRITE_CLASS_DENC(librbd::cache::pwl::ssd::SuperBlock)
+
+#endif // CEPH_LIBRBD_CACHE_SSD_TYPES_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "WriteLog.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "common/deleter.h"
+#include "common/dout.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/LogEntry.h"
+#include <map>
+#include <vector>
+
+#undef dout_subsys
+#define dout_subsys ceph_subsys_rbd_pwl
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::cache::pwl::ssd::WriteLog: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+using namespace librbd::cache::pwl;
+
+// SSD: this number can be updated later
+const unsigned long int ops_appended_together = MAX_WRITES_PER_SYNC_POINT;
+
+template <typename I>
+Builder<AbstractWriteLog<I>>* WriteLog<I>::create_builder() {
+ m_builderobj = new Builder<This>();
+ return m_builderobj;
+}
+
+template <typename I>
+WriteLog<I>::WriteLog(
+ I &image_ctx, librbd::cache::pwl::ImageCacheState<I>* cache_state,
+ cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<I>& plugin_api)
+ : AbstractWriteLog<I>(image_ctx, cache_state, create_builder(),
+ image_writeback, plugin_api)
+{
+}
+
+template <typename I>
+WriteLog<I>::~WriteLog() {
+ delete m_builderobj;
+}
+
+template <typename I>
+void WriteLog<I>::initialize_pool(Context *on_finish,
+ pwl::DeferredContexts &later) {
+ CephContext *cct = m_image_ctx.cct;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ if (access(this->m_log_pool_name.c_str(), F_OK) != 0) {
+ int fd = ::open(this->m_log_pool_name.c_str(), O_RDWR|O_CREAT, 0644);
+ bool succeed = true;
+ if (fd >= 0) {
+ if (truncate(this->m_log_pool_name.c_str(),
+ this->m_log_pool_config_size) != 0) {
+ succeed = false;
+ }
+ ::close(fd);
+ } else {
+ succeed = false;
+ }
+ if (!succeed) {
+ m_cache_state->present = false;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ /* TODO: filter/replace errnos that are meaningless to the caller */
+ on_finish->complete(-errno);
+ return;
+ }
+
+ bdev = BlockDevice::create(cct, this->m_log_pool_name, aio_cache_cb,
+ nullptr, nullptr, nullptr);
+ int r = bdev->open(this->m_log_pool_name);
+ if (r < 0) {
+ delete bdev;
+ on_finish->complete(-1);
+ return;
+ }
+ m_cache_state->present = true;
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ /* new pool, calculate and store metadata */
+ size_t small_write_size = MIN_WRITE_ALLOC_SSD_SIZE + sizeof(struct WriteLogCacheEntry);
+
+ uint64_t num_small_writes = (uint64_t)(this->m_log_pool_config_size / small_write_size);
+ if (num_small_writes > MAX_LOG_ENTRIES) {
+ num_small_writes = MAX_LOG_ENTRIES;
+ }
+ assert(num_small_writes > 2);
+ m_log_pool_ring_buffer_size = this->m_log_pool_config_size - DATA_RING_BUFFER_OFFSET;
+ /* Log ring empty */
+ m_first_free_entry = DATA_RING_BUFFER_OFFSET;
+ m_first_valid_entry = DATA_RING_BUFFER_OFFSET;
+
+ pool_size = this->m_log_pool_config_size;
+ auto new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
+ new_root->pool_size = this->m_log_pool_config_size;
+ new_root->flushed_sync_gen = this->m_flushed_sync_gen;
+ new_root->block_size = MIN_WRITE_ALLOC_SSD_SIZE;
+ new_root->first_free_entry = m_first_free_entry;
+ new_root->first_valid_entry = m_first_valid_entry;
+ new_root->num_log_entries = num_small_writes;
+ pool_root = *new_root;
+
+ r = update_pool_root_sync(new_root);
+ if (r != 0) {
+ this->m_total_log_entries = 0;
+ this->m_free_log_entries = 0;
+ lderr(m_image_ctx.cct) << "failed to initialize pool ("
+ << this->m_log_pool_name << ")" << dendl;
+ on_finish->complete(r);
+ }
+ this->m_total_log_entries = new_root->num_log_entries;
+ this->m_free_log_entries = new_root->num_log_entries - 1;
+ } else {
+ m_cache_state->present = true;
+ bdev = BlockDevice::create(
+ cct, this->m_log_pool_name, aio_cache_cb,
+ static_cast<void*>(this), nullptr, static_cast<void*>(this));
+ int r = bdev->open(this->m_log_pool_name);
+ if (r < 0) {
+ delete bdev;
+ on_finish->complete(r);
+ return;
+ }
+ load_existing_entries(later);
+ if (m_first_free_entry < m_first_valid_entry) {
+ /* Valid entries wrap around the end of the ring, so first_free is lower
+ * than first_valid. If first_valid was == first_free+1, the entry at
+ * first_free would be empty. The last entry is never used, so in
+ * that case there would be zero free log entries. */
+ this->m_free_log_entries = this->m_total_log_entries -
+ (m_first_valid_entry - m_first_free_entry) - 1;
+ } else {
+ /* first_valid is <= first_free. If they are == we have zero valid log
+ * entries, and n-1 free log entries */
+ this->m_free_log_entries = this->m_total_log_entries -
+ (m_first_free_entry - m_first_valid_entry) - 1;
+ }
+ m_cache_state->clean = this->m_dirty_log_entries.empty();
+ m_cache_state->empty = m_log_entries.empty();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::remove_pool_file() {
+ ceph_assert(bdev);
+ bdev->close();
+ delete bdev;
+ bdev = nullptr;
+ ldout(m_image_ctx.cct, 5) << "block device is closed" << dendl;
+
+ if (m_cache_state->clean) {
+ ldout(m_image_ctx.cct, 5) << "Removing empty pool file: "
+ << this->m_log_pool_name << dendl;
+ if (remove(this->m_log_pool_name.c_str()) != 0) {
+ lderr(m_image_ctx.cct) << "failed to remove empty pool \""
+ << this->m_log_pool_name << "\": " << dendl;
+ } else {
+ m_cache_state->clean = true;
+ m_cache_state->empty = true;
+ m_cache_state->present = false;
+ }
+ } else {
+ ldout(m_image_ctx.cct, 5) << "Not removing pool file: "
+ << this->m_log_pool_name << dendl;
+ }
+}
+
+template <typename I>
+void WriteLog<I>::load_existing_entries(pwl::DeferredContexts &later) {
+ bufferlist bl;
+ CephContext *cct = m_image_ctx.cct;
+ ::IOContext ioctx(cct, nullptr);
+ bdev->read(0, MIN_WRITE_ALLOC_SSD_SIZE, &bl, &ioctx, false);
+ SuperBlock superblock;
+
+ auto p = bl.cbegin();
+ decode(superblock, p);
+ ldout(cct,5) << "Decoded superblock" << dendl;
+
+ WriteLogPoolRoot current_pool_root = superblock.root;
+ uint64_t next_log_pos = pool_root.first_valid_entry;
+ uint64_t first_free_entry = pool_root.first_free_entry;
+ uint64_t curr_log_pos;
+
+ pool_root = current_pool_root;
+ m_first_free_entry = first_free_entry;
+ m_first_valid_entry = next_log_pos;
+ this->m_total_log_entries = current_pool_root.num_log_entries;
+ this->m_flushed_sync_gen = current_pool_root.flushed_sync_gen;
+ this->m_log_pool_actual_size = current_pool_root.pool_size;
+
+ std::map<uint64_t, std::shared_ptr<SyncPointLogEntry>> sync_point_entries;
+
+ std::map<uint64_t, bool> missing_sync_points;
+
+ // Iterate through the log_entries and append all the write_bytes
+ // of each entry to fetch the pos of next 4k of log_entries. Iterate
+ // through the log entries and append them to the in-memory vector
+ while (next_log_pos != first_free_entry) {
+ // read the entries from SSD cache and decode
+ bufferlist bl_entries;
+ ::IOContext ioctx_entry(cct, nullptr);
+ bdev->read(next_log_pos, MIN_WRITE_ALLOC_SSD_SIZE, &bl_entries,
+ &ioctx_entry, false);
+ std::vector<WriteLogCacheEntry> ssd_log_entries;
+ auto pl = bl_entries.cbegin();
+ decode(ssd_log_entries, pl);
+ ldout(cct, 5) << "decoded ssd log entries" << dendl;
+ curr_log_pos = next_log_pos;
+ std::shared_ptr<GenericLogEntry> log_entry = nullptr;
+
+ for (auto it = ssd_log_entries.begin(); it != ssd_log_entries.end(); ++it) {
+ this->update_entries(log_entry, &*it, missing_sync_points,
+ sync_point_entries, curr_log_pos);
+ log_entry->ram_entry = *it;
+ log_entry->log_entry_index = curr_log_pos;
+ log_entry->completed = true;
+ m_log_entries.push_back(log_entry);
+ next_log_pos += round_up_to(it->write_bytes, MIN_WRITE_ALLOC_SSD_SIZE);
+ }
+ // along with the write_bytes, add control block size too
+ next_log_pos += MIN_WRITE_ALLOC_SSD_SIZE;
+ if (next_log_pos >= this->m_log_pool_actual_size) {
+ next_log_pos = next_log_pos % this->m_log_pool_actual_size + DATA_RING_BUFFER_OFFSET;
+ }
+ }
+ this->update_sync_points(missing_sync_points, sync_point_entries, later,
+ MIN_WRITE_ALLOC_SSD_SIZE);
+}
+
+template <typename I>
+bool WriteLog<I>::alloc_resources(C_BlockIORequestT *req) {
+ bool alloc_succeeds = true;
+ uint64_t bytes_allocated = 0;
+ uint64_t bytes_cached = 0;
+ uint64_t bytes_dirtied = 0;
+ uint64_t num_lanes = 0;
+ uint64_t num_unpublished_reserves = 0;
+ uint64_t num_log_entries = 0;
+
+ // Setup buffer, and get all the number of required resources
+ req->setup_buffer_resources(&bytes_cached, &bytes_dirtied, &bytes_allocated,
+ &num_lanes, &num_log_entries,
+ &num_unpublished_reserves);
+
+ bytes_allocated += num_log_entries * MIN_WRITE_ALLOC_SSD_SIZE;
+
+ alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied,
+ bytes_allocated, num_lanes,
+ num_log_entries,
+ num_unpublished_reserves,
+ m_log_pool_ring_buffer_size);
+ req->set_allocated(alloc_succeeds);
+ return alloc_succeeds;
+}
+
+template <typename I>
+bool WriteLog<I>::has_sync_point_logs(GenericLogOperations &ops) {
+ for (auto &op : ops) {
+ if (op->get_log_entry()->is_sync_point()) {
+ return true;
+ break;
+ }
+ }
+ return false;
+}
+
+template<typename I>
+void WriteLog<I>::enlist_op_appender() {
+ this->m_async_append_ops++;
+ this->m_async_op_tracker.start_op();
+ Context *append_ctx = new LambdaContext([this](int r) {
+ append_scheduled_ops();
+ });
+ this->m_work_queue.queue(append_ctx);
+}
+/*
+ * Takes custody of ops. They'll all get their log entries appended,
+ * and have their on_write_persist contexts completed once they and
+ * all prior log entries are persisted everywhere.
+ */
+template<typename I>
+void WriteLog<I>::schedule_append_ops(GenericLogOperations &ops) {
+ bool need_finisher = false;
+ GenericLogOperationsVector appending;
+
+ std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending));
+ {
+ std::lock_guard locker(m_lock);
+
+ bool persist_on_flush = this->get_persist_on_flush();
+ need_finisher = !this->m_appending &&
+ ((this->m_ops_to_append.size() >= CONTROL_BLOCK_MAX_LOG_ENTRIES) ||
+ !persist_on_flush);
+
+ // Only flush logs into SSD when there is internal/external flush request
+ if (!need_finisher) {
+ need_finisher = has_sync_point_logs(ops);
+ }
+ this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops);
+ }
+
+ if (need_finisher) {
+ this->enlist_op_appender();
+ }
+
+ for (auto &op : appending) {
+ op->appending();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::setup_schedule_append(pwl::GenericLogOperationsVector &ops,
+ bool do_early_flush) {
+ this->schedule_append(ops);
+}
+
+template <typename I>
+void WriteLog<I>::append_scheduled_ops(void) {
+ GenericLogOperations ops;
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ bool ops_remain = false; //no-op variable for SSD
+ bool appending = false; //no-op variable for SSD
+ this->append_scheduled(ops, ops_remain, appending);
+
+ if (ops.size()) {
+ alloc_op_log_entries(ops);
+ append_op_log_entries(ops);
+ } else {
+ this->m_async_append_ops--;
+ this->m_async_op_tracker.finish_op();
+ }
+}
+
+/*
+ * Write and persist the (already allocated) write log entries and
+ * data buffer allocations for a set of ops. The data buffer for each
+ * of these must already have been persisted to its reserved area.
+ */
+template <typename I>
+void WriteLog<I>::append_op_log_entries(GenericLogOperations &ops) {
+ ceph_assert(!ops.empty());
+ ldout(m_image_ctx.cct, 20) << dendl;
+ Context *ctx = new LambdaContext([this, ops](int r) {
+ assert(r == 0);
+ ldout(m_image_ctx.cct, 20) << "Finished root update " << dendl;
+ this->m_async_update_superblock--;
+ this->m_async_op_tracker.finish_op();
+
+ auto captured_ops = std::move(ops);
+ this->complete_op_log_entries(std::move(captured_ops), r);
+
+ bool need_finisher = false;
+ {
+ std::lock_guard locker1(m_lock);
+ bool persist_on_flush = this->get_persist_on_flush();
+ need_finisher = ((this->m_ops_to_append.size() >= CONTROL_BLOCK_MAX_LOG_ENTRIES) ||
+ !persist_on_flush);
+
+ if (!need_finisher) {
+ need_finisher = has_sync_point_logs(this->m_ops_to_append);
+ }
+ }
+
+ if (need_finisher) {
+ this->enlist_op_appender();
+ }
+ });
+ uint64_t *new_first_free_entry = new(uint64_t);
+ Context *append_ctx = new LambdaContext(
+ [this, new_first_free_entry, ops, ctx](int r) {
+ std::shared_ptr<WriteLogPoolRoot> new_root;
+ {
+ ldout(m_image_ctx.cct, 20) << "Finished appending at "
+ << *new_first_free_entry << dendl;
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ operation->log_append_comp_time = now;
+ }
+ this->m_async_append_ops--;
+ this->m_async_op_tracker.finish_op();
+
+ std::lock_guard locker(this->m_log_append_lock);
+ std::lock_guard locker1(m_lock);
+ assert(this->m_appending);
+ this->m_appending = false;
+ new_root = std::make_shared<WriteLogPoolRoot>(pool_root);
+ pool_root.first_free_entry = *new_first_free_entry;
+ new_root->first_free_entry = *new_first_free_entry;
+ delete new_first_free_entry;
+ schedule_update_root(new_root, ctx);
+ }
+ });
+ // Append logs and update first_free_update
+ uint64_t bytes_allocated_updated;
+ append_ops(ops, append_ctx, new_first_free_entry, bytes_allocated_updated);
+
+ {
+ std::lock_guard locker1(m_lock);
+ m_first_free_entry = *new_first_free_entry;
+ m_bytes_allocated -= bytes_allocated_updated;
+ }
+
+ if (ops.size()) {
+ this->dispatch_deferred_writes();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::release_ram(std::shared_ptr<GenericLogEntry> log_entry) {
+ log_entry->remove_cache_bl();
+}
+
+template <typename I>
+void WriteLog<I>::alloc_op_log_entries(GenericLogOperations &ops) {
+ std::lock_guard locker(m_lock);
+
+ for (auto &operation : ops) {
+ auto &log_entry = operation->get_log_entry();
+ log_entry->ram_entry.entry_valid = 1;
+ m_log_entries.push_back(log_entry);
+ ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl;
+ }
+}
+
+template <typename I>
+Context* WriteLog<I>::construct_flush_entry_ctx(
+ std::shared_ptr<GenericLogEntry> log_entry) {
+ // snapshot so we behave consistently
+ bool invalidating = this->m_invalidating;
+
+ Context *ctx = this->construct_flush_entry(log_entry, invalidating);
+
+ if (invalidating) {
+ return ctx;
+ }
+ if(log_entry->is_write_entry()) {
+ bufferlist *read_bl_ptr = new bufferlist;
+ ctx = new LambdaContext(
+ [this, log_entry, read_bl_ptr, ctx](int r) {
+ bufferlist captured_entry_bl;
+ captured_entry_bl.claim_append(*read_bl_ptr);
+ free(read_bl_ptr);
+ m_image_ctx.op_work_queue->queue(new LambdaContext(
+ [this, log_entry, entry_bl=move(captured_entry_bl), ctx](int r) {
+ auto captured_entry_bl = std::move(entry_bl);
+ ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+ << " " << *log_entry << dendl;
+ log_entry->writeback_bl(this->m_image_writeback, ctx,
+ std::move(captured_entry_bl));
+ }), 0);
+ });
+ ctx = new LambdaContext(
+ [this, log_entry, read_bl_ptr, ctx](int r) {
+ aio_read_data_block(&log_entry->ram_entry, read_bl_ptr, ctx);
+ });
+ return ctx;
+ } else {
+ return new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ m_image_ctx.op_work_queue->queue(new LambdaContext(
+ [this, log_entry, ctx](int r) {
+ ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry
+ << " " << *log_entry << dendl;
+ log_entry->writeback(this->m_image_writeback, ctx);
+ }), 0);
+ });
+ }
+}
+
+template <typename I>
+void WriteLog<I>::process_work() {
+ CephContext *cct = m_image_ctx.cct;
+ int max_iterations = 4;
+ bool wake_up_requested = false;
+ uint64_t high_water_bytes = m_log_pool_ring_buffer_size * RETIRE_HIGH_WATER;
+ uint64_t high_water_entries = this->m_total_log_entries * RETIRE_HIGH_WATER;
+
+ ldout(cct, 20) << dendl;
+
+ do {
+ {
+ std::lock_guard locker(m_lock);
+ this->m_wake_up_requested = false;
+ }
+ if (this->m_alloc_failed_since_retire || (this->m_shutting_down) ||
+ this->m_invalidating || m_bytes_allocated > high_water_bytes ||
+ (m_log_entries.size() > high_water_entries)) {
+ ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire
+ << ", allocated > high_water="
+ << (m_bytes_allocated > high_water_bytes)
+ << ", allocated_entries > high_water="
+ << (m_log_entries.size() > high_water_entries)
+ << dendl;
+ //TODO: Implement and uncomment this in next PR
+ /*retire_entries((this->m_shutting_down || this->m_invalidating ||
+ (m_bytes_allocated > aggressive_high_water_bytes) ||
+ (m_log_entries.size() > aggressive_high_water_entries))
+ ? MAX_ALLOC_PER_TRANSACTION : MAX_FREE_PER_TRANSACTION);*/
+ }
+ this->dispatch_deferred_writes();
+ this->process_writeback_dirty_entries();
+ {
+ std::lock_guard locker(m_lock);
+ wake_up_requested = this->m_wake_up_requested;
+ }
+ } while (wake_up_requested && --max_iterations > 0);
+
+ {
+ std::lock_guard locker(m_lock);
+ this->m_wake_up_scheduled = false;
+ // Reschedule if it's still requested
+ if (this->m_wake_up_requested) {
+ this->wake_up();
+ }
+ }
+}
+
+template <typename I>
+void WriteLog<I>::append_ops(GenericLogOperations &ops, Context *ctx,
+ uint64_t* new_first_free_entry,
+ uint64_t &bytes_allocated) {
+ GenericLogEntriesVector log_entries;
+ CephContext *cct = m_image_ctx.cct;
+ uint64_t span_payload_len = 0;
+ bytes_allocated = 0;
+ ldout(cct, 20) << "Appending " << ops.size() << " log entries." << dendl;
+
+ AioTransContext* aio = new AioTransContext(cct, ctx);
+
+ utime_t now = ceph_clock_now();
+ for (auto &operation : ops) {
+ operation->log_append_time = now;
+ auto log_entry = operation->get_log_entry();
+
+ if (log_entries.size() == CONTROL_BLOCK_MAX_LOG_ENTRIES ||
+ span_payload_len >= SPAN_MAX_DATA_LEN) {
+ if (log_entries.size() > 1) {
+ bytes_allocated += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE;
+ }
+ write_log_entries(log_entries, aio);
+ log_entries.clear();
+ span_payload_len = 0;
+ }
+ log_entries.push_back(log_entry);
+ span_payload_len += log_entry->write_bytes();
+ }
+ if (!span_payload_len || !log_entries.empty()) {
+ if (log_entries.size() > 1) {
+ bytes_allocated += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE;
+ }
+ write_log_entries(log_entries, aio);
+ }
+ bdev->aio_submit(&aio->ioc);
+ *new_first_free_entry = pool_root.first_free_entry;
+}
+
+template <typename I>
+void WriteLog<I>::write_log_entries(GenericLogEntriesVector log_entries,
+ AioTransContext *aio) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(m_image_ctx.cct, 20) << dendl;
+ bufferlist data_bl;
+ // The first block is for log entries
+ uint64_t data_pos = pool_root.first_free_entry + MIN_WRITE_ALLOC_SSD_SIZE;
+ ldout(m_image_ctx.cct, 20) << "data_pos: " << data_pos << dendl;
+ if (data_pos == pool_root.pool_size ) {
+ data_pos = data_pos % pool_root.pool_size + DATA_RING_BUFFER_OFFSET;
+ }
+
+ std::vector<WriteLogCacheEntry> persist_log_entries;
+ for (auto &log_entry : log_entries) {
+ log_entry->log_entry_index = pool_root.first_free_entry;
+ // Append data buffer for write operations
+ persist_log_entries.push_back(log_entry->ram_entry);
+ if (log_entry->is_write_entry()) {
+ auto write_entry = static_pointer_cast<WriteLogEntry>(log_entry);
+ auto cache_bl = write_entry->get_cache_bl();
+ auto align_size = write_entry->get_aligned_data_size();
+ data_bl.append(cache_bl);
+ data_bl.append_zero(align_size - cache_bl.length());
+
+ write_entry->ram_entry.write_data_pos = data_pos;
+ data_pos += align_size;
+ if (data_pos >= pool_root.pool_size) {
+ data_pos = data_pos % pool_root.pool_size + DATA_RING_BUFFER_OFFSET;
+ }
+ }
+ }
+
+ //aio write
+ bufferlist bl;
+ encode(persist_log_entries, bl);
+ ceph_assert(bl.length() <= MIN_WRITE_ALLOC_SSD_SIZE);
+ bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length());
+ bl.append(data_bl);
+ ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+ if (pool_root.first_free_entry + bl.length() > pool_root.pool_size) {
+ //exceeds border, need to split
+ uint64_t size = bl.length();
+ auto end = pool_root.pool_size - pool_root.first_free_entry;
+ bufferlist bl1;
+ bl.splice(0, end, &bl1);
+ ceph_assert(bl.length() == (size - bl1.length()));
+ ldout(cct, 20) << "The write on " << pool_root.first_free_entry
+ << " with length " << size << " is split into two: "
+ << "pos=" << pool_root.first_free_entry << ", "
+ << "length=" << bl1.length() << "; "
+ << "pos=" << DATA_RING_BUFFER_OFFSET << ", "
+ << "length=" << bl.length() << dendl;
+
+ bdev->aio_write(pool_root.first_free_entry, bl1, &aio->ioc, false,
+ WRITE_LIFE_NOT_SET);
+ bdev->aio_write(DATA_RING_BUFFER_OFFSET, bl, &aio->ioc, false,
+ WRITE_LIFE_NOT_SET);
+ } else {
+ ldout(cct, 20) << "first_free_entry: " << pool_root.first_free_entry
+ << " bl length: " << bl.length() << dendl;
+ bdev->aio_write(pool_root.first_free_entry, bl, &aio->ioc, false,
+ WRITE_LIFE_NOT_SET);
+ ldout(cct, 20) << "finished aio_write log entries" << dendl;
+ }
+ // New first free entry
+ pool_root.first_free_entry = data_pos;
+}
+
+template <typename I>
+void WriteLog<I>::schedule_update_root(
+ std::shared_ptr<WriteLogPoolRoot> root, Context *ctx) {
+ bool need_finisher;
+ {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ need_finisher = m_poolroot_to_update.empty() && !m_updating_pool_root;
+ std::shared_ptr<WriteLogPoolRootUpdate> entry =
+ std::make_shared<WriteLogPoolRootUpdate>(root, ctx);
+ this->m_async_update_superblock++;
+ this->m_async_op_tracker.start_op();
+ m_poolroot_to_update.emplace_back(entry);
+ }
+ if (need_finisher) {
+ enlist_op_update_root();
+ }
+}
+
+template <typename I>
+void WriteLog<I>::enlist_op_update_root() {
+ Context *append_ctx = new LambdaContext([this](int r) {
+ update_root_scheduled_ops();
+ });
+ this->m_work_queue.queue(append_ctx);
+}
+
+template <typename I>
+void WriteLog<I>::update_root_scheduled_ops() {
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ std::shared_ptr<WriteLogPoolRoot> root;
+ WriteLogPoolRootUpdateList root_updates;
+ Context *ctx = nullptr;
+ {
+ std::lock_guard locker(m_lock);
+ if (m_updating_pool_root) {
+ /* Another thread is appending */
+ ldout(m_image_ctx.cct, 15) << "Another thread is updating pool root"
+ << dendl;
+ return;
+ }
+ if (m_poolroot_to_update.size()) {
+ m_updating_pool_root = true;
+ root_updates.swap(m_poolroot_to_update);
+ }
+ }
+ ceph_assert(!root_updates.empty());
+ ldout(m_image_ctx.cct, 15) << "Update root number: " << root_updates.size()
+ << dendl;
+ // We just update the last one, and call all the completions.
+ auto entry = root_updates.back();
+ root = entry->root;
+
+ ctx = new LambdaContext([this, updates = std::move(root_updates)](int r) {
+ ldout(m_image_ctx.cct, 15) << "Start to callback." << dendl;
+ for (auto it = updates.begin(); it != updates.end(); it++) {
+ Context *it_ctx = (*it)->ctx;
+ it_ctx->complete(r);
+ }
+ });
+ Context *append_ctx = new LambdaContext([this, ctx](int r) {
+ ldout(m_image_ctx.cct, 15) << "Finish the update of pool root." << dendl;
+ bool need_finisher = false;;
+ assert(r == 0);
+ {
+ std::lock_guard locker(m_lock);
+ m_updating_pool_root = false;
+ need_finisher = !m_poolroot_to_update.empty();
+ }
+ if (need_finisher) {
+ enlist_op_update_root();
+ }
+ ctx->complete(r);
+ });
+ AioTransContext* aio = new AioTransContext(m_image_ctx.cct, append_ctx);
+ update_pool_root(root, aio);
+}
+
+template <typename I>
+void WriteLog<I>::update_pool_root(std::shared_ptr<WriteLogPoolRoot> root,
+ AioTransContext *aio) {
+ bufferlist bl;
+ SuperBlock superblock;
+ superblock.root = *root;
+ encode(superblock, bl);
+ bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length());
+ ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+ bdev->aio_write(0, bl, &aio->ioc, false, WRITE_LIFE_NOT_SET);
+ bdev->aio_submit(&aio->ioc);
+}
+
+template <typename I>
+int WriteLog<I>::update_pool_root_sync(
+ std::shared_ptr<WriteLogPoolRoot> root) {
+ bufferlist bl;
+ SuperBlock superblock;
+ superblock.root = *root;
+ encode(superblock, bl);
+ bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length());
+ ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0);
+ return bdev->write(0, bl, false);
+}
+
+template <typename I>
+void WriteLog<I>::pre_io_check(WriteLogCacheEntry *log_entry,
+ uint64_t &length) {
+ assert(log_entry->is_write() || log_entry->is_writesame());
+ ceph_assert(log_entry->write_data_pos <= pool_size);
+
+ length = log_entry->is_write() ? log_entry->write_bytes :
+ log_entry->ws_datalen;
+ length = round_up_to(length, MIN_WRITE_ALLOC_SSD_SIZE);
+ ceph_assert(length != 0 && log_entry->write_data_pos + length <= pool_size);
+}
+
+template <typename I>
+void WriteLog<I>::aio_read_data_block(
+ WriteLogCacheEntry *log_entry, bufferlist *bl, Context *ctx) {
+ std::vector<WriteLogCacheEntry*> log_entries {log_entry};
+ std::vector<bufferlist *> bls {bl};
+ aio_read_data_block(log_entries, bls, ctx);
+}
+
+template <typename I>
+void WriteLog<I>::aio_read_data_block(
+ std::vector<WriteLogCacheEntry*> &log_entries,
+ std::vector<bufferlist *> &bls, Context *ctx) {
+ ceph_assert(log_entries.size() == bls.size());
+
+ //get the valid part
+ Context *read_ctx = new LambdaContext(
+ [this, log_entries, bls, ctx](int r) {
+ for (unsigned int i = 0; i < log_entries.size(); i++) {
+ bufferlist valid_data_bl;
+ auto length = log_entries[i]->is_write() ? log_entries[i]->write_bytes :
+ log_entries[i]->ws_datalen;
+ valid_data_bl.substr_of(*bls[i], 0, length);
+ bls[i]->clear();
+ bls[i]->append(valid_data_bl);
+ }
+ ctx->complete(r);
+ });
+
+ CephContext *cct = m_image_ctx.cct;
+ AioTransContext *aio = new AioTransContext(cct, read_ctx);
+ for (unsigned int i = 0; i < log_entries.size(); i++) {
+ auto log_entry = log_entries[i];
+
+ uint64_t length;
+ pre_io_check(log_entry, length);
+ ldout(cct, 20) << "Read at " << log_entry->write_data_pos
+ << ", length " << length << dendl;
+
+ bdev->aio_read(log_entry->write_data_pos, length, bls[i], &aio->ioc);
+ }
+ bdev->aio_submit(&aio->ioc);
+}
+
+template <typename I>
+void WriteLog<I>::complete_user_request(Context *&user_req, int r) {
+ m_image_ctx.op_work_queue->queue(user_req, r);
+}
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+template class librbd::cache::pwl::ssd::WriteLog<librbd::ImageCtx>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
+#define CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
+
+#include "blk/BlockDevice.h"
+#include "common/AsyncOpTracker.h"
+#include "common/Checksummer.h"
+#include "common/environment.h"
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "librbd/BlockGuard.h"
+#include "librbd/Utils.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/cache/Types.h"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/cache/pwl/LogMap.h"
+#include "librbd/cache/pwl/LogOperation.h"
+#include "librbd/cache/pwl/Request.h"
+#include "librbd/cache/pwl/ssd/Builder.h"
+#include "librbd/cache/pwl/ssd/Types.h"
+#include <functional>
+#include <list>
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace cache {
+namespace pwl {
+namespace ssd {
+
+template <typename ImageCtxT>
+class WriteLog : public AbstractWriteLog<ImageCtxT> {
+public:
+ WriteLog(ImageCtxT &image_ctx,
+ librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state,
+ cache::ImageWritebackInterface& image_writeback,
+ plugin::Api<ImageCtxT>& plugin_api);
+ ~WriteLog();
+ WriteLog(const WriteLog&) = delete;
+ WriteLog &operator=(const WriteLog&) = delete;
+
+ using This = AbstractWriteLog<ImageCtxT>;
+ using C_BlockIORequestT = pwl::C_BlockIORequest<This>;
+ using C_WriteRequestT = pwl::C_WriteRequest<This>;
+ using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>;
+
+ bool alloc_resources(C_BlockIORequestT *req) override;
+ void setup_schedule_append(
+ pwl::GenericLogOperationsVector &ops, bool do_early_flush) override;
+ void complete_user_request(Context *&user_req, int r) override;
+
+protected:
+ using AbstractWriteLog<ImageCtxT>::m_lock;
+ using AbstractWriteLog<ImageCtxT>::m_log_entries;
+ using AbstractWriteLog<ImageCtxT>::m_image_ctx;
+ using AbstractWriteLog<ImageCtxT>::m_cache_state;
+ using AbstractWriteLog<ImageCtxT>::m_first_free_entry;
+ using AbstractWriteLog<ImageCtxT>::m_first_valid_entry;
+ using AbstractWriteLog<ImageCtxT>::m_bytes_allocated;
+
+ void initialize_pool(Context *on_finish,
+ pwl::DeferredContexts &later) override;
+ void process_work() override;
+ void append_scheduled_ops(void) override;
+ void schedule_append_ops(pwl::GenericLogOperations &ops) override;
+ void remove_pool_file() override;
+ void release_ram(std::shared_ptr<GenericLogEntry> log_entry) override;
+
+private:
+ class AioTransContext {
+ public:
+ Context *on_finish;
+ ::IOContext ioc;
+ explicit AioTransContext(CephContext* cct, Context *cb)
+ : on_finish(cb), ioc(cct, this) {}
+
+ ~AioTransContext(){}
+
+ void aio_finish() {
+ on_finish->complete(ioc.get_return_value());
+ delete this;
+ }
+ }; //class AioTransContext
+
+ struct WriteLogPoolRootUpdate {
+ std::shared_ptr<pwl::WriteLogPoolRoot> root;
+ Context *ctx;
+ WriteLogPoolRootUpdate(std::shared_ptr<pwl::WriteLogPoolRoot> r,
+ Context* c)
+ : root(r), ctx(c) {}
+ };
+
+ using WriteLogPoolRootUpdateList = std::list<std::shared_ptr<WriteLogPoolRootUpdate>>;
+ WriteLogPoolRootUpdateList m_poolroot_to_update; /* pool root list to update to SSD */
+ bool m_updating_pool_root = false;
+
+ uint64_t m_log_pool_ring_buffer_size; /* Size of ring buffer */
+ std::atomic<int> m_async_update_superblock = {0};
+ BlockDevice *bdev = nullptr;
+ uint64_t pool_size;
+ pwl::WriteLogPoolRoot pool_root;
+ Builder<This> *m_builderobj;
+
+ Builder<This>* create_builder();
+ void load_existing_entries(pwl::DeferredContexts &later);
+ void enlist_op_appender();
+ bool has_sync_point_logs(GenericLogOperations &ops);
+ void append_op_log_entries(GenericLogOperations &ops);
+ void alloc_op_log_entries(GenericLogOperations &ops);
+ Context* construct_flush_entry_ctx(
+ std::shared_ptr<GenericLogEntry> log_entry);
+ void append_ops(GenericLogOperations &ops, Context *ctx,
+ uint64_t* new_first_free_entry,
+ uint64_t &bytes_allocated);
+ void write_log_entries(GenericLogEntriesVector log_entries,
+ AioTransContext *aio);
+ void schedule_update_root(std::shared_ptr<WriteLogPoolRoot> root,
+ Context *ctx);
+ void enlist_op_update_root();
+ void update_root_scheduled_ops();
+ int update_pool_root_sync(std::shared_ptr<pwl::WriteLogPoolRoot> root);
+ void update_pool_root(std::shared_ptr<WriteLogPoolRoot> root,
+ AioTransContext *aio);
+ void pre_io_check(WriteLogCacheEntry *log_entry, uint64_t &length);
+ void aio_read_data_block(WriteLogCacheEntry *log_entry, bufferlist *bl,
+ Context *ctx);
+ void aio_read_data_block(std::vector<WriteLogCacheEntry*> &log_entries,
+ std::vector<bufferlist *> &bls, Context *ctx);
+ static void aio_cache_cb(void *priv, void *priv2) {
+ AioTransContext *c = static_cast<AioTransContext*>(priv2);
+ c->aio_finish();
+ }
+};//class WriteLog
+
+} // namespace ssd
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
+
+extern template class librbd::cache::pwl::ssd::WriteLog<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG
watcher/test_mock_RewatchRequest.cc
)
-if(WITH_RBD_RWL)
+if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE)
set(unittest_librbd_srcs
${unittest_librbd_srcs}
- cache/pwl/test_mock_ReplicatedWriteLog.cc
cache/pwl/test_WriteLogMap.cc)
-endif(WITH_RBD_RWL)
+ if(WITH_RBD_RWL)
+ set(unittest_librbd_srcs
+ ${unittest_librbd_srcs}
+ cache/pwl/test_mock_ReplicatedWriteLog.cc)
+ endif()
+ if(WITH_RBD_SSD_CACHE)
+ set(unittest_librbd_srcs
+ ${unittest_librbd_srcs}
+ cache/pwl/test_mock_SSDWriteLog.cc)
+ endif()
+endif()
if(LINUX AND HAVE_LIBCRYPTSETUP)
list(APPEND unittest_librbd_srcs
} // namespace librbd
#include "librbd/cache/pwl/AbstractWriteLog.cc"
-#include "librbd/cache/pwl/ReplicatedWriteLog.cc"
-template class librbd::cache::pwl::ReplicatedWriteLog<librbd::MockImageCtx>;
+#include "librbd/cache/pwl/rwl/WriteLog.cc"
+template class librbd::cache::pwl::rwl::WriteLog<librbd::MockImageCtx>;
// template definitions
#include "librbd/cache/ImageWriteback.cc"
#include "librbd/cache/pwl/ImageCacheState.cc"
#include "librbd/cache/pwl/Request.cc"
+#include "librbd/cache/pwl/rwl/Request.cc"
#include "librbd/plugin/Api.cc"
namespace librbd {
typedef io::Extents Extents;
struct TestMockCacheReplicatedWriteLog : public TestMockFixture {
- typedef librbd::cache::pwl::ReplicatedWriteLog<librbd::MockImageCtx> MockReplicatedWriteLog;
+ typedef librbd::cache::pwl::rwl::WriteLog<librbd::MockImageCtx> MockReplicatedWriteLog;
typedef librbd::cache::pwl::ImageCacheState<librbd::MockImageCtx> MockImageCacheStateRWL;
typedef librbd::cache::ImageWriteback<librbd::MockImageCtx> MockImageWriteback;
typedef librbd::plugin::Api<librbd::MockImageCtx> MockApi;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include "common/hostname.h"
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "include/rbd/librbd.hpp"
+#include "librbd/cache/pwl/AbstractWriteLog.h"
+#include "librbd/cache/pwl/ImageCacheState.h"
+#include "librbd/cache/pwl/Types.h"
+#include "librbd/cache/ImageWriteback.h"
+#include "librbd/plugin/Api.h"
+
+namespace librbd {
+namespace {
+
+struct MockContextSSD : public C_SaferCond {
+ MOCK_METHOD1(complete, void(int));
+ MOCK_METHOD1(finish, void(int));
+
+ void do_complete(int r) {
+ C_SaferCond::complete(r);
+ }
+};
+
+} // anonymous namespace
+
+namespace util {
+
+inline ImageCtx *get_image_ctx(MockImageCtx *image_ctx) {
+ return image_ctx->image_ctx;
+}
+
+} // namespace util
+} // namespace librbd
+
+#include "librbd/cache/pwl/AbstractWriteLog.cc"
+#include "librbd/cache/pwl/ssd/WriteLog.cc"
+template class librbd::cache::pwl::ssd::WriteLog<librbd::MockImageCtx>;
+
+// template definitions
+#include "librbd/cache/ImageWriteback.cc"
+#include "librbd/cache/pwl/ImageCacheState.cc"
+#include "librbd/cache/pwl/Request.cc"
+#include "librbd/plugin/Api.cc"
+#include "librbd/cache/pwl/ssd/Request.cc"
+
+namespace librbd {
+namespace cache {
+namespace pwl {
+
+using ::testing::_;
+using ::testing::DoDefault;
+using ::testing::InSequence;
+using ::testing::Invoke;
+
+typedef io::Extent Extent;
+typedef io::Extents Extents;
+
+struct TestMockCacheSSDWriteLog : public TestMockFixture {
+ typedef librbd::cache::pwl::ssd::WriteLog<librbd::MockImageCtx> MockSSDWriteLog;
+ typedef librbd::cache::pwl::ImageCacheState<librbd::MockImageCtx> MockImageCacheStateSSD;
+ typedef librbd::cache::ImageWriteback<librbd::MockImageCtx> MockImageWriteback;
+ typedef librbd::plugin::Api<librbd::MockImageCtx> MockApi;
+
+ MockImageCacheStateSSD *get_cache_state(
+ MockImageCtx& mock_image_ctx, MockApi& mock_api) {
+ MockImageCacheStateSSD *rwl_state = new MockImageCacheStateSSD(&mock_image_ctx, mock_api);
+ return rwl_state;
+ }
+
+ void validate_cache_state(librbd::ImageCtx *image_ctx,
+ MockImageCacheStateSSD &state,
+ bool present, bool empty, bool clean,
+ string host, string path,
+ uint64_t size) {
+ ConfigProxy &config = image_ctx->config;
+ ASSERT_EQ(present, state.present);
+ ASSERT_EQ(empty, state.empty);
+ ASSERT_EQ(clean, state.clean);
+
+ ASSERT_EQ(host, state.host);
+ ASSERT_EQ(path, state.path);
+ ASSERT_EQ(size, state.size);
+ ASSERT_EQ(config.get_val<bool>("rbd_rwl_log_periodic_stats"),
+ state.log_periodic_stats);
+ }
+
+ void expect_op_work_queue(MockImageCtx& mock_image_ctx) {
+ EXPECT_CALL(*mock_image_ctx.op_work_queue, queue(_, _))
+ .WillRepeatedly(Invoke([](Context* ctx, int r) {
+ ctx->complete(r);
+ }));
+ }
+
+ void expect_context_complete(MockContextSSD& mock_context, int r) {
+ EXPECT_CALL(mock_context, complete(r))
+ .WillRepeatedly(Invoke([&mock_context](int r) {
+ mock_context.do_complete(r);
+ }));
+ }
+
+ void expect_metadata_set(MockImageCtx& mock_image_ctx) {
+ EXPECT_CALL(*mock_image_ctx.operations, execute_metadata_set(_, _, _))
+ .WillRepeatedly(Invoke([](std::string key, std::string val, Context* ctx) {
+ ctx->complete(0);
+ }));
+ }
+
+ void expect_metadata_remove(MockImageCtx& mock_image_ctx) {
+ EXPECT_CALL(*mock_image_ctx.operations, execute_metadata_remove(_, _))
+ .WillRepeatedly(Invoke([](std::string key, Context* ctx) {
+ ctx->complete(0);
+ }));
+ }
+};
+
+TEST_F(TestMockCacheSSDWriteLog, init_state_write) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockImageCtx mock_image_ctx(*ictx);
+ MockApi mock_api;
+ MockImageCacheStateSSD image_cache_state(&mock_image_ctx, mock_api);
+
+ validate_cache_state(ictx, image_cache_state, false, true, true, "", "", 0);
+
+ image_cache_state.empty = false;
+ image_cache_state.clean = false;
+ MockContextSSD finish_ctx;
+ expect_metadata_set(mock_image_ctx);
+ expect_context_complete(finish_ctx, 0);
+ image_cache_state.write_image_cache_state(&finish_ctx);
+ ASSERT_EQ(0, finish_ctx.wait());
+}
+
+static void get_jf(const string& s, JSONFormattable *f)
+{
+ JSONParser p;
+ bool result = p.parse(s.c_str(), s.size());
+ if (!result) {
+ cout << "Failed to parse: '" << s << "'" << std::endl;
+ }
+ ASSERT_EQ(true, result);
+ try {
+ decode_json_obj(*f, &p);
+ } catch (JSONDecoder::err& e) {
+ ASSERT_TRUE(0 == "Failed to decode JSON object");
+ }
+}
+
+TEST_F(TestMockCacheSSDWriteLog, init_state_json_write) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockImageCtx mock_image_ctx(*ictx);
+
+ JSONFormattable f;
+ string strf = "{ \"present\": \"1\", \"empty\": \"0\", \"clean\": \"0\", \
+ \"pwl_host\": \"testhost\", \
+ \"pwl_path\": \"/tmp\", \
+ \"pwl_size\": \"1024\" }";
+ get_jf(strf, &f);
+ MockApi mock_api;
+ MockImageCacheStateSSD image_cache_state(&mock_image_ctx, f, mock_api);
+
+ validate_cache_state(ictx, image_cache_state, true, false, false,
+ "testhost", "/tmp", 1024);
+
+ MockContextSSD finish_ctx;
+ expect_metadata_remove(mock_image_ctx);
+ expect_context_complete(finish_ctx, 0);
+ image_cache_state.clear_image_cache_state(&finish_ctx);
+ ASSERT_EQ(0, finish_ctx.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, init_shutdown) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockImageCtx mock_image_ctx(*ictx);
+ MockImageWriteback mock_image_writeback(mock_image_ctx);
+ MockApi mock_api;
+ MockSSDWriteLog rwl(
+ mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+ mock_image_writeback, mock_api);
+ MockContextSSD finish_ctx1;
+ expect_op_work_queue(mock_image_ctx);
+ expect_metadata_set(mock_image_ctx);
+
+ expect_context_complete(finish_ctx1, 0);
+ rwl.init(&finish_ctx1);
+ ASSERT_EQ(0, finish_ctx1.wait());
+
+ MockContextSSD finish_ctx2;
+ expect_context_complete(finish_ctx2, 0);
+ rwl.shut_down(&finish_ctx2);
+ ASSERT_EQ(0, finish_ctx2.wait());
+}
+
+TEST_F(TestMockCacheSSDWriteLog, write) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockImageCtx mock_image_ctx(*ictx);
+ MockImageWriteback mock_image_writeback(mock_image_ctx);
+ MockApi mock_api;
+ MockSSDWriteLog rwl(
+ mock_image_ctx, get_cache_state(mock_image_ctx, mock_api),
+ mock_image_writeback, mock_api);
+
+ MockContextSSD finish_ctx1;
+ expect_op_work_queue(mock_image_ctx);
+ expect_metadata_set(mock_image_ctx);
+ expect_context_complete(finish_ctx1, 0);
+ rwl.init(&finish_ctx1);
+ ASSERT_EQ(0, finish_ctx1.wait());
+
+ MockContextSSD finish_ctx2;
+ expect_context_complete(finish_ctx2, 0);
+ Extents image_extents{{0, 4096}};
+ bufferlist bl;
+ bl.append(std::string(4096, '1'));
+ int fadvise_flags = 0;
+ rwl.write(std::move(image_extents), std::move(bl), fadvise_flags, &finish_ctx2);
+ ASSERT_EQ(0, finish_ctx2.wait());
+
+ MockContextSSD finish_ctx3;
+ expect_context_complete(finish_ctx3, 0);
+ rwl.shut_down(&finish_ctx3);
+ ASSERT_EQ(0, finish_ctx3.wait());
+}
+
+} // namespace pwl
+} // namespace cache
+} // namespace librbd
void TestFixture::SetUp() {
ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), m_ioctx));
m_cct = reinterpret_cast<CephContext*>(m_ioctx.cct());
+ librados::Rados rados(m_ioctx);
+ rados.conf_set("rbd_rwl_path", ".");
m_image_name = get_temp_image_name();
m_image_size = 2 << 20;
iter != m_ictxs.end(); ++iter) {
(*iter)->state->close();
}
-
m_ioctx.close();
}
TYPED_TEST(DiffIterateTest, DiffIterateStress)
{
- REQUIRE(!is_rbd_rwl_enabled((CephContext *)this->_rados.cct()));
+ REQUIRE(!is_rbd_pwl_enabled((CephContext *)this->_rados.cct()));
librados::IoCtx ioctx;
ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx));
TEST_F(TestLibRBD, BreakLock)
{
REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
- REQUIRE(!is_rbd_rwl_enabled((CephContext *)_rados.cct()));
+ REQUIRE(!is_rbd_pwl_enabled((CephContext *)_rados.cct()));
static char buf[10];
bool init_shutdown) {
if (mock_image_ctx.clone_copy_on_read ||
(mock_image_ctx.features & RBD_FEATURE_JOURNALING) != 0 ||
- is_rbd_rwl_enabled(mock_image_ctx.cct)) {
+ is_rbd_pwl_enabled(mock_image_ctx.cct)) {
expect_set_require_lock(mock_image_dispatch, init_shutdown,
io::DIRECTION_BOTH);
} else {
return fsid == "00000000-1111-2222-3333-444444444444";
}
-bool is_rbd_rwl_enabled(ceph::common::CephContext *cct) {
-#if defined(WITH_RBD_RWL)
- return cct->_conf.get_val<bool>("rbd_rwl_enabled");
+bool is_rbd_pwl_enabled(ceph::common::CephContext *cct) {
+#if defined(WITH_RBD_RWL) || defined(WITH_RBD_SSD_CACHE)
+ auto value = cct->_conf.get_val<std::string>("rbd_persistent_cache_mode");
+ return value == "disabled" ? false : true;
#else
return false;
#endif
bool is_librados_test_stub(librados::Rados &rados);
-bool is_rbd_rwl_enabled(ceph::common::CephContext *ctx);
+bool is_rbd_pwl_enabled(ceph::common::CephContext *ctx);
#define REQUIRE(x) { \
if (!(x)) { \
#if defined(WITH_RBD) && defined(WITH_RBD_SSD_CACHE)
#include "librbd/cache/pwl/Types.h"
-#include "librbd/cache/pwl/SSDTypes.h"
-TYPE(librbd::cache::pwl::WriteLogPmemEntry)
+#include "librbd/cache/pwl/ssd/Types.h"
+TYPE(librbd::cache::pwl::WriteLogCacheEntry)
TYPE(librbd::cache::pwl::WriteLogPoolRoot)
-TYPE(librbd::cache::pwl::SuperBlock)
+TYPE(librbd::cache::pwl::ssd::SuperBlock)
#endif
#ifdef WITH_RBD