From 01400fedc35b73b63c5535e5f86eef7453f9fbe0 Mon Sep 17 00:00:00 2001 From: Mahati Chamarthy Date: Thu, 8 Oct 2020 16:25:00 +0530 Subject: [PATCH] librbd/cache: init functionality for SSD Cache Adds build option and implements init functionality for SSD cache Signed-off-by: Lisa Li Signed-off-by: Mahati Chamarthy Signed-off-by: Changcheng Liu --- CMakeLists.txt | 3 + src/blk/CMakeLists.txt | 2 +- src/include/config-h.in.cmake | 3 + src/librbd/CMakeLists.txt | 30 +++- src/librbd/cache/Types.h | 1 + src/librbd/cache/pwl/AbstractWriteLog.cc | 6 +- src/librbd/cache/pwl/AbstractWriteLog.h | 5 +- src/librbd/cache/pwl/InitRequest.cc | 24 ++- src/librbd/cache/pwl/LogEntry.cc | 11 +- src/librbd/cache/pwl/LogEntry.h | 5 +- src/librbd/cache/pwl/LogOperation.cc | 7 +- src/librbd/cache/pwl/LogOperation.h | 10 +- src/librbd/cache/pwl/ReplicatedWriteLog.cc | 10 +- src/librbd/cache/pwl/ReplicatedWriteLog.h | 3 + src/librbd/cache/pwl/Request.cc | 17 +- src/librbd/cache/pwl/Request.h | 6 + src/librbd/cache/pwl/SSDTypes.h | 42 +++++ src/librbd/cache/pwl/SSDWriteLog.cc | 158 ++++++++++++++++++ src/librbd/cache/pwl/SSDWriteLog.h | 102 +++++++++++ src/librbd/cache/pwl/Types.cc | 62 ++++++- src/librbd/cache/pwl/Types.h | 100 +++++++++-- src/test/librbd/CMakeLists.txt | 2 +- .../{ => pwl}/test_mock_ReplicatedWriteLog.cc | 1 - src/tools/ceph-dencoder/rbd_types.h | 8 + 24 files changed, 566 insertions(+), 52 deletions(-) create mode 100644 src/librbd/cache/pwl/SSDTypes.h create mode 100644 src/librbd/cache/pwl/SSDWriteLog.cc create mode 100644 src/librbd/cache/pwl/SSDWriteLog.h rename src/test/librbd/cache/{ => pwl}/test_mock_ReplicatedWriteLog.cc (99%) diff --git a/CMakeLists.txt b/CMakeLists.txt index c8065e54f227..b90b6c633d20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -210,6 +210,9 @@ CMAKE_DEPENDENT_OPTION(WITH_BLUESTORE_PMEM "Enable PMDK libraries" OFF CMAKE_DEPENDENT_OPTION(WITH_RBD_RWL "Enable librbd persistent write back cache" OFF "WITH_RBD" OFF) +CMAKE_DEPENDENT_OPTION(WITH_RBD_SSD_CACHE "Enable librbd persistent write back cache for SSDs" OFF + "WITH_RBD" OFF) + CMAKE_DEPENDENT_OPTION(WITH_SYSTEM_PMDK "Require and build with system PMDK" OFF "WITH_RBD_RWL OR WITH_BLUESTORE_PMEM" OFF) diff --git a/src/blk/CMakeLists.txt b/src/blk/CMakeLists.txt index 2f0cd695bd07..3ef16b1c895d 100644 --- a/src/blk/CMakeLists.txt +++ b/src/blk/CMakeLists.txt @@ -1,4 +1,4 @@ -if(WITH_BLUESTORE OR WITH_RBD_RWL) +if(WITH_BLUESTORE OR WITH_RBD_SSD_CACHE) list(APPEND libblk_srcs BlockDevice.cc) endif() diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake index b5a81eb772d0..cdf9cc909ad5 100644 --- a/src/include/config-h.in.cmake +++ b/src/include/config-h.in.cmake @@ -354,6 +354,9 @@ /* Define if RWL is enabled */ #cmakedefine WITH_RBD_RWL +/* Define if PWL-SSD is enabled */ +#cmakedefine WITH_RBD_SSD_CACHE + /* Shared library extension, such as .so, .dll or .dylib */ #cmakedefine CMAKE_SHARED_LIBRARY_SUFFIX "@CMAKE_SHARED_LIBRARY_SUFFIX@" diff --git a/src/librbd/CMakeLists.txt b/src/librbd/CMakeLists.txt index c0aae05bd11e..2f6931dbed30 100644 --- a/src/librbd/CMakeLists.txt +++ b/src/librbd/CMakeLists.txt @@ -1,10 +1,17 @@ -add_library(rbd_types STATIC +set(librbd_types_srcs journal/Types.cc mirroring_watcher/Types.cc trash_watcher/Types.cc watcher/Types.cc WatchNotifyTypes.cc) +if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE) + list(APPEND librbd_types_srcs cache/pwl/Types.cc) +endif() + +add_library(rbd_types STATIC + ${librbd_types_srcs}) + set(librbd_internal_srcs AsioEngine.cc AsyncObjectThrottle.cc @@ -186,7 +193,7 @@ if(WITH_EVENTTRACE) list(APPEND librbd_internal_srcs ../common/EventTrace.cc) endif() -if(WITH_RBD_RWL) +if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE) set(librbd_internal_srcs ${librbd_internal_srcs} cache/pwl/ImageCacheState.cc @@ -196,10 +203,18 @@ if(WITH_RBD_RWL) cache/pwl/ReadRequest.cc cache/pwl/Request.cc cache/pwl/SyncPoint.cc - cache/pwl/Types.cc - cache/pwl/ReplicatedWriteLog.cc cache/pwl/AbstractWriteLog.cc cache/WriteLogImageDispatch.cc) + if(WITH_RBD_RWL) + set(librbd_internal_srcs + ${librbd_internal_srcs} + cache/pwl/ReplicatedWriteLog.cc) + endif() + if(WITH_RBD_SSD_CACHE) + set(librbd_internal_srcs + ${librbd_internal_srcs} + cache/pwl/SSDWriteLog.cc) + endif() endif() add_library(rbd_api STATIC librbd.cc) @@ -219,11 +234,14 @@ target_link_libraries(rbd_internal PRIVATE osdc rbd_types) target_include_directories(rbd_internal PRIVATE ${OPENSSL_INCLUDE_DIR}) -if(WITH_RBD_RWL) +if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE) target_link_libraries(rbd_internal PUBLIC blk) endif() - +if(WITH_RBD_RWL) + target_link_libraries(rbd_types + PUBLIC blk) +endif() add_custom_target(librbd_plugins) set(librbd_plugins_dir ${CEPH_INSTALL_PKGLIBDIR}/librbd) diff --git a/src/librbd/cache/Types.h b/src/librbd/cache/Types.h index 2b08b7b4676e..682d30c91edc 100644 --- a/src/librbd/cache/Types.h +++ b/src/librbd/cache/Types.h @@ -14,6 +14,7 @@ namespace cache { enum ImageCacheType { IMAGE_CACHE_TYPE_RWL = 1, + IMAGE_CACHE_TYPE_SSD, }; typedef std::list Contexts; diff --git a/src/librbd/cache/pwl/AbstractWriteLog.cc b/src/librbd/cache/pwl/AbstractWriteLog.cc index f1e5ed702c95..fba6300c142a 100644 --- a/src/librbd/cache/pwl/AbstractWriteLog.cc +++ b/src/librbd/cache/pwl/AbstractWriteLog.cc @@ -18,7 +18,6 @@ #include "librbd/cache/pwl/ImageCacheState.h" #include "librbd/cache/pwl/LogEntry.h" #include "librbd/cache/pwl/ReadRequest.h" -#include "librbd/cache/pwl/Types.h" #include #include @@ -47,7 +46,6 @@ AbstractWriteLog::AbstractWriteLog(I &image_ctx, librbd::cache::pwl::ImageCac m_thread_pool( image_ctx.cct, "librbd::cache::pwl::AbstractWriteLog::thread_pool", "tp_pwl", 4, ""), m_cache_state(cache_state), - m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl)), m_image_ctx(image_ctx), m_log_pool_config_size(DEFAULT_POOL_SIZE), m_image_writeback(image_ctx), @@ -1995,7 +1993,9 @@ void AbstractWriteLog::internal_flush(bool invalidate, Context *on_finish) { } template -void AbstractWriteLog::add_into_log_map(GenericWriteLogEntries &log_entries) { +void AbstractWriteLog::add_into_log_map(GenericWriteLogEntries &log_entries, + C_BlockIORequestT *req) { + copy_pmem(req); m_blocks_to_log_entries.add_log_entries(log_entries); } diff --git a/src/librbd/cache/pwl/AbstractWriteLog.h b/src/librbd/cache/pwl/AbstractWriteLog.h index ac8b707115ea..e22bbcb6c8a0 100644 --- a/src/librbd/cache/pwl/AbstractWriteLog.h +++ b/src/librbd/cache/pwl/AbstractWriteLog.h @@ -139,7 +139,8 @@ public: uint32_t get_free_log_entries() { return m_free_log_entries; } - void add_into_log_map(pwl::GenericWriteLogEntries &log_entries); + void add_into_log_map(pwl::GenericWriteLogEntries &log_entries, + C_BlockIORequestT *req); private: typedef std::list *> C_WriteRequests; @@ -236,7 +237,6 @@ protected: std::atomic m_shutting_down = {false}; std::atomic m_invalidating = {false}; - const char* m_pwl_pool_layout_name; ImageCtxT &m_image_ctx; @@ -342,6 +342,7 @@ protected: virtual void alloc_op_log_entries(pwl::GenericLogOperations &ops) {} virtual bool retire_entries(const unsigned long int frees_per_tx) {return false;} virtual void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops) {} + virtual void copy_pmem(C_BlockIORequestT *req) {} virtual void persist_last_flushed_sync_gen() {} virtual void reserve_pmem(C_BlockIORequestT *req, bool &alloc_succeeds, bool &no_space) {} virtual Context *construct_flush_entry_ctx( diff --git a/src/librbd/cache/pwl/InitRequest.cc b/src/librbd/cache/pwl/InitRequest.cc index 1f696d518bd3..eae6e4657c78 100644 --- a/src/librbd/cache/pwl/InitRequest.cc +++ b/src/librbd/cache/pwl/InitRequest.cc @@ -8,11 +8,16 @@ #include "common/errno.h" #include "librbd/asio/ContextWQ.h" -#if defined(WITH_RBD_RWL) +#if defined(WITH_RBD_RWL) || defined(WITH_RBD_SSD_CACHE) #include "librbd/cache/pwl/ImageCacheState.h" -#include "librbd/cache/pwl/ReplicatedWriteLog.h" #include "librbd/cache/WriteLogImageDispatch.h" -#endif // WITH_RBD_RWL +#endif // WITH_RBD_RWL || WITH_RBD_SSD_CACHE +#ifdef WITH_RBD_RWL +#include "librbd/cache/pwl/ReplicatedWriteLog.h" +#endif +#ifdef WITH_RBD_SSD_CACHE +#include "librbd/cache/pwl/SSDWriteLog.h" +#endif #include "librbd/cache/Utils.h" #include "librbd/ImageCtx.h" @@ -44,14 +49,14 @@ InitRequest::InitRequest(I &image_ctx, Context *on_finish) template void InitRequest::send() { -#if defined(WITH_RBD_RWL) +#if defined(WITH_RBD_RWL) || defined(WITH_RBD_SSD_CACHE) get_image_cache_state(); #else finish(); #endif // WITH_RBD_RWL } -#if defined(WITH_RBD_RWL) +#if defined(WITH_RBD_RWL) || defined(WITH_RBD_SSD_CACHE) template void InitRequest::get_image_cache_state() { CephContext *cct = m_image_ctx.cct; @@ -76,11 +81,20 @@ void InitRequest::get_image_cache_state() { auto cache_type = cache_state->get_image_cache_type(); switch(cache_type) { + #ifdef WITH_RBD_RWL case cache::IMAGE_CACHE_TYPE_RWL: m_image_cache = new librbd::cache::pwl::ReplicatedWriteLog(m_image_ctx, cache_state); break; + #endif + #ifdef WITH_RBD_SSD_CACHE + case cache::IMAGE_CACHE_TYPE_SSD: + m_image_cache = + new librbd::cache::pwl::SSDWriteLog(m_image_ctx, + cache_state); + break; + #endif default: delete cache_state; cache_state = nullptr; diff --git a/src/librbd/cache/pwl/LogEntry.cc b/src/librbd/cache/pwl/LogEntry.cc index 4e7612c94f04..06f7931ea8c7 100644 --- a/src/librbd/cache/pwl/LogEntry.cc +++ b/src/librbd/cache/pwl/LogEntry.cc @@ -73,12 +73,17 @@ std::ostream &operator<<(std::ostream &os, return entry.format(os); } -void WriteLogEntry::init(bool has_data, std::vector::iterator allocation, - uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush) { - ram_entry.has_data = 1; +#ifdef WITH_RBD_RWL +void WriteLogEntry::init_pmem_buffer(std::vector::iterator allocation) { ram_entry.write_data = allocation->buffer_oid; ceph_assert(!TOID_IS_NULL(ram_entry.write_data)); pmem_buffer = D_RW(ram_entry.write_data); +} +#endif + +void WriteLogEntry::init(bool has_data, + uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush) { + ram_entry.has_data = 1; ram_entry.sync_gen_number = current_sync_gen; if (persist_on_flush) { /* Persist on flush. Sequence #0 is never used. */ diff --git a/src/librbd/cache/pwl/LogEntry.h b/src/librbd/cache/pwl/LogEntry.h index fb0f7d3fd0b4..0edd387f4de3 100644 --- a/src/librbd/cache/pwl/LogEntry.h +++ b/src/librbd/cache/pwl/LogEntry.h @@ -168,8 +168,11 @@ public: ~WriteLogEntry() override {}; WriteLogEntry(const WriteLogEntry&) = delete; WriteLogEntry &operator=(const WriteLogEntry&) = delete; - void init(bool has_data, std::vector::iterator allocation, + void init(bool has_data, uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush); + #ifdef WITH_RBD_RWL + void init_pmem_buffer(std::vector::iterator allocation); + #endif BlockExtent block_extent(); unsigned int reader_count() const; /* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */ diff --git a/src/librbd/cache/pwl/LogOperation.cc b/src/librbd/cache/pwl/LogOperation.cc index d47eb3a066e1..8125a5d41a72 100644 --- a/src/librbd/cache/pwl/LogOperation.cc +++ b/src/librbd/cache/pwl/LogOperation.cc @@ -179,7 +179,7 @@ WriteLogOperation::~WriteLogOperation() { } void WriteLogOperation::init(bool has_data, std::vector::iterator allocation, uint64_t current_sync_gen, uint64_t last_op_sequence_num, bufferlist &write_req_bl, uint64_t buffer_offset, bool persist_on_flush) { - log_entry->init(has_data, allocation, current_sync_gen, last_op_sequence_num, persist_on_flush); + log_entry->init(has_data, current_sync_gen, last_op_sequence_num, persist_on_flush); buffer_alloc = &(*allocation); bl.substr_of(write_req_bl, buffer_offset, log_entry->write_bytes()); @@ -215,11 +215,13 @@ void WriteLogOperation::complete(int result) { m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_app_t, log_append_time - buf_persist_time); } -void WriteLogOperation::copy_bl_to_pmem_buffer() { +#ifdef WITH_RBD_RWL +void WriteLogOperation::copy_bl_to_pmem_buffer(std::vector::iterator allocation) { /* operation is a shared_ptr, so write_op is only good as long as operation is in scope */ bufferlist::iterator i(&bl); m_perfcounter->inc(l_librbd_pwl_log_op_bytes, log_entry->write_bytes()); ldout(m_cct, 20) << bl << dendl; + log_entry->init_pmem_buffer(allocation); i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->pmem_buffer); } @@ -227,6 +229,7 @@ void WriteLogOperation::flush_pmem_buf_to_cache(PMEMobjpool *log_pool) { buf_persist_time = ceph_clock_now(); pmemobj_flush(log_pool, log_entry->pmem_buffer, log_entry->write_bytes()); } +#endif WriteLogOperationSet::WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr sync_point, bool persist_on_flush, CephContext *cct, Context *on_finish) diff --git a/src/librbd/cache/pwl/LogOperation.h b/src/librbd/cache/pwl/LogOperation.h index 8ae6351cc1b9..d3aa37e87c0f 100644 --- a/src/librbd/cache/pwl/LogOperation.h +++ b/src/librbd/cache/pwl/LogOperation.h @@ -53,8 +53,11 @@ public: virtual bool is_writing_op() const { return false; } - virtual void copy_bl_to_pmem_buffer() {}; + #ifdef WITH_RBD_RWL + virtual void copy_bl_to_pmem_buffer( + std::vector::iterator allocation) {}; virtual void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) {}; + #endif }; class SyncPointLogOperation : public GenericLogOperation { @@ -143,8 +146,11 @@ public: } void complete(int r) override; - void copy_bl_to_pmem_buffer() override; + #ifdef WITH_RBD_RWL + void copy_bl_to_pmem_buffer( + std::vector::iterator allocation) override; void flush_pmem_buf_to_cache(PMEMobjpool *log_pool) override; + #endif }; diff --git a/src/librbd/cache/pwl/ReplicatedWriteLog.cc b/src/librbd/cache/pwl/ReplicatedWriteLog.cc index 2bc03fa6fab9..c7d1b4a6b5c8 100644 --- a/src/librbd/cache/pwl/ReplicatedWriteLog.cc +++ b/src/librbd/cache/pwl/ReplicatedWriteLog.cc @@ -1,7 +1,6 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#include #include "ReplicatedWriteLog.h" #include "include/buffer.h" #include "include/Context.h" @@ -17,7 +16,6 @@ #include "librbd/asio/ContextWQ.h" #include "librbd/cache/pwl/ImageCacheState.h" #include "librbd/cache/pwl/LogEntry.h" -#include "librbd/cache/pwl/Types.h" #include #include @@ -38,7 +36,8 @@ const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION; template ReplicatedWriteLog::ReplicatedWriteLog( I &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state) -: AbstractWriteLog(image_ctx, cache_state) +: AbstractWriteLog(image_ctx, cache_state), + m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl)) { } @@ -851,6 +850,11 @@ void ReplicatedWriteLog::reserve_pmem(C_BlockIORequestT *req, } } +template +void ReplicatedWriteLog::copy_pmem(C_BlockIORequestT *req) { + req->copy_pmem(); +} + template bool ReplicatedWriteLog::alloc_resources(C_BlockIORequestT *req) { bool alloc_succeeds = true; diff --git a/src/librbd/cache/pwl/ReplicatedWriteLog.h b/src/librbd/cache/pwl/ReplicatedWriteLog.h index 995c8bf96b62..2464405de1aa 100644 --- a/src/librbd/cache/pwl/ReplicatedWriteLog.h +++ b/src/librbd/cache/pwl/ReplicatedWriteLog.h @@ -4,6 +4,7 @@ #ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG #define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG +#include #include "common/RWLock.h" #include "common/WorkQueue.h" #include "common/AsyncOpTracker.h" @@ -48,6 +49,7 @@ private: using C_CompAndWriteRequestT = pwl::C_CompAndWriteRequest; PMEMobjpool *m_log_pool = nullptr; + const char* m_pwl_pool_layout_name; void remove_pool_file(); void load_existing_entries(pwl::DeferredContexts &later); @@ -70,6 +72,7 @@ protected: using AbstractWriteLog::m_first_valid_entry; void process_work() override; + void copy_pmem(C_BlockIORequestT *req) override; void schedule_append_ops(pwl::GenericLogOperations &ops) override; void append_scheduled_ops(void) override; void reserve_pmem(C_BlockIORequestT *req, bool &alloc_succeeds, bool &no_space) override; diff --git a/src/librbd/cache/pwl/Request.cc b/src/librbd/cache/pwl/Request.cc index aecf3b6a3f53..85ba30cab66e 100644 --- a/src/librbd/cache/pwl/Request.cc +++ b/src/librbd/cache/pwl/Request.cc @@ -4,7 +4,7 @@ #include "Request.h" #include "librbd/BlockGuard.h" #include "librbd/cache/pwl/LogEntry.h" -#include "librbd/cache/pwl/ReplicatedWriteLog.h" +#include "librbd/cache/pwl/AbstractWriteLog.h" #define dout_subsys ceph_subsys_rbd_pwl #undef dout_prefix @@ -251,12 +251,19 @@ void C_WriteRequest::setup_log_operations(DeferredContexts &on_exit) { op_set->extent_ops_appending->activate(); op_set->extent_ops_persist->activate(); - /* Write data */ + pwl.add_into_log_map(log_entries, this); +} + +#ifdef WITH_RBD_RWL +template +void C_WriteRequest::copy_pmem() { + auto allocation = m_resources.buffers.begin(); for (auto &operation : op_set->operations) { - operation->copy_bl_to_pmem_buffer(); + operation->copy_bl_to_pmem_buffer(allocation); + allocation++; } - pwl.add_into_log_map(log_entries); } +#endif template bool C_WriteRequest::append_write_request(std::shared_ptr sync_point) { @@ -455,7 +462,7 @@ void C_DiscardRequest::setup_log_operations() { discard_req->release_cell(); }); op->init(current_sync_gen, persist_on_flush, pwl.get_last_op_sequence_num(), on_write_persist); - pwl.add_into_log_map(log_entries); + pwl.add_into_log_map(log_entries, this); } template diff --git a/src/librbd/cache/pwl/Request.h b/src/librbd/cache/pwl/Request.h index 53a013d46d4b..fc7aecb24c18 100644 --- a/src/librbd/cache/pwl/Request.h +++ b/src/librbd/cache/pwl/Request.h @@ -66,6 +66,8 @@ public: virtual void dispatch() = 0; + virtual void copy_pmem() {}; + virtual const char *get_name() const { return "C_BlockIORequest"; } @@ -152,6 +154,10 @@ public: void dispatch() override; + #ifdef WITH_RBD_RWL + void copy_pmem() override; + #endif + virtual std::shared_ptr create_operation(uint64_t offset, uint64_t len); virtual void setup_log_operations(DeferredContexts &on_exit); diff --git a/src/librbd/cache/pwl/SSDTypes.h b/src/librbd/cache/pwl/SSDTypes.h new file mode 100644 index 000000000000..7e6f2dff680d --- /dev/null +++ b/src/librbd/cache/pwl/SSDTypes.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_SSD_TYPES_H +#define CEPH_LIBRBD_CACHE_SSD_TYPES_H + +#include "acconfig.h" + +#include "librbd/io/Types.h" +#include "Types.h" //generic type = to be renamed + +namespace librbd { +namespace cache { +namespace pwl { + +struct SuperBlock{ + WriteLogPoolRoot root; + + DENC(SuperBlock, v, p) { + DENC_START(1, 1, p); + denc(v.root, p); + DENC_FINISH(p); + } + + void dump(Formatter *f) const { + f->dump_object("super", root); + } + + static void generate_test_instances(list& ls) { + ls.push_back(new SuperBlock); + ls.push_back(new SuperBlock); + ls.back()->root.first_valid_entry = 2; + } +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +WRITE_CLASS_DENC(librbd::cache::pwl::SuperBlock) + +#endif // CEPH_LIBRBD_CACHE_SSD_TYPES_H diff --git a/src/librbd/cache/pwl/SSDWriteLog.cc b/src/librbd/cache/pwl/SSDWriteLog.cc new file mode 100644 index 000000000000..b34d1ce5f961 --- /dev/null +++ b/src/librbd/cache/pwl/SSDWriteLog.cc @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SSDWriteLog.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "include/ceph_assert.h" +#include "common/deleter.h" +#include "common/dout.h" +#include "common/environment.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "common/Timer.h" +#include "common/perf_counters.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/cache/pwl/LogEntry.h" +#include +#include + +#undef dout_subsys +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::SSDWriteLog: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +using namespace librbd::cache::pwl; + +// SSD: this number can be updated later +const unsigned long int ops_appended_together = MAX_WRITES_PER_SYNC_POINT; + +template +SSDWriteLog::SSDWriteLog( + I &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state) + : AbstractWriteLog(image_ctx, cache_state) +{ +} + +template +void SSDWriteLog::initialize_pool(Context *on_finish, pwl::DeferredContexts &later) { + CephContext *cct = m_image_ctx.cct; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + if (access(this->m_log_pool_name.c_str(), F_OK) != 0) { + int fd = ::open(this->m_log_pool_name.c_str(), O_RDWR|O_CREAT, 0644); + bool succeed = true; + if (fd >= 0) { + if (truncate(this->m_log_pool_name.c_str(), this->m_log_pool_config_size) != 0) { + succeed = false; + } + ::close(fd); + } else { + succeed = false; + } + if (!succeed) { + m_cache_state->present = false; + m_cache_state->clean = true; + m_cache_state->empty = true; + /* TODO: filter/replace errnos that are meaningless to the caller */ + on_finish->complete(-errno); + return; + } + + bdev = BlockDevice::create(cct, this->m_log_pool_name, aio_cache_cb, + nullptr, nullptr, nullptr); + int r = bdev->open(this->m_log_pool_name); + if (r < 0) { + delete bdev; + on_finish->complete(-1); + return; + } + m_cache_state->present = true; + m_cache_state->clean = true; + m_cache_state->empty = true; + /* new pool, calculate and store metadata */ + size_t small_write_size = MIN_WRITE_ALLOC_SIZE + sizeof(struct WriteLogPmemEntry); + + uint64_t num_small_writes = (uint64_t)(this->m_log_pool_config_size / small_write_size); + if (num_small_writes > MAX_LOG_ENTRIES) { + num_small_writes = MAX_LOG_ENTRIES; + } + assert(num_small_writes > 2); + m_log_pool_ring_buffer_size = this->m_log_pool_config_size - DATA_RING_BUFFER_OFFSET; + /* Log ring empty */ + m_first_free_entry = DATA_RING_BUFFER_OFFSET; + m_first_valid_entry = DATA_RING_BUFFER_OFFSET; + + pool_size = this->m_log_pool_config_size; + auto new_root = std::make_shared(pool_root); + new_root->pool_size = this->m_log_pool_config_size; + new_root->flushed_sync_gen = this->m_flushed_sync_gen; + new_root->block_size = MIN_WRITE_ALLOC_SIZE; + new_root->first_free_entry = m_first_free_entry; + new_root->first_valid_entry = m_first_valid_entry; + new_root->num_log_entries = num_small_writes; + pool_root = *new_root; + + r = update_pool_root_sync(new_root); + if (r != 0) { + this->m_total_log_entries = 0; + this->m_free_log_entries = 0; + lderr(m_image_ctx.cct) << "failed to initialize pool (" + << this->m_log_pool_name << ")" << dendl; + on_finish->complete(r); + } + this->m_total_log_entries = new_root->num_log_entries; + this->m_free_log_entries = new_root->num_log_entries - 1; + } else { + m_cache_state->present = true; + bdev = BlockDevice::create( + cct, this->m_log_pool_name, aio_cache_cb, + static_cast(this), nullptr, static_cast(this)); + int r = bdev->open(this->m_log_pool_name); + if (r < 0) { + delete bdev; + on_finish->complete(r); + return; + } + //load_existing_entries(later); #TODO: Implement and uncomment in later PR + if (m_first_free_entry < m_first_valid_entry) { + /* Valid entries wrap around the end of the ring, so first_free is lower + * than first_valid. If first_valid was == first_free+1, the entry at + * first_free would be empty. The last entry is never used, so in + * that case there would be zero free log entries. */ + this->m_free_log_entries = this->m_total_log_entries - + (m_first_valid_entry - m_first_free_entry) - 1; + } else { + /* first_valid is <= first_free. If they are == we have zero valid log + * entries, and n-1 free log entries */ + this->m_free_log_entries = this->m_total_log_entries - + (m_first_free_entry - m_first_valid_entry) - 1; + } + m_cache_state->clean = this->m_dirty_log_entries.empty(); + m_cache_state->empty = m_log_entries.empty(); + } +} + +template +int SSDWriteLog::update_pool_root_sync( + std::shared_ptr root) { + bufferlist bl; + SuperBlock superblock; + superblock.root = *root; + encode(superblock, bl); + bl.append_zero(MIN_WRITE_ALLOC_SIZE - bl.length()); + ceph_assert(bl.length() % MIN_WRITE_ALLOC_SIZE == 0); + return bdev->write(0, bl, false); +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::SSDWriteLog; diff --git a/src/librbd/cache/pwl/SSDWriteLog.h b/src/librbd/cache/pwl/SSDWriteLog.h new file mode 100644 index 000000000000..0052535273e0 --- /dev/null +++ b/src/librbd/cache/pwl/SSDWriteLog.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG +#define CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG + +#include "AbstractWriteLog.h" +#include "blk/BlockDevice.h" +#include "common/AsyncOpTracker.h" +#include "common/Checksummer.h" +#include "common/environment.h" +#include "common/RWLock.h" +#include "common/WorkQueue.h" +#include "librbd/BlockGuard.h" +#include "librbd/Utils.h" +#include "librbd/cache/ImageWriteback.h" +#include "librbd/cache/Types.h" +#include "librbd/cache/pwl/LogMap.h" +#include "librbd/cache/pwl/LogOperation.h" +#include "librbd/cache/pwl/Request.h" +#include "librbd/cache/pwl/SSDTypes.h" +#include +#include + +namespace librbd { + +struct ImageCtx; + +namespace cache { + +namespace pwl { + +template +class SSDWriteLog : public AbstractWriteLog { +public: + SSDWriteLog(ImageCtxT &image_ctx, + librbd::cache::pwl::ImageCacheState* cache_state); + ~SSDWriteLog() {} + SSDWriteLog(const SSDWriteLog&) = delete; + SSDWriteLog &operator=(const SSDWriteLog&) = delete; + + using This = AbstractWriteLog; + using C_BlockIORequestT = pwl::C_BlockIORequest; + + //TODO: Implement below functions in later PR + bool alloc_resources(C_BlockIORequestT *req) override { return false; } + void setup_schedule_append( + pwl::GenericLogOperationsVector &ops, bool do_early_flush) override {} + +protected: + using AbstractWriteLog::m_lock; + using AbstractWriteLog::m_log_entries; + using AbstractWriteLog::m_image_ctx; + using AbstractWriteLog::m_cache_state; + using AbstractWriteLog::m_first_free_entry; + using AbstractWriteLog::m_first_valid_entry; + + void initialize_pool(Context *on_finish, pwl::DeferredContexts &later) override; + //TODO: Implement below functions in later PR + void process_work() override {} + void append_scheduled_ops(void) override {} + void schedule_append_ops(pwl::GenericLogOperations &ops) override {} + void remove_pool_file() override {} + +private: + uint64_t m_log_pool_ring_buffer_size; /* Size of ring buffer */ + + //classes and functions to faciliate block device operations + class AioTransContext { + public: + Context *on_finish; + ::IOContext ioc; + explicit AioTransContext(CephContext* cct, Context *cb) + :on_finish(cb), ioc(cct, this) { + } + ~AioTransContext(){} + + void aio_finish() { + on_finish->complete(ioc.get_return_value()); + delete this; + } + }; //class AioTransContext + + BlockDevice *bdev = nullptr; + uint64_t pool_size; + pwl::WriteLogPoolRoot pool_root; + + int update_pool_root_sync(std::shared_ptr root); + + static void aio_cache_cb(void *priv, void *priv2) { + AioTransContext *c = static_cast(priv2); + c->aio_finish(); + } +};//class SSDWriteLog + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::SSDWriteLog; + +#endif // CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG diff --git a/src/librbd/cache/pwl/Types.cc b/src/librbd/cache/pwl/Types.cc index 31bd0af00ad3..25c9dce130e1 100644 --- a/src/librbd/cache/pwl/Types.cc +++ b/src/librbd/cache/pwl/Types.cc @@ -10,11 +10,10 @@ #undef dout_prefix #define dout_prefix *_dout << "librbd::cache::pwl::Types: " << this << " " \ << __func__ << ": " +using ceph::Formatter; namespace librbd { - namespace cache { - namespace pwl { DeferredContexts::~DeferredContexts() { @@ -53,6 +52,65 @@ uint64_t WriteLogPmemEntry::get_write_bytes() { return write_bytes; } +#ifdef WITH_RBD_SSD_CACHE +void WriteLogPmemEntry::dump(Formatter *f) const { + f->dump_unsigned("sync_gen_number", sync_gen_number); + f->dump_unsigned("write_sequence_number", write_sequence_number); + f->dump_unsigned("image_offset_bytes", image_offset_bytes); + f->dump_unsigned("write_bytes", write_bytes); + f->dump_unsigned("write_data_pos", write_data_pos); + f->dump_unsigned("entry_valid", entry_valid); + f->dump_unsigned("sync_point", sync_point); + f->dump_unsigned("sequenced", sequenced); + f->dump_unsigned("has_data", has_data); + f->dump_unsigned("discard", discard); + f->dump_unsigned("writesame", writesame); + f->dump_unsigned("ws_datalen", ws_datalen); + f->dump_unsigned("entry_index", entry_index); +} + +void WriteLogPmemEntry::generate_test_instances(list& ls) { + ls.push_back(new WriteLogPmemEntry); + ls.push_back(new WriteLogPmemEntry); + ls.back()->sync_gen_number = 1; + ls.back()->write_sequence_number = 1; + ls.back()->image_offset_bytes = 1; + ls.back()->write_bytes = 1; + ls.back()->write_data_pos = 1; + ls.back()->entry_valid = 1; + ls.back()->sync_point = 1; + ls.back()->sequenced = 1; + ls.back()->has_data = 1; + ls.back()->discard = 1; + ls.back()->writesame = 1; + ls.back()->ws_datalen = 1; + ls.back()->entry_index = 1; +} + +void WriteLogPoolRoot::dump(Formatter *f) const { + f->dump_unsigned("layout_version", layout_version); + f->dump_unsigned("cur_sync_gen", cur_sync_gen); + f->dump_unsigned("pool_size", pool_size); + f->dump_unsigned("flushed_sync_gen", flushed_sync_gen); + f->dump_unsigned("block_size", block_size); + f->dump_unsigned("num_log_entries", num_log_entries); + f->dump_unsigned("first_free_entry", first_free_entry); + f->dump_unsigned("first_valid_entry", first_valid_entry); } + +void WriteLogPoolRoot::generate_test_instances(list& ls) { + ls.push_back(new WriteLogPoolRoot); + ls.push_back(new WriteLogPoolRoot); + ls.back()->layout_version = 2; + ls.back()->cur_sync_gen = 1; + ls.back()->pool_size = 1024; + ls.back()->flushed_sync_gen = 1; + ls.back()->block_size = 4096; + ls.back()->num_log_entries = 10000000; + ls.back()->first_free_entry = 1; + ls.back()->first_valid_entry = 0; +} +#endif + std::ostream& operator<<(std::ostream& os, const WriteLogPmemEntry &entry) { os << "entry_valid=" << (bool)entry.entry_valid << ", " diff --git a/src/librbd/cache/pwl/Types.h b/src/librbd/cache/pwl/Types.h index 78a2440e51f3..ab6c696a132d 100644 --- a/src/librbd/cache/pwl/Types.h +++ b/src/librbd/cache/pwl/Types.h @@ -1,14 +1,23 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#ifndef CEPH_LIBRBD_CACHE_RWL_TYPES_H -#define CEPH_LIBRBD_CACHE_RWL_TYPES_H +#ifndef CEPH_LIBRBD_CACHE_PWL_TYPES_H +#define CEPH_LIBRBD_CACHE_PWL_TYPES_H + +#include "acconfig.h" + +#ifdef WITH_RBD_RWL +#include "libpmemobj.h" +#endif #include -#include #include "librbd/BlockGuard.h" #include "librbd/io/Types.h" +namespace ceph { +class Formatter; +} + class Context; enum { @@ -155,6 +164,7 @@ const uint64_t MAX_WRITES_PER_SYNC_POINT = 256; const uint64_t MAX_BYTES_PER_SYNC_POINT = (1024 * 1024 * 8); const uint32_t MIN_WRITE_ALLOC_SIZE = 512; +const uint32_t MIN_WRITE_ALLOC_SSD_SIZE = 4096; const uint32_t LOG_STATS_INTERVAL_SECONDS = 5; /**** Write log entries ****/ @@ -172,6 +182,11 @@ const double AGGRESSIVE_RETIRE_HIGH_WATER = 0.75; const double RETIRE_HIGH_WATER = 0.50; const double RETIRE_LOW_WATER = 0.40; const int RETIRE_BATCH_TIME_LIMIT_MS = 250; +const uint64_t CONTROL_BLOCK_MAX_LOG_ENTRIES = 32; +const uint64_t SPAN_MAX_DATA_LEN = (16*1024*1024); + +/* offset of ring on SSD */ +const uint64_t DATA_RING_BUFFER_OFFSET = 8192; /* Defer a set of Contexts until destruct/exit. Used for deferring * work on a given thread until a required lock is dropped. */ @@ -184,31 +199,41 @@ public: }; /* Pmem structures */ +#ifdef WITH_RBD_RWL POBJ_LAYOUT_BEGIN(rbd_pwl); POBJ_LAYOUT_ROOT(rbd_pwl, struct WriteLogPoolRoot); POBJ_LAYOUT_TOID(rbd_pwl, uint8_t); POBJ_LAYOUT_TOID(rbd_pwl, struct WriteLogPmemEntry); POBJ_LAYOUT_END(rbd_pwl); +#endif struct WriteLogPmemEntry { uint64_t sync_gen_number = 0; uint64_t write_sequence_number = 0; uint64_t image_offset_bytes; uint64_t write_bytes; + #ifdef WITH_RBD_RWL TOID(uint8_t) write_data; - struct { - uint8_t entry_valid :1; /* if 0, this entry is free */ - uint8_t sync_point :1; /* No data. No write sequence number. Marks sync - point for this sync gen number */ - uint8_t sequenced :1; /* write sequence number is valid */ - uint8_t has_data :1; /* write_data field is valid (else ignore) */ - uint8_t discard :1; /* has_data will be 0 if this is a discard */ - uint8_t writesame :1; /* ws_datalen indicates length of data at write_bytes */ + #endif + #ifdef WITH_RBD_SSD_CACHE + uint64_t write_data_pos; /* SSD data offset */ + #endif + union { + uint8_t flags; + struct { + uint8_t entry_valid :1; /* if 0, this entry is free */ + uint8_t sync_point :1; /* No data. No write sequence number. Marks sync + point for this sync gen number */ + uint8_t sequenced :1; /* write sequence number is valid */ + uint8_t has_data :1; /* write_data field is valid (else ignore) */ + uint8_t discard :1; /* has_data will be 0 if this is a discard */ + uint8_t writesame :1; /* ws_datalen indicates length of data at write_bytes */ + }; }; uint32_t ws_datalen = 0; /* Length of data buffer (writesame only) */ uint32_t entry_index = 0; /* For debug consistency check. Can be removed if * we need the space */ - WriteLogPmemEntry(const uint64_t image_offset_bytes, const uint64_t write_bytes) + WriteLogPmemEntry(const uint64_t image_offset_bytes=0, const uint64_t write_bytes=0) : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes), entry_valid(0), sync_point(0), sequenced(0), has_data(0), discard(0), writesame(0) { } @@ -234,11 +259,26 @@ struct WriteLogPmemEntry { } friend std::ostream& operator<<(std::ostream& os, const WriteLogPmemEntry &entry); + #ifdef WITH_RBD_SSD_CACHE + DENC(WriteLogPmemEntry, v, p) { + DENC_START(1, 1, p); + denc(v.sync_gen_number, p); + denc(v.write_sequence_number, p); + denc(v.image_offset_bytes, p); + denc(v.write_bytes, p); + denc(v.write_data_pos, p); + denc(v.flags, p); + denc(v.ws_datalen, p); + denc(v.entry_index, p); + DENC_FINISH(p); + } + #endif + void dump(ceph::Formatter *f) const; + static void generate_test_instances(list& ls); }; -static_assert(sizeof(WriteLogPmemEntry) == 64); - struct WriteLogPoolRoot { + #ifdef WITH_RBD_RWL union { struct { uint8_t layout_version; /* Version of this structure (RWL_POOL_VERSION) */ @@ -246,6 +286,11 @@ struct WriteLogPoolRoot { uint64_t _u64; } header; TOID(struct WriteLogPmemEntry) log_entries; /* contiguous array of log entries */ + #endif + #ifdef WITH_RBD_SSD_CACHE + uint64_t layout_version = 0; + uint64_t cur_sync_gen = 0; + #endif uint64_t pool_size; uint64_t flushed_sync_gen; /* All writing entries with this or a lower * sync gen number are flushed. */ @@ -253,12 +298,32 @@ struct WriteLogPoolRoot { uint32_t num_log_entries; uint32_t first_free_entry; /* Entry following the newest valid entry */ uint32_t first_valid_entry; /* Index of the oldest valid entry in the log */ + + #ifdef WITH_RBD_SSD_CACHE + DENC(WriteLogPoolRoot, v, p) { + DENC_START(1, 1, p); + denc(v.layout_version, p); + denc(v.cur_sync_gen, p); + denc(v.pool_size, p); + denc(v.flushed_sync_gen, p); + denc(v.block_size, p); + denc(v.num_log_entries, p); + denc(v.first_free_entry, p); + denc(v.first_valid_entry, p); + DENC_FINISH(p); + } + #endif + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(list& ls); }; struct WriteBufferAllocation { unsigned int allocation_size = 0; + #ifdef WITH_RBD_RWL pobj_action buffer_alloc_action; TOID(uint8_t) buffer_oid = OID_NULL; + #endif bool allocated = false; utime_t allocation_lat; }; @@ -309,4 +374,9 @@ public: } // namespace cache } // namespace librbd -#endif // CEPH_LIBRBD_CACHE_RWL_TYPES_H +#ifdef WITH_RBD_SSD_CACHE +WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogPmemEntry) +WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogPoolRoot) +#endif + +#endif // CEPH_LIBRBD_CACHE_PWL_TYPES_H diff --git a/src/test/librbd/CMakeLists.txt b/src/test/librbd/CMakeLists.txt index b0ba3f250ba9..2c77d38b35db 100644 --- a/src/test/librbd/CMakeLists.txt +++ b/src/test/librbd/CMakeLists.txt @@ -125,7 +125,7 @@ set(unittest_librbd_srcs if(WITH_RBD_RWL) set(unittest_librbd_srcs ${unittest_librbd_srcs} - cache/test_mock_ReplicatedWriteLog.cc + cache/pwl/test_mock_ReplicatedWriteLog.cc cache/pwl/test_WriteLogMap.cc) endif(WITH_RBD_RWL) diff --git a/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc b/src/test/librbd/cache/pwl/test_mock_ReplicatedWriteLog.cc similarity index 99% rename from src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc rename to src/test/librbd/cache/pwl/test_mock_ReplicatedWriteLog.cc index 80208e9a9977..d1622a2ce34a 100644 --- a/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc +++ b/src/test/librbd/cache/pwl/test_mock_ReplicatedWriteLog.cc @@ -12,7 +12,6 @@ #include "librbd/cache/pwl/Types.h" #include "librbd/cache/ImageWriteback.h" - namespace librbd { namespace { diff --git a/src/tools/ceph-dencoder/rbd_types.h b/src/tools/ceph-dencoder/rbd_types.h index 2068946fb896..5c7c15fe98c7 100644 --- a/src/tools/ceph-dencoder/rbd_types.h +++ b/src/tools/ceph-dencoder/rbd_types.h @@ -19,6 +19,14 @@ TYPE(rbd_replay::action::ActionEntry) TYPE(rbd::mirror::image_map::PolicyData) #endif +#if defined(WITH_RBD) && defined(WITH_RBD_SSD_CACHE) +#include "librbd/cache/pwl/Types.h" +#include "librbd/cache/pwl/SSDTypes.h" +TYPE(librbd::cache::pwl::WriteLogPmemEntry) +TYPE(librbd::cache::pwl::WriteLogPoolRoot) +TYPE(librbd::cache::pwl::SuperBlock) +#endif + #ifdef WITH_RBD #include "cls/rbd/cls_rbd.h" TYPE_FEATUREFUL(cls_rbd_parent) -- 2.47.3