From: Yuan Lu Date: Tue, 14 Apr 2020 07:58:28 +0000 (+0800) Subject: librbd: add aio_read X-Git-Tag: v17.0.0~2516^2~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=3d63bde4a41b247b11ca5d01dc599065f209d23f;p=ceph.git librbd: add aio_read Signed-off-by: Peterson, Scott Signed-off-by: Li, Xiaoyan Signed-off-by: Lu, Yuan Signed-off-by: Chamarthy, Mahati --- diff --git a/src/librbd/cache/ReplicatedWriteLog.cc b/src/librbd/cache/ReplicatedWriteLog.cc index 661215085d33b..533ba7eacb6a6 100644 --- a/src/librbd/cache/ReplicatedWriteLog.cc +++ b/src/librbd/cache/ReplicatedWriteLog.cc @@ -52,6 +52,7 @@ ReplicatedWriteLog::ReplicatedWriteLog(I &image_ctx, librbd::cache::rwl::Imag "librbd::cache::ReplicatedWriteLog::m_lock", this))), m_blockguard_lock(ceph::make_mutex(util::unique_lock_name( "librbd::cache::ReplicatedWriteLog::m_blockguard_lock", this))), + m_blocks_to_log_entries(image_ctx.cct), m_thread_pool(image_ctx.cct, "librbd::cache::ReplicatedWriteLog::thread_pool", "tp_rwl", 4, ""), @@ -565,6 +566,114 @@ template void ReplicatedWriteLog::aio_read(Extents&& image_extents, ceph::bufferlist* bl, int fadvise_flags, Context *on_finish) { + // TODO: handle writesame and discard case in later PRs + CephContext *cct = m_image_ctx.cct; + utime_t now = ceph_clock_now(); + C_ReadRequest *read_ctx = new C_ReadRequest(cct, now, m_perfcounter, bl, on_finish); + ldout(cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id + << "image_extents=" << image_extents << ", " + << "bl=" << bl << ", " + << "on_finish=" << on_finish << dendl; + + ceph_assert(m_initialized); + bl->clear(); + m_perfcounter->inc(l_librbd_rwl_rd_req, 1); + + /* + * The strategy here is to look up all the WriteLogMapEntries that overlap + * this read, and iterate through those to separate this read into hits and + * misses. A new Extents object is produced here with Extents for each miss + * region. The miss Extents is then passed on to the read cache below RWL. We + * also produce an ImageExtentBufs for all the extents (hit or miss) in this + * read. When the read from the lower cache layer completes, we iterate + * through the ImageExtentBufs and insert buffers for each cache hit at the + * appropriate spot in the bufferlist returned from below for the miss + * read. The buffers we insert here refer directly to regions of various + * write log entry data buffers. + * + * Locking: These buffer objects hold a reference on the write log entries + * they refer to. Log entries can't be retired until there are no references. + * The GenericWriteLogEntry references are released by the buffer destructor. + */ + for (auto &extent : image_extents) { + uint64_t extent_offset = 0; + RWLock::RLocker entry_reader_locker(m_entry_reader_lock); + WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries(block_extent(extent)); + for (auto &map_entry : map_entries) { + Extent entry_image_extent(rwl::image_extent(map_entry.block_extent)); + /* If this map entry starts after the current image extent offset ... */ + if (entry_image_extent.first > extent.first + extent_offset) { + /* ... add range before map_entry to miss extents */ + uint64_t miss_extent_start = extent.first + extent_offset; + uint64_t miss_extent_length = entry_image_extent.first - miss_extent_start; + Extent miss_extent(miss_extent_start, miss_extent_length); + read_ctx->miss_extents.push_back(miss_extent); + /* Add miss range to read extents */ + ImageExtentBuf miss_extent_buf(miss_extent); + read_ctx->read_extents.push_back(miss_extent_buf); + extent_offset += miss_extent_length; + } + ceph_assert(entry_image_extent.first <= extent.first + extent_offset); + uint64_t entry_offset = 0; + /* If this map entry starts before the current image extent offset ... */ + if (entry_image_extent.first < extent.first + extent_offset) { + /* ... compute offset into log entry for this read extent */ + entry_offset = (extent.first + extent_offset) - entry_image_extent.first; + } + /* This read hit ends at the end of the extent or the end of the log + entry, whichever is less. */ + uint64_t entry_hit_length = min(entry_image_extent.second - entry_offset, + extent.second - extent_offset); + Extent hit_extent(entry_image_extent.first, entry_hit_length); + + /* Offset of the map entry into the log entry's buffer */ + uint64_t map_entry_buffer_offset = entry_image_extent.first - map_entry.log_entry->ram_entry.image_offset_bytes; + /* Offset into the log entry buffer of this read hit */ + uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset; + /* Create buffer object referring to pmem pool for this read hit */ + auto write_entry = map_entry.log_entry; + + /* Make a bl for this hit extent. This will add references to the write_entry->pmem_bp */ + buffer::list hit_bl; + + buffer::list entry_bl_copy; + write_entry->copy_pmem_bl(&entry_bl_copy); + entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl); + + ceph_assert(hit_bl.length() == entry_hit_length); + + /* Add hit extent to read extents */ + ImageExtentBuf hit_extent_buf(hit_extent, hit_bl); + read_ctx->read_extents.push_back(hit_extent_buf); + + /* Exclude RWL hit range from buffer and extent */ + extent_offset += entry_hit_length; + ldout(cct, 20) << map_entry << dendl; + } + /* If the last map entry didn't consume the entire image extent ... */ + if (extent.second > extent_offset) { + /* ... add the rest of this extent to miss extents */ + uint64_t miss_extent_start = extent.first + extent_offset; + uint64_t miss_extent_length = extent.second - extent_offset; + Extent miss_extent(miss_extent_start, miss_extent_length); + read_ctx->miss_extents.push_back(miss_extent); + /* Add miss range to read extents */ + ImageExtentBuf miss_extent_buf(miss_extent); + read_ctx->read_extents.push_back(miss_extent_buf); + extent_offset += miss_extent_length; + } + } + + ldout(cct, 20) << "miss_extents=" << read_ctx->miss_extents << ", " + << "miss_bl=" << read_ctx->miss_bl << dendl; + + if (read_ctx->miss_extents.empty()) { + /* All of this read comes from RWL */ + read_ctx->complete(0); + } else { + /* Pass the read misses on to the layer below RWL */ + m_image_writeback.aio_read(std::move(read_ctx->miss_extents), &read_ctx->miss_bl, fadvise_flags, read_ctx); + } } template @@ -2041,6 +2150,11 @@ void ReplicatedWriteLog::internal_flush(Context *on_finish) { detain_guarded_request(nullptr, guarded_ctx, true); } +template +void ReplicatedWriteLog::add_into_log_map(GenericWriteLogEntries &log_entries) { + m_blocks_to_log_entries.add_log_entries(log_entries); +} + } // namespace cache } // namespace librbd diff --git a/src/librbd/cache/ReplicatedWriteLog.h b/src/librbd/cache/ReplicatedWriteLog.h index 88fa56e5a71a6..668289889dc4a 100644 --- a/src/librbd/cache/ReplicatedWriteLog.h +++ b/src/librbd/cache/ReplicatedWriteLog.h @@ -14,6 +14,7 @@ #include "librbd/cache/Types.h" #include "librbd/cache/rwl/LogOperation.h" #include "librbd/cache/rwl/Request.h" +#include "librbd/cache/rwl/LogMap.h" #include #include @@ -36,6 +37,8 @@ class GenericLogEntry; typedef std::list> WriteLogEntries; typedef std::list> GenericLogEntries; typedef std::list> GenericWriteLogEntries; +typedef LogMapEntries WriteLogMapEntries; +typedef LogMap WriteLogMap; /**** Write log entries end ****/ @@ -125,6 +128,7 @@ public: uint32_t get_free_log_entries() { return m_free_log_entries; } + void add_into_log_map(rwl::GenericWriteLogEntries &log_entries); private: typedef std::list *> C_WriteRequests; typedef std::list *> C_BlockIORequests; @@ -220,6 +224,8 @@ private: rwl::GenericLogOperations m_ops_to_flush; /* Write ops needing flush in local log */ rwl::GenericLogOperations m_ops_to_append; /* Write ops needing event append in local log */ + rwl::WriteLogMap m_blocks_to_log_entries; + /* New entries are at the back. Oldest at the front */ rwl::GenericLogEntries m_log_entries; rwl::GenericLogEntries m_dirty_log_entries; diff --git a/src/librbd/cache/rwl/LogEntry.h b/src/librbd/cache/rwl/LogEntry.h index 471102f6c5e83..9e5d3c53403df 100644 --- a/src/librbd/cache/rwl/LogEntry.h +++ b/src/librbd/cache/rwl/LogEntry.h @@ -19,6 +19,8 @@ class SyncPointLogEntry; class GenericWriteLogEntry; class WriteLogEntry; +typedef std::list> GenericWriteLogEntries; + class GenericLogEntry { public: WriteLogPmemEntry ram_entry; @@ -106,6 +108,7 @@ public: std::shared_ptr get_sync_point_entry() override { return sync_point_entry; } + virtual void copy_pmem_bl(bufferlist *out_bl) = 0; std::ostream &format(std::ostream &os) const; friend std::ostream &operator<<(std::ostream &os, const GenericWriteLogEntry &entry); @@ -151,7 +154,7 @@ public: /* Returns a ref to a bl containing bufferptrs to the entry pmem buffer */ buffer::list &get_pmem_bl(); /* Constructs a new bl containing copies of pmem_bp */ - void copy_pmem_bl(bufferlist *out_bl); + void copy_pmem_bl(bufferlist *out_bl) override; void writeback(librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) override; std::ostream &format(std::ostream &os) const; diff --git a/src/librbd/cache/rwl/Request.cc b/src/librbd/cache/rwl/Request.cc index 6978328940901..a19dfbba469cd 100644 --- a/src/librbd/cache/rwl/Request.cc +++ b/src/librbd/cache/rwl/Request.cc @@ -14,8 +14,6 @@ namespace librbd { namespace cache { namespace rwl { -typedef std::list> GenericWriteLogEntries; - template C_BlockIORequest::C_BlockIORequest(T &rwl, const utime_t arrived, io::Extents &&extents, bufferlist&& bl, const int fadvise_flags, Context *user_req) @@ -191,6 +189,7 @@ void C_WriteRequest::setup_buffer_resources( template void C_WriteRequest::setup_log_operations(DeferredContexts &on_exit) { + GenericWriteLogEntries log_entries; { std::lock_guard locker(m_lock); std::shared_ptr current_sync_point = rwl.get_current_sync_point(); @@ -232,6 +231,7 @@ void C_WriteRequest::setup_log_operations(DeferredContexts &on_exit) { /* A WS is also a write */ ldout(rwl.get_context(), 20) << "write_req=" << *this << " op_set=" << op_set.get() << " operation=" << operation << dendl; + log_entries.emplace_back(operation->log_entry); rwl.inc_last_op_sequence_num(); operation->init(true, allocation, current_sync_gen, rwl.get_last_op_sequence_num(), this->bl, buffer_offset, op_set->persist_on_flush); @@ -248,6 +248,7 @@ void C_WriteRequest::setup_log_operations(DeferredContexts &on_exit) { for (auto &operation : op_set->operations) { operation->copy_bl_to_pmem_buffer(); } + rwl.add_into_log_map(log_entries); } template diff --git a/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc b/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc index d6b230fc6de59..d2a07b9298530 100644 --- a/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc +++ b/src/test/librbd/cache/test_mock_ReplicatedWriteLog.cc @@ -45,6 +45,7 @@ inline ImageCtx *get_image_ctx(MockImageCtx *image_ctx) { #include "librbd/cache/rwl/Request.cc" #include "librbd/cache/rwl/Types.cc" #include "librbd/cache/rwl/LogOperation.cc" +#include "librbd/cache/rwl/LogMap.cc" template class librbd::cache::ImageWriteback; template class librbd::cache::rwl::ImageCacheState;