From c5b13b73c596c0e0219ef087f8cc164c0ceff6f0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 21 May 2021 15:27:31 +0200 Subject: [PATCH] librbd/cache/pwl/ssd: avoid corrupting first_free_entry In append_ops(), new_first_free_entry is assigned to after aio_submit() is called. This can result in accessing uninitialized or freed memory because all I/Os may complete and append_ctx callback may run before the assignment is executed. Garbage value gets written to first_free_entry and we eventually crash, most likely in bufferlist manipulation code. But worse, the corrupted first_free_entry makes it to media almost all the time. The result is a corrupted cache -- dirty user data is lost. Fixes: https://tracker.ceph.com/issues/50832 Signed-off-by: Ilya Dryomov (cherry picked from commit ef381d993ce29c5d0d774a6af27c3af861392ca1) --- src/librbd/cache/pwl/ssd/WriteLog.cc | 75 ++++++++++++---------------- src/librbd/cache/pwl/ssd/WriteLog.h | 2 +- 2 files changed, 32 insertions(+), 45 deletions(-) diff --git a/src/librbd/cache/pwl/ssd/WriteLog.cc b/src/librbd/cache/pwl/ssd/WriteLog.cc index b37fd52588612..34a724b62ed9c 100644 --- a/src/librbd/cache/pwl/ssd/WriteLog.cc +++ b/src/librbd/cache/pwl/ssd/WriteLog.cc @@ -818,6 +818,7 @@ void WriteLog::append_ops(GenericLogOperations &ops, Context *ctx, uint64_t bytes_to_free = 0; ldout(cct, 20) << "Appending " << ops.size() << " log entries." << dendl; + *new_first_free_entry = pool_root.first_free_entry; AioTransContext* aio = new AioTransContext(cct, ctx); utime_t now = ceph_clock_now(); @@ -830,7 +831,7 @@ void WriteLog::append_ops(GenericLogOperations &ops, Context *ctx, if (log_entries.size() > 1) { bytes_to_free += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE; } - write_log_entries(log_entries, aio); + write_log_entries(log_entries, aio, new_first_free_entry); log_entries.clear(); span_payload_len = 0; } @@ -841,41 +842,32 @@ void WriteLog::append_ops(GenericLogOperations &ops, Context *ctx, if (log_entries.size() > 1) { bytes_to_free += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE; } - write_log_entries(log_entries, aio); - } - - { - std::lock_guard locker1(m_lock); - m_first_free_entry = *new_first_free_entry; - m_bytes_allocated -= bytes_to_free; - } - - { - std::lock_guard locker1(m_lock); - m_first_free_entry = *new_first_free_entry; - m_bytes_allocated -= bytes_to_free; + write_log_entries(log_entries, aio, new_first_free_entry); } bdev->aio_submit(&aio->ioc); - *new_first_free_entry = pool_root.first_free_entry; } template void WriteLog::write_log_entries(GenericLogEntriesVector log_entries, - AioTransContext *aio) { + AioTransContext *aio, uint64_t *pos) { CephContext *cct = m_image_ctx.cct; - ldout(m_image_ctx.cct, 20) << dendl; - bufferlist data_bl; + ldout(m_image_ctx.cct, 20) << "pos=" << *pos << dendl; + ceph_assert(*pos >= DATA_RING_BUFFER_OFFSET && + *pos < this->m_log_pool_size && + *pos % MIN_WRITE_ALLOC_SSD_SIZE == 0); + // The first block is for log entries - uint64_t data_pos = pool_root.first_free_entry + MIN_WRITE_ALLOC_SSD_SIZE; - ldout(m_image_ctx.cct, 20) << "data_pos: " << data_pos << dendl; - if (data_pos == pool_root.pool_size) { - data_pos = data_pos % pool_root.pool_size + DATA_RING_BUFFER_OFFSET; + uint64_t control_block_pos = *pos; + *pos += MIN_WRITE_ALLOC_SSD_SIZE; + if (*pos == this->m_log_pool_size) { + *pos = DATA_RING_BUFFER_OFFSET; } std::vector persist_log_entries; + bufferlist data_bl; for (auto &log_entry : log_entries) { - log_entry->log_entry_index = pool_root.first_free_entry; + log_entry->log_entry_index = control_block_pos; // Append data buffer for write operations if (log_entry->is_write_entry()) { auto write_entry = static_pointer_cast(log_entry); @@ -884,10 +876,10 @@ void WriteLog::write_log_entries(GenericLogEntriesVector log_entries, data_bl.append(cache_bl); data_bl.append_zero(align_size - cache_bl.length()); - write_entry->ram_entry.write_data_pos = data_pos; - data_pos += align_size; - if (data_pos >= pool_root.pool_size) { - data_pos = data_pos % pool_root.pool_size + DATA_RING_BUFFER_OFFSET; + write_entry->ram_entry.write_data_pos = *pos; + *pos += align_size; + if (*pos >= this->m_log_pool_size) { + *pos = *pos % this->m_log_pool_size + DATA_RING_BUFFER_OFFSET; } } // push_back _after_ setting write_data_pos @@ -901,33 +893,28 @@ void WriteLog::write_log_entries(GenericLogEntriesVector log_entries, bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length()); bl.append(data_bl); ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0); - if (pool_root.first_free_entry + bl.length() > pool_root.pool_size) { + if (control_block_pos + bl.length() > this->m_log_pool_size) { //exceeds border, need to split uint64_t size = bl.length(); - auto end = pool_root.pool_size - pool_root.first_free_entry; bufferlist bl1; - bl.splice(0, end, &bl1); + bl.splice(0, this->m_log_pool_size - control_block_pos, &bl1); ceph_assert(bl.length() == (size - bl1.length())); - ldout(cct, 20) << "The write on " << pool_root.first_free_entry - << " with length " << size << " is split into two: " - << "pos=" << pool_root.first_free_entry << ", " - << "length=" << bl1.length() << "; " - << "pos=" << DATA_RING_BUFFER_OFFSET << ", " - << "length=" << bl.length() << dendl; - - bdev->aio_write(pool_root.first_free_entry, bl1, &aio->ioc, false, + + ldout(cct, 20) << "write " << control_block_pos << "~" + << size << " spans boundary, split into " + << control_block_pos << "~" << bl1.length() + << " and " << DATA_RING_BUFFER_OFFSET << "~" + << bl.length() << dendl; + bdev->aio_write(control_block_pos, bl1, &aio->ioc, false, WRITE_LIFE_NOT_SET); bdev->aio_write(DATA_RING_BUFFER_OFFSET, bl, &aio->ioc, false, WRITE_LIFE_NOT_SET); } else { - ldout(cct, 20) << "first_free_entry: " << pool_root.first_free_entry - << " bl length: " << bl.length() << dendl; - bdev->aio_write(pool_root.first_free_entry, bl, &aio->ioc, false, + ldout(cct, 20) << "write " << control_block_pos << "~" + << bl.length() << dendl; + bdev->aio_write(control_block_pos, bl, &aio->ioc, false, WRITE_LIFE_NOT_SET); - ldout(cct, 20) << "finished aio_write log entries" << dendl; } - // New first free entry - pool_root.first_free_entry = data_pos; } template diff --git a/src/librbd/cache/pwl/ssd/WriteLog.h b/src/librbd/cache/pwl/ssd/WriteLog.h index 5a0b2305d98bc..69cc366628f24 100644 --- a/src/librbd/cache/pwl/ssd/WriteLog.h +++ b/src/librbd/cache/pwl/ssd/WriteLog.h @@ -128,7 +128,7 @@ private: void append_ops(GenericLogOperations &ops, Context *ctx, uint64_t* new_first_free_entry); void write_log_entries(GenericLogEntriesVector log_entries, - AioTransContext *aio); + AioTransContext *aio, uint64_t *pos); void schedule_update_root(std::shared_ptr root, Context *ctx); void enlist_op_update_root(); -- 2.39.5