From 06d05e5ed7e09fa873cc05021d16f21317a1f8ef Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Wed, 10 Apr 2013 14:16:56 -0700 Subject: [PATCH] LibrbdWriteback: complete writes strictly in order RADOS returns writes to the same object in the same order. The ObjectCacher relies on this assumption to make sure previous writes are complete and maintain consistency. Reads, however, may be reordered with respect to each other. When writing to an rbd clone, reads to the parent must be performed when the object does not exist in the child yet. These reads may be reordered, resulting in the original writes being reordered. This breaks the assmuptions of the ObjectCacher, causing an assert to fail. To fix this, keep a per-object queue of outstanding writes to an object in the LibrbdWriteback handler, and finish them in the order in which they were sent. Fixes: #4531 Signed-off-by: Josh Durgin --- src/librbd/LibrbdWriteback.cc | 57 +++++++++++++++++++++++++++++++++-- src/librbd/LibrbdWriteback.h | 18 +++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc index 8c0de81078aff..237901dc61fed 100644 --- a/src/librbd/LibrbdWriteback.cc +++ b/src/librbd/LibrbdWriteback.cc @@ -62,6 +62,29 @@ namespace librbd { Mutex *m_lock; }; + class C_OrderedWrite : public Context { + public: + C_OrderedWrite(CephContext *cct, LibrbdWriteback::write_result_d *result, + LibrbdWriteback *wb) + : m_cct(cct), m_result(result), m_wb_handler(wb) {} + virtual ~C_OrderedWrite() {} + virtual void finish(int r) { + ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl; + { + Mutex::Locker l(m_wb_handler->m_lock); + assert(!m_result->done); + m_result->done = true; + m_result->ret = r; + m_wb_handler->complete_writes(m_result->oid); + } + ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl; + } + private: + CephContext *m_cct; + LibrbdWriteback::write_result_d *m_result; + LibrbdWriteback *m_wb_handler; + }; + LibrbdWriteback::LibrbdWriteback(ImageCtx *ictx, Mutex& lock) : m_tid(0), m_lock(lock), m_ictx(ictx) { @@ -130,8 +153,10 @@ namespace librbd { object_no, 0, m_ictx->layout.fl_object_size, objectx); uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap); - - C_Request *req_comp = new C_Request(m_ictx->cct, oncommit, &m_lock); + write_result_d *result = new write_result_d(oid.name, oncommit); + m_writes[oid.name].push(result); + ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl; + C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this); AioWrite *req = new AioWrite(m_ictx, oid.name, object_no, off, objectx, object_overlap, bl, snapc, snap_id, @@ -139,4 +164,32 @@ namespace librbd { req->send(); return ++m_tid; } + + void LibrbdWriteback::complete_writes(const std::string& oid) + { + assert(m_lock.is_locked()); + std::queue& results = m_writes[oid]; + ldout(m_ictx->cct, 20) << "complete_writes() oid " << oid << dendl; + std::list finished; + + while (!results.empty()) { + write_result_d *result = results.front(); + if (!result->done) + break; + finished.push_back(result); + results.pop(); + } + + if (results.empty()) + m_writes.erase(oid); + + for (std::list::iterator it = finished.begin(); + it != finished.end(); ++it) { + write_result_d *result = *it; + ldout(m_ictx->cct, 20) << "complete_writes() completing " << result + << dendl; + result->oncommit->complete(result->ret); + delete result; + } + } } diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h index 6466a23ce98c9..ba8ff1f114d17 100644 --- a/src/librbd/LibrbdWriteback.h +++ b/src/librbd/LibrbdWriteback.h @@ -3,6 +3,8 @@ #ifndef CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H #define CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H +#include + #include "include/Context.h" #include "include/types.h" #include "include/rados/librados.hpp" @@ -35,10 +37,26 @@ namespace librbd { const bufferlist &bl, utime_t mtime, uint64_t trunc_size, __u32 trunc_seq, Context *oncommit); + struct write_result_d { + bool done; + int ret; + std::string oid; + Context *oncommit; + write_result_d(const std::string& oid, Context *oncommit) : + done(false), ret(0), oid(oid), oncommit(oncommit) {} + private: + write_result_d(const write_result_d& rhs); + const write_result_d& operator=(const write_result_d& rhs); + }; + private: + void complete_writes(const std::string& oid); + tid_t m_tid; Mutex& m_lock; librbd::ImageCtx *m_ictx; + hash_map > m_writes; + friend class C_OrderedWrite; }; } -- 2.39.5