]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
LibrbdWriteback: complete writes strictly in order 214/head
authorJosh Durgin <josh.durgin@inktank.com>
Wed, 10 Apr 2013 21:16:56 +0000 (14:16 -0700)
committerJosh Durgin <josh.durgin@inktank.com>
Wed, 10 Apr 2013 23:57:08 +0000 (16:57 -0700)
RADOS returns writes to the same object in the same order. The
ObjectCacher relies on this assumption to make sure previous writes
are complete and maintain consistency. Reads, however, may be
reordered with respect to each other. When writing to an rbd clone,
reads to the parent must be performed when the object does not exist
in the child yet. These reads may be reordered, resulting in the
original writes being reordered. This breaks the assmuptions of the
ObjectCacher, causing an assert to fail.

To fix this, keep a per-object queue of outstanding writes to an
object in the LibrbdWriteback handler, and finish them in the order in
which they were sent.

Fixes: #4531
Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
src/librbd/LibrbdWriteback.cc
src/librbd/LibrbdWriteback.h

index 8c0de81078aff1c66fc2a66363d4b18664dd3e82..237901dc61fed3303b74bb0428bc935ec94cd426 100644 (file)
@@ -62,6 +62,29 @@ namespace librbd {
     Mutex *m_lock;
   };
 
+  class C_OrderedWrite : public Context {
+  public:
+    C_OrderedWrite(CephContext *cct, LibrbdWriteback::write_result_d *result,
+                  LibrbdWriteback *wb)
+      : m_cct(cct), m_result(result), m_wb_handler(wb) {}
+    virtual ~C_OrderedWrite() {}
+    virtual void finish(int r) {
+      ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl;
+      {
+       Mutex::Locker l(m_wb_handler->m_lock);
+       assert(!m_result->done);
+       m_result->done = true;
+       m_result->ret = r;
+       m_wb_handler->complete_writes(m_result->oid);
+      }
+      ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl;
+    }
+  private:
+    CephContext *m_cct;
+    LibrbdWriteback::write_result_d *m_result;
+    LibrbdWriteback *m_wb_handler;
+  };
+
   LibrbdWriteback::LibrbdWriteback(ImageCtx *ictx, Mutex& lock)
     : m_tid(0), m_lock(lock), m_ictx(ictx)
   {
@@ -130,8 +153,10 @@ namespace librbd {
                          object_no, 0, m_ictx->layout.fl_object_size,
                          objectx);
     uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap);
-
-    C_Request *req_comp = new C_Request(m_ictx->cct, oncommit, &m_lock);
+    write_result_d *result = new write_result_d(oid.name, oncommit);
+    m_writes[oid.name].push(result);
+    ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl;
+    C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this);
     AioWrite *req = new AioWrite(m_ictx, oid.name,
                                 object_no, off, objectx, object_overlap,
                                 bl, snapc, snap_id,
@@ -139,4 +164,32 @@ namespace librbd {
     req->send();
     return ++m_tid;
   }
+
+  void LibrbdWriteback::complete_writes(const std::string& oid)
+  {
+    assert(m_lock.is_locked());
+    std::queue<write_result_d*>& results = m_writes[oid];
+    ldout(m_ictx->cct, 20) << "complete_writes() oid " << oid << dendl;
+    std::list<write_result_d*> finished;
+
+    while (!results.empty()) {
+      write_result_d *result = results.front();
+      if (!result->done)
+       break;
+      finished.push_back(result);
+      results.pop();
+    }
+
+    if (results.empty())
+      m_writes.erase(oid);
+
+    for (std::list<write_result_d*>::iterator it = finished.begin();
+        it != finished.end(); ++it) {
+      write_result_d *result = *it;
+      ldout(m_ictx->cct, 20) << "complete_writes() completing " << result
+                            << dendl;
+      result->oncommit->complete(result->ret);
+      delete result;
+    }
+  }
 }
index 6466a23ce98c9098e6003daf596280ac7b1053ba..ba8ff1f114d17150a6a0bff13070d3d7722b6047 100644 (file)
@@ -3,6 +3,8 @@
 #ifndef CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H
 #define CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H
 
+#include <queue>
+
 #include "include/Context.h"
 #include "include/types.h"
 #include "include/rados/librados.hpp"
@@ -35,10 +37,26 @@ namespace librbd {
                        const bufferlist &bl, utime_t mtime, uint64_t trunc_size,
                        __u32 trunc_seq, Context *oncommit);
 
+    struct write_result_d {
+      bool done;
+      int ret;
+      std::string oid;
+      Context *oncommit;
+      write_result_d(const std::string& oid, Context *oncommit) :
+       done(false), ret(0), oid(oid), oncommit(oncommit) {}
+    private:
+      write_result_d(const write_result_d& rhs);
+      const write_result_d& operator=(const write_result_d& rhs);
+    };
+
   private:
+    void complete_writes(const std::string& oid);
+
     tid_t m_tid;
     Mutex& m_lock;
     librbd::ImageCtx *m_ictx;
+    hash_map<std::string, std::queue<write_result_d*> > m_writes;
+    friend class C_OrderedWrite;
   };
 }