]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: copy-on-read
authorCheng Cheng <ccheng.leo@gmail.com>
Wed, 14 Jan 2015 15:52:46 +0000 (10:52 -0500)
committerJason Dillaman <dillaman@redhat.com>
Tue, 20 Jan 2015 15:11:39 +0000 (10:11 -0500)
  Asychronously perform copyup on read path.
  Sychronously perform copyup on write path.

Signed-off-by: Cheng Cheng <ccheng.leo@gmail.com>
src/common/config_opts.h
src/include/xlist.h
src/librbd/AioRequest.cc
src/librbd/AioRequest.h
src/librbd/CopyupRequest.cc [new file with mode: 0644]
src/librbd/CopyupRequest.h [new file with mode: 0644]
src/librbd/ImageCtx.cc
src/librbd/ImageCtx.h
src/librbd/Makefile.am
src/librbd/internal.cc
src/test/librbd/test_librbd.cc

index 622e9f4bd7c22f3cc0f8f8554c50c4ee0c7b5aa7..0885130327c9da64da02084aeb188694ab279afc 100644 (file)
@@ -850,7 +850,7 @@ OPTION(rbd_localize_parent_reads, OPT_BOOL, true)
 OPTION(rbd_readahead_trigger_requests, OPT_INT, 10) // number of sequential requests necessary to trigger readahead
 OPTION(rbd_readahead_max_bytes, OPT_LONGLONG, 512 * 1024) // set to 0 to disable readahead
 OPTION(rbd_readahead_disable_after_bytes, OPT_LONGLONG, 50 * 1024 * 1024) // how many bytes are read in total before readahead is disabled
-OPTION(rbd_clone_copy_on_read, OPT_BOOL, false)//copy-on-read option for rbd clones
+OPTION(rbd_clone_copy_on_read, OPT_BOOL, false)
 
 /*
  * The following options change the behavior for librbd's image creation methods that
index 7fdbe355959253e652f696f72a9d552c7a64fc9d..53aa3ab06fc5195dbd76c0d5636619f5f81b6ef1 100644 (file)
@@ -160,7 +160,6 @@ public:
       return *this;
     }
     bool end() const { return cur == 0; }
-    item *get_cur() const { return cur; }
   };
 
   iterator begin() { return iterator(_front); }
index 9591eb34d083d38710ba8821c72552f007b5a1cf..dc61c00f591079471bcbecdf2cdd411987176d0b 100644 (file)
@@ -11,6 +11,7 @@
 #include "librbd/internal.h"
 
 #include "librbd/AioRequest.h"
+#include "librbd/CopyupRequest.h"
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
@@ -22,26 +23,26 @@ namespace librbd {
     m_ictx(NULL), m_ioctx(NULL),
     m_object_no(0), m_object_off(0), m_object_len(0),
     m_snap_id(CEPH_NOSNAP), m_completion(NULL), m_parent_completion(NULL),
-    m_hide_enoent(false), m_parent_completion_cor(NULL){}
+    m_hide_enoent(false), m_entire_object(NULL) {}
   AioRequest::AioRequest(ImageCtx *ictx, const std::string &oid,
                         uint64_t objectno, uint64_t off, uint64_t len,
-                        librados::snap_t snap_id,
+                        const ::SnapContext &snapc, librados::snap_t snap_id,
                         Context *completion,
                         bool hide_enoent) :
     m_ictx(ictx), m_ioctx(&ictx->data_ctx), m_oid(oid), m_object_no(objectno),
     m_object_off(off), m_object_len(len), m_snap_id(snap_id),
     m_completion(completion), m_parent_completion(NULL),
-    m_hide_enoent(hide_enoent), m_parent_completion_cor(NULL) {}
+    m_hide_enoent(hide_enoent), m_entire_object(NULL) {
+    for (std::vector<snapid_t>::const_iterator it = snapc.snaps.begin();
+         it != snapc.snaps.end(); ++it)
+      m_snaps.push_back(it->val);
+  }
 
   AioRequest::~AioRequest() {
     if (m_parent_completion) {
       m_parent_completion->release();
       m_parent_completion = NULL;
     }
-    if (m_parent_completion_cor) {
-      m_parent_completion_cor->release();
-      m_parent_completion_cor = NULL;
-    }
   }
 
   void AioRequest::read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents)
@@ -56,115 +57,160 @@ namespace librbd {
             m_parent_completion, 0);
   }
 
-  //copy-on-read : read the entire object from parent, using bufferlist m_entire_object
-  void AioRequest::read_from_parent_cor(vector<pair<uint64_t,uint64_t> >& image_extents)
-  {
-    assert(!m_parent_completion_cor);
-    m_parent_completion_cor = aio_create_completion_internal(this, rbd_req_cb);
-    ldout(m_ictx->cct, 20) << "read_from_parent_cor this = " << this
-                          << " parent completion cor " << m_parent_completion_cor
-                          << " extents " << image_extents
-                          << dendl;
-    aio_read(m_ictx->parent, image_extents, NULL, &m_entire_object,
-            m_parent_completion_cor);
+  static inline bool is_cor(ImageCtx *ictx, librados::snap_t snap_id) {
+     return (ictx->cct->_conf->rbd_clone_copy_on_read) &&
+             (!ictx->read_only) && (snap_id == CEPH_NOSNAP);
   }
 
   /** read **/
 
-  //copy-on-read: after read entire object, just write it into child
-  ssize_t AioRead::write_cor()
+  void AioRead::guard_read()
   {
-    ldout(m_ictx->cct, 20) << "write_cor" << dendl;
-    int ret = 0;
-
-    m_ictx->snap_lock.get_read();
-    ::SnapContext snapc = m_ictx->snapc;
-    m_ictx->snap_lock.put_read();
+    RWLock::RLocker l(m_ictx->snap_lock);
+    RWLock::RLocker l2(m_ictx->parent_lock);
 
-    librados::ObjectWriteOperation copyup_cor;
-    copyup_cor.exec("rbd", "copyup", m_entire_object);
+    vector<pair<uint64_t,uint64_t> > image_extents;
+    Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
+                            m_object_no, 0, m_ictx->layout.fl_object_size,
+                            image_extents);
 
-    std::vector<librados::snap_t> m_snaps;
-    for (std::vector<snapid_t>::const_iterator it = snapc.snaps.begin();
-                it != snapc.snaps.end(); ++it) {
-      m_snaps.push_back(it->val);
+    uint64_t image_overlap = 0;
+    m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
+    uint64_t object_overlap =
+      m_ictx->prune_parent_extents(image_extents, image_overlap);
+    if (object_overlap) {
+      ldout(m_ictx->cct, 20) << __func__ << " guarding read" << dendl;
+      m_state = LIBRBD_AIO_READ_GUARD;
     }
-
-    librados::AioCompletion *cor_completion =
-        librados::Rados::aio_create_completion(m_ictx, librbd::cor_completion_callback, NULL);
-
-    xlist<librados::AioCompletion *>::item *comp =
-       new xlist<librados::AioCompletion *>::item(cor_completion);
-
-    m_ictx->add_cor_completion(comp);//add cor_completion to xlist
-    //asynchronously write object
-    ret = m_ictx->md_ctx.aio_operate(m_oid, cor_completion, &copyup_cor, snapc.seq.val, m_snaps);
-
-    return ret;
   }
 
   bool AioRead::should_complete(int r)
   {
-    ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len
-                          << " r = " << r << dendl;
-
-    //get copy-on-read option and check image if read_only
-    bool cor = (m_ictx->cct->_conf->rbd_clone_copy_on_read) && (!m_ictx->read_only);
-    ldout(m_ictx->cct, 20) << "should_complete cor = " << cor << " read_only = " << m_ictx->read_only << dendl;
+    ldout(m_ictx->cct, 20) << "AioRead::should_complete " << this
+                           << " " << m_oid << " " << m_object_off
+                           << "~" << m_object_len << " r = " << r
+                           << " cor = " << is_cor(m_ictx, m_snap_id)
+                           << " readonly = " << m_ictx->read_only
+                           << dendl;
 
-    if (!m_tried_parent && r == -ENOENT) {
-      RWLock::RLocker l(m_ictx->snap_lock);
-      RWLock::RLocker l2(m_ictx->parent_lock);
+    bool finished = true;
 
-      // calculate reverse mapping onto the image
-      vector<pair<uint64_t,uint64_t> > image_extents;
-      Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
-                           m_object_no, m_object_off, m_object_len,
-                           image_extents);
+    switch (m_state) {
+    case LIBRBD_AIO_READ_GUARD:
+      ldout(m_ictx->cct, 20) << "should_complete " << this
+                             << " READ_CHECK_GUARD" << dendl;
 
-      uint64_t image_overlap = 0;
-      r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
-      if (r < 0) {
-       assert(0 == "FIXME");
-      }
-      uint64_t object_overlap = m_ictx->prune_parent_extents(image_extents, image_overlap);
-      if (object_overlap) {
-       m_tried_parent = true;
-       if (cor) {//copy-on-read option  
-           vector<pair<uint64_t,uint64_t> > extend_image_extents;
-           //extend range to entire object
-           Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
-                           m_object_no, 0, m_ictx->layout.fl_object_size,
-                           extend_image_extents);
-           //read entire object from parent , and put it in m_entire_object
-           read_from_parent_cor(extend_image_extents);
-       } else {
-           read_from_parent(image_extents);
-       }
-       return false;
-      }
-    }
+      // This is the step to read from parent
+      if (!m_tried_parent && r == -ENOENT) {
+        RWLock::RLocker l(m_ictx->snap_lock);
+        RWLock::RLocker l2(m_ictx->parent_lock);
 
-    if (cor) {//copy-on-read option
-      //if read entire object from parent success
-      if (m_tried_parent && r > 0) {
+        // calculate reverse mapping onto the image
         vector<pair<uint64_t,uint64_t> > image_extents;
         Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
-                            m_object_no, m_object_off, m_object_len,
-                            image_extents);
+                               m_object_no, m_object_off, m_object_len,
+                               image_extents);
+
         uint64_t image_overlap = 0;
-        int r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
+        r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
         if (r < 0) {
           assert(0 == "FIXME");
         }
-       m_ictx->prune_parent_extents(image_extents, image_overlap);
-       // copy the read range to m_read_data
-       m_read_data.substr_of(m_entire_object, m_object_off, m_object_len);
-       write_cor();
+        uint64_t object_overlap = m_ictx->prune_parent_extents(image_extents, image_overlap);
+        if (object_overlap) {
+          m_tried_parent = true;
+
+          if (is_cor(m_ictx, m_snap_id)) {
+            // If there is a valid object being coyping up, directly extract
+            // content and finish up.
+            Mutex::Locker l3(m_ictx->copyup_list_lock);
+
+            map<uint64_t, CopyupRequest*>::iterator it =
+              m_ictx->copyup_list.find(m_object_no);
+            if (it != m_ictx->copyup_list.end()) {
+              Mutex::Locker l4(it->second->get_lock());
+              if (it->second->is_ready()) {
+                ceph::bufferlist &copyup_data = it->second->get_copyup_data();
+                m_read_data.substr_of(copyup_data, m_object_off, m_object_len);
+                ldout(m_ictx->cct, 20) << __func__ << " extract content from copyup_list, obj-"
+                                       << m_object_no << dendl;
+                finished = true;
+                break;
+              }
+            }
+          }
+
+          read_from_parent(image_extents);
+
+          if (is_cor(m_ictx, m_snap_id))
+            m_state = LIBRBD_AIO_READ_COPYUP; 
+          else 
+            m_state = LIBRBD_AIO_READ_GUARD;
+
+          finished = false;
+        }
       }
+      break;
+    case LIBRBD_AIO_READ_COPYUP:
+      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP" << dendl;
+      // This is the extra step for copy-on-read: kick off an asynchronous copyup.
+      // It is different from copy-on-write as asynchronous copyup will finish
+      // by itself so state won't go back to LIBRBD_AIO_READ_GUARD.
+
+      if (m_tried_parent && r > 0) {
+        // If read entire object from parent success and CoR is possible, kick
+        // off a asynchronous copyup. This approach minimizes the latency
+        // impact.
+        m_ictx->copyup_list_lock.Lock();
+        map<uint64_t, CopyupRequest*>::iterator it =
+          m_ictx->copyup_list.find(m_object_no);
+        if (it == m_ictx->copyup_list.end()) {
+          // create and kick off a CopyupRequest
+          CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+                                                     m_object_no, true);
+          pair<uint64_t, CopyupRequest*> new_slot =
+            pair<uint64_t, CopyupRequest*>(m_object_no, new_req);
+          m_ictx->copyup_list.insert(new_slot);
+          m_ictx->copyup_list_lock.Unlock();
+
+          RWLock::RLocker l(m_ictx->snap_lock);
+          RWLock::RLocker l2(m_ictx->parent_lock);
+
+          vector<pair<uint64_t,uint64_t> > extend_image_extents;
+          //extend range to entire object
+          Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+                                  0, m_ictx->layout.fl_object_size,
+                                  extend_image_extents);
+          uint64_t image_overlap = 0;
+          r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
+          if (r < 0) {
+            assert(0 == "FIXME");
+          }
+
+          m_ictx->prune_parent_extents(extend_image_extents, image_overlap);
+          m_ictx->copyup_list[m_object_no]->read_from_parent(extend_image_extents);
+        } else {
+          m_ictx->copyup_list_lock.Unlock();
+        }
+
+        finished = true;
+      }
+
+      if (r < 0) {
+        ldout(m_ictx->cct, 20) << "error checking for object existence" << dendl;
+      }
+
+      break;
+    case LIBRBD_AIO_READ_FLAT:
+      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT" << dendl;
+      // The read contect should be deposit in m_read_data
+      break;
+    default:
+      lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
+      assert(0);
     }
 
-    return true;
+    return finished;
   }
 
   int AioRead::send() {
@@ -202,18 +248,12 @@ namespace librbd {
                               const ::SnapContext &snapc, librados::snap_t snap_id,
                               Context *completion,
                               bool hide_enoent)
-    : AioRequest(ictx, oid, object_no, object_off, len, snap_id, completion,
-                hide_enoent),
+    : AioRequest(ictx, oid, object_no, object_off, len, snapc, snap_id, 
+                 completion, hide_enoent),
       m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
   {
     m_object_image_extents = objectx;
     m_parent_overlap = object_overlap;
-
-    // TODO: find a way to make this less stupid
-    for (std::vector<snapid_t>::const_iterator it = snapc.snaps.begin();
-        it != snapc.snaps.end(); ++it) {
-      m_snaps.push_back(it->val);
-    }
   }
 
   void AbstractWrite::guard_write()
@@ -230,6 +270,7 @@ namespace librbd {
     ldout(m_ictx->cct, 20) << "write " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len
                           << " should_complete: r = " << r << dendl;
 
+    map<uint64_t, CopyupRequest*>::iterator it;
     bool finished = true;
     switch (m_state) {
     case LIBRBD_AIO_WRITE_GUARD:
@@ -269,7 +310,48 @@ namespace librbd {
                                 << m_object_image_extents << dendl;
 
          m_state = LIBRBD_AIO_WRITE_COPYUP;
-         read_from_parent(m_object_image_extents);
+
+          if (is_cor(m_ictx, m_snap_id)) {
+            m_ictx->copyup_list_lock.Lock();
+            it = m_ictx->copyup_list.find(m_object_no);
+            if (it == m_ictx->copyup_list.end()) {
+              // If it is not in the list, create a CopyupRequest and wait for it.
+              CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+                                                         m_object_no, false);
+              // make sure to wait on this CopyupRequest
+              new_req->append_request(this);
+              pair<uint64_t, CopyupRequest*> new_slot =
+                pair<uint64_t, CopyupRequest*>(m_object_no, new_req);
+              m_ictx->copyup_list.insert(new_slot);
+              m_ictx->copyup_list_lock.Unlock();
+
+              m_entire_object = &(m_ictx->copyup_list[m_object_no]->get_copyup_data());
+              ldout(m_ictx->cct, 20) << __func__ << " creating new Copyup for AioWrite, obj-"
+                                     << m_object_no << dendl;
+              m_ictx->copyup_list[m_object_no]->read_from_parent(m_object_image_extents);
+              ldout(m_ictx->cct, 20) << __func__ << " issuing read_from_parent" << dendl;
+            } else {
+              Mutex::Locker l3(it->second->get_lock());
+              if (it->second->is_ready()) {
+                ldout(m_ictx->cct, 20) << __func__ << " extracting contect from copyup_list, obj-"
+                                       << m_object_no << " length=" << it->second->get_copyup_data().length()
+                                       << dendl;
+                m_read_data.append(it->second->get_copyup_data());
+                ldout(m_ictx->cct, 20) << __func__ << " extracted contect from copyup_list, obj-"
+                                       << m_object_no << " length=" << m_read_data.length()
+                                       << dendl;
+                m_ictx->copyup_list_lock.Unlock();
+                return should_complete(1);
+              } else {
+                ldout(m_ictx->cct, 20) << __func__ << " someone is reading back from parent" << dendl;
+                it->second->append_request(this);
+                m_ictx->copyup_list_lock.Unlock();
+
+                m_entire_object = &(m_ictx->copyup_list[m_object_no]->get_copyup_data());
+              }
+            }
+          } else 
+            read_from_parent(m_object_image_extents);
        } else {
          ldout(m_ictx->cct, 20) << "should_complete(" << this
                                 << "): parent overlap now 0" << dendl;
@@ -291,6 +373,20 @@ namespace librbd {
       m_state = LIBRBD_AIO_WRITE_GUARD;
       if (r < 0)
        return should_complete(r);
+
+      // Read data from waiting list safely. If this AioWrite created a
+      // CopyupRequest, m_read_data should be empty.
+      if (m_read_data.is_zero()) {
+        Mutex::Locker l(m_ictx->copyup_list_lock);
+        ldout(m_ictx->cct, 20) << __func__ << " releasing self pending, obj-"
+                               << m_object_no << dendl;
+        it = m_ictx->copyup_list.find(m_object_no);
+        assert(it != m_ictx->copyup_list.end());
+        assert(it->second->is_ready());
+        assert(m_entire_object);
+        m_read_data.append(*m_entire_object);
+      }
+
       send_copyup();
       finished = false;
       break;
index 8fbae736e5435665c2dd2f070e854c3b54e1cff8..4368c1214d1e0febe5ded3d5cbc7372d9ccacacf 100644 (file)
@@ -16,6 +16,7 @@ namespace librbd {
 
   struct AioCompletion;
   struct ImageCtx;
+  class CopyupRequest;
 
   /**
    * This class represents an I/O operation to a single RBD data object.
@@ -27,9 +28,9 @@ namespace librbd {
   public:
     AioRequest();
     AioRequest(ImageCtx *ictx, const std::string &oid,
-              uint64_t objectno, uint64_t off, uint64_t len,
-              librados::snap_t snap_id, Context *completion,
-              bool hide_enoent);
+               uint64_t objectno, uint64_t off, uint64_t len,
+               const ::SnapContext &snapc, librados::snap_t snap_id,
+               Context *completion, bool hide_enoent);
     virtual ~AioRequest();
 
     void complete(int r)
@@ -47,7 +48,6 @@ namespace librbd {
 
   protected:
     void read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents);
-    void read_from_parent_cor(vector<pair<uint64_t,uint64_t> >& image_extents);
 
     ImageCtx *m_ictx;
     librados::IoCtx *m_ioctx;
@@ -58,26 +58,27 @@ namespace librbd {
     AioCompletion *m_parent_completion;
     ceph::bufferlist m_read_data;
     bool m_hide_enoent;
-    ceph::bufferlist m_entire_object;//copy-on-read : store the entire object
-    AioCompletion *m_parent_completion_cor;//copy-on-read : AioCompletion for read from parent
+    std::vector<librados::snap_t> m_snaps;
+    ceph::bufferlist *m_entire_object;
   };
 
   class AioRead : public AioRequest {
   public:
     AioRead(ImageCtx *ictx, const std::string &oid,
            uint64_t objectno, uint64_t offset, uint64_t len,
-           vector<pair<uint64_t,uint64_t> >& be,
+           vector<pair<uint64_t,uint64_t> >& be, const ::SnapContext &snapc,
            librados::snap_t snap_id, bool sparse,
            Context *completion, int op_flags)
-      : AioRequest(ictx, oid, objectno, offset, len, snap_id, completion,
+      : AioRequest(ictx, oid, objectno, offset, len, snapc, snap_id, completion,
                   false),
        m_buffer_extents(be), m_tried_parent(false),
-       m_sparse(sparse), m_op_flags(op_flags) {
+       m_sparse(sparse), m_op_flags(op_flags), m_state(LIBRBD_AIO_READ_FLAT) {
+      guard_read();
     }
     virtual ~AioRead() {}
-    ssize_t write_cor();
     virtual bool should_complete(int r);
     virtual int send();
+    void guard_read();
 
     ceph::bufferlist &data() {
       return m_read_data;
@@ -91,6 +92,30 @@ namespace librbd {
     bool m_tried_parent;
     bool m_sparse;
     int m_op_flags;
+
+    /**
+     * Reads go through the following state machine to deal with
+     * layering:
+     *
+     *                          need copyup
+     * LIBRBD_AIO_READ_GUARD ---------------> LIBRBD_AIO_READ_COPYUP
+     *           |                                       |
+     *           v                                       |
+     *         done <------------------------------------/
+     *           ^
+     *           |
+     * LIBRBD_AIO_READ_FLAT
+     *
+     * Reads start in LIBRBD_AIO_READ_GUARD or _FLAT, depending on
+     * whether there is a parent or not.
+     */
+    enum read_state_d {
+      LIBRBD_AIO_READ_GUARD,
+      LIBRBD_AIO_READ_COPYUP,
+      LIBRBD_AIO_READ_FLAT
+    };
+
+    read_state_d m_state;
   };
 
   class AbstractWrite : public AioRequest {
@@ -144,7 +169,6 @@ namespace librbd {
     librados::ObjectWriteOperation m_write;
     librados::ObjectWriteOperation m_copyup;
     uint64_t m_snap_seq;
-    std::vector<librados::snap_t> m_snaps;
 
   private:
     void send_copyup();
diff --git a/src/librbd/CopyupRequest.cc b/src/librbd/CopyupRequest.cc
new file mode 100644 (file)
index 0000000..1afe71f
--- /dev/null
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/Mutex.h"
+
+#include "librbd/AioCompletion.h"
+#include "librbd/ImageCtx.h"
+
+#include "librbd/AioRequest.h"
+#include "librbd/CopyupRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::CopyupRequest: "
+
+namespace librbd {
+  CopyupRequest::CopyupRequest()
+    : m_ictx(NULL), m_object_no(0), m_lock("librbd::CopyupRequest::m_lock"),
+      m_ready(false), m_need_copyup(false), m_parent_completion(NULL),
+      m_copyup_completion(NULL) {}
+
+  CopyupRequest::CopyupRequest(ImageCtx *ictx, const std::string &oid,
+                               uint64_t objectno, bool need_copyup)
+    : m_ictx(ictx), m_oid(oid), m_object_no(objectno),
+      m_lock("librbd::CopyupRequest::m_lock"), m_ready(false),
+      m_need_copyup(need_copyup), m_parent_completion(NULL),
+      m_copyup_completion(NULL) {}
+
+  CopyupRequest::~CopyupRequest() {
+    assert(m_pending_requests.empty());
+
+    m_ictx->copyup_list_lock.Lock();
+    ldout(m_ictx->cct, 20) << __func__ << " removing the slot " << dendl;
+    map<uint64_t, CopyupRequest*>::iterator it =
+      m_ictx->copyup_list.find(m_object_no);
+    assert(it != m_ictx->copyup_list.end());
+    it->second = NULL;
+    m_ictx->copyup_list.erase(it);
+
+    if (m_ictx->copyup_list.empty())
+      m_ictx->copyup_list_cond.Signal();
+
+    ldout(m_ictx->cct, 20) <<  __func__ << " remove the slot " << m_object_no
+                           << " in copyup_list, size = " << m_ictx->copyup_list.size()
+                           << dendl;
+
+    m_ictx->copyup_list_lock.Unlock();
+  }
+
+  void CopyupRequest::set_ready() {
+    m_ready = true;
+  }
+
+  bool CopyupRequest::is_ready() {
+    return m_ready;
+  }
+
+  bool CopyupRequest::is_need_send_copyup() {
+    return m_need_copyup;
+  }
+
+  ceph::bufferlist& CopyupRequest::get_copyup_data() {
+    return m_copyup_data;
+  }
+
+  Mutex& CopyupRequest::get_lock() {
+    return m_lock;
+  }
+
+  void CopyupRequest::append_request(AioRequest *req) {
+    assert(!m_ready);
+    m_pending_requests.push_back(req);
+  }
+
+  void CopyupRequest::complete_all(int r) {
+    assert(m_ready);
+
+    while (!m_pending_requests.empty()) {
+      vector<AioRequest *>::iterator it = m_pending_requests.begin();
+      AioRequest *req = *it;
+      req->complete(r);
+      m_pending_requests.erase(it);
+    }
+  }
+
+  void CopyupRequest::send_copyup(int r) {
+    ldout(m_ictx->cct, 20) << __func__ << dendl;
+
+    m_ictx->snap_lock.get_read();
+    ::SnapContext snapc = m_ictx->snapc;
+    m_ictx->snap_lock.put_read();
+
+    std::vector<librados::snap_t> snaps;
+    for (std::vector<snapid_t>::const_iterator it = snapc.snaps.begin();
+         it != snapc.snaps.end(); ++it) {
+      snaps.push_back(it->val);
+    }
+
+    librados::ObjectWriteOperation copyup_op;
+    copyup_op.exec("rbd", "copyup", m_copyup_data);
+
+    m_copyup_completion = librados::Rados::aio_create_completion(this, NULL, rbd_copyup_cb);
+    m_ictx->md_ctx.aio_operate(m_oid, m_copyup_completion, &copyup_op,
+                               snapc.seq.val, snaps);
+    m_copyup_completion->release();
+  }
+
+  void CopyupRequest::read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents)
+  {
+    m_parent_completion = aio_create_completion_internal(this, rbd_read_from_parent_cb);
+    ldout(m_ictx->cct, 20) << __func__ << " this = " << this
+                           << " parent completion " << m_parent_completion
+                           << " extents " << image_extents
+                           << dendl;
+    aio_read(m_ictx->parent, image_extents, NULL, &m_copyup_data, m_parent_completion, 0);
+  }
+
+  void rbd_read_from_parent_cb(completion_t cb, void *arg)
+  {
+    CopyupRequest *req = reinterpret_cast<CopyupRequest *>(arg);
+    AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb);
+
+    ldout(req->m_ictx->cct, 20) << __func__ << dendl;
+
+    req->get_lock().Lock();
+    req->set_ready();
+    req->complete_all(comp->get_return_value());
+    req->get_lock().Unlock();
+
+    // If this entry is created by a read request, then copyup operation will
+    // be performed asynchronously. Perform cleaning up from copyup callback.
+    // If this entry is created by a write request, then copyup operation will
+    // be performed synchronously by AioWrite. After extracting data, perform
+    // cleaning up here
+    if (req->is_need_send_copyup())
+      req->send_copyup(comp->get_return_value());
+    else
+      delete req;
+  }
+
+  void rbd_copyup_cb(rados_completion_t c, void *arg)
+  {
+    CopyupRequest *req = reinterpret_cast<CopyupRequest *>(arg);
+
+    ldout(req->m_ictx->cct, 20) << __func__ << dendl;
+    delete req;
+  }
+}
diff --git a/src/librbd/CopyupRequest.h b/src/librbd/CopyupRequest.h
new file mode 100644 (file)
index 0000000..55813e2
--- /dev/null
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_COPYUPREQUEST_H
+#define CEPH_LIBRBD_COPYUPREQUEST_H
+
+#include "include/int_types.h"
+
+#include "common/Mutex.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+
+namespace librbd {
+
+  struct AioCompletion;
+
+  class CopyupRequest {
+  public:
+    CopyupRequest();
+    CopyupRequest(ImageCtx *ictx, const std::string &oid, uint64_t objectno,
+                  bool need_copyup);
+    ~CopyupRequest();
+
+    void set_ready();
+    bool is_ready();
+    bool is_need_send_copyup();
+    ceph::bufferlist& get_copyup_data();
+    Mutex& get_lock();
+    void append_request(AioRequest *req);
+    void complete_all(int r);
+    void send_copyup(int r);
+    void read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents);
+    AioCompletion *get_parent_completion() { return m_parent_completion; }
+    librados::AioCompletion *get_copyup_completion() { return m_copyup_completion; }
+    ImageCtx *m_ictx;
+
+  private:
+    std::string m_oid;
+    uint64_t m_object_no;
+    Mutex m_lock;
+    bool m_ready;
+    bool m_need_copyup;
+    AioCompletion *m_parent_completion;
+    librados::AioCompletion *m_copyup_completion;
+    ceph::bufferlist m_copyup_data;
+    vector<AioRequest *> m_pending_requests;
+  };
+
+  void rbd_read_from_parent_cb(completion_t cb, void *arg);
+  void rbd_copyup_cb(completion_t aio_completion_impl, void *arg);
+}
+
+#endif
index 622e8c420ba3e14d77d23b854179c001a1780b35..c49d05233838a6dffce7a6b2a740f00572151b9d 100644 (file)
@@ -47,7 +47,8 @@ namespace librbd {
       parent_lock("librbd::ImageCtx::parent_lock"),
       refresh_lock("librbd::ImageCtx::refresh_lock"),
       aio_lock("librbd::ImageCtx::aio_lock"),
-      cor_lock("librbd::ImageCtx::cor_lock"),
+      copyup_list_lock("librbd::ImageCtx::copyup_list_lock"),
+      copyup_list_cond(),
       extra_read_flags(0),
       old_format(true),
       order(0), size(0), features(0),
@@ -102,7 +103,6 @@ namespace librbd {
       object_set->return_enoent = true;
       object_cacher->start();
     }
-    cor_completions = new xlist<librados::AioCompletion*>();
   }
 
   ImageCtx::~ImageCtx() {
@@ -119,10 +119,6 @@ namespace librbd {
       delete object_set;
       object_set = NULL;
     }
-    if (cor_completions) {
-      delete cor_completions;
-      cor_completions = NULL;
-    }
     delete[] format_string;
   }
 
@@ -669,67 +665,12 @@ namespace librbd {
       pending_aio_cond.Wait(aio_lock);
     }
   }
- }
-
-  void ImageCtx::add_cor_completion(xlist<librados::AioCompletion*>::item *comp)
-  {
-    if(!comp)
-      return;
-
-    cor_lock.Lock();
-    cor_completions->push_back(comp);
-    cor_lock.Unlock();
-
-    ldout(cct, 10) << "add_cor_completion:: size = "<< cor_completions->size() << dendl;
-  }
 
-  void ImageCtx::wait_last_completions()
-  {
-    ldout(cct, 10) << "wait_last_completions:: cor_completions = " << cor_completions  << " size = " << cor_completions->size()  << dendl;
-    xlist<librados::AioCompletion*>::iterator itr;
-    xlist<librados::AioCompletion*>::item *ptr;
-
-    while (!cor_completions->empty()){
-      cor_lock.Lock();
-      librados::AioCompletion *comp = cor_completions->front();
-      comp->wait_for_complete();
-      itr = cor_completions->begin();
-      ptr = itr.get_cur();
-      cor_completions->pop_front();
-      delete ptr;
-      ptr = NULL;
-      cor_lock.Unlock();
+  void ImageCtx::wait_for_pending_copyup() {
+    Mutex::Locker l(copyup_list_lock);
+    while (!copyup_list.empty()) {
+      ldout(cct, 20) << __func__ << " waiting CopyupRequest to be completed" << dendl;
+      copyup_list_cond.Wait(copyup_list_lock);
     }
-    ldout(cct, 10) << "wait_last_completions:: after clear cor_completions = " << cor_completions  << " size = " << cor_completions->size() << dendl;
   }
-
-  void cor_completion_callback(librados::completion_t aio_completion_impl, void *arg)
-  {
-    librbd::ImageCtx * ictx = (librbd::ImageCtx *)arg;
-
-    ictx->cor_lock.Lock();
-    xlist<librados::AioCompletion*> *completions = ictx->cor_completions; 
-    ictx->cor_lock.Unlock();
-
-    ldout(ictx->cct, 10) << "cor_completion_callback:: cor_completions = " << completions << " size = "<< completions->size() << dendl;
-    if (!completions) 
-      return;
-
-    //find current AioCompletion item in xlist, and remove it
-    for (xlist<librados::AioCompletion*>::iterator itr = completions->begin(); !(itr.end()); ++itr) {
-       if (aio_completion_impl == (*itr)->pc){
-         xlist<librados::AioCompletion*>::item *ptr = itr.get_cur();
-
-         ictx->cor_lock.Lock();
-         completions->remove(ptr);
-         ictx->cor_lock.Unlock();
-
-         delete ptr;//delete xlist<librados::AioCompletion*>::item *
-         ptr = NULL;
-         break;
-       }
-    }
-    ldout(ictx->cct, 10) << "cor_completion_callback:: after remove item, size = " << completions->size() << dendl;
-  }
-
 }
index 33ae12d2d9de209159f1393fc71b645addd15b65..c451004c47d4823e1355059d9b71388c021734be 100644 (file)
@@ -32,6 +32,7 @@ class PerfCounters;
 namespace librbd {
 
   class ImageWatcher;
+  class CopyupRequest;
 
   struct ImageCtx {
     CephContext *cct;
@@ -74,7 +75,9 @@ namespace librbd {
     RWLock parent_lock; // protects parent_md and parent
     Mutex refresh_lock; // protects refresh_seq and last_refresh
     Mutex aio_lock; // protects pending_aio and pending_aio_cond
-    Mutex cor_lock; //protects cor_completions for copy-on-read
+    Mutex copyup_list_lock; // protects copyup_waiting_list
+
+    Cond copyup_list_cond; // protected by copyup_waiting_list_lock
 
     unsigned extra_read_flags;
 
@@ -98,10 +101,10 @@ namespace librbd {
 
     Readahead readahead;
     uint64_t total_bytes_read;
+    std::map<uint64_t, CopyupRequest*> copyup_list;
 
     Cond pending_aio_cond;
     uint64_t pending_aio;
-    xlist<librados::AioCompletion*> *cor_completions; //copy-on-read AioCompletions
 
     /**
      * Either image_name or image_id must be set.
@@ -167,11 +170,8 @@ namespace librbd {
     uint64_t prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx,
                                  uint64_t overlap);
     void wait_for_pending_aio();
-
-    void add_cor_completion(xlist<librados::AioCompletion*>::item *comp);
-    void wait_last_completions();//wait for uncompleted asynchronous write which is still in xlist
+    void wait_for_pending_copyup();
   };
-  void cor_completion_callback(librados::completion_t aio_completion_impl, void *arg);
 }
 
 #endif
index 87b326f816bd09388d6ebd6b2516d660a9ce4ed3..7222a16d9af870d7caf25dce5139fe2723be3276 100644 (file)
@@ -1,6 +1,7 @@
 librbd_internal_la_SOURCES = \
        librbd/AioCompletion.cc \
        librbd/AioRequest.cc \
+       librbd/CopyupRequest.cc \
        librbd/ImageCtx.cc \
        librbd/ImageWatcher.cc \
        librbd/internal.cc \
@@ -35,6 +36,7 @@ lib_LTLIBRARIES += librbd.la
 noinst_HEADERS += \
        librbd/AioCompletion.h \
        librbd/AioRequest.h \
+       librbd/CopyupRequest.h \
        librbd/ImageCtx.h \
        librbd/ImageWatcher.h \
        librbd/internal.h \
index 7a755b5aa23f2cc9c5678267ed46e5df56bb389d..9745655264a28f4fc2da717a02e7272b76d79678 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "librbd/AioCompletion.h"
 #include "librbd/AioRequest.h"
+#include "librbd/CopyupRequest.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 
@@ -2203,6 +2204,8 @@ reprotect_and_return_err:
   {
     ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = "
                         << (snap_name ? snap_name : "NULL") << dendl;
+
+    ictx->wait_for_pending_copyup();
     // ignore return value, since we may be set to a non-existent
     // snapshot and the user is trying to fix that
     ictx_check(ictx);
@@ -2278,10 +2281,8 @@ reprotect_and_return_err:
     if (ictx->image_watcher != NULL) {
       ictx->image_watcher->flush_aio_operations();
     }
-    if (ictx->cor_completions)
-      ictx->wait_last_completions();//copy-on-read: wait for unfinished AioCompletion requests
-
-    if (ictx->object_cacher)
+    ictx->wait_for_pending_copyup();
+    if (ictx->object_cacher) {
       ictx->shutdown_cache(); // implicitly flushes
     } else {
       flush(ictx);
@@ -3388,6 +3389,7 @@ reprotect_and_return_err:
 
     ictx->snap_lock.get_read();
     snap_t snap_id = ictx->snap_id;
+    ::SnapContext snapc = ictx->snapc;
     ictx->snap_lock.put_read();
 
     // readahead
@@ -3432,7 +3434,7 @@ reprotect_and_return_err:
        C_AioRead *req_comp = new C_AioRead(ictx->cct, c);
        AioRead *req = new AioRead(ictx, q->oid.name, 
                                   q->objectno, q->offset, q->length,
-                                  q->buffer_extents,
+                                  q->buffer_extents, snapc,
                                   snap_id, true, req_comp, op_flags);
        req_comp->set_req(req);
        c->add_request();
index 71648726bd5a6bc087fc13aa6358f6cda934110c..4723fe10275743f0ec6da740c6faed78169d1e4d 100644 (file)
@@ -1395,6 +1395,141 @@ TEST_F(TestLibRBD, TestClone2)
   rados_ioctx_destroy(ioctx);
 }
 
+TEST_F(TestLibRBD, TestCoR)
+{
+  if (!g_conf->rbd_clone_copy_on_read) {
+    std::cout << "SKIPPING due to disabled rbd_copy_on_read" << std::endl;
+    return;
+  }
+
+  rados_ioctx_t ioctx;
+  rados_ioctx_create(_cluster, m_pool_name.c_str(), &ioctx);
+
+  int features = RBD_FEATURE_LAYERING;
+  rbd_image_t parent, child;
+  int order = 12; // smallest object size is 4K
+  const uint64_t image_size = 4<<20;
+  const int object_size = 1<<12;
+  const int object_num = image_size / object_size;
+  map<uint64_t, uint64_t> write_tracker;
+  set<string> obj_checker;
+  rbd_image_info_t p_info, c_info;
+  rados_list_ctx_t list_ctx;
+  const char *entry;
+
+  // make a parent to clone from
+  ASSERT_EQ(0, create_image_full(ioctx, "parent", image_size, &order, false, features));
+  ASSERT_EQ(0, rbd_open(ioctx, "parent", &parent, NULL));
+  printf("made parent image \"parent\": %ldK (%d * %dK)\n", 
+         image_size, object_num, object_size/1024);
+
+  // write something into parent
+  char test_data[TEST_IO_SIZE + 1];
+  char zero_data[TEST_IO_SIZE + 1];
+  int i;
+  int count = 0;
+
+  for (i = 0; i < TEST_IO_SIZE; ++i) 
+    test_data[i] = (char) (rand() % (126 - 33) + 33);
+  test_data[TEST_IO_SIZE] = '\0';
+  memset(zero_data, 0, sizeof(zero_data));
+
+  // generate a random map which covers every objects with random offset
+  while (count < 100) {
+    uint64_t ono = rand() % object_num;
+    if (write_tracker.find(ono) == write_tracker.end()) {
+      uint64_t offset = rand() % (object_size - TEST_IO_SIZE);
+      write_tracker.insert(pair<uint64_t, uint64_t>(ono, offset));
+      count++;
+    }
+  }
+
+  printf("generated random write map:\n");
+  for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+       itr != write_tracker.end(); ++itr)
+    printf("\t [%-8ld, %-8ld]\n", itr->first, itr->second);
+
+  printf("write data based on random map\n");
+  for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+       itr != write_tracker.end(); ++itr) {
+    printf("\twrite object-%-4ld\t", itr->first);
+    ASSERT_PASSED(write_test_data, parent, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+  }
+
+  for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+         itr != write_tracker.end(); ++itr) {
+    printf("\tread object-%-4ld\t", itr->first);
+    ASSERT_PASSED(read_test_data, parent, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+  }
+
+  // find out what objects the parent image has generated
+  ASSERT_EQ(0, rbd_stat(parent, &p_info, sizeof(p_info)));
+  ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
+  while (rados_objects_list_next(list_ctx, &entry, NULL) != -ENOENT) {
+    if (strstr(entry, p_info.block_name_prefix)) {
+      const char *block_name_suffix = entry + strlen(p_info.block_name_prefix) + 1;
+      obj_checker.insert(block_name_suffix);
+    }
+  }
+  rados_objects_list_close(list_ctx);
+  ASSERT_EQ(obj_checker.size(), write_tracker.size());
+
+  // create a snapshot, reopen as the parent we're interested in and protect it
+  ASSERT_EQ(0, rbd_snap_create(parent, "parent_snap"));
+  ASSERT_EQ(0, rbd_close(parent));
+  ASSERT_EQ(0, rbd_open(ioctx, "parent", &parent, "parent_snap"));
+  ASSERT_EQ(0, rbd_snap_protect(parent, "parent_snap"));
+  ASSERT_EQ(0, rbd_close(parent));
+  printf("made snapshot \"parent@parent_snap\" and protect it\n");
+
+  // create a copy-on-read clone and open it
+  ASSERT_EQ(0, rbd_clone(ioctx, "parent", "parent_snap", ioctx, "child",
+           features, &order));
+  ASSERT_EQ(0, rbd_open(ioctx, "child", &child, NULL));
+  printf("made and opened clone \"child\"\n");
+
+  printf("read from \"child\"\n");
+  {
+    map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+    printf("\tread object-%-4ld\t", itr->first);
+    ASSERT_PASSED(read_test_data, child, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+  }
+
+  for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+       itr != write_tracker.end(); ++itr) {
+    printf("\tread object-%-4ld\t", itr->first);
+    ASSERT_PASSED(read_test_data, child, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+  }
+
+  printf("read again reversely\n");
+  for (map<uint64_t, uint64_t>::iterator itr = --write_tracker.end();
+     itr != write_tracker.begin(); --itr) {
+    printf("\tread object-%-4ld\t", itr->first);
+    ASSERT_PASSED(read_test_data, child, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+  }
+
+  // close child to flush all copy-on-read
+  ASSERT_EQ(0, rbd_close(child));
+
+  printf("check whether child image has the same set of objects as parent\n");
+  ASSERT_EQ(0, rbd_open(ioctx, "child", &child, NULL));
+  ASSERT_EQ(0, rbd_stat(child, &c_info, sizeof(c_info)));
+  ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
+  while (rados_objects_list_next(list_ctx, &entry, NULL) != -ENOENT) {
+    if (strstr(entry, c_info.block_name_prefix)) {
+      const char *block_name_suffix = entry + strlen(c_info.block_name_prefix) + 1;
+      set<string>::iterator it = obj_checker.find(block_name_suffix);
+      ASSERT_TRUE(it != obj_checker.end());
+      obj_checker.erase(it);
+    }
+  }
+  rados_objects_list_close(list_ctx);
+  ASSERT_TRUE(obj_checker.empty());
+  ASSERT_EQ(0, rbd_close(child));
+
+  rados_ioctx_destroy(ioctx);
+}
+
 static void test_list_children(rbd_image_t image, ssize_t num_expected, ...)
 {
   va_list ap;