Asychronously perform copyup on read path.
Sychronously perform copyup on write path.
Signed-off-by: Cheng Cheng <ccheng.leo@gmail.com>
OPTION(rbd_readahead_trigger_requests, OPT_INT, 10) // number of sequential requests necessary to trigger readahead
OPTION(rbd_readahead_max_bytes, OPT_LONGLONG, 512 * 1024) // set to 0 to disable readahead
OPTION(rbd_readahead_disable_after_bytes, OPT_LONGLONG, 50 * 1024 * 1024) // how many bytes are read in total before readahead is disabled
-OPTION(rbd_clone_copy_on_read, OPT_BOOL, false)//copy-on-read option for rbd clones
+OPTION(rbd_clone_copy_on_read, OPT_BOOL, false)
/*
* The following options change the behavior for librbd's image creation methods that
return *this;
}
bool end() const { return cur == 0; }
- item *get_cur() const { return cur; }
};
iterator begin() { return iterator(_front); }
#include "librbd/internal.h"
#include "librbd/AioRequest.h"
+#include "librbd/CopyupRequest.h"
#define dout_subsys ceph_subsys_rbd
#undef dout_prefix
m_ictx(NULL), m_ioctx(NULL),
m_object_no(0), m_object_off(0), m_object_len(0),
m_snap_id(CEPH_NOSNAP), m_completion(NULL), m_parent_completion(NULL),
- m_hide_enoent(false), m_parent_completion_cor(NULL){}
+ m_hide_enoent(false), m_entire_object(NULL) {}
AioRequest::AioRequest(ImageCtx *ictx, const std::string &oid,
uint64_t objectno, uint64_t off, uint64_t len,
- librados::snap_t snap_id,
+ const ::SnapContext &snapc, librados::snap_t snap_id,
Context *completion,
bool hide_enoent) :
m_ictx(ictx), m_ioctx(&ictx->data_ctx), m_oid(oid), m_object_no(objectno),
m_object_off(off), m_object_len(len), m_snap_id(snap_id),
m_completion(completion), m_parent_completion(NULL),
- m_hide_enoent(hide_enoent), m_parent_completion_cor(NULL) {}
+ m_hide_enoent(hide_enoent), m_entire_object(NULL) {
+ for (std::vector<snapid_t>::const_iterator it = snapc.snaps.begin();
+ it != snapc.snaps.end(); ++it)
+ m_snaps.push_back(it->val);
+ }
AioRequest::~AioRequest() {
if (m_parent_completion) {
m_parent_completion->release();
m_parent_completion = NULL;
}
- if (m_parent_completion_cor) {
- m_parent_completion_cor->release();
- m_parent_completion_cor = NULL;
- }
}
void AioRequest::read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents)
m_parent_completion, 0);
}
- //copy-on-read : read the entire object from parent, using bufferlist m_entire_object
- void AioRequest::read_from_parent_cor(vector<pair<uint64_t,uint64_t> >& image_extents)
- {
- assert(!m_parent_completion_cor);
- m_parent_completion_cor = aio_create_completion_internal(this, rbd_req_cb);
- ldout(m_ictx->cct, 20) << "read_from_parent_cor this = " << this
- << " parent completion cor " << m_parent_completion_cor
- << " extents " << image_extents
- << dendl;
- aio_read(m_ictx->parent, image_extents, NULL, &m_entire_object,
- m_parent_completion_cor);
+ static inline bool is_cor(ImageCtx *ictx, librados::snap_t snap_id) {
+ return (ictx->cct->_conf->rbd_clone_copy_on_read) &&
+ (!ictx->read_only) && (snap_id == CEPH_NOSNAP);
}
/** read **/
- //copy-on-read: after read entire object, just write it into child
- ssize_t AioRead::write_cor()
+ void AioRead::guard_read()
{
- ldout(m_ictx->cct, 20) << "write_cor" << dendl;
- int ret = 0;
-
- m_ictx->snap_lock.get_read();
- ::SnapContext snapc = m_ictx->snapc;
- m_ictx->snap_lock.put_read();
+ RWLock::RLocker l(m_ictx->snap_lock);
+ RWLock::RLocker l2(m_ictx->parent_lock);
- librados::ObjectWriteOperation copyup_cor;
- copyup_cor.exec("rbd", "copyup", m_entire_object);
+ vector<pair<uint64_t,uint64_t> > image_extents;
+ Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
+ m_object_no, 0, m_ictx->layout.fl_object_size,
+ image_extents);
- std::vector<librados::snap_t> m_snaps;
- for (std::vector<snapid_t>::const_iterator it = snapc.snaps.begin();
- it != snapc.snaps.end(); ++it) {
- m_snaps.push_back(it->val);
+ uint64_t image_overlap = 0;
+ m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
+ uint64_t object_overlap =
+ m_ictx->prune_parent_extents(image_extents, image_overlap);
+ if (object_overlap) {
+ ldout(m_ictx->cct, 20) << __func__ << " guarding read" << dendl;
+ m_state = LIBRBD_AIO_READ_GUARD;
}
-
- librados::AioCompletion *cor_completion =
- librados::Rados::aio_create_completion(m_ictx, librbd::cor_completion_callback, NULL);
-
- xlist<librados::AioCompletion *>::item *comp =
- new xlist<librados::AioCompletion *>::item(cor_completion);
-
- m_ictx->add_cor_completion(comp);//add cor_completion to xlist
- //asynchronously write object
- ret = m_ictx->md_ctx.aio_operate(m_oid, cor_completion, ©up_cor, snapc.seq.val, m_snaps);
-
- return ret;
}
bool AioRead::should_complete(int r)
{
- ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len
- << " r = " << r << dendl;
-
- //get copy-on-read option and check image if read_only
- bool cor = (m_ictx->cct->_conf->rbd_clone_copy_on_read) && (!m_ictx->read_only);
- ldout(m_ictx->cct, 20) << "should_complete cor = " << cor << " read_only = " << m_ictx->read_only << dendl;
+ ldout(m_ictx->cct, 20) << "AioRead::should_complete " << this
+ << " " << m_oid << " " << m_object_off
+ << "~" << m_object_len << " r = " << r
+ << " cor = " << is_cor(m_ictx, m_snap_id)
+ << " readonly = " << m_ictx->read_only
+ << dendl;
- if (!m_tried_parent && r == -ENOENT) {
- RWLock::RLocker l(m_ictx->snap_lock);
- RWLock::RLocker l2(m_ictx->parent_lock);
+ bool finished = true;
- // calculate reverse mapping onto the image
- vector<pair<uint64_t,uint64_t> > image_extents;
- Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
- m_object_no, m_object_off, m_object_len,
- image_extents);
+ switch (m_state) {
+ case LIBRBD_AIO_READ_GUARD:
+ ldout(m_ictx->cct, 20) << "should_complete " << this
+ << " READ_CHECK_GUARD" << dendl;
- uint64_t image_overlap = 0;
- r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
- if (r < 0) {
- assert(0 == "FIXME");
- }
- uint64_t object_overlap = m_ictx->prune_parent_extents(image_extents, image_overlap);
- if (object_overlap) {
- m_tried_parent = true;
- if (cor) {//copy-on-read option
- vector<pair<uint64_t,uint64_t> > extend_image_extents;
- //extend range to entire object
- Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
- m_object_no, 0, m_ictx->layout.fl_object_size,
- extend_image_extents);
- //read entire object from parent , and put it in m_entire_object
- read_from_parent_cor(extend_image_extents);
- } else {
- read_from_parent(image_extents);
- }
- return false;
- }
- }
+ // This is the step to read from parent
+ if (!m_tried_parent && r == -ENOENT) {
+ RWLock::RLocker l(m_ictx->snap_lock);
+ RWLock::RLocker l2(m_ictx->parent_lock);
- if (cor) {//copy-on-read option
- //if read entire object from parent success
- if (m_tried_parent && r > 0) {
+ // calculate reverse mapping onto the image
vector<pair<uint64_t,uint64_t> > image_extents;
Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
- m_object_no, m_object_off, m_object_len,
- image_extents);
+ m_object_no, m_object_off, m_object_len,
+ image_extents);
+
uint64_t image_overlap = 0;
- int r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
+ r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
if (r < 0) {
assert(0 == "FIXME");
}
- m_ictx->prune_parent_extents(image_extents, image_overlap);
- // copy the read range to m_read_data
- m_read_data.substr_of(m_entire_object, m_object_off, m_object_len);
- write_cor();
+ uint64_t object_overlap = m_ictx->prune_parent_extents(image_extents, image_overlap);
+ if (object_overlap) {
+ m_tried_parent = true;
+
+ if (is_cor(m_ictx, m_snap_id)) {
+ // If there is a valid object being coyping up, directly extract
+ // content and finish up.
+ Mutex::Locker l3(m_ictx->copyup_list_lock);
+
+ map<uint64_t, CopyupRequest*>::iterator it =
+ m_ictx->copyup_list.find(m_object_no);
+ if (it != m_ictx->copyup_list.end()) {
+ Mutex::Locker l4(it->second->get_lock());
+ if (it->second->is_ready()) {
+ ceph::bufferlist ©up_data = it->second->get_copyup_data();
+ m_read_data.substr_of(copyup_data, m_object_off, m_object_len);
+ ldout(m_ictx->cct, 20) << __func__ << " extract content from copyup_list, obj-"
+ << m_object_no << dendl;
+ finished = true;
+ break;
+ }
+ }
+ }
+
+ read_from_parent(image_extents);
+
+ if (is_cor(m_ictx, m_snap_id))
+ m_state = LIBRBD_AIO_READ_COPYUP;
+ else
+ m_state = LIBRBD_AIO_READ_GUARD;
+
+ finished = false;
+ }
}
+ break;
+ case LIBRBD_AIO_READ_COPYUP:
+ ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP" << dendl;
+ // This is the extra step for copy-on-read: kick off an asynchronous copyup.
+ // It is different from copy-on-write as asynchronous copyup will finish
+ // by itself so state won't go back to LIBRBD_AIO_READ_GUARD.
+
+ if (m_tried_parent && r > 0) {
+ // If read entire object from parent success and CoR is possible, kick
+ // off a asynchronous copyup. This approach minimizes the latency
+ // impact.
+ m_ictx->copyup_list_lock.Lock();
+ map<uint64_t, CopyupRequest*>::iterator it =
+ m_ictx->copyup_list.find(m_object_no);
+ if (it == m_ictx->copyup_list.end()) {
+ // create and kick off a CopyupRequest
+ CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+ m_object_no, true);
+ pair<uint64_t, CopyupRequest*> new_slot =
+ pair<uint64_t, CopyupRequest*>(m_object_no, new_req);
+ m_ictx->copyup_list.insert(new_slot);
+ m_ictx->copyup_list_lock.Unlock();
+
+ RWLock::RLocker l(m_ictx->snap_lock);
+ RWLock::RLocker l2(m_ictx->parent_lock);
+
+ vector<pair<uint64_t,uint64_t> > extend_image_extents;
+ //extend range to entire object
+ Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+ 0, m_ictx->layout.fl_object_size,
+ extend_image_extents);
+ uint64_t image_overlap = 0;
+ r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
+ if (r < 0) {
+ assert(0 == "FIXME");
+ }
+
+ m_ictx->prune_parent_extents(extend_image_extents, image_overlap);
+ m_ictx->copyup_list[m_object_no]->read_from_parent(extend_image_extents);
+ } else {
+ m_ictx->copyup_list_lock.Unlock();
+ }
+
+ finished = true;
+ }
+
+ if (r < 0) {
+ ldout(m_ictx->cct, 20) << "error checking for object existence" << dendl;
+ }
+
+ break;
+ case LIBRBD_AIO_READ_FLAT:
+ ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT" << dendl;
+ // The read contect should be deposit in m_read_data
+ break;
+ default:
+ lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
+ assert(0);
}
- return true;
+ return finished;
}
int AioRead::send() {
const ::SnapContext &snapc, librados::snap_t snap_id,
Context *completion,
bool hide_enoent)
- : AioRequest(ictx, oid, object_no, object_off, len, snap_id, completion,
- hide_enoent),
+ : AioRequest(ictx, oid, object_no, object_off, len, snapc, snap_id,
+ completion, hide_enoent),
m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
{
m_object_image_extents = objectx;
m_parent_overlap = object_overlap;
-
- // TODO: find a way to make this less stupid
- for (std::vector<snapid_t>::const_iterator it = snapc.snaps.begin();
- it != snapc.snaps.end(); ++it) {
- m_snaps.push_back(it->val);
- }
}
void AbstractWrite::guard_write()
ldout(m_ictx->cct, 20) << "write " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len
<< " should_complete: r = " << r << dendl;
+ map<uint64_t, CopyupRequest*>::iterator it;
bool finished = true;
switch (m_state) {
case LIBRBD_AIO_WRITE_GUARD:
<< m_object_image_extents << dendl;
m_state = LIBRBD_AIO_WRITE_COPYUP;
- read_from_parent(m_object_image_extents);
+
+ if (is_cor(m_ictx, m_snap_id)) {
+ m_ictx->copyup_list_lock.Lock();
+ it = m_ictx->copyup_list.find(m_object_no);
+ if (it == m_ictx->copyup_list.end()) {
+ // If it is not in the list, create a CopyupRequest and wait for it.
+ CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+ m_object_no, false);
+ // make sure to wait on this CopyupRequest
+ new_req->append_request(this);
+ pair<uint64_t, CopyupRequest*> new_slot =
+ pair<uint64_t, CopyupRequest*>(m_object_no, new_req);
+ m_ictx->copyup_list.insert(new_slot);
+ m_ictx->copyup_list_lock.Unlock();
+
+ m_entire_object = &(m_ictx->copyup_list[m_object_no]->get_copyup_data());
+ ldout(m_ictx->cct, 20) << __func__ << " creating new Copyup for AioWrite, obj-"
+ << m_object_no << dendl;
+ m_ictx->copyup_list[m_object_no]->read_from_parent(m_object_image_extents);
+ ldout(m_ictx->cct, 20) << __func__ << " issuing read_from_parent" << dendl;
+ } else {
+ Mutex::Locker l3(it->second->get_lock());
+ if (it->second->is_ready()) {
+ ldout(m_ictx->cct, 20) << __func__ << " extracting contect from copyup_list, obj-"
+ << m_object_no << " length=" << it->second->get_copyup_data().length()
+ << dendl;
+ m_read_data.append(it->second->get_copyup_data());
+ ldout(m_ictx->cct, 20) << __func__ << " extracted contect from copyup_list, obj-"
+ << m_object_no << " length=" << m_read_data.length()
+ << dendl;
+ m_ictx->copyup_list_lock.Unlock();
+ return should_complete(1);
+ } else {
+ ldout(m_ictx->cct, 20) << __func__ << " someone is reading back from parent" << dendl;
+ it->second->append_request(this);
+ m_ictx->copyup_list_lock.Unlock();
+
+ m_entire_object = &(m_ictx->copyup_list[m_object_no]->get_copyup_data());
+ }
+ }
+ } else
+ read_from_parent(m_object_image_extents);
} else {
ldout(m_ictx->cct, 20) << "should_complete(" << this
<< "): parent overlap now 0" << dendl;
m_state = LIBRBD_AIO_WRITE_GUARD;
if (r < 0)
return should_complete(r);
+
+ // Read data from waiting list safely. If this AioWrite created a
+ // CopyupRequest, m_read_data should be empty.
+ if (m_read_data.is_zero()) {
+ Mutex::Locker l(m_ictx->copyup_list_lock);
+ ldout(m_ictx->cct, 20) << __func__ << " releasing self pending, obj-"
+ << m_object_no << dendl;
+ it = m_ictx->copyup_list.find(m_object_no);
+ assert(it != m_ictx->copyup_list.end());
+ assert(it->second->is_ready());
+ assert(m_entire_object);
+ m_read_data.append(*m_entire_object);
+ }
+
send_copyup();
finished = false;
break;
struct AioCompletion;
struct ImageCtx;
+ class CopyupRequest;
/**
* This class represents an I/O operation to a single RBD data object.
public:
AioRequest();
AioRequest(ImageCtx *ictx, const std::string &oid,
- uint64_t objectno, uint64_t off, uint64_t len,
- librados::snap_t snap_id, Context *completion,
- bool hide_enoent);
+ uint64_t objectno, uint64_t off, uint64_t len,
+ const ::SnapContext &snapc, librados::snap_t snap_id,
+ Context *completion, bool hide_enoent);
virtual ~AioRequest();
void complete(int r)
protected:
void read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents);
- void read_from_parent_cor(vector<pair<uint64_t,uint64_t> >& image_extents);
ImageCtx *m_ictx;
librados::IoCtx *m_ioctx;
AioCompletion *m_parent_completion;
ceph::bufferlist m_read_data;
bool m_hide_enoent;
- ceph::bufferlist m_entire_object;//copy-on-read : store the entire object
- AioCompletion *m_parent_completion_cor;//copy-on-read : AioCompletion for read from parent
+ std::vector<librados::snap_t> m_snaps;
+ ceph::bufferlist *m_entire_object;
};
class AioRead : public AioRequest {
public:
AioRead(ImageCtx *ictx, const std::string &oid,
uint64_t objectno, uint64_t offset, uint64_t len,
- vector<pair<uint64_t,uint64_t> >& be,
+ vector<pair<uint64_t,uint64_t> >& be, const ::SnapContext &snapc,
librados::snap_t snap_id, bool sparse,
Context *completion, int op_flags)
- : AioRequest(ictx, oid, objectno, offset, len, snap_id, completion,
+ : AioRequest(ictx, oid, objectno, offset, len, snapc, snap_id, completion,
false),
m_buffer_extents(be), m_tried_parent(false),
- m_sparse(sparse), m_op_flags(op_flags) {
+ m_sparse(sparse), m_op_flags(op_flags), m_state(LIBRBD_AIO_READ_FLAT) {
+ guard_read();
}
virtual ~AioRead() {}
- ssize_t write_cor();
virtual bool should_complete(int r);
virtual int send();
+ void guard_read();
ceph::bufferlist &data() {
return m_read_data;
bool m_tried_parent;
bool m_sparse;
int m_op_flags;
+
+ /**
+ * Reads go through the following state machine to deal with
+ * layering:
+ *
+ * need copyup
+ * LIBRBD_AIO_READ_GUARD ---------------> LIBRBD_AIO_READ_COPYUP
+ * | |
+ * v |
+ * done <------------------------------------/
+ * ^
+ * |
+ * LIBRBD_AIO_READ_FLAT
+ *
+ * Reads start in LIBRBD_AIO_READ_GUARD or _FLAT, depending on
+ * whether there is a parent or not.
+ */
+ enum read_state_d {
+ LIBRBD_AIO_READ_GUARD,
+ LIBRBD_AIO_READ_COPYUP,
+ LIBRBD_AIO_READ_FLAT
+ };
+
+ read_state_d m_state;
};
class AbstractWrite : public AioRequest {
librados::ObjectWriteOperation m_write;
librados::ObjectWriteOperation m_copyup;
uint64_t m_snap_seq;
- std::vector<librados::snap_t> m_snaps;
private:
void send_copyup();
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/Mutex.h"
+
+#include "librbd/AioCompletion.h"
+#include "librbd/ImageCtx.h"
+
+#include "librbd/AioRequest.h"
+#include "librbd/CopyupRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::CopyupRequest: "
+
+namespace librbd {
+ CopyupRequest::CopyupRequest()
+ : m_ictx(NULL), m_object_no(0), m_lock("librbd::CopyupRequest::m_lock"),
+ m_ready(false), m_need_copyup(false), m_parent_completion(NULL),
+ m_copyup_completion(NULL) {}
+
+ CopyupRequest::CopyupRequest(ImageCtx *ictx, const std::string &oid,
+ uint64_t objectno, bool need_copyup)
+ : m_ictx(ictx), m_oid(oid), m_object_no(objectno),
+ m_lock("librbd::CopyupRequest::m_lock"), m_ready(false),
+ m_need_copyup(need_copyup), m_parent_completion(NULL),
+ m_copyup_completion(NULL) {}
+
+ CopyupRequest::~CopyupRequest() {
+ assert(m_pending_requests.empty());
+
+ m_ictx->copyup_list_lock.Lock();
+ ldout(m_ictx->cct, 20) << __func__ << " removing the slot " << dendl;
+ map<uint64_t, CopyupRequest*>::iterator it =
+ m_ictx->copyup_list.find(m_object_no);
+ assert(it != m_ictx->copyup_list.end());
+ it->second = NULL;
+ m_ictx->copyup_list.erase(it);
+
+ if (m_ictx->copyup_list.empty())
+ m_ictx->copyup_list_cond.Signal();
+
+ ldout(m_ictx->cct, 20) << __func__ << " remove the slot " << m_object_no
+ << " in copyup_list, size = " << m_ictx->copyup_list.size()
+ << dendl;
+
+ m_ictx->copyup_list_lock.Unlock();
+ }
+
+ void CopyupRequest::set_ready() {
+ m_ready = true;
+ }
+
+ bool CopyupRequest::is_ready() {
+ return m_ready;
+ }
+
+ bool CopyupRequest::is_need_send_copyup() {
+ return m_need_copyup;
+ }
+
+ ceph::bufferlist& CopyupRequest::get_copyup_data() {
+ return m_copyup_data;
+ }
+
+ Mutex& CopyupRequest::get_lock() {
+ return m_lock;
+ }
+
+ void CopyupRequest::append_request(AioRequest *req) {
+ assert(!m_ready);
+ m_pending_requests.push_back(req);
+ }
+
+ void CopyupRequest::complete_all(int r) {
+ assert(m_ready);
+
+ while (!m_pending_requests.empty()) {
+ vector<AioRequest *>::iterator it = m_pending_requests.begin();
+ AioRequest *req = *it;
+ req->complete(r);
+ m_pending_requests.erase(it);
+ }
+ }
+
+ void CopyupRequest::send_copyup(int r) {
+ ldout(m_ictx->cct, 20) << __func__ << dendl;
+
+ m_ictx->snap_lock.get_read();
+ ::SnapContext snapc = m_ictx->snapc;
+ m_ictx->snap_lock.put_read();
+
+ std::vector<librados::snap_t> snaps;
+ for (std::vector<snapid_t>::const_iterator it = snapc.snaps.begin();
+ it != snapc.snaps.end(); ++it) {
+ snaps.push_back(it->val);
+ }
+
+ librados::ObjectWriteOperation copyup_op;
+ copyup_op.exec("rbd", "copyup", m_copyup_data);
+
+ m_copyup_completion = librados::Rados::aio_create_completion(this, NULL, rbd_copyup_cb);
+ m_ictx->md_ctx.aio_operate(m_oid, m_copyup_completion, ©up_op,
+ snapc.seq.val, snaps);
+ m_copyup_completion->release();
+ }
+
+ void CopyupRequest::read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents)
+ {
+ m_parent_completion = aio_create_completion_internal(this, rbd_read_from_parent_cb);
+ ldout(m_ictx->cct, 20) << __func__ << " this = " << this
+ << " parent completion " << m_parent_completion
+ << " extents " << image_extents
+ << dendl;
+ aio_read(m_ictx->parent, image_extents, NULL, &m_copyup_data, m_parent_completion, 0);
+ }
+
+ void rbd_read_from_parent_cb(completion_t cb, void *arg)
+ {
+ CopyupRequest *req = reinterpret_cast<CopyupRequest *>(arg);
+ AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb);
+
+ ldout(req->m_ictx->cct, 20) << __func__ << dendl;
+
+ req->get_lock().Lock();
+ req->set_ready();
+ req->complete_all(comp->get_return_value());
+ req->get_lock().Unlock();
+
+ // If this entry is created by a read request, then copyup operation will
+ // be performed asynchronously. Perform cleaning up from copyup callback.
+ // If this entry is created by a write request, then copyup operation will
+ // be performed synchronously by AioWrite. After extracting data, perform
+ // cleaning up here
+ if (req->is_need_send_copyup())
+ req->send_copyup(comp->get_return_value());
+ else
+ delete req;
+ }
+
+ void rbd_copyup_cb(rados_completion_t c, void *arg)
+ {
+ CopyupRequest *req = reinterpret_cast<CopyupRequest *>(arg);
+
+ ldout(req->m_ictx->cct, 20) << __func__ << dendl;
+ delete req;
+ }
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_COPYUPREQUEST_H
+#define CEPH_LIBRBD_COPYUPREQUEST_H
+
+#include "include/int_types.h"
+
+#include "common/Mutex.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+
+namespace librbd {
+
+ struct AioCompletion;
+
+ class CopyupRequest {
+ public:
+ CopyupRequest();
+ CopyupRequest(ImageCtx *ictx, const std::string &oid, uint64_t objectno,
+ bool need_copyup);
+ ~CopyupRequest();
+
+ void set_ready();
+ bool is_ready();
+ bool is_need_send_copyup();
+ ceph::bufferlist& get_copyup_data();
+ Mutex& get_lock();
+ void append_request(AioRequest *req);
+ void complete_all(int r);
+ void send_copyup(int r);
+ void read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents);
+ AioCompletion *get_parent_completion() { return m_parent_completion; }
+ librados::AioCompletion *get_copyup_completion() { return m_copyup_completion; }
+ ImageCtx *m_ictx;
+
+ private:
+ std::string m_oid;
+ uint64_t m_object_no;
+ Mutex m_lock;
+ bool m_ready;
+ bool m_need_copyup;
+ AioCompletion *m_parent_completion;
+ librados::AioCompletion *m_copyup_completion;
+ ceph::bufferlist m_copyup_data;
+ vector<AioRequest *> m_pending_requests;
+ };
+
+ void rbd_read_from_parent_cb(completion_t cb, void *arg);
+ void rbd_copyup_cb(completion_t aio_completion_impl, void *arg);
+}
+
+#endif
parent_lock("librbd::ImageCtx::parent_lock"),
refresh_lock("librbd::ImageCtx::refresh_lock"),
aio_lock("librbd::ImageCtx::aio_lock"),
- cor_lock("librbd::ImageCtx::cor_lock"),
+ copyup_list_lock("librbd::ImageCtx::copyup_list_lock"),
+ copyup_list_cond(),
extra_read_flags(0),
old_format(true),
order(0), size(0), features(0),
object_set->return_enoent = true;
object_cacher->start();
}
- cor_completions = new xlist<librados::AioCompletion*>();
}
ImageCtx::~ImageCtx() {
delete object_set;
object_set = NULL;
}
- if (cor_completions) {
- delete cor_completions;
- cor_completions = NULL;
- }
delete[] format_string;
}
pending_aio_cond.Wait(aio_lock);
}
}
- }
-
- void ImageCtx::add_cor_completion(xlist<librados::AioCompletion*>::item *comp)
- {
- if(!comp)
- return;
-
- cor_lock.Lock();
- cor_completions->push_back(comp);
- cor_lock.Unlock();
-
- ldout(cct, 10) << "add_cor_completion:: size = "<< cor_completions->size() << dendl;
- }
- void ImageCtx::wait_last_completions()
- {
- ldout(cct, 10) << "wait_last_completions:: cor_completions = " << cor_completions << " size = " << cor_completions->size() << dendl;
- xlist<librados::AioCompletion*>::iterator itr;
- xlist<librados::AioCompletion*>::item *ptr;
-
- while (!cor_completions->empty()){
- cor_lock.Lock();
- librados::AioCompletion *comp = cor_completions->front();
- comp->wait_for_complete();
- itr = cor_completions->begin();
- ptr = itr.get_cur();
- cor_completions->pop_front();
- delete ptr;
- ptr = NULL;
- cor_lock.Unlock();
+ void ImageCtx::wait_for_pending_copyup() {
+ Mutex::Locker l(copyup_list_lock);
+ while (!copyup_list.empty()) {
+ ldout(cct, 20) << __func__ << " waiting CopyupRequest to be completed" << dendl;
+ copyup_list_cond.Wait(copyup_list_lock);
}
- ldout(cct, 10) << "wait_last_completions:: after clear cor_completions = " << cor_completions << " size = " << cor_completions->size() << dendl;
}
-
- void cor_completion_callback(librados::completion_t aio_completion_impl, void *arg)
- {
- librbd::ImageCtx * ictx = (librbd::ImageCtx *)arg;
-
- ictx->cor_lock.Lock();
- xlist<librados::AioCompletion*> *completions = ictx->cor_completions;
- ictx->cor_lock.Unlock();
-
- ldout(ictx->cct, 10) << "cor_completion_callback:: cor_completions = " << completions << " size = "<< completions->size() << dendl;
- if (!completions)
- return;
-
- //find current AioCompletion item in xlist, and remove it
- for (xlist<librados::AioCompletion*>::iterator itr = completions->begin(); !(itr.end()); ++itr) {
- if (aio_completion_impl == (*itr)->pc){
- xlist<librados::AioCompletion*>::item *ptr = itr.get_cur();
-
- ictx->cor_lock.Lock();
- completions->remove(ptr);
- ictx->cor_lock.Unlock();
-
- delete ptr;//delete xlist<librados::AioCompletion*>::item *
- ptr = NULL;
- break;
- }
- }
- ldout(ictx->cct, 10) << "cor_completion_callback:: after remove item, size = " << completions->size() << dendl;
- }
-
}
namespace librbd {
class ImageWatcher;
+ class CopyupRequest;
struct ImageCtx {
CephContext *cct;
RWLock parent_lock; // protects parent_md and parent
Mutex refresh_lock; // protects refresh_seq and last_refresh
Mutex aio_lock; // protects pending_aio and pending_aio_cond
- Mutex cor_lock; //protects cor_completions for copy-on-read
+ Mutex copyup_list_lock; // protects copyup_waiting_list
+
+ Cond copyup_list_cond; // protected by copyup_waiting_list_lock
unsigned extra_read_flags;
Readahead readahead;
uint64_t total_bytes_read;
+ std::map<uint64_t, CopyupRequest*> copyup_list;
Cond pending_aio_cond;
uint64_t pending_aio;
- xlist<librados::AioCompletion*> *cor_completions; //copy-on-read AioCompletions
/**
* Either image_name or image_id must be set.
uint64_t prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx,
uint64_t overlap);
void wait_for_pending_aio();
-
- void add_cor_completion(xlist<librados::AioCompletion*>::item *comp);
- void wait_last_completions();//wait for uncompleted asynchronous write which is still in xlist
+ void wait_for_pending_copyup();
};
- void cor_completion_callback(librados::completion_t aio_completion_impl, void *arg);
}
#endif
librbd_internal_la_SOURCES = \
librbd/AioCompletion.cc \
librbd/AioRequest.cc \
+ librbd/CopyupRequest.cc \
librbd/ImageCtx.cc \
librbd/ImageWatcher.cc \
librbd/internal.cc \
noinst_HEADERS += \
librbd/AioCompletion.h \
librbd/AioRequest.h \
+ librbd/CopyupRequest.h \
librbd/ImageCtx.h \
librbd/ImageWatcher.h \
librbd/internal.h \
#include "librbd/AioCompletion.h"
#include "librbd/AioRequest.h"
+#include "librbd/CopyupRequest.h"
#include "librbd/ImageCtx.h"
#include "librbd/ImageWatcher.h"
{
ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = "
<< (snap_name ? snap_name : "NULL") << dendl;
+
+ ictx->wait_for_pending_copyup();
// ignore return value, since we may be set to a non-existent
// snapshot and the user is trying to fix that
ictx_check(ictx);
if (ictx->image_watcher != NULL) {
ictx->image_watcher->flush_aio_operations();
}
- if (ictx->cor_completions)
- ictx->wait_last_completions();//copy-on-read: wait for unfinished AioCompletion requests
-
- if (ictx->object_cacher)
+ ictx->wait_for_pending_copyup();
+ if (ictx->object_cacher) {
ictx->shutdown_cache(); // implicitly flushes
} else {
flush(ictx);
ictx->snap_lock.get_read();
snap_t snap_id = ictx->snap_id;
+ ::SnapContext snapc = ictx->snapc;
ictx->snap_lock.put_read();
// readahead
C_AioRead *req_comp = new C_AioRead(ictx->cct, c);
AioRead *req = new AioRead(ictx, q->oid.name,
q->objectno, q->offset, q->length,
- q->buffer_extents,
+ q->buffer_extents, snapc,
snap_id, true, req_comp, op_flags);
req_comp->set_req(req);
c->add_request();
rados_ioctx_destroy(ioctx);
}
+TEST_F(TestLibRBD, TestCoR)
+{
+ if (!g_conf->rbd_clone_copy_on_read) {
+ std::cout << "SKIPPING due to disabled rbd_copy_on_read" << std::endl;
+ return;
+ }
+
+ rados_ioctx_t ioctx;
+ rados_ioctx_create(_cluster, m_pool_name.c_str(), &ioctx);
+
+ int features = RBD_FEATURE_LAYERING;
+ rbd_image_t parent, child;
+ int order = 12; // smallest object size is 4K
+ const uint64_t image_size = 4<<20;
+ const int object_size = 1<<12;
+ const int object_num = image_size / object_size;
+ map<uint64_t, uint64_t> write_tracker;
+ set<string> obj_checker;
+ rbd_image_info_t p_info, c_info;
+ rados_list_ctx_t list_ctx;
+ const char *entry;
+
+ // make a parent to clone from
+ ASSERT_EQ(0, create_image_full(ioctx, "parent", image_size, &order, false, features));
+ ASSERT_EQ(0, rbd_open(ioctx, "parent", &parent, NULL));
+ printf("made parent image \"parent\": %ldK (%d * %dK)\n",
+ image_size, object_num, object_size/1024);
+
+ // write something into parent
+ char test_data[TEST_IO_SIZE + 1];
+ char zero_data[TEST_IO_SIZE + 1];
+ int i;
+ int count = 0;
+
+ for (i = 0; i < TEST_IO_SIZE; ++i)
+ test_data[i] = (char) (rand() % (126 - 33) + 33);
+ test_data[TEST_IO_SIZE] = '\0';
+ memset(zero_data, 0, sizeof(zero_data));
+
+ // generate a random map which covers every objects with random offset
+ while (count < 100) {
+ uint64_t ono = rand() % object_num;
+ if (write_tracker.find(ono) == write_tracker.end()) {
+ uint64_t offset = rand() % (object_size - TEST_IO_SIZE);
+ write_tracker.insert(pair<uint64_t, uint64_t>(ono, offset));
+ count++;
+ }
+ }
+
+ printf("generated random write map:\n");
+ for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+ itr != write_tracker.end(); ++itr)
+ printf("\t [%-8ld, %-8ld]\n", itr->first, itr->second);
+
+ printf("write data based on random map\n");
+ for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+ itr != write_tracker.end(); ++itr) {
+ printf("\twrite object-%-4ld\t", itr->first);
+ ASSERT_PASSED(write_test_data, parent, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+ }
+
+ for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+ itr != write_tracker.end(); ++itr) {
+ printf("\tread object-%-4ld\t", itr->first);
+ ASSERT_PASSED(read_test_data, parent, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+ }
+
+ // find out what objects the parent image has generated
+ ASSERT_EQ(0, rbd_stat(parent, &p_info, sizeof(p_info)));
+ ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
+ while (rados_objects_list_next(list_ctx, &entry, NULL) != -ENOENT) {
+ if (strstr(entry, p_info.block_name_prefix)) {
+ const char *block_name_suffix = entry + strlen(p_info.block_name_prefix) + 1;
+ obj_checker.insert(block_name_suffix);
+ }
+ }
+ rados_objects_list_close(list_ctx);
+ ASSERT_EQ(obj_checker.size(), write_tracker.size());
+
+ // create a snapshot, reopen as the parent we're interested in and protect it
+ ASSERT_EQ(0, rbd_snap_create(parent, "parent_snap"));
+ ASSERT_EQ(0, rbd_close(parent));
+ ASSERT_EQ(0, rbd_open(ioctx, "parent", &parent, "parent_snap"));
+ ASSERT_EQ(0, rbd_snap_protect(parent, "parent_snap"));
+ ASSERT_EQ(0, rbd_close(parent));
+ printf("made snapshot \"parent@parent_snap\" and protect it\n");
+
+ // create a copy-on-read clone and open it
+ ASSERT_EQ(0, rbd_clone(ioctx, "parent", "parent_snap", ioctx, "child",
+ features, &order));
+ ASSERT_EQ(0, rbd_open(ioctx, "child", &child, NULL));
+ printf("made and opened clone \"child\"\n");
+
+ printf("read from \"child\"\n");
+ {
+ map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+ printf("\tread object-%-4ld\t", itr->first);
+ ASSERT_PASSED(read_test_data, child, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+ }
+
+ for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
+ itr != write_tracker.end(); ++itr) {
+ printf("\tread object-%-4ld\t", itr->first);
+ ASSERT_PASSED(read_test_data, child, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+ }
+
+ printf("read again reversely\n");
+ for (map<uint64_t, uint64_t>::iterator itr = --write_tracker.end();
+ itr != write_tracker.begin(); --itr) {
+ printf("\tread object-%-4ld\t", itr->first);
+ ASSERT_PASSED(read_test_data, child, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
+ }
+
+ // close child to flush all copy-on-read
+ ASSERT_EQ(0, rbd_close(child));
+
+ printf("check whether child image has the same set of objects as parent\n");
+ ASSERT_EQ(0, rbd_open(ioctx, "child", &child, NULL));
+ ASSERT_EQ(0, rbd_stat(child, &c_info, sizeof(c_info)));
+ ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
+ while (rados_objects_list_next(list_ctx, &entry, NULL) != -ENOENT) {
+ if (strstr(entry, c_info.block_name_prefix)) {
+ const char *block_name_suffix = entry + strlen(c_info.block_name_prefix) + 1;
+ set<string>::iterator it = obj_checker.find(block_name_suffix);
+ ASSERT_TRUE(it != obj_checker.end());
+ obj_checker.erase(it);
+ }
+ }
+ rados_objects_list_close(list_ctx);
+ ASSERT_TRUE(obj_checker.empty());
+ ASSERT_EQ(0, rbd_close(child));
+
+ rados_ioctx_destroy(ioctx);
+}
+
static void test_list_children(rbd_image_t image, ssize_t num_expected, ...)
{
va_list ap;