From a871651857afc11da440a5f5ffcce53bc4b050a3 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Tue, 27 Jan 2015 06:07:13 -0500 Subject: [PATCH] librbd: object map updates should use AIO Update the existing AbstractWrite state machine to handle updating the object map before and after the object update. Also convert the resize/trim maintenance operations to use new state machines which handle the pre-/post-operation updates to the object map. Signed-off-by: Jason Dillaman --- src/cls/rbd/cls_rbd_client.cc | 23 +- src/cls/rbd/cls_rbd_client.h | 6 +- src/include/Context.h | 6 +- src/librbd/AioRequest.cc | 170 +++++++-- src/librbd/AioRequest.h | 126 +++++-- src/librbd/AsyncFlattenRequest.cc | 210 +++++++++++ src/librbd/AsyncFlattenRequest.h | 77 ++++ src/librbd/AsyncObjectThrottle.cc | 16 +- src/librbd/AsyncObjectThrottle.h | 4 +- src/librbd/AsyncRequest.cc | 19 + src/librbd/AsyncRequest.h | 62 ++++ src/librbd/AsyncResizeRequest.cc | 216 +++++++++++ src/librbd/AsyncResizeRequest.h | 75 ++++ src/librbd/AsyncTrimRequest.cc | 298 +++++++++++++++ src/librbd/AsyncTrimRequest.h | 77 ++++ src/librbd/CopyupRequest.cc | 165 ++++++--- src/librbd/CopyupRequest.h | 47 ++- src/librbd/ImageCtx.cc | 2 +- src/librbd/ImageCtx.h | 4 +- src/librbd/Makefile.am | 8 + src/librbd/ObjectMap.cc | 265 +++++++++----- src/librbd/ObjectMap.h | 92 ++++- src/librbd/internal.cc | 587 ++---------------------------- src/librbd/internal.h | 10 +- src/osdc/ObjectCacher.h | 6 +- src/test/cls_rbd/test_cls_rbd.cc | 8 +- 26 files changed, 1758 insertions(+), 821 deletions(-) create mode 100644 src/librbd/AsyncFlattenRequest.cc create mode 100644 src/librbd/AsyncFlattenRequest.h create mode 100644 src/librbd/AsyncRequest.cc create mode 100644 src/librbd/AsyncRequest.h create mode 100644 src/librbd/AsyncResizeRequest.cc create mode 100644 src/librbd/AsyncResizeRequest.h create mode 100644 src/librbd/AsyncTrimRequest.cc create mode 100644 src/librbd/AsyncTrimRequest.h diff --git a/src/cls/rbd/cls_rbd_client.cc b/src/cls/rbd/cls_rbd_client.cc index 7b144ffe7ea7..faef6f3ceb35 100644 --- a/src/cls/rbd/cls_rbd_client.cc +++ b/src/cls/rbd/cls_rbd_client.cc @@ -265,15 +265,13 @@ namespace librbd { return 0; } - int set_flags(librados::IoCtx *ioctx, const std::string &oid, - uint64_t flags, uint64_t mask) + void set_flags(librados::ObjectWriteOperation *op, uint64_t flags, + uint64_t mask) { bufferlist inbl; ::encode(flags, inbl); ::encode(mask, inbl); - - bufferlist outbl; - return ioctx->exec(oid, "rbd", "set_flags", inbl, outbl); + op->exec("rbd", "set_flags", inbl); } int remove_parent(librados::IoCtx *ioctx, const std::string &oid) @@ -301,16 +299,23 @@ namespace librbd { return ioctx->exec(oid, "rbd", "add_child", in, out); } - int remove_child(librados::IoCtx *ioctx, const std::string &oid, - parent_spec pspec, const std::string &c_imageid) + void remove_child(librados::ObjectWriteOperation *op, + parent_spec pspec, const std::string &c_imageid) { - bufferlist in, out; + bufferlist in; ::encode(pspec.pool_id, in); ::encode(pspec.image_id, in); ::encode(pspec.snap_id, in); ::encode(c_imageid, in); + op->exec("rbd", "remove_child", in); + } - return ioctx->exec(oid, "rbd", "remove_child", in, out); + int remove_child(librados::IoCtx *ioctx, const std::string &oid, + parent_spec pspec, const std::string &c_imageid) + { + librados::ObjectWriteOperation op; + remove_child(&op, pspec, c_imageid); + return ioctx->operate(oid, &op); } int get_children(librados::IoCtx *ioctx, const std::string &oid, diff --git a/src/cls/rbd/cls_rbd_client.h b/src/cls/rbd/cls_rbd_client.h index d78175fb67cf..0e21da33a6fa 100644 --- a/src/cls/rbd/cls_rbd_client.h +++ b/src/cls/rbd/cls_rbd_client.h @@ -49,12 +49,14 @@ namespace librbd { parent_spec pspec, uint64_t parent_overlap); int get_flags(librados::IoCtx *ioctx, const std::string &oid, snapid_t snap_id, uint64_t *flags); - int set_flags(librados::IoCtx *ioctx, const std::string &oid, - uint64_t flags, uint64_t mask); + void set_flags(librados::ObjectWriteOperation *op, + uint64_t flags, uint64_t mask); int remove_parent(librados::IoCtx *ioctx, const std::string &oid); void remove_parent(librados::ObjectWriteOperation *op); int add_child(librados::IoCtx *ioctx, const std::string &oid, parent_spec pspec, const std::string &c_imageid); + void remove_child(librados::ObjectWriteOperation *op, + parent_spec pspec, const std::string &c_imageid); int remove_child(librados::IoCtx *ioctx, const std::string &oid, parent_spec pspec, const std::string &c_imageid); int get_children(librados::IoCtx *ioctx, const std::string &oid, diff --git a/src/include/Context.h b/src/include/Context.h index 0b91e70c3d72..2ae92214b724 100644 --- a/src/include/Context.h +++ b/src/include/Context.h @@ -452,16 +452,16 @@ typedef C_GatherBuilderBase C_GatherBuilder; class FunctionContext : public Context { public: - FunctionContext(const boost::function &callback) + FunctionContext(const boost::function &callback) : m_callback(callback) { } virtual void finish(int r) { - m_callback(); + m_callback(r); } private: - boost::function m_callback; + boost::function m_callback; }; #undef mydout diff --git a/src/librbd/AioRequest.cc b/src/librbd/AioRequest.cc index 2efde981d627..907367a192e1 100644 --- a/src/librbd/AioRequest.cc +++ b/src/librbd/AioRequest.cc @@ -3,16 +3,20 @@ #include "common/ceph_context.h" #include "common/dout.h" +#include "common/errno.h" #include "common/Mutex.h" #include "common/RWLock.h" #include "librbd/AioCompletion.h" #include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" #include "librbd/internal.h" #include "librbd/AioRequest.h" #include "librbd/CopyupRequest.h" -#include "librbd/ObjectMap.h" + +#include +#include #define dout_subsys ceph_subsys_rbd #undef dout_prefix @@ -21,7 +25,7 @@ namespace librbd { AioRequest::AioRequest() : - m_ictx(NULL), m_ioctx(NULL), + m_ictx(NULL), m_object_no(0), m_object_off(0), m_object_len(0), m_snap_id(CEPH_NOSNAP), m_completion(NULL), m_parent_completion(NULL), m_hide_enoent(false) {} @@ -30,7 +34,7 @@ namespace librbd { const ::SnapContext &snapc, librados::snap_t snap_id, Context *completion, bool hide_enoent) : - m_ictx(ictx), m_ioctx(&ictx->data_ctx), m_oid(oid), m_object_no(objectno), + m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off), m_object_len(len), m_snap_id(snap_id), m_completion(completion), m_parent_completion(NULL), m_hide_enoent(hide_enoent) { @@ -176,7 +180,7 @@ namespace librbd { m_object_no, m_image_extents); m_ictx->copyup_list[m_object_no] = new_req; - new_req->queue_read_from_parent(); + new_req->queue_send(); } } } @@ -219,8 +223,7 @@ namespace librbd { } op.set_op_flags2(m_op_flags); - r = m_ioctx->aio_operate(m_oid, rados_completion, &op, flags, NULL); - + r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &op, flags, NULL); rados_completion->release(); return r; } @@ -240,8 +243,12 @@ namespace librbd { bool hide_enoent) : AioRequest(ictx, oid, object_no, object_off, len, snapc, snap_id, completion, hide_enoent), - m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val), m_entire_object(NULL) + m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val), + m_entire_object(NULL) { + m_io_ctx.dup(ictx->data_ctx); + m_io_ctx.snap_set_read(CEPH_NOSNAP); + m_object_image_extents = objectx; m_parent_overlap = object_overlap; } @@ -252,6 +259,8 @@ namespace librbd { m_state = LIBRBD_AIO_WRITE_GUARD; m_write.assert_exists(); ldout(m_ictx->cct, 20) << __func__ << " guarding write" << dendl; + } else { + m_state = LIBRBD_AIO_WRITE_FLAT; } } @@ -263,6 +272,21 @@ namespace librbd { map::iterator it; bool finished = true; switch (m_state) { + case LIBRBD_AIO_WRITE_PRE: + ldout(m_ictx->cct, 20) << "WRITE_PRE" << dendl; + if (r < 0) { + return true; + } + + send_write(); + finished = false; + break; + + case LIBRBD_AIO_WRITE_POST: + ldout(m_ictx->cct, 20) << "WRITE_POST" << dendl; + finished = true; + break; + case LIBRBD_AIO_WRITE_GUARD: ldout(m_ictx->cct, 20) << "WRITE_CHECK_GUARD" << dendl; @@ -314,7 +338,7 @@ namespace librbd { m_entire_object = &(new_req->get_copyup_data()); m_ictx->copyup_list_lock.Unlock(); - new_req->read_from_parent(); + new_req->send(); } else { it->second->append_request(this); m_entire_object = &it->second->get_copyup_data(); @@ -332,25 +356,27 @@ namespace librbd { } finished = false; break; - } - if (r < 0) { - ldout(m_ictx->cct, 20) << "error checking for object existence" << dendl; + } else if (r < 0) { + // pass the error code to the finish context + m_state = LIBRBD_AIO_WRITE_ERROR; + complete(r); + finished = false; break; } + + finished = send_post(); break; case LIBRBD_AIO_WRITE_COPYUP: ldout(m_ictx->cct, 20) << "WRITE_COPYUP" << dendl; m_state = LIBRBD_AIO_WRITE_GUARD; - if (r < 0) + if (r < 0) { return should_complete(r); + } // Read data from waiting list safely. If this AioWrite created a // CopyupRequest, m_read_data should be empty. if (m_entire_object != NULL) { - assert(m_ictx->copyup_list_lock.is_locked()); - assert(m_ictx->copyup_list.find(m_object_no) != - m_ictx->copyup_list.end()); assert(m_read_data.length() == 0); m_read_data.append(*m_entire_object); } @@ -361,7 +387,14 @@ namespace librbd { case LIBRBD_AIO_WRITE_FLAT: ldout(m_ictx->cct, 20) << "WRITE_FLAT" << dendl; - // nothing to do + + finished = send_post(); + break; + + case LIBRBD_AIO_WRITE_ERROR: + assert(r < 0); + lderr(m_ictx->cct) << "WRITE_ERROR: " << cpp_strerror(r) + << dendl; break; default: @@ -373,32 +406,111 @@ namespace librbd { } int AbstractWrite::send() { - ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl; + ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " " + << m_object_off << "~" << m_object_len << dendl; + + if (send_pre()) { + return 0; + } else { + send_write(); + } + return 0; + } + + bool AbstractWrite::send_pre() { + bool lost_exclusive_lock = false; + { + RWLock::RLocker l(m_ictx->owner_lock); + RWLock::RLocker l2(m_ictx->md_lock); + if (m_ictx->object_map == NULL) { + return false; + } + + if (!m_ictx->image_watcher->is_lock_owner()) { + ldout(m_ictx->cct, 1) << "lost exclusive lock during write" << dendl; + lost_exclusive_lock = true; + } else { + ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " " + << m_object_off << "~" << m_object_len << dendl; + + uint8_t new_state; + boost::optional current_state; + pre_object_map_update(&new_state); + + m_state = LIBRBD_AIO_WRITE_PRE; + FunctionContext *ctx = new FunctionContext( + boost::bind(&AioRequest::complete, this, _1)); + m_ictx->object_map->aio_update(m_object_no, new_state, + current_state, ctx); + } + } + + if (lost_exclusive_lock) { + complete(-ERESTART); + } + return true; + } + + bool AbstractWrite::send_post() { + ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " " + << m_object_off << "~" << m_object_len << dendl; + + RWLock::RLocker l(m_ictx->owner_lock); + RWLock::RLocker l2(m_ictx->md_lock); + if (m_ictx->object_map == NULL || !post_object_map_update()) { + return true; + } + + if (m_ictx->image_watcher->is_lock_supported() && + !m_ictx->image_watcher->is_lock_owner()) { + // leave the object flagged as pending + ldout(m_ictx->cct, 1) << "lost exclusive lock during write" << dendl; + return true; + } + + m_state = LIBRBD_AIO_WRITE_POST; + FunctionContext *ctx = new FunctionContext( + boost::bind(&AioRequest::complete, this, _1)); + m_ictx->object_map->aio_update(m_object_no, OBJECT_NONEXISTENT, + OBJECT_PENDING, ctx); + return false; + } + + void AbstractWrite::send_write() { + ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " " + << m_object_off << "~" << m_object_len << dendl; + + guard_write(); + add_write_ops(&m_write); + assert(m_write.size() != 0); + librados::AioCompletion *rados_completion = librados::Rados::aio_create_completion(this, NULL, rados_req_cb); - int r; - assert(m_write.size()); - r = m_ioctx->aio_operate(m_oid, rados_completion, &m_write, - m_snap_seq, m_snaps); + int r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &m_write, + m_snap_seq, m_snaps); + assert(r == 0); rados_completion->release(); - return r; } void AbstractWrite::send_copyup() { ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl; - if (!m_read_data.is_zero()) - m_copyup.exec("rbd", "copyup", m_read_data); - add_copyup_ops(); + librados::ObjectWriteOperation op; + if (!m_read_data.is_zero()) { + op.exec("rbd", "copyup", m_read_data); + } + add_write_ops(&op); + assert(op.size() != 0); librados::AioCompletion *rados_completion = librados::Rados::aio_create_completion(this, NULL, rados_req_cb); - m_ictx->md_ctx.aio_operate(m_oid, rados_completion, &m_copyup, + m_ictx->md_ctx.aio_operate(m_oid, rados_completion, &op, m_snap_seq, m_snaps); rados_completion->release(); } - void AioWrite::add_write_ops(librados::ObjectWriteOperation &wr) { - wr.set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size()); - wr.write(m_object_off, m_write_data); + void AioWrite::add_write_ops(librados::ObjectWriteOperation *wr) { + wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size()); + wr->write(m_object_off, m_write_data); + wr->set_op_flags2(m_op_flags); } } diff --git a/src/librbd/AioRequest.h b/src/librbd/AioRequest.h index 4e29feb82aa6..4ffc7491838f 100644 --- a/src/librbd/AioRequest.h +++ b/src/librbd/AioRequest.h @@ -11,6 +11,7 @@ #include "include/buffer.h" #include "include/Context.h" #include "include/rados/librados.hpp" +#include "librbd/ObjectMap.h" namespace librbd { @@ -50,7 +51,6 @@ namespace librbd { void read_from_parent(vector >& image_extents); ImageCtx *m_ictx; - librados::IoCtx *m_ioctx; std::string m_oid; uint64_t m_object_no, m_object_off, m_object_len; librados::snap_t m_snap_id; @@ -125,7 +125,6 @@ namespace librbd { virtual ~AbstractWrite() {} virtual bool should_complete(int r); virtual int send(); - void guard_write(); bool has_parent() const { return !m_object_image_extents.empty(); @@ -134,38 +133,65 @@ namespace librbd { private: /** * Writes go through the following state machine to deal with - * layering: + * layering and the object map: * - * need copyup - * LIBRBD_AIO_WRITE_GUARD ---------------> LIBRBD_AIO_WRITE_COPYUP - * | ^ | - * v \------------------------------/ - * done - * ^ - * | - * LIBRBD_AIO_WRITE_FLAT + * + * . | + * . | + * . \---> LIBRBD_AIO_WRITE_PRE + * . | | + * . . . . . . | . . . . | . . . . . . . . . . . + * . | -or- | . + * . | | v + * . | \----------------> LIBRBD_AIO_WRITE_FLAT . . . + * . | | . + * v v need copyup | . + * LIBRBD_AIO_WRITE_GUARD -----------> LIBRBD_AIO_WRITE_COPYUP | . + * . | ^ | | . + * . | | | | . + * . | \---------------------------/ | . + * . | | . + * . \-------------------\ /-------------------/ . + * . | | . + * . LIBRBD_AIO_WRITE_POST . + * . | . + * . v . + * . . . . . . . . . . . . . . > < . . . . . . . . . . . . . . * - * Writes start in LIBRBD_AIO_WRITE_GUARD or _FLAT, depending on whether - * there is a parent or not. + * The _PRE_REMOVE/_POST_REMOVE states are skipped if the object map + * is disabled. The write starts in _WRITE_GUARD or _FLAT depending on + * whether or not there is a parent overlap. */ enum write_state_d { LIBRBD_AIO_WRITE_GUARD, LIBRBD_AIO_WRITE_COPYUP, - LIBRBD_AIO_WRITE_FLAT + LIBRBD_AIO_WRITE_FLAT, + LIBRBD_AIO_WRITE_PRE, + LIBRBD_AIO_WRITE_POST, + LIBRBD_AIO_WRITE_ERROR }; protected: - virtual void add_copyup_ops() = 0; - write_state_d m_state; vector > m_object_image_extents; uint64_t m_parent_overlap; librados::ObjectWriteOperation m_write; - librados::ObjectWriteOperation m_copyup; uint64_t m_snap_seq; ceph::bufferlist *m_entire_object; + virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0; + virtual void guard_write(); + virtual void pre_object_map_update(uint8_t *new_state) = 0; + virtual bool post_object_map_update() { + return false; + } + private: + librados::IoCtx m_io_ctx; + + bool send_pre(); + bool send_post(); + void send_write(); void send_copyup(); }; @@ -182,23 +208,22 @@ namespace librbd { objectx, object_overlap, snapc, snap_id, completion, false), - m_write_data(data) { - guard_write(); - add_write_ops(m_write); + m_write_data(data), m_op_flags(0) { } virtual ~AioWrite() {} void set_op_flags(int op_flags) { - m_write.set_op_flags2(op_flags); + m_op_flags = op_flags; } protected: - virtual void add_copyup_ops() { - add_write_ops(m_copyup); + virtual void add_write_ops(librados::ObjectWriteOperation *wr); + virtual void pre_object_map_update(uint8_t *new_state) { + *new_state = OBJECT_EXISTS; } private: - void add_write_ops(librados::ObjectWriteOperation &wr); ceph::bufferlist m_write_data; + int m_op_flags; }; class AioRemove : public AbstractWrite { @@ -213,18 +238,37 @@ namespace librbd { objectx, object_overlap, snapc, snap_id, completion, true) { - if (has_parent()) - m_write.truncate(0); - else - m_write.remove(); } virtual ~AioRemove() {} protected: - virtual void add_copyup_ops() { - // removing an object never needs to copyup - assert(0); + virtual void add_write_ops(librados::ObjectWriteOperation *wr) { + if (has_parent()) { + m_object_state = OBJECT_EXISTS; + wr->truncate(0); + } else { + m_object_state = OBJECT_PENDING; + wr->remove(); + } + } + + virtual void pre_object_map_update(uint8_t *new_state) { + *new_state = m_object_state; } + + virtual bool post_object_map_update() { + if (m_object_state == OBJECT_EXISTS) { + return false; + } + return true; + } + + virtual void guard_write() { + // do nothing to disable write guard + } + + private: + uint8_t m_object_state; }; class AioTruncate : public AbstractWrite { @@ -239,14 +283,16 @@ namespace librbd { objectx, object_overlap, snapc, snap_id, completion, true) { - guard_write(); - m_write.truncate(object_off); } virtual ~AioTruncate() {} protected: - virtual void add_copyup_ops() { - m_copyup.truncate(m_object_off); + virtual void add_write_ops(librados::ObjectWriteOperation *wr) { + wr->truncate(m_object_off); + } + + virtual void pre_object_map_update(uint8_t *new_state) { + *new_state = OBJECT_EXISTS; } }; @@ -262,14 +308,16 @@ namespace librbd { objectx, object_overlap, snapc, snap_id, completion, true) { - guard_write(); - m_write.zero(object_off, object_len); } virtual ~AioZero() {} protected: - virtual void add_copyup_ops() { - m_copyup.zero(m_object_off, m_object_len); + virtual void add_write_ops(librados::ObjectWriteOperation *wr) { + wr->zero(m_object_off, m_object_len); + } + + virtual void pre_object_map_update(uint8_t *new_state) { + *new_state = OBJECT_EXISTS; } }; diff --git a/src/librbd/AsyncFlattenRequest.cc b/src/librbd/AsyncFlattenRequest.cc new file mode 100644 index 000000000000..eacdc4bdb556 --- /dev/null +++ b/src/librbd/AsyncFlattenRequest.cc @@ -0,0 +1,210 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/AsyncFlattenRequest.h" +#include "librbd/AioRequest.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ObjectMap.h" +#include "common/dout.h" +#include "common/errno.h" +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::AsyncFlattenRequest: " + +namespace librbd { + +class AsyncFlattenObjectContext : public C_AsyncObjectThrottle { +public: + AsyncFlattenObjectContext(AsyncObjectThrottle &throttle, ImageCtx *image_ctx, + uint64_t object_size, ::SnapContext snapc, + uint64_t object_no) + : C_AsyncObjectThrottle(throttle), m_image_ctx(*image_ctx), + m_object_size(object_size), m_snapc(snapc), m_object_no(object_no) + { + } + + virtual int send() { + CephContext *cct = m_image_ctx.cct; + + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.image_watcher->is_lock_supported() && + !m_image_ctx.image_watcher->is_lock_owner()) { + ldout(cct, 1) << "lost exclusive lock during flatten" << dendl; + return -ERESTART; + } + + RWLock::RLocker l2(m_image_ctx.md_lock); + uint64_t overlap; + { + RWLock::RLocker l3(m_image_ctx.parent_lock); + // stop early if the parent went away - it just means + // another flatten finished first, so this one is useless. + if (!m_image_ctx.parent) { + return 1; + } + + // resize might have occurred while flatten is running + overlap = min(m_image_ctx.size, m_image_ctx.parent_md.overlap); + } + + // map child object onto the parent + vector > objectx; + Striper::extent_to_file(cct, &m_image_ctx.layout, m_object_no, + 0, m_object_size, objectx); + uint64_t object_overlap = m_image_ctx.prune_parent_extents(objectx, overlap); + assert(object_overlap <= m_object_size); + if (object_overlap == 0) { + // resize shrunk image while flattening + return 1; + } + + bufferlist bl; + string oid = m_image_ctx.get_object_name(m_object_no); + AioWrite *req = new AioWrite(&m_image_ctx, oid, m_object_no, 0, objectx, + object_overlap, bl, m_snapc, CEPH_NOSNAP, + this); + int r = req->send(); + assert(r == 0); + return 0; + } + +private: + ImageCtx &m_image_ctx; + uint64_t m_object_size; + ::SnapContext m_snapc; + uint64_t m_object_no; +}; + +bool AsyncFlattenRequest::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl; + if (r < 0 && !(r == -ENOENT && m_ignore_enoent) ) { + lderr(cct) << "flatten encountered an error: " << cpp_strerror(r) << dendl; + return true; + } + + switch (m_state) { + case STATE_FLATTEN_OBJECTS: + ldout(cct, 5) << "FLATTEN_OBJECTS" << dendl; + return send_update_header(); + + case STATE_UPDATE_HEADER: + ldout(cct, 5) << "UPDATE_HEADER" << dendl; + return send_update_children(); + + case STATE_UPDATE_CHILDREN: + ldout(cct, 5) << "UPDATE_CHILDREN" << dendl; + return true; + + default: + lderr(cct) << "invalid state: " << m_state << dendl; + assert(false); + break; + } + return false; +} + +void AsyncFlattenRequest::send() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " send" << dendl; + + m_state = STATE_FLATTEN_OBJECTS; + AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr(), + boost::lambda::_1, &m_image_ctx, m_object_size, m_snapc, + boost::lambda::_2)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + context_factory, create_callback_context(), m_prog_ctx, 0, + m_overlap_objects); + throttle->start_ops(cct->_conf->rbd_concurrent_management_ops); +} + +bool AsyncFlattenRequest::send_update_header() { + CephContext *cct = m_image_ctx.cct; + bool lost_exclusive_lock = false; + + m_state = STATE_UPDATE_HEADER; + { + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.image_watcher->is_lock_supported() && + !m_image_ctx.image_watcher->is_lock_owner()) { + ldout(cct, 1) << "lost exclusive lock during header update" << dendl; + lost_exclusive_lock = true; + } else { + ldout(cct, 5) << this << " send_update_header" << dendl; + + RWLock::RLocker l2(m_image_ctx.parent_lock); + // stop early if the parent went away - it just means + // another flatten finished first, so this one is useless. + if (!m_image_ctx.parent) { + ldout(cct, 5) << "image already flattened" << dendl; + return true; + } + m_ignore_enoent = true; + m_parent_spec = m_image_ctx.parent_md.spec; + + // remove parent from this (base) image + librados::ObjectWriteOperation op; + if (m_image_ctx.image_watcher->is_lock_supported()) { + m_image_ctx.image_watcher->assert_header_locked(&op); + } + cls_client::remove_parent(&op); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, + rados_completion, &op); + assert(r == 0); + rados_completion->release(); + } + } + + if (lost_exclusive_lock) { + complete(-ERESTART); + } + return false; +} + +bool AsyncFlattenRequest::send_update_children() { + CephContext *cct = m_image_ctx.cct; + bool lost_exclusive_lock = false; + + m_state = STATE_UPDATE_CHILDREN; + { + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.image_watcher->is_lock_supported() && + !m_image_ctx.image_watcher->is_lock_owner()) { + ldout(cct, 1) << "lost exclusive lock during children update" << dendl; + lost_exclusive_lock = true; + } else { + // if there are no snaps, remove from the children object as well + // (if snapshots remain, they have their own parent info, and the child + // will be removed when the last snap goes away) + RWLock::RLocker l2(m_image_ctx.snap_lock); + if (!m_image_ctx.snaps.empty()) { + return true; + } + + ldout(cct, 2) << "removing child from children list..." << dendl; + librados::ObjectWriteOperation op; + cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, rados_completion, + &op); + assert(r == 0); + rados_completion->release(); + } + } + + if (lost_exclusive_lock) { + complete(-ERESTART); + } + return false; +} + +} // namespace librbd diff --git a/src/librbd/AsyncFlattenRequest.h b/src/librbd/AsyncFlattenRequest.h new file mode 100644 index 000000000000..3f0c61262968 --- /dev/null +++ b/src/librbd/AsyncFlattenRequest.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_ASYNC_FLATTEN_REQUEST_H +#define CEPH_LIBRBD_ASYNC_FLATTEN_REQUEST_H + +#include "librbd/AsyncRequest.h" +#include "librbd/parent_types.h" +#include "common/snap_types.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +class AsyncFlattenRequest : public AsyncRequest +{ +public: + AsyncFlattenRequest(ImageCtx &image_ctx, Context *on_finish, + uint64_t object_size, uint64_t overlap_objects, + const ::SnapContext &snapc, ProgressContext &prog_ctx) + : AsyncRequest(image_ctx, on_finish), m_object_size(object_size), + m_overlap_objects(overlap_objects), m_snapc(snapc), m_prog_ctx(prog_ctx), + m_ignore_enoent(false) + { + } + + virtual void send(); + +protected: + virtual bool should_complete(int r); + +private: + /** + * Flatten goes through the following state machine to copyup objects + * from the parent image: + * + * + * | + * v + * STATE_FLATTEN_OBJECTS ---> STATE_UPDATE_HEADER . . . . . + * . | . + * . | . + * . v . + * . STATE_UPDATE_CHILDREN . + * . | . + * . | . + * . \---> < . . + * . ^ + * . . + * . . . . . . . . . . . . . . . . . . . + * + * The _UPDATE_CHILDREN state will be skipped if the image has one or + * more snapshots. The _UPDATE_HEADER state will be skipped if the + * image was concurrently flattened by another client. + */ + enum State { + STATE_FLATTEN_OBJECTS, + STATE_UPDATE_HEADER, + STATE_UPDATE_CHILDREN + }; + + uint64_t m_object_size; + uint64_t m_overlap_objects; + ::SnapContext m_snapc; + ProgressContext &m_prog_ctx; + State m_state; + + parent_spec m_parent_spec; + bool m_ignore_enoent; + + bool send_update_header(); + bool send_update_children(); +}; + +} // namespace librbd + +#endif // CEPH_LIBRBD_ASYNC_FLATTEN_REQUEST_H diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc index c0ab54edcf39..28a6a348d1ea 100644 --- a/src/librbd/AsyncObjectThrottle.cc +++ b/src/librbd/AsyncObjectThrottle.cc @@ -17,14 +17,14 @@ AsyncObjectThrottle::AsyncObjectThrottle(const ContextFactory& context_factory, { } -int AsyncObjectThrottle::start_ops(uint64_t max_concurrent) { +void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) { bool complete; { Mutex::Locker l(m_lock); for (uint64_t i = 0; i < max_concurrent; ++i) { - int r = start_next_op(); - if (r < 0 && m_current_ops == 0) { - return r; + start_next_op(); + if (m_ret < 0 && m_current_ops == 0) { + break; } } complete = (m_current_ops == 0); @@ -33,7 +33,6 @@ int AsyncObjectThrottle::start_ops(uint64_t max_concurrent) { m_ctx->complete(m_ret); delete this; } - return 0; } void AsyncObjectThrottle::finish_op(int r) { @@ -54,11 +53,11 @@ void AsyncObjectThrottle::finish_op(int r) { } } -int AsyncObjectThrottle::start_next_op() { +void AsyncObjectThrottle::start_next_op() { bool done = false; while (!done) { if (m_ret != 0 || m_object_no >= m_end_object_no) { - return m_ret; + return; } uint64_t ono = m_object_no++; @@ -68,7 +67,7 @@ int AsyncObjectThrottle::start_next_op() { if (r < 0) { m_ret = r; delete ctx; - return m_ret; + return; } else if (r > 0) { // op completed immediately delete ctx; @@ -78,7 +77,6 @@ int AsyncObjectThrottle::start_next_op() { } m_prog_ctx.update_progress(ono, m_end_object_no); } - return 0; } } // namespace librbd diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h index fdbab5acf6ec..c7b5bf69a627 100644 --- a/src/librbd/AsyncObjectThrottle.h +++ b/src/librbd/AsyncObjectThrottle.h @@ -46,7 +46,7 @@ public: ProgressContext &prog_ctx, uint64_t object_no, uint64_t end_object_no); - int start_ops(uint64_t max_concurrent); + void start_ops(uint64_t max_concurrent); virtual void finish_op(int r); private: @@ -59,7 +59,7 @@ private: uint64_t m_current_ops; int m_ret; - int start_next_op(); + void start_next_op(); }; } // namespace librbd diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc new file mode 100644 index 000000000000..332d883cbed2 --- /dev/null +++ b/src/librbd/AsyncRequest.cc @@ -0,0 +1,19 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "librbd/AsyncRequest.h" +#include "librbd/internal.h" +#include + +namespace librbd +{ + +librados::AioCompletion *AsyncRequest::create_callback_completion() { + return librados::Rados::aio_create_completion(create_callback_context(), + NULL, rados_ctx_cb); +} + +Context *AsyncRequest::create_callback_context() { + return new FunctionContext(boost::bind(&AsyncRequest::complete, this, _1)); +} + +} // namespace librbd diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h new file mode 100644 index 000000000000..3fdc85c031b4 --- /dev/null +++ b/src/librbd/AsyncRequest.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_ASYNC_REQUEST_H +#define CEPH_LIBRBD_ASYNC_REQUEST_H + +#include "include/int_types.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" + +namespace librbd { + +class ImageCtx; + +class AsyncRequest +{ +public: + AsyncRequest(ImageCtx &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) + { + } + + virtual ~AsyncRequest() {} + + void complete(int r) { + if (should_complete(r)) { + m_on_finish->complete(r); + delete this; + } + } + + virtual void send() = 0; + +protected: + ImageCtx &m_image_ctx; + Context *m_on_finish; + + librados::AioCompletion *create_callback_completion(); + Context *create_callback_context(); + + virtual bool should_complete(int r) = 0; +}; + +class C_AsyncRequest : public Context +{ +public: + C_AsyncRequest(AsyncRequest *req) + : m_req(req) + { + } + +protected: + virtual void finish(int r) { + m_req->complete(r); + } + +private: + AsyncRequest *m_req; +}; + +} // namespace librbd + +#endif //CEPH_LIBRBD_ASYNC_REQUEST_H diff --git a/src/librbd/AsyncResizeRequest.cc b/src/librbd/AsyncResizeRequest.cc new file mode 100644 index 000000000000..d0e330775abd --- /dev/null +++ b/src/librbd/AsyncResizeRequest.cc @@ -0,0 +1,216 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "librbd/AsyncResizeRequest.h" +#include "librbd/AsyncTrimRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "common/dout.h" +#include "common/errno.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::AsyncResizeRequest: " + +namespace librbd +{ + +bool AsyncResizeRequest::should_complete(int r) +{ + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "resize encountered an error: " << cpp_strerror(r) << dendl; + RWLock::WLocker l(m_image_ctx.md_lock); + if (m_image_ctx.size == m_new_size) { + m_image_ctx.size = m_original_size; + } + return true; + } + + switch (m_state) { + case STATE_TRIM_IMAGE: + ldout(cct, 5) << "TRIM_IMAGE" << dendl; + send_grow_object_map(); + break; + + case STATE_GROW_OBJECT_MAP: + ldout(cct, 5) << "GROW_OBJECT_MAP" << dendl; + send_update_header(); + break; + + case STATE_UPDATE_HEADER: + ldout(cct, 5) << "UPDATE_HEADER" << dendl; + if (send_shrink_object_map()) { + return true; + } + break; + + case STATE_SHRINK_OBJECT_MAP: + ldout(cct, 5) << "SHRINK_OBJECT_MAP" << dendl; + return true; + + case STATE_FINISHED: + ldout(cct, 5) << "FINISHED" << dendl; + return true; + + default: + lderr(cct) << "invalid state: " << m_state << dendl; + assert(false); + break; + } + return false; +} + +void AsyncResizeRequest::send() { + CephContext *cct = m_image_ctx.cct; + if (m_original_size == m_new_size) { + ldout(cct, 2) << this << " no change in size (" << m_original_size + << " -> " << m_new_size << ")" << dendl; + m_state = STATE_FINISHED; + complete(0); + } else if (m_new_size > m_original_size) { + ldout(cct, 2) << this << " expanding image (" << m_original_size + << " -> " << m_new_size << ")" << dendl; + send_grow_object_map(); + } else { + ldout(cct, 2) << this << " shrinking image (" << m_original_size + << " -> " << m_new_size << ")" << dendl; + send_trim_image(); + } +} + +void AsyncResizeRequest::send_trim_image() { + ldout(m_image_ctx.cct, 5) << this << " send_trim_image: " + << " original_size=" << m_original_size + << " new_size=" << m_new_size << dendl; + m_state = STATE_TRIM_IMAGE; + + { + // update in-memory size to clip concurrent IO operations + RWLock::WLocker l(m_image_ctx.md_lock); + m_image_ctx.size = m_new_size; + } + + AsyncTrimRequest *req = new AsyncTrimRequest(m_image_ctx, + create_callback_context(), + m_original_size, m_new_size, + m_prog_ctx); + req->send(); +} + +void AsyncResizeRequest::send_grow_object_map() { + bool lost_exclusive_lock = false; + bool object_map_enabled = true; + { + RWLock::RLocker l(m_image_ctx.owner_lock); + RWLock::RLocker l2(m_image_ctx.md_lock); + if (m_image_ctx.object_map == NULL) { + object_map_enabled = false; + } else { + ldout(m_image_ctx.cct, 5) << this << " send_grow_object_map: " + << " original_size=" << m_original_size + << " new_size=" << m_new_size << dendl; + m_state = STATE_GROW_OBJECT_MAP; + + if (m_image_ctx.image_watcher->is_lock_supported() && + !m_image_ctx.image_watcher->is_lock_owner()) { + ldout(m_image_ctx.cct, 1) << "lost exclusive lock during grow object map" << dendl; + lost_exclusive_lock = true; + } else { + m_image_ctx.object_map->aio_resize(m_new_size, OBJECT_NONEXISTENT, + create_callback_context()); + object_map_enabled = true; + } + } + } + + if (!object_map_enabled) { + send_update_header(); + } else if (lost_exclusive_lock) { + // only complete when not holding locks + complete(-ERESTART); + } +} + +bool AsyncResizeRequest::send_shrink_object_map() { + bool lost_exclusive_lock = false; + { + RWLock::RLocker l(m_image_ctx.owner_lock); + RWLock::RLocker l2(m_image_ctx.md_lock); + if (m_image_ctx.object_map == NULL || + m_new_size > m_original_size) { + return true; + } + + ldout(m_image_ctx.cct, 5) << this << " send_shrink_object_map: " + << " original_size=" << m_original_size + << " new_size=" << m_new_size << dendl; + m_state = STATE_SHRINK_OBJECT_MAP; + + if (m_image_ctx.image_watcher->is_lock_supported() && + !m_image_ctx.image_watcher->is_lock_owner()) { + ldout(m_image_ctx.cct, 1) << "lost exclusive lock during shrink object map" << dendl; + lost_exclusive_lock = true; + } else { + m_image_ctx.object_map->aio_resize(m_new_size, OBJECT_NONEXISTENT, + create_callback_context()); + } + } + + if (lost_exclusive_lock) { + // only complete when not holding locks + complete(-ERESTART); + } + return false; +} + +void AsyncResizeRequest::send_update_header() { + bool lost_exclusive_lock = false; + + ldout(m_image_ctx.cct, 5) << this << " send_update_header: " + << " original_size=" << m_original_size + << " new_size=" << m_new_size << dendl; + m_state = STATE_UPDATE_HEADER; + + { + RWLock::RLocker l(m_image_ctx.owner_lock); + RWLock::WLocker l2(m_image_ctx.md_lock); + if (m_image_ctx.image_watcher->is_lock_supported() && + !m_image_ctx.image_watcher->is_lock_owner()) { + ldout(m_image_ctx.cct, 1) << "lost exclusive lock during header update" << dendl; + lost_exclusive_lock = true; + } else { + m_image_ctx.size = m_new_size; + + librados::ObjectWriteOperation op; + if (m_image_ctx.old_format) { + // rewrite header + bufferlist bl; + m_image_ctx.header.image_size = m_new_size; + bl.append((const char *)&m_image_ctx.header, sizeof(m_image_ctx.header)); + op.write(0, bl); + } else { + if (m_image_ctx.image_watcher->is_lock_supported()) { + m_image_ctx.image_watcher->assert_header_locked(&op); + } + cls_client::set_size(&op, m_new_size); + } + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, + rados_completion, &op); + assert(r == 0); + rados_completion->release(); + } + } + + if (lost_exclusive_lock) { + // only complete when not holding locks + complete(-ERESTART); + } +} + +} // namespace librbd diff --git a/src/librbd/AsyncResizeRequest.h b/src/librbd/AsyncResizeRequest.h new file mode 100644 index 000000000000..0e72a101e7ff --- /dev/null +++ b/src/librbd/AsyncResizeRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_ASYNC_RESIZE_REQUEST_H +#define CEPH_LIBRBD_ASYNC_RESIZE_REQUEST_H + +#include "librbd/AsyncRequest.h" + +namespace librbd +{ + +class ImageCtx; +class ProgressContext; + +class AsyncResizeRequest : public AsyncRequest +{ +public: + AsyncResizeRequest(ImageCtx &image_ctx, Context *on_finish, + uint64_t original_size, uint64_t new_size, + ProgressContext &prog_ctx) + : AsyncRequest(image_ctx, on_finish), + m_original_size(original_size), m_new_size(new_size), + m_prog_ctx(prog_ctx) + { + } + + virtual void send(); + +protected: + /** + * Resize goes through the following state machine to resize the image + * and update the object map: + * + * ----> STATE_FINISHED --------------------------------\ + * | . | + * | . . . . . . . . . . . . . . . . . . | + * | . | + * | v | + * |---> STATE_GROW_OBJECT_MAP ---> STATE_UPDATE_HEADER -------| + * | | + * | | + * \---> STATE_TRIM_IMAGE --------> STATE_UPDATE_HEADER . . . | + * | . | + * | . | + * v v v + * STATE_SHRINK_OBJECT_MAP ---> + * + * The _OBJECT_MAP states are skipped if the object map isn't enabled. + * The state machine will immediately transition to _FINISHED if there + * are no objects to trim. + */ + enum State { + STATE_TRIM_IMAGE, + STATE_GROW_OBJECT_MAP, + STATE_UPDATE_HEADER, + STATE_SHRINK_OBJECT_MAP, + STATE_FINISHED + }; + + State m_state; + uint64_t m_original_size; + uint64_t m_new_size; + ProgressContext &m_prog_ctx; + + virtual bool should_complete(int r); + + void send_trim_image(); + void send_grow_object_map(); + bool send_shrink_object_map(); + void send_update_header(); + +}; + +} // namespace librbd + +#endif // CEPH_LIBRBD_ASYNC_RESIZE_REQUEST_H diff --git a/src/librbd/AsyncTrimRequest.cc b/src/librbd/AsyncTrimRequest.cc new file mode 100644 index 000000000000..610d2d011a1e --- /dev/null +++ b/src/librbd/AsyncTrimRequest.cc @@ -0,0 +1,298 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "librbd/AsyncTrimRequest.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/AioRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "common/ContextCompletion.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" + +#include +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::AsyncTrimRequest: " + +namespace librbd +{ + +class AsyncTrimObjectContext : public C_AsyncObjectThrottle { +public: + AsyncTrimObjectContext(AsyncObjectThrottle &throttle, ImageCtx *image_ctx, + uint64_t object_no) + : C_AsyncObjectThrottle(throttle), m_image_ctx(*image_ctx), + m_object_no(object_no) + { + } + + virtual int send() { + { + RWLock::RLocker l(m_image_ctx.md_lock); + if (m_image_ctx.object_map != NULL && + !m_image_ctx.object_map->object_may_exist(m_object_no)) { + return 1; + } + } + + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.image_watcher->is_lock_supported() && + !m_image_ctx.image_watcher->is_lock_owner()) { + return -ERESTART; + } + + string oid = m_image_ctx.get_object_name(m_object_no); + ldout(m_image_ctx.cct, 10) << "removing " << oid << dendl; + + librados::AioCompletion *rados_completion = + librados::Rados::aio_create_completion(this, NULL, rados_ctx_cb); + int r = m_image_ctx.data_ctx.aio_remove(oid, rados_completion); + assert(r == 0); + rados_completion->release(); + return 0; + } + +private: + ImageCtx &m_image_ctx; + uint64_t m_object_no; +}; + +AsyncTrimRequest::AsyncTrimRequest(ImageCtx &image_ctx, Context *on_finish, + uint64_t original_size, uint64_t new_size, + ProgressContext &prog_ctx) + : AsyncRequest(image_ctx, on_finish), m_new_size(new_size), m_prog_ctx(prog_ctx) +{ + uint64_t period = m_image_ctx.get_stripe_period(); + uint64_t new_num_periods = ((m_new_size + period - 1) / period); + m_delete_off = MIN(new_num_periods * period, original_size); + // first object we can delete free and clear + m_delete_start = new_num_periods * m_image_ctx.get_stripe_count(); + m_num_objects = Striper::get_num_objects(m_image_ctx.layout, original_size); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " trim image " << original_size << " -> " + << m_new_size << " periods " << new_num_periods + << " discard to offset " << m_delete_off + << " delete objects " << m_delete_start + << " to " << m_num_objects << dendl; +} + + +bool AsyncTrimRequest::should_complete(int r) +{ + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " should_complete: r=" << r << dendl; + if (r < 0) { + lderr(cct) << "trim encountered an error: " << cpp_strerror(r) << dendl; + return true; + } + + switch (m_state) { + case STATE_PRE_REMOVE: + ldout(cct, 5) << " PRE_REMOVE" << dendl; + send_remove_objects(); + break; + + case STATE_REMOVE_OBJECTS: + ldout(cct, 5) << " REMOVE_OBJECTS" << dendl; + if (send_post_remove()) { + return true; + } + break; + + case STATE_POST_REMOVE: + ldout(cct, 5) << " POST_OBJECTS" << dendl; + if (send_clean_boundary()) { + return true; + } + break; + + case STATE_CLEAN_BOUNDARY: + ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl; + return true; + + case STATE_FINISHED: + ldout(cct, 5) << "FINISHED" << dendl; + return true; + + default: + lderr(cct) << "invalid state: " << m_state << dendl; + assert(false); + break; + } + return false; +} + +void AsyncTrimRequest::send() { + if (m_delete_start < m_num_objects) { + send_pre_remove(); + } else { + bool finished = send_clean_boundary(); + if (finished) { + m_state = STATE_FINISHED; + complete(0); + } + } +} + +void AsyncTrimRequest::send_remove_objects() { + CephContext *cct = m_image_ctx.cct; + ldout(m_image_ctx.cct, 5) << this << " send_remove_objects: " + << " delete_start=" << m_delete_start + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_REMOVE_OBJECTS; + + Context *ctx = create_callback_context(); + AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr(), + boost::lambda::_1, &m_image_ctx, boost::lambda::_2)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + context_factory, ctx, m_prog_ctx, m_delete_start, m_num_objects); + throttle->start_ops(cct->_conf->rbd_concurrent_management_ops); +} + +void AsyncTrimRequest::send_pre_remove() { + bool lost_exclusive_lock = false; + { + RWLock::RLocker l(m_image_ctx.owner_lock); + RWLock::RLocker l2(m_image_ctx.md_lock); + if (m_image_ctx.object_map == NULL) { + send_remove_objects(); + return; + } + + ldout(m_image_ctx.cct, 5) << this << " send_pre_remove: " + << " delete_start=" << m_delete_start + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_PRE_REMOVE; + + if (!m_image_ctx.image_watcher->is_lock_owner()) { + ldout(m_image_ctx.cct, 1) << "lost exclusive lock during trim" << dendl; + lost_exclusive_lock = true; + } else { + // flag the objects as pending deletion + m_image_ctx.object_map->aio_update( + m_delete_start, m_num_objects, OBJECT_PENDING, OBJECT_EXISTS, + create_callback_context()); + } + } + + if (lost_exclusive_lock) { + complete(-ERESTART); + } +} + +bool AsyncTrimRequest::send_post_remove() { + bool lost_exclusive_lock = false; + { + RWLock::RLocker l(m_image_ctx.owner_lock); + RWLock::RLocker l2(m_image_ctx.md_lock); + if (m_image_ctx.object_map == NULL) { + return send_clean_boundary(); + } + + ldout(m_image_ctx.cct, 5) << this << " send_post_remove: " + << " delete_start=" << m_delete_start + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_POST_REMOVE; + + if (!m_image_ctx.image_watcher->is_lock_owner()) { + ldout(m_image_ctx.cct, 1) << "lost exclusive lock during trim" << dendl; + } else { + // flag the pending objects as removed + m_image_ctx.object_map->aio_update( + m_delete_start, m_num_objects, OBJECT_NONEXISTENT, OBJECT_PENDING, + create_callback_context()); + } + } + + if (lost_exclusive_lock) { + complete(-ERESTART); + } + return false; +} + +bool AsyncTrimRequest::send_clean_boundary() { + CephContext *cct = m_image_ctx.cct; + if (m_delete_off <= m_new_size) { + return true; + } + + bool lost_exclusive_lock = false; + ContextCompletion *completion = NULL; + { + ldout(m_image_ctx.cct, 5) << this << " send_clean_boundary: " + << " delete_start=" << m_delete_start + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_CLEAN_BOUNDARY; + + RWLock::RLocker l(m_image_ctx.owner_lock); + if (m_image_ctx.image_watcher->is_lock_supported() && + !m_image_ctx.image_watcher->is_lock_owner()) { + ldout(m_image_ctx.cct, 1) << "lost exclusive lock during trim" << dendl; + lost_exclusive_lock = true; + } else { + ::SnapContext snapc; + uint64_t parent_overlap; + { + RWLock::RLocker l2(m_image_ctx.snap_lock); + RWLock::RLocker l3(m_image_ctx.parent_lock); + snapc = m_image_ctx.snapc; + parent_overlap = m_image_ctx.parent_md.overlap; + } + + // discard the weird boundary, if any + vector extents; + Striper::file_to_extents(cct, m_image_ctx.format_string, + &m_image_ctx.layout, m_new_size, + m_delete_off - m_new_size, 0, extents); + + completion = new ContextCompletion(create_callback_context(), true); + for (vector::iterator p = extents.begin(); + p != extents.end(); ++p) { + ldout(cct, 20) << " ex " << *p << dendl; + Context *req_comp = new C_ContextCompletion(*completion); + + // reverse map this object extent onto the parent + vector > objectx; + Striper::extent_to_file(cct, &m_image_ctx.layout, p->objectno, 0, + m_image_ctx.layout.fl_object_size, objectx); + uint64_t object_overlap = + m_image_ctx.prune_parent_extents(objectx, parent_overlap); + + AbstractWrite *req; + if (p->offset == 0) { + req = new AioRemove(&m_image_ctx, p->oid.name, p->objectno, objectx, + object_overlap, snapc, CEPH_NOSNAP, req_comp); + } else { + req = new AioTruncate(&m_image_ctx, p->oid.name, p->objectno, p->offset, + objectx, object_overlap, snapc, CEPH_NOSNAP, + req_comp); + } + int r = req->send(); + if (r < 0) { + req_comp->complete(r); + delete req; + break; + } + } + } + + } + + if (lost_exclusive_lock) { + complete(-ERESTART); + } else if (completion != NULL) { + completion->finish_adding_requests(); + } + return false; +} + +} // namespace librbd diff --git a/src/librbd/AsyncTrimRequest.h b/src/librbd/AsyncTrimRequest.h new file mode 100644 index 000000000000..7a89a119bb95 --- /dev/null +++ b/src/librbd/AsyncTrimRequest.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_ASYNC_TRIM_REQUEST_H +#define CEPH_LIBRBD_ASYNC_TRIM_REQUEST_H + +#include "librbd/AsyncRequest.h" + +namespace librbd +{ + +class ImageCtx; +class ProgressContext; + +class AsyncTrimRequest : public AsyncRequest +{ +public: + AsyncTrimRequest(ImageCtx &image_ctx, Context *on_finish, + uint64_t original_size, uint64_t new_size, + ProgressContext &prog_ctx); + + virtual void send(); + +protected: + /** + * Trim goes through the following state machine to remove whole objects, + * clean partially trimmed objects, and update the object map: + * + * . . . . > STATE_FINISHED . . . . . . . . . + * | . . + * | . . . . . . . . . . . . . + * | . . + * v v . + * STATE_PRE_REMOVE ---> STATE_REMOVE_OBJECTS . + * | . . . + * /-----------------------/ . . . . . . . . + * | . . . + * v v v v + * STATE_POST_REMOVE --> STATE_CLEAN_BOUNDARY ---> + * . ^ + * . . + * . . . . . . . . . . . . . . . . . . . . . . . + * + * The _PRE_REMOVE/_POST_REMOVE states are skipped if the object map + * isn't enabled. The _REMOVE_OBJECTS state is skipped if no whole objects + * are removed. The _CLEAN_BOUNDARY state is skipped if no boundary + * objects are cleaned. The state machine will immediately transition + * to _FINISHED state if there are no bytes to trim. + */ + + enum State { + STATE_PRE_REMOVE, + STATE_REMOVE_OBJECTS, + STATE_POST_REMOVE, + STATE_CLEAN_BOUNDARY, + STATE_FINISHED + }; + + virtual bool should_complete(int r); + + State m_state; + +private: + uint64_t m_delete_start; + uint64_t m_num_objects; + uint64_t m_delete_off; + uint64_t m_new_size; + ProgressContext &m_prog_ctx; + + void send_remove_objects(); + void send_pre_remove(); + bool send_post_remove(); + bool send_clean_boundary(); +}; + +} // namespace librbd + +#endif // CEPH_LIBRBD_ASYNC_TRIM_REQUEST_H diff --git a/src/librbd/CopyupRequest.cc b/src/librbd/CopyupRequest.cc index 8328fa556ec6..0a6df6a60c3d 100644 --- a/src/librbd/CopyupRequest.cc +++ b/src/librbd/CopyupRequest.cc @@ -7,12 +7,14 @@ #include "common/Mutex.h" #include "librbd/AioCompletion.h" -#include "librbd/ImageCtx.h" - #include "librbd/AioRequest.h" #include "librbd/CopyupRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" #include "librbd/ObjectMap.h" +#include + #define dout_subsys ceph_subsys_rbd #undef dout_prefix #define dout_prefix *_dout << "librbd::CopyupRequest: " @@ -23,22 +25,12 @@ namespace librbd { uint64_t objectno, vector >& image_extents) : m_ictx(ictx), m_oid(oid), m_object_no(objectno), - m_image_extents(image_extents) + m_image_extents(image_extents), m_state(STATE_READ_FROM_PARENT) { } CopyupRequest::~CopyupRequest() { - assert(m_ictx->copyup_list_lock.is_locked()); assert(m_pending_requests.empty()); - - map::iterator it = - m_ictx->copyup_list.find(m_object_no); - assert(it != m_ictx->copyup_list.end()); - m_ictx->copyup_list.erase(it); - - if (m_ictx->copyup_list.empty()) { - m_ictx->copyup_list_cond.Signal(); - } } ceph::bufferlist& CopyupRequest::get_copyup_data() { @@ -50,7 +42,11 @@ namespace librbd { m_pending_requests.push_back(req); } - void CopyupRequest::complete_all(int r) { + bool CopyupRequest::complete_requests(int r) { + if (m_pending_requests.empty()) { + return false; + } + while (!m_pending_requests.empty()) { vector::iterator it = m_pending_requests.begin(); AioRequest *req = *it; @@ -59,12 +55,12 @@ namespace librbd { req->complete(r); m_pending_requests.erase(it); } + return true; } - void CopyupRequest::send_copyup(int r) { + void CopyupRequest::send_copyup() { ldout(m_ictx->cct, 20) << __func__ << " " << this - << ": oid " << m_oid - << ", r " << r << dendl; + << ": oid " << m_oid << dendl; m_ictx->snap_lock.get_read(); ::SnapContext snapc = m_ictx->snapc; @@ -73,20 +69,6 @@ namespace librbd { std::vector snaps; snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end()); - { - RWLock::RLocker l(m_ictx->md_lock); - if (m_ictx->object_map != NULL) { - r = m_ictx->object_map->update(m_object_no, OBJECT_EXISTS, - boost::optional()); - if (r < 0) { - lderr(m_ictx->cct) << __func__ << " " << this - << ": failed to update object map:" - << cpp_strerror(r) << dendl; - return; - } - } - } - librados::ObjectWriteOperation copyup_op; copyup_op.exec("rbd", "copyup", m_copyup_data); @@ -96,55 +78,126 @@ namespace librbd { comp->release(); } - void CopyupRequest::read_from_parent() + void CopyupRequest::send() { + m_state = STATE_READ_FROM_PARENT; AioCompletion *comp = aio_create_completion_internal( - this, &CopyupRequest::read_from_parent_cb); + create_callback_context(), rbd_ctx_cb); + ldout(m_ictx->cct, 20) << __func__ << " " << this << ": completion " << comp << ", oid " << m_oid << ", extents " << m_image_extents << dendl; - int r = aio_read(m_ictx->parent, m_image_extents, NULL, &m_copyup_data, comp, 0); if (r < 0) { comp->release(); - delete this; + + remove_from_list(); + complete_requests(r); } } - void CopyupRequest::queue_read_from_parent() + void CopyupRequest::queue_send() { // TODO: once the ObjectCacher allows reentrant read requests, the finisher // should be eliminated ldout(m_ictx->cct, 20) << __func__ << " " << this << ": oid " << m_oid << " " << ", extents " << m_image_extents << dendl; - C_ReadFromParent *ctx = new C_ReadFromParent(this); + FunctionContext *ctx = new FunctionContext( + boost::bind(&CopyupRequest::send, this)); m_ictx->copyup_finisher->queue(ctx); } - void CopyupRequest::read_from_parent_cb(completion_t cb, void *arg) + void CopyupRequest::complete(int r) { - CopyupRequest *req = reinterpret_cast(arg); - AioCompletion *comp = reinterpret_cast(cb); - - ldout(req->m_ictx->cct, 20) << __func__ << " " << req - << ": oid " << req->m_oid - << ", extents " << req->m_image_extents << dendl; - - // If this entry is created by a read request, then copyup operation will - // be performed asynchronously. Perform cleaning up from copyup callback. - // If this entry is created by a write request, then copyup operation will - // be performed synchronously by AioWrite. After extracting data, perform - // cleaning up here - Mutex::Locker l(req->m_ictx->copyup_list_lock); - if (req->m_pending_requests.empty()) { - req->send_copyup(comp->get_return_value()); - } else { - req->complete_all(comp->get_return_value()); + if (should_complete(r)) { + delete this; } - delete req; + } + + bool CopyupRequest::should_complete(int r) + { + CephContext *cct = m_ictx->cct; + ldout(cct, 20) << __func__ << " " + << ": oid " << m_oid + << ", extents " << m_image_extents + << ", r " << r << dendl; + + switch (m_state) { + case STATE_READ_FROM_PARENT: + ldout(cct, 20) << "READ_FROM_PARENT" << dendl; + remove_from_list(); + if (complete_requests(r)) { + // pending write operation: it will handle object map / copyup + return true; + } else if (r < 0) { + // nothing to copyup + return true; + } else if (send_object_map()) { + return true; + } + break; + + case STATE_OBJECT_MAP: + ldout(cct, 20) << "OBJECT_MAP" << dendl; + if (r == 0) { + send_copyup(); + } + return true; + + default: + lderr(cct) << "invalid state: " << m_state << dendl; + assert(false); + break; + } + return false; + } + + void CopyupRequest::remove_from_list() + { + Mutex::Locker l(m_ictx->copyup_list_lock); + + map::iterator it = + m_ictx->copyup_list.find(m_object_no); + assert(it != m_ictx->copyup_list.end()); + m_ictx->copyup_list.erase(it); + + if (m_ictx->copyup_list.empty()) { + m_ictx->copyup_list_cond.Signal(); + } + } + + bool CopyupRequest::send_object_map() { + bool object_map_enabled = true; + { + RWLock::RLocker l(m_ictx->owner_lock); + RWLock::RLocker l2(m_ictx->md_lock); + if (m_ictx->object_map == NULL) { + object_map_enabled = false; + } else if (!m_ictx->image_watcher->is_lock_owner()) { + ldout(m_ictx->cct, 20) << "exclusive lock not held for copy-on-read" + << dendl; + return true; + } else { + m_state = STATE_OBJECT_MAP; + m_ictx->object_map->aio_update(m_object_no, OBJECT_EXISTS, + boost::optional(), + create_callback_context()); + } + } + + if (!object_map_enabled) { + send_copyup(); + return true; + } + return false; + } + + Context *CopyupRequest::create_callback_context() + { + return new FunctionContext(boost::bind(&CopyupRequest::complete, this, _1)); } } diff --git a/src/librbd/CopyupRequest.h b/src/librbd/CopyupRequest.h index 107189f1f69b..6ec13071bbb9 100644 --- a/src/librbd/CopyupRequest.h +++ b/src/librbd/CopyupRequest.h @@ -21,33 +21,50 @@ namespace librbd { ceph::bufferlist& get_copyup_data(); void append_request(AioRequest *req); - void read_from_parent(); - void queue_read_from_parent(); - private: - class C_ReadFromParent : public Context { - public: - C_ReadFromParent(CopyupRequest *c) : m_req(c) {} - - virtual void finish(int r) { - m_req->read_from_parent(); - } + void send(); + void queue_send(); - private: - CopyupRequest *m_req; + private: + /** + * Copyup requests go through the following state machine to read from the + * parent image, update the object map, and copyup the object: + * + * + * | + * v + * STATE_READ_FROM_PARENT ---> STATE_OBJECT_MAP + * . | + * . . . . . . . . . . . . . | + * . | + * v v + * + * The _OBJECT_MAP state is skipped if the object map isn't enabled. + */ + enum State { + STATE_READ_FROM_PARENT, + STATE_OBJECT_MAP }; ImageCtx *m_ictx; std::string m_oid; uint64_t m_object_no; vector > m_image_extents; + State m_state; ceph::bufferlist m_copyup_data; vector m_pending_requests; - void complete_all(int r); - void send_copyup(int r); - static void read_from_parent_cb(completion_t cb, void *arg); + bool complete_requests(int r); + + void complete(int r); + bool should_complete(int r); + + void remove_from_list(); + + bool send_object_map(); + void send_copyup(); + Context *create_callback_context(); }; } diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc index 9fa8689b4ad0..0bd2372e1114 100644 --- a/src/librbd/ImageCtx.cc +++ b/src/librbd/ImageCtx.cc @@ -505,7 +505,7 @@ namespace librbd { onfinish->complete(r); } - void ImageCtx::write_to_cache(object_t o, bufferlist& bl, size_t len, + void ImageCtx::write_to_cache(object_t o, const bufferlist& bl, size_t len, uint64_t off, Context *onfinish) { snap_lock.get_read(); ObjectCacher::OSDWrite *wr = object_cacher->prepare_write(snapc, bl, diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h index c7aa8cc3e36c..16a46aaf13f5 100644 --- a/src/librbd/ImageCtx.h +++ b/src/librbd/ImageCtx.h @@ -163,8 +163,8 @@ namespace librbd { uint64_t *overlap) const; void aio_read_from_cache(object_t o, uint64_t object_no, bufferlist *bl, size_t len, uint64_t off, Context *onfinish); - void write_to_cache(object_t o, bufferlist& bl, size_t len, uint64_t off, - Context *onfinish); + void write_to_cache(object_t o, const bufferlist& bl, size_t len, + uint64_t off, Context *onfinish); int read_from_cache(object_t o, uint64_t object_no, bufferlist *bl, size_t len, uint64_t off); void user_flushed(); diff --git a/src/librbd/Makefile.am b/src/librbd/Makefile.am index 2880bedb6912..2836c696da03 100644 --- a/src/librbd/Makefile.am +++ b/src/librbd/Makefile.am @@ -1,7 +1,11 @@ librbd_internal_la_SOURCES = \ librbd/AioCompletion.cc \ librbd/AioRequest.cc \ + librbd/AsyncFlattenRequest.cc \ librbd/AsyncObjectThrottle.cc \ + librbd/AsyncRequest.cc \ + librbd/AsyncResizeRequest.cc \ + librbd/AsyncTrimRequest.cc \ librbd/CopyupRequest.cc \ librbd/ImageCtx.cc \ librbd/ImageWatcher.cc \ @@ -38,7 +42,11 @@ lib_LTLIBRARIES += librbd.la noinst_HEADERS += \ librbd/AioCompletion.h \ librbd/AioRequest.h \ + librbd/AsyncFlattenRequest.h \ librbd/AsyncObjectThrottle.h \ + librbd/AsyncRequest.h \ + librbd/AsyncResizeRequest.h \ + librbd/AsyncTrimRequest.h \ librbd/CopyupRequest.h \ librbd/ImageCtx.h \ librbd/ImageWatcher.h \ diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc index ada967bd87de..e4c736e2b939 100644 --- a/src/librbd/ObjectMap.cc +++ b/src/librbd/ObjectMap.cc @@ -2,6 +2,7 @@ // vim: ts=8 sw=2 smarttab #include "librbd/ObjectMap.h" #include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" #include "librbd/internal.h" #include "common/dout.h" #include "common/errno.h" @@ -131,8 +132,8 @@ int ObjectMap::refresh() ldout(cct, 20) << "refreshed object map: " << object_map.size() << dendl; - uint64_t num_objs = Striper::get_num_objects(m_image_ctx.layout, - m_image_ctx.get_current_size()); + uint64_t num_objs = Striper::get_num_objects( + m_image_ctx.layout, m_image_ctx.get_image_size(m_image_ctx.snap_id)); if (object_map.size() != num_objs) { // resize op might have been interrupted lderr(cct) << "incorrect object map size: " << object_map.size() @@ -142,115 +143,205 @@ int ObjectMap::refresh() } return 0; } -int ObjectMap::resize(uint8_t default_object_state) -{ - if ((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) == 0) { - return 0; - } - RWLock::WLocker l(m_image_ctx.object_map_lock); - CephContext *cct = m_image_ctx.cct; - uint64_t num_objs = Striper::get_num_objects(m_image_ctx.layout, - m_image_ctx.get_current_size()); - ldout(cct, 20) << "resizing object map: " << num_objs << dendl; - librados::ObjectWriteOperation op; - rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); - cls_client::object_map_resize(&op, num_objs, default_object_state); - int r = m_image_ctx.data_ctx.operate(object_map_name(m_image_ctx.id), &op); - if (r == -EBUSY) { - lderr(cct) << "object map lock not owned by client" << dendl; - return r; - } else if (r < 0) { - lderr(cct) << "error resizing object map: size=" << num_objs << ", " - << "state=" << default_object_state << ", " - << "error=" << cpp_strerror(r) << dendl; - invalidate(); - return 0; - } - size_t orig_object_map_size = object_map.size(); - object_map.resize(num_objs); - for (uint64_t i = orig_object_map_size; i < object_map.size(); ++i) { - object_map[i] = default_object_state; - } - return 0; +void ObjectMap::aio_resize(uint64_t new_size, uint8_t default_object_state, + Context *on_finish) { + assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0); + assert(m_image_ctx.owner_lock.is_locked()); + assert(m_image_ctx.image_watcher->is_lock_owner()); + + ResizeRequest *req = new ResizeRequest( + m_image_ctx, new_size, default_object_state, on_finish); + req->send(); } -int ObjectMap::update(uint64_t object_no, uint8_t new_state, - const boost::optional ¤t_state) +void ObjectMap::aio_update(uint64_t object_no, uint8_t new_state, + const boost::optional ¤t_state, + Context *on_finish) { - return update(object_no, object_no + 1, new_state, current_state); + aio_update(object_no, object_no + 1, new_state, current_state, on_finish); } -int ObjectMap::update(uint64_t start_object_no, - uint64_t end_object_no, uint8_t new_state, - const boost::optional ¤t_state) +void ObjectMap::aio_update(uint64_t start_object_no, uint64_t end_object_no, + uint8_t new_state, + const boost::optional ¤t_state, + Context *on_finish) { - if ((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) == 0) { - return 0; - } - - RWLock::WLocker l(m_image_ctx.object_map_lock); - CephContext *cct = m_image_ctx.cct; - assert(start_object_no <= end_object_no); - assert(end_object_no <= object_map.size() || - (m_image_ctx.flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0); - if (end_object_no > object_map.size()) { - ldout(cct, 20) << "skipping update of invalid object map" << dendl; - return 0; - } + assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0); + assert(m_image_ctx.owner_lock.is_locked()); + assert(m_image_ctx.image_watcher->is_lock_owner()); bool update_required = false; - for (uint64_t object_no = start_object_no; object_no < end_object_no; - ++object_no) { - if ((!current_state || object_map[object_no] == *current_state) && - object_map[object_no] != new_state) { - update_required = true; - break; + { + RWLock::WLocker l(m_image_ctx.object_map_lock); + assert(start_object_no < end_object_no); + + CephContext *cct = m_image_ctx.cct; + if (end_object_no > object_map.size()) { + ldout(cct, 20) << "skipping update of invalid object map" << dendl; + return; + } + + for (uint64_t object_no = start_object_no; object_no < end_object_no; + ++object_no) { + if ((!current_state || object_map[object_no] == *current_state) && + object_map[object_no] != new_state) { + update_required = true; + break; + } + } + + if (update_required) { + UpdateRequest *req = new UpdateRequest(m_image_ctx, start_object_no, + end_object_no, new_state, + current_state, on_finish); + req->send(); } } if (!update_required) { - return 0; + on_finish->complete(0); } +} - ldout(cct, 20) << "updating object map: [" << start_object_no << "," - << end_object_no << ") = " - << static_cast(new_state) << dendl; +void ObjectMap::invalidate() { + CephContext *cct = m_image_ctx.cct; + lderr(cct) << this << " invalidating object map" << dendl; + m_image_ctx.flags |= RBD_FLAG_OBJECT_MAP_INVALID; librados::ObjectWriteOperation op; - rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); - cls_client::object_map_update(&op, start_object_no, end_object_no, - new_state, current_state); - int r = m_image_ctx.data_ctx.operate(object_map_name(m_image_ctx.id), &op); - if (r == -EBUSY) { - lderr(cct) << "object map lock not owned by client" << dendl; - return r; - } else if (r < 0) { - lderr(cct) << "object map update failed: " << cpp_strerror(r) << dendl; - invalidate(); - } else { - for (uint64_t object_no = start_object_no; object_no < end_object_no; - ++object_no) { - if (!current_state || object_map[object_no] == *current_state) { - object_map[object_no] = new_state; + cls_client::set_flags(&op, m_image_ctx.flags, RBD_FLAG_OBJECT_MAP_INVALID); + + int r = m_image_ctx.md_ctx.operate(m_image_ctx.header_oid, &op); + if (r < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(r) + << dendl; + } +} + +bool ObjectMap::Request::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " should_complete: r=" << r << dendl; + + switch (m_state) + { + case STATE_REQUEST: + if (r == -EBUSY) { + lderr(cct) << "object map lock not owned by client" << dendl; + return true; + } else if (r < 0) { + lderr(cct) << "failed to update object map: " << cpp_strerror(r) + << dendl; + invalidate(); + return false; + } + + { + RWLock::RLocker l(m_image_ctx.md_lock); + RWLock::WLocker l2(m_image_ctx.object_map_lock); + ObjectMap *object_map = m_image_ctx.object_map; + if (object_map != NULL) { + finish(object_map); } } + return true; + + case STATE_INVALIDATE: + ldout(cct, 20) << "INVALIDATE" << dendl; + if (r < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(r) + << dendl; + return true; + } + break; + + default: + lderr(cct) << "invalid state: " << m_state << dendl; + assert(false); + break; } - return 0; + return false; } -void ObjectMap::invalidate() -{ - // TODO: md_lock +void ObjectMap::Request::invalidate() { + CephContext *cct = m_image_ctx.cct; + RWLock::WLocker l(m_image_ctx.md_lock); + + lderr(cct) << this << " invalidating object map" << dendl; + m_state = STATE_INVALIDATE; m_image_ctx.flags |= RBD_FLAG_OBJECT_MAP_INVALID; - int r = cls_client::set_flags(&m_image_ctx.md_ctx, - m_image_ctx.header_oid, - m_image_ctx.flags, - RBD_FLAG_OBJECT_MAP_INVALID); - if (r < 0) { - lderr(m_image_ctx.cct) << "Failed to invalidate object map: " - << cpp_strerror(r) << dendl; + + librados::ObjectWriteOperation op; + cls_client::set_flags(&op, m_image_ctx.flags, RBD_FLAG_OBJECT_MAP_INVALID); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, + rados_completion, &op); + assert(r == 0); + rados_completion->release(); +} + +void ObjectMap::ResizeRequest::send() { + CephContext *cct = m_image_ctx.cct; + + RWLock::WLocker l(m_image_ctx.object_map_lock); + m_num_objs = Striper::get_num_objects(m_image_ctx.layout, m_new_size); + + ldout(cct, 5) << this << " resizing on-disk object map: " << m_num_objs << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); + cls_client::object_map_resize(&op, m_num_objs, m_default_object_state); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.data_ctx.aio_operate(object_map_name(m_image_ctx.id), + rados_completion, &op); + assert(r == 0); + rados_completion->release(); +} + +void ObjectMap::ResizeRequest::finish(ObjectMap *object_map) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 5) << this << " resizing in-memory object map: " << m_num_objs << dendl; + size_t orig_object_map_size = object_map->object_map.size(); + object_map->object_map.resize(m_num_objs); + for (uint64_t i = orig_object_map_size; i < object_map->object_map.size(); ++i) { + object_map->object_map[i] = m_default_object_state; + } +} + +void ObjectMap::UpdateRequest::send() { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << this << " updating on-disk object map: [" + << m_start_object_no << "," << m_end_object_no << ") = " + << static_cast(m_new_state) << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", ""); + cls_client::object_map_update(&op, m_start_object_no, m_end_object_no, + m_new_state, m_current_state); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.data_ctx.aio_operate(object_map_name(m_image_ctx.id), + rados_completion, &op); + assert(r == 0); + rados_completion->release(); +} + +void ObjectMap::UpdateRequest::finish(ObjectMap *object_map) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << this << " updating in-memory object map" << dendl; + for (uint64_t object_no = m_start_object_no; + object_no < MIN(m_end_object_no, object_map->object_map.size()); + ++object_no) { + if (!m_current_state || + object_map->object_map[object_no] == *m_current_state) { + object_map->object_map[object_no] = m_new_state; + } } } diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h index da426d18f3b1..648043d38f96 100644 --- a/src/librbd/ObjectMap.h +++ b/src/librbd/ObjectMap.h @@ -4,11 +4,19 @@ #define CEPH_LIBRBD_OBJECT_MAP_H #include "include/int_types.h" +#include "include/rados/librados.hpp" #include "common/bit_vector.hpp" +#include "librbd/AsyncRequest.h" #include +class Context; + namespace librbd { +static const uint8_t OBJECT_NONEXISTENT = 0; +static const uint8_t OBJECT_EXISTS = 1; +static const uint8_t OBJECT_PENDING = 2; + class ImageCtx; class ObjectMap { @@ -21,16 +29,88 @@ public: bool object_may_exist(uint64_t object_no) const; - int refresh(); - int resize(uint8_t default_object_state); + void aio_resize(uint64_t new_size, uint8_t default_object_state, + Context *on_finish); + void aio_update(uint64_t object_no, uint8_t new_state, + const boost::optional ¤t_state, + Context *on_finish); + void aio_update(uint64_t start_object_no, uint64_t end_object_no, + uint8_t new_state, + const boost::optional ¤t_state, + Context *on_finish); - int update(uint64_t object_no, uint8_t new_state, - const boost::optional ¤t_state); - int update(uint64_t start_object_no, uint64_t end_object_no, - uint8_t new_state, const boost::optional ¤t_state); + int refresh(); private: + class Request : public AsyncRequest { + public: + Request(ImageCtx &image_ctx, Context *on_finish) + : AsyncRequest(image_ctx, on_finish), m_state(STATE_REQUEST) + { + } + + protected: + virtual bool should_complete(int r); + virtual void finish(ObjectMap *object_map) = 0; + + private: + /** + * ---> STATE_REQUEST ---> + * | ^ + * v | + * STATE_INVALIDATE -------/ + */ + enum State { + STATE_REQUEST, + STATE_INVALIDATE + }; + + State m_state; + + void invalidate(); + }; + + class ResizeRequest : public Request { + public: + ResizeRequest(ImageCtx &image_ctx, uint64_t new_size, + uint8_t default_object_state, Context *on_finish) + : Request(image_ctx, on_finish), m_num_objs(0), m_new_size(new_size), + m_default_object_state(default_object_state) + { + } + + virtual void send(); + protected: + virtual void finish(ObjectMap *object_map); + private: + uint64_t m_num_objs; + uint64_t m_new_size; + uint8_t m_default_object_state; + }; + + class UpdateRequest : public Request { + public: + UpdateRequest(ImageCtx &image_ctx, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional ¤t_state, + Context *on_finish) + : Request(image_ctx, on_finish), m_start_object_no(start_object_no), + m_end_object_no(end_object_no), m_new_state(new_state), + m_current_state(current_state) + { + } + + virtual void send(); + protected: + virtual void finish(ObjectMap *object_map); + private: + uint64_t m_start_object_no; + uint64_t m_end_object_no; + uint8_t m_new_state; + boost::optional m_current_state; + }; + ImageCtx &m_image_ctx; ceph::BitVector<2> object_map; diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc index a200d183e64d..f3966362a20d 100644 --- a/src/librbd/internal.cc +++ b/src/librbd/internal.cc @@ -17,7 +17,9 @@ #include "librbd/AioCompletion.h" #include "librbd/AioRequest.h" -#include "librbd/AsyncObjectThrottle.h" +#include "librbd/AsyncFlattenRequest.h" +#include "librbd/AsyncResizeRequest.h" +#include "librbd/AsyncTrimRequest.h" #include "librbd/CopyupRequest.h" #include "librbd/ImageCtx.h" #include "librbd/ImageWatcher.h" @@ -30,8 +32,6 @@ #include "librados/snap_set_diff.h" #include -#include -#include #include #include "include/assert.h" @@ -167,30 +167,15 @@ namespace librbd { assert(!ictx->image_watcher->is_lock_supported() || ictx->image_watcher->is_lock_owner()); - Mutex my_lock("librbd::trim_image::my_lock"); - Cond cond; - bool done; - int ret; - - CephContext *cct = ictx->cct; - Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &ret); - ret = async_trim_image(ictx, ctx, ictx->size, newsize, prog_ctx); - if (ret < 0) { - lderr(cct) << "warning: failed to remove object(s): " - << cpp_strerror(ret) << dendl; - delete ctx; - return; - } - - my_lock.Lock(); - while (!done) { - cond.Wait(my_lock); - } - my_lock.Unlock(); + C_SaferCond *ctx = new C_SaferCond(); + AsyncTrimRequest *req = new AsyncTrimRequest(*ictx, ctx, ictx->size, + newsize, prog_ctx); + req->send(); - if (ret < 0) { - lderr(cct) << "warning: failed to remove some object(s): " - << cpp_strerror(ret) << dendl; + int r = ctx->wait(); + if (r < 0) { + lderr(ictx->cct) << "warning: failed to remove some object(s): " + << cpp_strerror(r) << dendl; } } @@ -1606,9 +1591,7 @@ reprotect_and_return_err: int r; do { - Mutex my_lock("librbd::resize::my_lock"); - Cond cond; - bool done; + C_SaferCond *ctx; { RWLock::RLocker l(ictx->owner_lock); while (ictx->image_watcher->is_lock_supported()) { @@ -1626,7 +1609,7 @@ reprotect_and_return_err: ldout(ictx->cct, 5) << "resize timed out notifying lock owner" << dendl; } - Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &r); + ctx = new C_SaferCond(); r = async_resize(ictx, ctx, size, prog_ctx); if (r < 0) { delete ctx; @@ -1634,40 +1617,17 @@ reprotect_and_return_err: } } - my_lock.Lock(); - while (!done) { - cond.Wait(my_lock); - } - my_lock.Unlock(); - + r = ctx->wait(); if (r == -ERESTART) { ldout(ictx->cct, 5) << "resize interrupted: restarting" << dendl; } } while (r == -ERESTART); + ictx->perfcounter->inc(l_librbd_resize); notify_change(ictx->md_ctx, ictx->header_oid, ictx); ldout(cct, 2) << "resize finished" << dendl; return r; } - - class AsyncResizeFinishContext : public Context { - public: - AsyncResizeFinishContext(ImageCtx *ictx, Context *ctx) - : m_ictx(ictx), m_ctx(ctx) - { - } - - virtual void finish(int r) { - ldout(m_ictx->cct, 2) << "async_resize finished" << dendl; - m_ictx->perfcounter->inc(l_librbd_resize); - m_ctx->complete(r); - } - - private: - ImageCtx *m_ictx; - Context *m_ctx; - }; - int async_resize(ImageCtx *ictx, Context *ctx, uint64_t size, ProgressContext &prog_ctx) { @@ -1705,297 +1665,17 @@ reprotect_and_return_err: } } - AsyncResizeFinishContext *finish_ctx = - new AsyncResizeFinishContext(ictx, ctx); - r = async_resize_helper(ictx, finish_ctx, original_size, size, prog_ctx); - if (r < 0) { - delete ctx; - return r; - } + async_resize_helper(ictx, ctx, original_size, size, prog_ctx); return 0; } - class AsyncResizeHelperFinishContext : public Context { - public: - AsyncResizeHelperFinishContext(ImageCtx *ictx, Context *ctx, - uint64_t original_size, uint64_t new_size) - : m_ictx(ictx), m_ctx(ctx), m_original_size(original_size), - m_new_size(new_size) - { - } - - virtual void finish(int r) { - BOOST_SCOPE_EXIT((m_ctx) (m_ictx) (m_new_size) (m_original_size) (&r)) { - ldout(m_ictx->cct, 2) << "async_resize_helper finished (" - << r << ")" << dendl; - - if (r < 0) { - RWLock::WLocker l(m_ictx->md_lock); - if (m_ictx->size == m_new_size) { - m_ictx->size = m_original_size; - } - m_ctx->complete(r); - } - } BOOST_SCOPE_EXIT_END - - if (r < 0) { - return; - } - - RWLock::RLocker l(m_ictx->owner_lock); - if (m_ictx->image_watcher->is_lock_supported() && - !m_ictx->image_watcher->is_lock_owner()) { - r = -ERESTART; - return; - } - - RWLock::WLocker l2(m_ictx->md_lock); - m_ictx->size = m_new_size; - - librados::ObjectWriteOperation op; - if (m_ictx->old_format) { - // rewrite header - bufferlist bl; - m_ictx->header.image_size = m_new_size; - bl.append((const char *)&m_ictx->header, sizeof(m_ictx->header)); - op.write(0, bl); - } else { - if (m_ictx->image_watcher->is_lock_supported()) { - m_ictx->image_watcher->assert_header_locked(&op); - } - cls_client::set_size(&op, m_new_size); - } - - librados::AioCompletion *rados_completion = - librados::Rados::aio_create_completion(m_ctx, NULL, rados_ctx_cb); - r = m_ictx->md_ctx.aio_operate(m_ictx->header_oid, rados_completion, &op); - rados_completion->release(); - if (r < 0) { - lderr(m_ictx->cct) << "error writing header: " << cpp_strerror(r) - << dendl; - return; - } - - if (m_ictx->object_map != NULL) { - m_ictx->size = m_new_size; - m_ictx->object_map->resize(OBJECT_NONEXISTENT); - } - } - - private: - ImageCtx *m_ictx; - Context *m_ctx; - uint64_t m_original_size; - uint64_t m_new_size; - }; - - int async_resize_helper(ImageCtx *ictx, Context *ctx, uint64_t original_size, - uint64_t new_size, ProgressContext& prog_ctx) + void async_resize_helper(ImageCtx *ictx, Context *ctx, uint64_t original_size, + uint64_t new_size, ProgressContext& prog_ctx) { - CephContext *cct = ictx->cct; - if (original_size == new_size) { - ldout(cct, 2) << "no change in size (" << original_size << " -> " - << new_size << ")" << dendl; - ctx->complete(0); - return 0; - } - - AsyncResizeHelperFinishContext *finish_ctx = - new AsyncResizeHelperFinishContext(ictx, ctx, original_size, new_size); - if (new_size > original_size) { - ldout(cct, 2) << "expanding image " << original_size << " -> " - << new_size << dendl; - finish_ctx->complete(0); - } else { - ldout(cct, 2) << "shrinking image " << original_size << " -> " - << new_size << dendl; - - { - // update in-memory size to clip concurrent IO operations - RWLock::WLocker l(ictx->md_lock); - ictx->size = new_size; - } - - int r = async_trim_image(ictx, finish_ctx, original_size, new_size, - prog_ctx); - if (r < 0) { - delete finish_ctx; - return r; - } - } - return 0; - } - - class AsyncTrimObjectContext : public C_AsyncObjectThrottle { - public: - AsyncTrimObjectContext(AsyncObjectThrottle &throttle, ImageCtx *ictx, - uint64_t object_no) - : C_AsyncObjectThrottle(throttle), m_ictx(ictx), m_object_no(object_no) - { - } - - virtual int send() { - { - RWLock::RLocker l(m_ictx->md_lock); - if (m_ictx->object_map != NULL && - !m_ictx->object_map->object_may_exist(m_object_no)) { - return 1; - } - } - - RWLock::RLocker l(m_ictx->owner_lock); - if (m_ictx->image_watcher->is_lock_supported() && - !m_ictx->image_watcher->is_lock_owner()) { - return -ERESTART; - } - - string oid = m_ictx->get_object_name(m_object_no); - librados::AioCompletion *rados_completion = - librados::Rados::aio_create_completion(this, NULL, rados_ctx_cb); - m_ictx->data_ctx.aio_remove(oid, rados_completion); - rados_completion->release(); - return 0; - } - - private: - ImageCtx *m_ictx; - uint64_t m_object_no; - }; - - class AsyncTrimFinishContext : public Context { - public: - - AsyncTrimFinishContext(ImageCtx *ictx, Context *ctx, uint64_t delete_start, - uint64_t num_objects, uint64_t delete_offset, - uint64_t new_size) - : m_ictx(ictx), m_ctx(ctx), m_delete_start(delete_start), - m_num_objects(num_objects), m_delete_offset(delete_offset), - m_new_size(new_size) - { - } - - virtual void finish(int r) { - if (r < 0) { - m_ctx->complete(r); - return; - } - - RWLock::RLocker l(m_ictx->owner_lock); - if (m_ictx->image_watcher->is_lock_supported() && - !m_ictx->image_watcher->is_lock_owner()) { - m_ctx->complete(-ERESTART); - return; - } - - RWLock::RLocker l2(m_ictx->md_lock); - if (m_ictx->object_map != NULL) { - m_ictx->object_map->update(m_delete_start, m_num_objects, - OBJECT_NONEXISTENT, OBJECT_PENDING); - } - - if (m_delete_offset <= m_new_size) { - m_ctx->complete(r); - return; - } - - // discard the weird boundary, if any - vector extents; - Striper::file_to_extents(m_ictx->cct, m_ictx->format_string, - &m_ictx->layout, m_new_size, - m_delete_offset - m_new_size, 0, extents); - - ContextCompletion *completion = new ContextCompletion(m_ctx, true); - for (vector::iterator p = extents.begin(); - p != extents.end(); ++p) { - ldout(m_ictx->cct, 20) << " ex " << *p << dendl; - Context *req_comp = new C_ContextCompletion(*completion); - librados::AioCompletion *rados_completion = - librados::Rados::aio_create_completion(req_comp, NULL, rados_ctx_cb); - - bool flag_nonexistent = false; - if (p->offset == 0) { - flag_nonexistent = true; - if (m_ictx->object_map != NULL) { - m_ictx->object_map->update(p->objectno, OBJECT_PENDING, - OBJECT_EXISTS); - } - m_ictx->data_ctx.aio_remove(p->oid.name, rados_completion); - } else { - if (m_ictx->object_map != NULL) { - m_ictx->object_map->update(p->objectno, OBJECT_EXISTS, - boost::optional()); - } - librados::ObjectWriteOperation op; - op.truncate(p->offset); - m_ictx->data_ctx.aio_operate(p->oid.name, rados_completion, &op); - } - rados_completion->release(); - - if (flag_nonexistent && m_ictx->object_map != NULL) { - m_ictx->object_map->update(p->objectno, OBJECT_NONEXISTENT, - OBJECT_PENDING); - } - } - completion->finish_adding_requests(); - } - - private: - ImageCtx *m_ictx; - Context *m_ctx; - uint64_t m_delete_start; - uint64_t m_num_objects; - uint64_t m_delete_offset; - uint64_t m_new_size; - }; - - int async_trim_image(ImageCtx *ictx, Context *ctx, uint64_t original_size, - uint64_t new_size, ProgressContext& prog_ctx) - { - CephContext *cct = (CephContext *)ictx->data_ctx.cct(); - - uint64_t period = ictx->get_stripe_period(); - uint64_t new_num_periods = ((new_size + period - 1) / period); - uint64_t delete_off = MIN(new_num_periods * period, original_size); - // first object we can delete free and clear - uint64_t delete_start = new_num_periods * ictx->get_stripe_count(); - uint64_t num_objects = Striper::get_num_objects(ictx->layout, original_size); - - ldout(cct, 10) << "trim_image " << original_size << " -> " << new_size - << " periods " << new_num_periods - << " discard to offset " << delete_off - << " delete objects " << delete_start - << " to " << (num_objects-1) - << dendl; - - AsyncTrimFinishContext *finish_ctx = - new AsyncTrimFinishContext(ictx, ctx, delete_start, num_objects, - delete_off, new_size); - if (delete_start < num_objects) { - ldout(cct, 2) << "trim_image objects " << delete_start << " to " - << (num_objects - 1) << dendl; - - { - RWLock::RLocker l(ictx->md_lock); - if (ictx->object_map != NULL) { - ictx->object_map->update(delete_start, num_objects, OBJECT_PENDING, - OBJECT_EXISTS); - } - } - - AsyncObjectThrottle::ContextFactory context_factory( - boost::lambda::bind(boost::lambda::new_ptr(), - boost::lambda::_1, ictx, boost::lambda::_2)); - AsyncObjectThrottle *throttle = new AsyncObjectThrottle( - context_factory, finish_ctx, prog_ctx, delete_start, num_objects); - int r = throttle->start_ops(cct->_conf->rbd_concurrent_management_ops); - if (r < 0) { - delete throttle; - return r; - } - } else { - finish_ctx->complete(0); - } - return 0; + assert(ictx->owner_lock.is_locked()); + AsyncResizeRequest *req = new AsyncResizeRequest(*ictx, ctx, original_size, + new_size, prog_ctx); + req->send(); } int snap_list(ImageCtx *ictx, vector& snaps) @@ -2403,25 +2083,12 @@ reprotect_and_return_err: } } - Mutex my_lock("librbd::snap_rollback::my_lock"); - Cond cond; - bool done; - Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &r); - ldout(cct, 2) << "resizing to snapshot size..." << dendl; NoOpProgressContext no_op; - r = async_resize_helper(ictx, ctx, original_size, new_size, no_op); - if (r < 0) { - delete ctx; - return r; - } - - my_lock.Lock(); - while (!done) { - cond.Wait(my_lock); - } - my_lock.Unlock(); + C_SaferCond *ctx = new C_SaferCond(); + async_resize_helper(ictx, ctx, original_size, new_size, no_op); + r = ctx->wait(); if (r < 0) { lderr(cct) << "Error resizing to snapshot size: " << cpp_strerror(r) << dendl; @@ -2707,7 +2374,6 @@ reprotect_and_return_err: ictx->shutdown_cache(); // implicitly flushes } else { flush(ictx); - ictx->wait_for_pending_aio(); } if (ictx->copyup_finisher != NULL) { @@ -2738,140 +2404,6 @@ reprotect_and_return_err: delete ictx; } - class AsyncFlattenObjectContext : public C_AsyncObjectThrottle { - public: - AsyncFlattenObjectContext(AsyncObjectThrottle &throttle, ImageCtx *ictx, - uint64_t object_size, ::SnapContext snapc, - uint64_t object_no) - : C_AsyncObjectThrottle(throttle), m_ictx(ictx), - m_object_size(object_size), m_snapc(snapc), m_object_no(object_no) - { - } - - virtual int send() { - int r = ictx_check(m_ictx); - if (r < 0) { - return r; - } - - RWLock::RLocker l(m_ictx->owner_lock); - if (m_ictx->image_watcher->is_lock_supported() && - !m_ictx->image_watcher->is_lock_owner()) { - return -ERESTART; - } - - RWLock::RLocker l2(m_ictx->md_lock); - uint64_t overlap; - { - RWLock::RLocker l3(m_ictx->parent_lock); - // stop early if the parent went away - it just means - // another flatten finished first, so this one is useless. - if (!m_ictx->parent) { - return 1; - } - - // resize might have occurred while flatten is running - overlap = min(m_ictx->size, m_ictx->parent_md.overlap); - } - - // map child object onto the parent - vector > objectx; - Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no, 0, - m_object_size, objectx); - uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap); - assert(object_overlap <= m_object_size); - if (object_overlap == 0) { - // resize shrunk image while flattening - return 1; - } - - bufferlist bl; - string oid = m_ictx->get_object_name(m_object_no); - AioWrite *req = new AioWrite(m_ictx, oid, m_object_no, 0, objectx, - object_overlap, bl, m_snapc, CEPH_NOSNAP, - this); - r = req->send(); - if (r < 0) { - lderr(m_ictx->cct) << "failed to flatten object " << oid << dendl; - delete req; - return r; - } - return 0; - } - - private: - ImageCtx *m_ictx; - uint64_t m_object_size; - ::SnapContext m_snapc; - uint64_t m_object_no; - - }; - - class AsyncFlattenFinishContext : public Context { - public: - AsyncFlattenFinishContext(ImageCtx *ictx, Context *ctx, - uint64_t overlap_objects) - : m_ictx(ictx), m_ctx(ctx), m_overlap_objects(overlap_objects) - { - } - - virtual void finish(int r) { - BOOST_SCOPE_EXIT((&m_ctx) (&r)) { - m_ctx->complete(r); - } BOOST_SCOPE_EXIT_END - - CephContext *cct = m_ictx->cct; - if (r < 0) { - lderr(cct) << "failed to flatten at least one object: " - << cpp_strerror(r) << dendl; - return; - } - - RWLock::RLocker l(m_ictx->owner_lock); - if (m_ictx->image_watcher->is_lock_supported() && - !m_ictx->image_watcher->is_lock_owner()) { - r = -ERESTART; - return; - } - - // remove parent from this (base) image - librados::ObjectWriteOperation op; - if (m_ictx->image_watcher->is_lock_supported()) { - m_ictx->image_watcher->assert_header_locked(&op); - } - cls_client::remove_parent(&op); - r = m_ictx->md_ctx.operate(m_ictx->header_oid, &op); - if (r < 0) { - lderr(cct) << "error removing parent" << dendl; - return; - } - - // and if there are no snaps, remove from the children object as well - // (if snapshots remain, they have their own parent info, and the child - // will be removed when the last snap goes away) - m_ictx->snap_lock.get_read(); - if (m_ictx->snaps.empty()) { - ldout(cct, 2) << "removing child from children list..." << dendl; - int r = cls_client::remove_child(&m_ictx->md_ctx, RBD_CHILDREN, - m_ictx->parent_md.spec, m_ictx->id); - if (r < 0) { - lderr(cct) << "error removing child from children list" << dendl; - m_ictx->snap_lock.put_read(); - return; - } - } - m_ictx->snap_lock.put_read(); - - ldout(cct, 20) << "finished flattening" << dendl; - return; - } - - private: - ImageCtx *m_ictx; - Context *m_ctx; - uint64_t m_overlap_objects; - }; - // 'flatten' child image by copying all parent's blocks int flatten(ImageCtx *ictx, ProgressContext &prog_ctx) { @@ -2884,9 +2416,7 @@ reprotect_and_return_err: int r; do { - Mutex my_lock("librbd::flatten:my_lock"); - Cond cond; - bool done; + C_SaferCond *ctx; { RWLock::RLocker l(ictx->owner_lock); while (ictx->image_watcher->is_lock_supported()) { @@ -2904,7 +2434,7 @@ reprotect_and_return_err: ldout(ictx->cct, 5) << "flatten timed out notifying lock owner" << dendl; } - Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &r); + ctx = new C_SaferCond(); r = async_flatten(ictx, ctx, prog_ctx); if (r < 0) { delete ctx; @@ -2912,12 +2442,7 @@ reprotect_and_return_err: } } - my_lock.Lock(); - while (!done) { - cond.Wait(my_lock); - } - my_lock.Unlock(); - + r = ctx->wait(); if (r == -ERESTART) { ldout(ictx->cct, 5) << "flatten interrupted: restarting" << dendl; } @@ -2977,20 +2502,10 @@ reprotect_and_return_err: overlap_objects = Striper::get_num_objects(ictx->layout, overlap); } - AsyncObjectThrottle::ContextFactory context_factory( - boost::lambda::bind(boost::lambda::new_ptr(), - boost::lambda::_1, ictx, object_size, snapc, - boost::lambda::_2)); - AsyncFlattenFinishContext *finish_ctx = - new AsyncFlattenFinishContext(ictx, ctx, overlap_objects); - AsyncObjectThrottle *throttle = new AsyncObjectThrottle( - context_factory, finish_ctx, prog_ctx, 0, overlap_objects); - r = throttle->start_ops(cct->_conf->rbd_concurrent_management_ops); - if (r < 0) { - delete throttle; - return r; - } - + AsyncFlattenRequest *req = + new AsyncFlattenRequest(*ictx, ctx, object_size, overlap_objects, + snapc, prog_ctx); + req->send(); return 0; } @@ -3635,6 +3150,7 @@ reprotect_and_return_err: r = ictx->flush_cache(); } else { r = ictx->data_ctx.aio_flush(); + ictx->wait_for_pending_aio(); } if (r) @@ -3724,17 +3240,6 @@ reprotect_and_return_err: bl.append(buf + q->first, q->second); } - { - RWLock::RLocker l(ictx->md_lock); - if (ictx->object_map != NULL) { - r = ictx->object_map->update(p->objectno, OBJECT_EXISTS, - boost::optional()); - if (r < 0) { - goto done; - } - } - } - C_AioWrite *req_comp = new C_AioWrite(cct, c); if (ictx->object_cacher) { c->add_request(); @@ -3754,8 +3259,10 @@ reprotect_and_return_err: req->set_op_flags(op_flags); r = req->send(); - if (r < 0) + if (r < 0) { + req->complete(r); goto done; + } } } done: @@ -3764,8 +3271,6 @@ reprotect_and_return_err: ictx->perfcounter->inc(l_librbd_aio_wr); ictx->perfcounter->inc(l_librbd_aio_wr_bytes, mylen); - - /* FIXME: cleanup all the allocated stuff */ return r; } @@ -3834,15 +3339,9 @@ reprotect_and_return_err: object_overlap = ictx->prune_parent_extents(objectx, overlap); } - RWLock::RLocker l(ictx->md_lock); - bool flag_nonexistent = false; if (p->offset == 0 && p->length == ictx->layout.fl_object_size) { req = new AioRemove(ictx, p->oid.name, p->objectno, objectx, object_overlap, snapc, snap_id, req_comp); - if (!req->has_parent() && ictx->object_map != NULL) { - ictx->object_map->update(p->objectno, OBJECT_PENDING, OBJECT_EXISTS); - flag_nonexistent = true; - } } else if (p->offset + p->length == ictx->layout.fl_object_size) { req = new AioTruncate(ictx, p->oid.name, p->objectno, p->offset, objectx, object_overlap, snapc, snap_id, req_comp); @@ -3852,18 +3351,10 @@ reprotect_and_return_err: snapc, snap_id, req_comp); } - if (!flag_nonexistent && ictx->object_map != NULL) { - ictx->object_map->update(p->objectno, OBJECT_EXISTS, - boost::optional()); - } - r = req->send(); - if (r < 0) + if (r < 0) { + req->complete(r); goto done; - - if (flag_nonexistent && ictx->object_map != NULL) { - ictx->object_map->update(p->objectno, OBJECT_NONEXISTENT, - OBJECT_PENDING); } } r = 0; @@ -3878,8 +3369,6 @@ reprotect_and_return_err: ictx->perfcounter->inc(l_librbd_aio_discard); ictx->perfcounter->inc(l_librbd_aio_discard_bytes, len); - - /* FIXME: cleanup all the allocated stuff */ return r; } diff --git a/src/librbd/internal.h b/src/librbd/internal.h index 48ddcfcb3e44..9984b4b663da 100644 --- a/src/librbd/internal.h +++ b/src/librbd/internal.h @@ -73,10 +73,6 @@ namespace librbd { } }; - static const uint8_t OBJECT_NONEXISTENT = 0; - static const uint8_t OBJECT_EXISTS = 1; - static const uint8_t OBJECT_PENDING = 2; - const std::string id_obj_name(const std::string &name); const std::string header_name(const std::string &image_id); const std::string old_header_name(const std::string &image_name); @@ -197,10 +193,8 @@ namespace librbd { int async_flatten(ImageCtx *ictx, Context *ctx, ProgressContext &prog_ctx); int async_resize(ImageCtx *ictx, Context *ctx, uint64_t size, ProgressContext &prog_ctx); - int async_resize_helper(ImageCtx *ictx, Context *ctx, uint64_t original_size, - uint64_t new_size, ProgressContext& prog_ctx); - int async_trim_image(ImageCtx *ictx, Context *ctx, uint64_t original_size, - uint64_t new_size, ProgressContext& prog_ctx); + void async_resize_helper(ImageCtx *ictx, Context *ctx, uint64_t original_size, + uint64_t new_size, ProgressContext& prog_ctx); int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf, AioCompletion *c, int op_flags); diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h index f243cf2627d9..3b6561b96af6 100644 --- a/src/osdc/ObjectCacher.h +++ b/src/osdc/ObjectCacher.h @@ -71,10 +71,12 @@ class ObjectCacher { bufferlist bl; utime_t mtime; int flags; - OSDWrite(const SnapContext& sc, bufferlist& b, utime_t mt, int f) : snapc(sc), bl(b), mtime(mt), flags(f) {} + OSDWrite(const SnapContext& sc, const bufferlist& b, utime_t mt, int f) + : snapc(sc), bl(b), mtime(mt), flags(f) {} }; - OSDWrite *prepare_write(const SnapContext& sc, bufferlist &b, utime_t mt, int f) { + OSDWrite *prepare_write(const SnapContext& sc, const bufferlist &b, + utime_t mt, int f) { return new OSDWrite(sc, b, mt, f); } diff --git a/src/test/cls_rbd/test_cls_rbd.cc b/src/test/cls_rbd/test_cls_rbd.cc index 3642efe2f876..fb124189c005 100644 --- a/src/test/cls_rbd/test_cls_rbd.cc +++ b/src/test/cls_rbd/test_cls_rbd.cc @@ -1041,7 +1041,9 @@ TEST_F(TestClsRbd, flags) ASSERT_EQ(0, get_flags(&ioctx, oid, CEPH_NOSNAP, &flags)); ASSERT_EQ(0U, flags); - ASSERT_EQ(0, set_flags(&ioctx, oid, 3, 2)); + librados::ObjectWriteOperation op1; + set_flags(&op1, 3, 2); + ASSERT_EQ(0, ioctx.operate(oid, &op1)); ASSERT_EQ(0, get_flags(&ioctx, oid, CEPH_NOSNAP, &flags)); ASSERT_EQ(2U, flags); @@ -1049,7 +1051,9 @@ TEST_F(TestClsRbd, flags) ASSERT_EQ(-ENOENT, get_flags(&ioctx, oid, snap_id, &flags)); ASSERT_EQ(0, snapshot_add(&ioctx, oid, snap_id, "snap")); - ASSERT_EQ(0, set_flags(&ioctx, oid, 31, 4)); + librados::ObjectWriteOperation op2; + set_flags(&op2, 31, 4); + ASSERT_EQ(0, ioctx.operate(oid, &op2)); ASSERT_EQ(0, get_flags(&ioctx, oid, CEPH_NOSNAP, &flags)); ASSERT_EQ(6U, flags); -- 2.47.3