From: Jason Dillaman Date: Mon, 10 Nov 2014 17:25:50 +0000 (-0500) Subject: librbd: Create async versions of long-running maintenance operations X-Git-Tag: v0.93~193^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=398bc9648d48fa5659dd80001112aa69e3d31197;p=ceph.git librbd: Create async versions of long-running maintenance operations Resize and flatten now have async versions. The existing resize and flatten operations now use the async versions internally. The async operations will be used by the client holding the exclusive lock when it receives maintenance requests from other clients. Signed-off-by: Jason Dillaman --- diff --git a/src/common/ContextCompletion.cc b/src/common/ContextCompletion.cc new file mode 100644 index 0000000000000..d5ff9f2204c43 --- /dev/null +++ b/src/common/ContextCompletion.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "common/ContextCompletion.h" + +namespace ceph +{ + +ContextCompletion::ContextCompletion(Context *ctx, bool ignore_enoent) + : m_lock("ceph::ContextCompletion::m_lock"), m_ctx(ctx), + m_ignore_enoent(ignore_enoent), m_ret(0), m_building(true), m_current_ops(0) +{ +} + +void ContextCompletion::finish_adding_requests() { + bool complete; + { + Mutex::Locker l(m_lock); + m_building = false; + complete = (m_current_ops == 0); + } + if (complete) { + m_ctx->complete(m_ret); + delete this; + } +} + +void ContextCompletion::start_op() { + Mutex::Locker l(m_lock); + ++m_current_ops; +} + +void ContextCompletion::finish_op(int r) { + bool complete; + { + Mutex::Locker l(m_lock); + if (r < 0 && m_ret == 0 && (!m_ignore_enoent || r != -ENOENT)) { + m_ret = r; + } + + --m_current_ops; + complete = (m_current_ops == 0 && !m_building); + } + if (complete) { + m_ctx->complete(m_ret); + delete this; + } +} + +} // namespace ceph diff --git a/src/common/ContextCompletion.h b/src/common/ContextCompletion.h new file mode 100644 index 0000000000000..2cf5d55cb0c05 --- /dev/null +++ b/src/common/ContextCompletion.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_ASYNC_COMPLETION_H +#define CEPH_ASYNC_COMPLETION_H + +#include "include/int_types.h" +#include "include/Context.h" + +namespace ceph { + +class ContextCompletion { +public: + ContextCompletion(Context *ctx, bool ignore_enoent); + + void finish_adding_requests(); + + void start_op(); + void finish_op(int r); + +private: + Mutex m_lock; + Context *m_ctx; + bool m_ignore_enoent; + int m_ret; + bool m_building; + uint64_t m_current_ops; +}; + +class C_ContextCompletion : public Context { +public: + C_ContextCompletion(ContextCompletion &context_completion) + : m_context_completion(context_completion) + { + m_context_completion.start_op(); + } + + virtual void finish(int r) { + m_context_completion.finish_op(r); + } + +private: + ContextCompletion &m_context_completion; +}; + +} // namespace ceph + +#endif // CEPH_ASYNC_COMPLETION_H diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 7408345b6abd6..07a7f1c77ee9c 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -73,7 +73,8 @@ libcommon_internal_la_SOURCES = \ common/linux_version.c \ common/module.c \ common/Readahead.cc \ - common/Cycles.cc + common/Cycles.cc \ + common/ContextCompletion.cc if ENABLE_XIO libcommon_internal_la_SOURCES += \ @@ -225,7 +226,8 @@ noinst_HEADERS += \ common/Continuation.h \ common/Readahead.h \ common/Cycles.h \ - common/Initialize.h + common/Initialize.h \ + common/ContextCompletion.h if ENABLE_XIO noinst_HEADERS += \ diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc new file mode 100644 index 0000000000000..c0ab54edcf393 --- /dev/null +++ b/src/librbd/AsyncObjectThrottle.cc @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "librbd/AsyncObjectThrottle.h" +#include "include/rbd/librbd.hpp" + +namespace librbd +{ + +AsyncObjectThrottle::AsyncObjectThrottle(const ContextFactory& context_factory, + Context *ctx, ProgressContext &prog_ctx, + uint64_t object_no, + uint64_t end_object_no) + : m_lock("librbd::AsyncThrottle::m_lock"), + m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx), + m_object_no(object_no), m_end_object_no(end_object_no), m_current_ops(0), + m_ret(0) +{ +} + +int AsyncObjectThrottle::start_ops(uint64_t max_concurrent) { + bool complete; + { + Mutex::Locker l(m_lock); + for (uint64_t i = 0; i < max_concurrent; ++i) { + int r = start_next_op(); + if (r < 0 && m_current_ops == 0) { + return r; + } + } + complete = (m_current_ops == 0); + } + if (complete) { + m_ctx->complete(m_ret); + delete this; + } + return 0; +} + +void AsyncObjectThrottle::finish_op(int r) { + bool complete; + { + Mutex::Locker l(m_lock); + --m_current_ops; + if (r < 0 && r != -ENOENT && m_ret == 0) { + m_ret = r; + } + + start_next_op(); + complete = (m_current_ops == 0); + } + if (complete) { + m_ctx->complete(m_ret); + delete this; + } +} + +int AsyncObjectThrottle::start_next_op() { + bool done = false; + while (!done) { + if (m_ret != 0 || m_object_no >= m_end_object_no) { + return m_ret; + } + + uint64_t ono = m_object_no++; + C_AsyncObjectThrottle *ctx = m_context_factory(*this, ono); + + int r = ctx->send(); + if (r < 0) { + m_ret = r; + delete ctx; + return m_ret; + } else if (r > 0) { + // op completed immediately + delete ctx; + } else { + ++m_current_ops; + done = true; + } + m_prog_ctx.update_progress(ono, m_end_object_no); + } + return 0; +} + +} // namespace librbd diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h new file mode 100644 index 0000000000000..fdbab5acf6ec6 --- /dev/null +++ b/src/librbd/AsyncObjectThrottle.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H +#define CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H + +#include "include/int_types.h" +#include "include/Context.h" + +#include +#include "include/assert.h" + +namespace librbd +{ +class ProgressContext; + +class AsyncObjectThrottleFinisher { +public: + virtual ~AsyncObjectThrottleFinisher() {}; + virtual void finish_op(int r) = 0; +}; + +class C_AsyncObjectThrottle : public Context { +public: + C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher) + : m_finisher(finisher) + { + } + + virtual void finish(int r) + { + m_finisher.finish_op(r); + } + + virtual int send() = 0; + +private: + AsyncObjectThrottleFinisher &m_finisher; +}; + +class AsyncObjectThrottle : public AsyncObjectThrottleFinisher { +public: + typedef boost::function ContextFactory; + + AsyncObjectThrottle(const ContextFactory& context_factory, Context *ctx, + ProgressContext &prog_ctx, uint64_t object_no, + uint64_t end_object_no); + + int start_ops(uint64_t max_concurrent); + virtual void finish_op(int r); + +private: + Mutex m_lock; + ContextFactory m_context_factory; + Context *m_ctx; + ProgressContext &m_prog_ctx; + uint64_t m_object_no; + uint64_t m_end_object_no; + uint64_t m_current_ops; + int m_ret; + + int start_next_op(); +}; + +} // namespace librbd + +#endif // CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H diff --git a/src/librbd/Makefile.am b/src/librbd/Makefile.am index 7222a16d9af87..3a49a12df1b36 100644 --- a/src/librbd/Makefile.am +++ b/src/librbd/Makefile.am @@ -1,6 +1,7 @@ librbd_internal_la_SOURCES = \ librbd/AioCompletion.cc \ librbd/AioRequest.cc \ + librbd/AsyncObjectThrottle.cc \ librbd/CopyupRequest.cc \ librbd/ImageCtx.cc \ librbd/ImageWatcher.cc \ @@ -36,6 +37,7 @@ lib_LTLIBRARIES += librbd.la noinst_HEADERS += \ librbd/AioCompletion.h \ librbd/AioRequest.h \ + librbd/AsyncObjectThrottle.h \ librbd/CopyupRequest.h \ librbd/ImageCtx.h \ librbd/ImageWatcher.h \ diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc index 3617c5ac5e8b7..03da0ce68f9a2 100644 --- a/src/librbd/internal.cc +++ b/src/librbd/internal.cc @@ -8,6 +8,7 @@ #include "common/ceph_context.h" #include "common/dout.h" #include "common/errno.h" +#include "common/ContextCompletion.h" #include "common/Throttle.h" #include "cls/lock/cls_lock_client.h" #include "include/stringify.h" @@ -16,6 +17,7 @@ #include "librbd/AioCompletion.h" #include "librbd/AioRequest.h" +#include "librbd/AsyncObjectThrottle.h" #include "librbd/CopyupRequest.h" #include "librbd/ImageCtx.h" #include "librbd/ImageWatcher.h" @@ -27,6 +29,10 @@ #include "librados/snap_set_diff.h" #include +#include +#include +#include +#include "include/assert.h" #define dout_subsys ceph_subsys_rbd #undef dout_prefix @@ -151,66 +157,30 @@ namespace librbd { void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx) { - CephContext *cct = (CephContext *)ictx->data_ctx.cct(); - - uint64_t size = ictx->get_current_size(); - uint64_t period = ictx->get_stripe_period(); - uint64_t num_period = ((newsize + period - 1) / period); - uint64_t delete_off = MIN(num_period * period, size); - // first object we can delete free and clear - uint64_t delete_start = num_period * ictx->get_stripe_count(); - uint64_t num_objects = Striper::get_num_objects(ictx->layout, size); - uint64_t object_size = ictx->get_object_size(); - - ldout(cct, 10) << "trim_image " << size << " -> " << newsize - << " periods " << num_period - << " discard to offset " << delete_off - << " delete objects " << delete_start - << " to " << (num_objects-1) - << dendl; + Mutex my_lock("librbd::trim_image::my_lock"); + Cond cond; + bool done; + int ret; - SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, true); - if (delete_start < num_objects) { - ldout(cct, 2) << "trim_image objects " << delete_start << " to " - << (num_objects - 1) << dendl; - for (uint64_t i = delete_start; i < num_objects; ++i) { - string oid = ictx->get_object_name(i); - Context *req_comp = new C_SimpleThrottle(&throttle); - librados::AioCompletion *rados_completion = - librados::Rados::aio_create_completion(req_comp, NULL, rados_ctx_cb); - ictx->data_ctx.aio_remove(oid, rados_completion); - rados_completion->release(); - prog_ctx.update_progress((i - delete_start) * object_size, - (num_objects - delete_start) * object_size); - } + CephContext *cct = ictx->cct; + Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &ret); + ret = async_trim_image(ictx, ctx, ictx->size, newsize, prog_ctx); + if (ret < 0) { + lderr(cct) << "warning: failed to remove object(s): " + << cpp_strerror(ret) << dendl; + delete ctx; + return; } - // discard the weird boundary, if any - if (delete_off > newsize) { - vector extents; - Striper::file_to_extents(ictx->cct, ictx->format_string, &ictx->layout, - newsize, delete_off - newsize, 0, extents); - - for (vector::iterator p = extents.begin(); - p != extents.end(); ++p) { - ldout(ictx->cct, 20) << " ex " << *p << dendl; - Context *req_comp = new C_SimpleThrottle(&throttle); - librados::AioCompletion *rados_completion = - librados::Rados::aio_create_completion(req_comp, NULL, rados_ctx_cb); - if (p->offset == 0) { - ictx->data_ctx.aio_remove(p->oid.name, rados_completion); - } else { - librados::ObjectWriteOperation op; - op.truncate(p->offset); - ictx->data_ctx.aio_operate(p->oid.name, rados_completion, &op); - } - rados_completion->release(); - } + my_lock.Lock(); + while (!done) { + cond.Wait(my_lock); } - int r = throttle.wait_for_ret(); - if (r < 0) { + my_lock.Unlock(); + + if (ret < 0) { lderr(cct) << "warning: failed to remove some object(s): " - << cpp_strerror(r) << dendl; + << cpp_strerror(ret) << dendl; } } @@ -320,8 +290,13 @@ namespace librbd { int rollback_image(ImageCtx *ictx, uint64_t snap_id, ProgressContext& prog_ctx) { - uint64_t numseg = Striper::get_num_objects(ictx->layout, ictx->get_current_size()); uint64_t bsize = ictx->get_object_size(); + uint64_t numseg; + { + RWLock::RLocker l(ictx->md_lock); + numseg = Striper::get_num_objects(ictx->layout, ictx->get_current_size()); + } + int r; CephContext *cct = ictx->cct; SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, true); @@ -1550,59 +1525,61 @@ reprotect_and_return_err: return 0; } - int resize_helper(ImageCtx *ictx, uint64_t size, ProgressContext& prog_ctx) + int resize(ImageCtx *ictx, uint64_t size, ProgressContext& prog_ctx) { CephContext *cct = ictx->cct; + ldout(cct, 20) << "resize " << ictx << " " << ictx->size << " -> " + << size << dendl; - if (size == ictx->size) { - ldout(cct, 2) << "no change in size (" << ictx->size << " -> " << size - << ")" << dendl; - return 0; + Mutex my_lock("librbd::resize::my_lock"); + Cond cond; + bool done; + int ret; + + Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &ret); + ret = async_resize(ictx, ctx, size, prog_ctx); + if (ret < 0) { + delete ctx; + return ret; } - if (size > ictx->size) { - ldout(cct, 2) << "expanding image " << ictx->size << " -> " << size - << dendl; - // TODO: make ictx->set_size - } else { - ldout(cct, 2) << "shrinking image " << ictx->size << " -> " << size - << dendl; - trim_image(ictx, size, prog_ctx); + my_lock.Lock(); + while (!done) { + cond.Wait(my_lock); } - ictx->size = size; + my_lock.Unlock(); - int r; - if (ictx->old_format) { - // rewrite header - bufferlist bl; - ictx->header.image_size = size; - bl.append((const char *)&(ictx->header), sizeof(ictx->header)); - r = ictx->md_ctx.write(ictx->header_oid, bl, bl.length(), 0); - } else { - r = cls_client::set_size(&(ictx->md_ctx), ictx->header_oid, size); + notify_change(ictx->md_ctx, ictx->header_oid, ictx); + ldout(cct, 2) << "resize finished" << dendl; + return ret; + } + + class AsyncResizeFinishContext : public Context { + public: + AsyncResizeFinishContext(ImageCtx *ictx, Context *ctx) + : m_ictx(ictx), m_ctx(ctx) + { } - // TODO: remove this useless check - if (r == -ERANGE) - lderr(cct) << "operation might have conflicted with another client!" - << dendl; - if (r < 0) { - lderr(cct) << "error writing header: " << cpp_strerror(-r) << dendl; - return r; - } else { - notify_change(ictx->md_ctx, ictx->header_oid, ictx); + virtual void finish(int r) { + ldout(m_ictx->cct, 2) << "async_resize finished" << dendl; + m_ictx->perfcounter->inc(l_librbd_resize); + m_ctx->complete(r); } - return 0; - } + private: + ImageCtx *m_ictx; + Context *m_ctx; + }; - int resize(ImageCtx *ictx, uint64_t size, ProgressContext& prog_ctx) + int async_resize(ImageCtx *ictx, Context *ctx, uint64_t size, + ProgressContext &prog_ctx) { CephContext *cct = ictx->cct; - ldout(cct, 20) << "resize " << ictx << " " << ictx->size << " -> " + ldout(cct, 20) << "async_resize " << ictx << " " << ictx->size << " -> " << size << dendl; - if (ictx->read_only) { + if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) { return -EROFS; } @@ -1611,34 +1588,260 @@ reprotect_and_return_err: return r; } - RWLock::RLocker l(ictx->owner_lock); - r = prepare_image_update(ictx); + uint64_t original_size; + { + RWLock::RLocker l(ictx->owner_lock); + r = prepare_image_update(ictx); + if (r < 0) { + return -EROFS; + } + if (ictx->image_watcher->is_lock_supported() && + !ictx->image_watcher->is_lock_owner()) { + return -EROFS; + } + + RWLock::RLocker l2(ictx->md_lock); + original_size = ictx->size; + if (size < ictx->size) { + ictx->wait_for_pending_copyup(); + if (ictx->object_cacher) { + // need to invalidate since we're deleting objects, and + // ObjectCacher doesn't track non-existent objects + r = ictx->invalidate_cache(); + if (r < 0) { + return r; + } + } + } + } + + AsyncResizeFinishContext *finish_ctx = + new AsyncResizeFinishContext(ictx, ctx); + r = async_resize_helper(ictx, finish_ctx, original_size, size, prog_ctx); if (r < 0) { - return -EROFS; + delete ctx; + return r; } - if (ictx->image_watcher->is_lock_supported() && - !ictx->image_watcher->is_lock_owner()) { - // TODO: temporary until request proxied to lock owner - return -EROFS; + return 0; + } + + class AsyncResizeHelperFinishContext : public Context { + public: + AsyncResizeHelperFinishContext(ImageCtx *ictx, Context *ctx, + uint64_t original_size, uint64_t new_size) + : m_ictx(ictx), m_ctx(ctx), m_original_size(original_size), + m_new_size(new_size) + { } - RWLock::WLocker l2(ictx->md_lock); - if (size < ictx->size) { - ictx->wait_for_pending_copyup(); - if (ictx->object_cacher) { - // need to invalidate since we're deleting objects, and - // ObjectCacher doesn't track non-existent objects - r = ictx->invalidate_cache(); - if (r < 0) { - return r; + virtual void finish(int r) { + BOOST_SCOPE_EXIT((m_ctx) (m_ictx) (m_new_size) (m_original_size) (&r)) { + ldout(m_ictx->cct, 2) << "async_resize_helper finished (" + << r << ")" << dendl; + + RWLock::WLocker l(m_ictx->md_lock); + if (r < 0 && m_ictx->size == m_new_size) { + m_ictx->size = m_original_size; + } + m_ctx->complete(r); + } BOOST_SCOPE_EXIT_END + + RWLock::RLocker l(m_ictx->owner_lock); + if (m_ictx->image_watcher->is_lock_supported() && + !m_ictx->image_watcher->is_lock_owner()) { + r = -EROFS; + return; + } + + RWLock::WLocker l2(m_ictx->md_lock); + m_ictx->size = m_new_size; + if (m_ictx->old_format) { + // rewrite header + bufferlist bl; + m_ictx->header.image_size = m_new_size; + bl.append((const char *)&m_ictx->header, sizeof(m_ictx->header)); + r = m_ictx->md_ctx.write(m_ictx->header_oid, bl, bl.length(), 0); + } else { + r = cls_client::set_size(&m_ictx->md_ctx, m_ictx->header_oid, + m_new_size); + } + + if (r < 0) { + lderr(m_ictx->cct) << "error writing header: " << cpp_strerror(r) + << dendl; + } + } + + private: + ImageCtx *m_ictx; + Context *m_ctx; + uint64_t m_original_size; + uint64_t m_new_size; + }; + + int async_resize_helper(ImageCtx *ictx, Context *ctx, uint64_t original_size, + uint64_t new_size, ProgressContext& prog_ctx) + { + CephContext *cct = ictx->cct; + if (original_size == new_size) { + ldout(cct, 2) << "no change in size (" << original_size << " -> " + << new_size << ")" << dendl; + ctx->complete(0); + return 0; + } + + AsyncResizeHelperFinishContext *finish_ctx = + new AsyncResizeHelperFinishContext(ictx, ctx, original_size, new_size); + if (new_size > original_size) { + ldout(cct, 2) << "expanding image " << original_size << " -> " + << new_size << dendl; + finish_ctx->complete(0); + } else { + ldout(cct, 2) << "shrinking image " << original_size << " -> " + << new_size << dendl; + + { + // update in-memory size to clip concurrent IO operations + RWLock::WLocker l(ictx->md_lock); + ictx->size = new_size; + } + + int r = async_trim_image(ictx, finish_ctx, original_size, new_size, + prog_ctx); + if (r < 0) { + delete finish_ctx; + return r; + } + } + return 0; + } + + class AsyncTrimObjectContext : public C_AsyncObjectThrottle { + public: + AsyncTrimObjectContext(AsyncObjectThrottle &throttle, ImageCtx *ictx, + uint64_t object_no) + : C_AsyncObjectThrottle(throttle), m_ictx(ictx), m_object_no(object_no) + { + } + + virtual int send() { + RWLock::RLocker l(m_ictx->owner_lock); + if (m_ictx->image_watcher->is_lock_supported() && + !m_ictx->image_watcher->is_lock_owner()) { + return -EROFS; + } + + string oid = m_ictx->get_object_name(m_object_no); + librados::AioCompletion *rados_completion = + librados::Rados::aio_create_completion(this, NULL, rados_ctx_cb); + m_ictx->data_ctx.aio_remove(oid, rados_completion); + rados_completion->release(); + return 0; + } + + private: + ImageCtx *m_ictx; + uint64_t m_object_no; + }; + + class AsyncTrimFinishContext : public Context { + public: + + AsyncTrimFinishContext(ImageCtx *ictx, Context *ctx, uint64_t delete_start, + uint64_t num_objects, uint64_t delete_offset, + uint64_t new_size) + : m_ictx(ictx), m_ctx(ctx), m_delete_start(delete_start), + m_num_objects(num_objects), m_delete_offset(delete_offset), + m_new_size(new_size) + { + } + + virtual void finish(int r) { + if (r < 0 || m_delete_offset <= m_new_size) { + m_ctx->complete(r); + return; + } + + RWLock::RLocker l(m_ictx->owner_lock); + if (m_ictx->image_watcher->is_lock_supported() && + !m_ictx->image_watcher->is_lock_owner()) { + r = -EROFS; + return; + } + + // discard the weird boundary, if any + vector extents; + Striper::file_to_extents(m_ictx->cct, m_ictx->format_string, + &m_ictx->layout, m_new_size, + m_delete_offset - m_new_size, 0, extents); + + ContextCompletion *completion = new ContextCompletion(m_ctx, true); + for (vector::iterator p = extents.begin(); + p != extents.end(); ++p) { + ldout(m_ictx->cct, 20) << " ex " << *p << dendl; + Context *req_comp = new C_ContextCompletion(*completion); + librados::AioCompletion *rados_completion = + librados::Rados::aio_create_completion(req_comp, NULL, rados_ctx_cb); + if (p->offset == 0) { + m_ictx->data_ctx.aio_remove(p->oid.name, rados_completion); + } else { + librados::ObjectWriteOperation op; + op.truncate(p->offset); + m_ictx->data_ctx.aio_operate(p->oid.name, rados_completion, &op); } + rados_completion->release(); } + completion->finish_adding_requests(); } - resize_helper(ictx, size, prog_ctx); - ldout(cct, 2) << "done." << dendl; + private: + ImageCtx *m_ictx; + Context *m_ctx; + uint64_t m_delete_start; + uint64_t m_num_objects; + uint64_t m_delete_offset; + uint64_t m_new_size; + }; + + int async_trim_image(ImageCtx *ictx, Context *ctx, uint64_t original_size, + uint64_t new_size, ProgressContext& prog_ctx) + { + CephContext *cct = (CephContext *)ictx->data_ctx.cct(); + + uint64_t period = ictx->get_stripe_period(); + uint64_t new_num_periods = ((new_size + period - 1) / period); + uint64_t delete_off = MIN(new_num_periods * period, original_size); + // first object we can delete free and clear + uint64_t delete_start = new_num_periods * ictx->get_stripe_count(); + uint64_t num_objects = Striper::get_num_objects(ictx->layout, original_size); - ictx->perfcounter->inc(l_librbd_resize); + ldout(cct, 10) << "trim_image " << original_size << " -> " << new_size + << " periods " << new_num_periods + << " discard to offset " << delete_off + << " delete objects " << delete_start + << " to " << (num_objects-1) + << dendl; + + AsyncTrimFinishContext *finish_ctx = + new AsyncTrimFinishContext(ictx, ctx, delete_start, num_objects, + delete_off, new_size); + if (delete_start < num_objects) { + ldout(cct, 2) << "trim_image objects " << delete_start << " to " + << (num_objects - 1) << dendl; + + AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr(), + boost::lambda::_1, ictx, boost::lambda::_2)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + context_factory, finish_ctx, prog_ctx, delete_start, num_objects); + int r = throttle->start_ops(cct->_conf->rbd_concurrent_management_ops); + if (r < 0) { + delete throttle; + return r; + } + } else { + finish_ctx->complete(0); + } return 0; } @@ -1967,48 +2170,72 @@ reprotect_and_return_err: return r; RWLock::RLocker l(ictx->owner_lock); - RWLock::WLocker l2(ictx->md_lock); snap_t snap_id; + uint64_t original_size; uint64_t new_size; { - // need to drop snap_lock before invalidating cache - RWLock::RLocker l3(ictx->snap_lock); - if (!ictx->snap_exists) - return -ENOENT; + RWLock::WLocker l2(ictx->md_lock); + { + // need to drop snap_lock before invalidating cache + RWLock::RLocker l3(ictx->snap_lock); + if (!ictx->snap_exists) { + return -ENOENT; + } - if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) - return -EROFS; + if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) { + return -EROFS; + } - snap_id = ictx->get_snap_id(snap_name); - if (snap_id == CEPH_NOSNAP) { - lderr(cct) << "No such snapshot found." << dendl; - return -ENOENT; + snap_id = ictx->get_snap_id(snap_name); + if (snap_id == CEPH_NOSNAP) { + lderr(cct) << "No such snapshot found." << dendl; + return -ENOENT; + } + } + + r = prepare_image_update(ictx); + if (r < 0) { + return -EROFS; } + if (ictx->image_watcher->is_lock_supported() && + !ictx->image_watcher->is_lock_owner()) { + return -EROFS; + } + + original_size = ictx->size; new_size = ictx->get_image_size(snap_id); - } - r = prepare_image_update(ictx); - if (r < 0) { - return -EROFS; - } - if (ictx->image_watcher->is_lock_supported() && - !ictx->image_watcher->is_lock_owner()) { - return -EROFS; + // need to flush any pending writes before resizing and rolling back - + // writes might create new snapshots. Rolling back will replace + // the current version, so we have to invalidate that too. + r = ictx->invalidate_cache(); + if (r < 0) { + return r; + } } - // need to flush any pending writes before resizing and rolling back - - // writes might create new snapshots. Rolling back will replace - // the current version, so we have to invalidate that too. - r = ictx->invalidate_cache(); - if (r < 0) - return r; + Mutex my_lock("librbd::snap_rollback::my_lock"); + Cond cond; + bool done; + Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &r); ldout(cct, 2) << "resizing to snapshot size..." << dendl; NoOpProgressContext no_op; - r = resize_helper(ictx, new_size, no_op); + r = async_resize_helper(ictx, ctx, original_size, new_size, no_op); + if (r < 0) { + delete ctx; + return r; + } + + my_lock.Lock(); + while (!done) { + cond.Wait(my_lock); + } + my_lock.Unlock(); + if (r < 0) { lderr(cct) << "Error resizing to snapshot size: " - << cpp_strerror(-r) << dendl; + << cpp_strerror(r) << dendl; return r; } @@ -2319,14 +2546,172 @@ reprotect_and_return_err: delete ictx; } + class AsyncFlattenObjectContext : public C_AsyncObjectThrottle { + public: + AsyncFlattenObjectContext(AsyncObjectThrottle &throttle, ImageCtx *ictx, + uint64_t object_size, ::SnapContext snapc, + uint64_t object_no) + : C_AsyncObjectThrottle(throttle), m_ictx(ictx), + m_object_size(object_size), m_snapc(snapc), m_object_no(object_no) + { + } + + virtual int send() { + int r = ictx_check(m_ictx); + if (r < 0) { + return r; + } + + RWLock::RLocker l(m_ictx->owner_lock); + if (m_ictx->image_watcher->is_lock_supported() && + !m_ictx->image_watcher->is_lock_owner()) { + return -EROFS; + } + + RWLock::RLocker l2(m_ictx->md_lock); + uint64_t overlap; + { + RWLock::RLocker l3(m_ictx->parent_lock); + // stop early if the parent went away - it just means + // another flatten finished first, so this one is useless. + if (!m_ictx->parent) { + return 1; + } + + // resize might have occurred while flatten is running + overlap = min(m_ictx->size, m_ictx->parent_md.overlap); + } + + // map child object onto the parent + vector > objectx; + Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no, 0, + m_object_size, objectx); + uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap); + assert(object_overlap <= m_object_size); + if (object_overlap == 0) { + // resize shrunk image while flattening + return 1; + } + + bufferlist bl; + string oid = m_ictx->get_object_name(m_object_no); + AioWrite *req = new AioWrite(m_ictx, oid, m_object_no, 0, objectx, + object_overlap, bl, m_snapc, CEPH_NOSNAP, + this); + r = req->send(); + if (r < 0) { + lderr(m_ictx->cct) << "failed to flatten object " << oid << dendl; + delete req; + return r; + } + return 0; + } + + private: + ImageCtx *m_ictx; + uint64_t m_object_size; + ::SnapContext m_snapc; + uint64_t m_object_no; + + }; + + class AsyncFlattenFinishContext : public Context { + public: + AsyncFlattenFinishContext(ImageCtx *ictx, Context *ctx, + uint64_t overlap_objects) + : m_ictx(ictx), m_ctx(ctx), m_overlap_objects(overlap_objects) + { + } + + virtual void finish(int r) { + BOOST_SCOPE_EXIT((&m_ctx) (&r)) { + m_ctx->complete(r); + } BOOST_SCOPE_EXIT_END + + CephContext *cct = m_ictx->cct; + if (r < 0) { + lderr(cct) << "failed to flatten at least one object: " + << cpp_strerror(r) << dendl; + return; + } + + RWLock::RLocker l(m_ictx->owner_lock); + if (m_ictx->image_watcher->is_lock_supported() && + !m_ictx->image_watcher->is_lock_owner()) { + r = -EROFS; + return; + } + + // remove parent from this (base) image + r = cls_client::remove_parent(&m_ictx->md_ctx, m_ictx->header_oid); + if (r < 0) { + lderr(cct) << "error removing parent" << dendl; + return; + } + + // and if there are no snaps, remove from the children object as well + // (if snapshots remain, they have their own parent info, and the child + // will be removed when the last snap goes away) + m_ictx->snap_lock.get_read(); + if (m_ictx->snaps.empty()) { + ldout(cct, 2) << "removing child from children list..." << dendl; + int r = cls_client::remove_child(&m_ictx->md_ctx, RBD_CHILDREN, + m_ictx->parent_md.spec, m_ictx->id); + if (r < 0) { + lderr(cct) << "error removing child from children list" << dendl; + m_ictx->snap_lock.put_read(); + return; + } + } + m_ictx->snap_lock.put_read(); + + ldout(cct, 20) << "finished flattening" << dendl; + return; + } + + private: + ImageCtx *m_ictx; + Context *m_ctx; + uint64_t m_overlap_objects; + }; + // 'flatten' child image by copying all parent's blocks int flatten(ImageCtx *ictx, ProgressContext &prog_ctx) { CephContext *cct = ictx->cct; ldout(cct, 20) << "flatten" << dendl; - if (ictx->read_only) + Mutex my_lock("librbd::flatten:my_lock"); + Cond cond; + bool done; + int ret; + + Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &ret); + ret = async_flatten(ictx, ctx, prog_ctx); + if (ret < 0) { + delete ctx; + return ret; + } + + my_lock.Lock(); + while (!done) { + cond.Wait(my_lock); + } + my_lock.Unlock(); + + notify_change(ictx->md_ctx, ictx->header_oid, ictx); + ldout(cct, 20) << "flatten finished" << dendl; + return ret; + } + + int async_flatten(ImageCtx *ictx, Context *ctx, ProgressContext &prog_ctx) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "flatten" << dendl; + + if (ictx->read_only || ictx->snap_id != CEPH_NOSNAP) { return -EROFS; + } int r; // ictx_check also updates parent data @@ -2364,90 +2749,34 @@ reprotect_and_return_err: overlap_objects = Striper::get_num_objects(ictx->layout, overlap); } - RWLock::RLocker l(ictx->owner_lock); - r = prepare_image_update(ictx); - if (r < 0) { - return -EROFS; - } - if (ictx->image_watcher->is_lock_supported() && - !ictx->image_watcher->is_lock_owner()) { - // TODO: temporary until request proxied to lock owner - return -EROFS; - } - - SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, false); - - for (uint64_t ono = 0; ono < overlap_objects; ono++) { - { - RWLock::RLocker l(ictx->parent_lock); - // stop early if the parent went away - it just means - // another flatten finished first, so this one is useless. - if (!ictx->parent) { - r = 0; - goto err; - } - } - - // map child object onto the parent - vector > objectx; - Striper::extent_to_file(cct, &ictx->layout, - ono, 0, object_size, - objectx); - uint64_t object_overlap = ictx->prune_parent_extents(objectx, overlap); - assert(object_overlap <= object_size); - - bufferlist bl; - string oid = ictx->get_object_name(ono); - Context *comp = new C_SimpleThrottle(&throttle); - AioWrite *req = new AioWrite(ictx, oid, ono, 0, objectx, object_overlap, - bl, snapc, CEPH_NOSNAP, comp); - r = req->send(); + { + RWLock::RLocker l(ictx->owner_lock); + r = prepare_image_update(ictx); if (r < 0) { - lderr(cct) << "failed to flatten object " << oid << dendl; - goto err; + return -EROFS; + } + if (ictx->image_watcher->is_lock_supported() && + !ictx->image_watcher->is_lock_owner()) { + // TODO: temporary until request proxied to lock owner + return -EROFS; } - - prog_ctx.update_progress(ono, overlap_objects); - } - - r = throttle.wait_for_ret(); - if (r < 0) { - lderr(cct) << "failed to flatten at least one object: " - << cpp_strerror(r) << dendl; - goto err; } - // remove parent from this (base) image - r = cls_client::remove_parent(&ictx->md_ctx, ictx->header_oid); + AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr(), + boost::lambda::_1, ictx, object_size, snapc, + boost::lambda::_2)); + AsyncFlattenFinishContext *finish_ctx = + new AsyncFlattenFinishContext(ictx, ctx, overlap_objects); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + context_factory, finish_ctx, prog_ctx, 0, overlap_objects); + r = throttle->start_ops(cct->_conf->rbd_concurrent_management_ops); if (r < 0) { - lderr(cct) << "error removing parent" << dendl; + delete throttle; return r; } - // and if there are no snaps, remove from the children object as well - // (if snapshots remain, they have their own parent info, and the child - // will be removed when the last snap goes away) - ictx->snap_lock.get_read(); - if (ictx->snaps.empty()) { - ldout(cct, 2) << "removing child from children list..." << dendl; - int r = cls_client::remove_child(&ictx->md_ctx, RBD_CHILDREN, - ictx->parent_md.spec, ictx->id); - if (r < 0) { - lderr(cct) << "error removing child from children list" << dendl; - ictx->snap_lock.put_read(); - return r; - } - } - ictx->snap_lock.put_read(); - notify_change(ictx->md_ctx, ictx->header_oid, ictx); - - ldout(cct, 20) << "finished flattening" << dendl; - return 0; - - err: - throttle.wait_for_ret(); - return r; } int list_lockers(ImageCtx *ictx, diff --git a/src/librbd/internal.h b/src/librbd/internal.h index 20c55af92f648..3b777e948284a 100644 --- a/src/librbd/internal.h +++ b/src/librbd/internal.h @@ -53,6 +53,8 @@ enum { l_librbd_last, }; +class Context; + namespace librbd { struct AioCompletion; @@ -105,7 +107,6 @@ namespace librbd { int remove(librados::IoCtx& io_ctx, const char *imgname, ProgressContext& prog_ctx); int resize(ImageCtx *ictx, uint64_t size, ProgressContext& prog_ctx); - int resize_helper(ImageCtx *ictx, uint64_t size, ProgressContext& prog_ctx); int snap_create(ImageCtx *ictx, const char *snap_name); int snap_list(ImageCtx *ictx, std::vector& snaps); bool snap_exists(ImageCtx *ictx, const char *snap_name); @@ -184,6 +185,15 @@ namespace librbd { char *buf, bufferlist *pbl, int op_flags); ssize_t write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf, int op_flags); int discard(ImageCtx *ictx, uint64_t off, uint64_t len); + + int async_flatten(ImageCtx *ictx, Context *ctx, ProgressContext &prog_ctx); + int async_resize(ImageCtx *ictx, Context *ctx, uint64_t size, + ProgressContext &prog_ctx); + int async_resize_helper(ImageCtx *ictx, Context *ctx, uint64_t original_size, + uint64_t new_size, ProgressContext& prog_ctx); + int async_trim_image(ImageCtx *ictx, Context *ctx, uint64_t original_size, + uint64_t new_size, ProgressContext& prog_ctx); + int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf, AioCompletion *c, int op_flags); int aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c);