From f13bdaeb862b2fdc3c5d8833d405716f3bec184a Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Thu, 21 May 2015 14:43:20 -0400 Subject: [PATCH] librbd: execute multiple object diffs in parallel Issue up to 'concurrent_management_ops' list_snap ops to the OSDs concurrently. The iterate callbacks will continue to be serialized -- clients that wish higher throughput should avoid blocking to maximum performance. Fixes: #11625 Signed-off-by: Jason Dillaman --- src/librbd/DiffIterate.cc | 379 ++++++++++++++++++++++++++++---------- 1 file changed, 279 insertions(+), 100 deletions(-) diff --git a/src/librbd/DiffIterate.cc b/src/librbd/DiffIterate.cc index 905e4ae3f9d99..35ebfecb0479d 100644 --- a/src/librbd/DiffIterate.cc +++ b/src/librbd/DiffIterate.cc @@ -3,9 +3,17 @@ #include "librbd/DiffIterate.h" #include "librbd/ImageCtx.h" +#include "librbd/internal.h" #include "include/rados/librados.hpp" #include "include/interval_set.h" +#include "common/errno.h" +#include "common/Mutex.h" +#include "common/Throttle.h" #include "librados/snap_set_diff.h" +#include +#include +#include +#include #define dout_subsys ceph_subsys_rbd #undef dout_prefix @@ -21,24 +29,254 @@ enum ObjectDiffState { OBJECT_DIFF_STATE_HOLE = 2 }; +class DiffContext { +public: + typedef boost::tuple Diff; + typedef std::list Diffs; + + bool whole_object; + uint64_t from_snap_id; + uint64_t end_snap_id; + interval_set parent_diff; + + DiffContext(ImageCtx &image_ctx, DiffIterate::Callback callback, + void *callback_arg, bool _whole_object, uint64_t _from_snap_id, + uint64_t _end_snap_id) + : whole_object(_whole_object), from_snap_id(_from_snap_id), + end_snap_id(_end_snap_id), m_lock("librbd::DiffContext::m_lock"), + m_image_ctx(image_ctx), m_callback(callback), + m_callback_arg(callback_arg), m_pending_ops(0), m_return_value(0), + m_next_request(0), m_waiting_request(0) + { + } + + int invoke_callback() { + Mutex::Locker locker(m_lock); + if (m_return_value < 0) { + return m_return_value; + } + + std::map::iterator it; + while ((it = m_request_diffs.begin()) != m_request_diffs.end() && + it->first == m_waiting_request) { + Diffs diffs = it->second; + m_request_diffs.erase(it); + + for (Diffs::const_iterator d = diffs.begin(); d != diffs.end(); ++d) { + m_lock.Unlock(); + m_callback(d->get<0>(), d->get<1>(), d->get<2>(), m_callback_arg); + m_lock.Lock(); + } + ++m_waiting_request; + } + return 0; + } + + int wait_for_ret() { + Mutex::Locker locker(m_lock); + while (m_pending_ops > 0) { + m_cond.Wait(m_lock); + } + return m_return_value; + } + + uint64_t start_op() { + Mutex::Locker locker(m_lock); + while (m_pending_ops >= m_image_ctx.concurrent_management_ops) { + m_cond.Wait(m_lock); + } + ++m_pending_ops; + return m_next_request++; + } + + void finish_op(uint64_t request_num, int r, const Diffs &diffs) { + Mutex::Locker locker(m_lock); + m_request_diffs[request_num] = diffs; + + if (m_return_value == 0 && r < 0) { + m_return_value = r; + } + + --m_pending_ops; + m_cond.Signal(); + } + +private: + Mutex m_lock; + Cond m_cond; + + ImageCtx &m_image_ctx; + DiffIterate::Callback m_callback; + void *m_callback_arg; + + uint32_t m_pending_ops; + int m_return_value; + + uint64_t m_next_request; + uint64_t m_waiting_request; + + std::map m_request_diffs; +}; + +class C_DiffObject : public Context { +public: + C_DiffObject(ImageCtx &image_ctx, librados::IoCtx &head_ctx, + DiffContext &diff_context, const std::string &oid, + uint64_t offset, const std::vector &object_extents) + : m_image_ctx(image_ctx), m_head_ctx(head_ctx), + m_diff_context(diff_context), m_oid(oid), m_offset(offset), + m_object_extents(object_extents), m_snap_ret(0) + { + m_request_num = m_diff_context.start_op(); + } + + void send() { + librados::ObjectReadOperation op; + op.list_snaps(&m_snap_set, &m_snap_ret); + + librados::AioCompletion *rados_completion = + librados::Rados::aio_create_completion(this, NULL, rados_ctx_cb); + int r = m_head_ctx.aio_operate(m_oid, rados_completion, &op, NULL); + assert(r == 0); + rados_completion->release(); + } + +protected: + virtual void finish(int r) { + CephContext *cct = m_image_ctx.cct; + if (r == 0 && m_snap_ret < 0) { + r = m_snap_ret; + } + + DiffContext::Diffs diffs; + if (r == 0) { + ldout(cct, 20) << "object " << m_oid << ": list_snaps complete" << dendl; + compute_diffs(&diffs); + } else if (r == -ENOENT) { + ldout(cct, 20) << "object " << m_oid << ": list_snaps (not found)" + << dendl; + r = 0; + compute_parent_overlap(&diffs); + } else { + ldout(cct, 20) << "object " << m_oid << ": list_snaps failed: " + << cpp_strerror(r) << dendl; + } + + m_diff_context.finish_op(m_request_num, r, diffs); + } + +private: + ImageCtx &m_image_ctx; + librados::IoCtx &m_head_ctx; + DiffContext &m_diff_context; + uint64_t m_request_num; + std::string m_oid; + uint64_t m_offset; + std::vector m_object_extents; + + librados::snap_set_t m_snap_set; + int m_snap_ret; + + void compute_diffs(DiffContext::Diffs *diffs) { + CephContext *cct = m_image_ctx.cct; + + // calc diff from from_snap_id -> to_snap_id + interval_set diff; + bool end_exists; + calc_snap_set_diff(cct, m_snap_set, m_diff_context.from_snap_id, + m_diff_context.end_snap_id, &diff, &end_exists); + ldout(cct, 20) << " diff " << diff << " end_exists=" << end_exists + << dendl; + if (diff.empty()) { + return; + } else if (m_diff_context.whole_object) { + // provide the full object extents to the callback + for (vector::iterator q = m_object_extents.begin(); + q != m_object_extents.end(); ++q) { + diffs->push_back(boost::make_tuple(m_offset + q->offset, q->length, + end_exists)); + } + return; + } + + for (vector::iterator q = m_object_extents.begin(); + q != m_object_extents.end(); ++q) { + ldout(cct, 20) << "diff_iterate object " << m_oid << " extent " + << q->offset << "~" << q->length << " from " + << q->buffer_extents << dendl; + uint64_t opos = q->offset; + for (vector >::iterator r = + q->buffer_extents.begin(); + r != q->buffer_extents.end(); ++r) { + interval_set overlap; // object extents + overlap.insert(opos, r->second); + overlap.intersection_of(diff); + ldout(m_image_ctx.cct, 20) << " opos " << opos + << " buf " << r->first << "~" << r->second + << " overlap " << overlap << dendl; + for (interval_set::iterator s = overlap.begin(); + s != overlap.end(); ++s) { + uint64_t su_off = s.get_start() - opos; + uint64_t logical_off = m_offset + r->first + su_off; + ldout(cct, 20) << " overlap extent " << s.get_start() << "~" + << s.get_len() << " logical " << logical_off << "~" + << s.get_len() << dendl; + diffs->push_back(boost::make_tuple(logical_off, s.get_len(), + end_exists)); + } + opos += r->second; + } + assert(opos == q->offset + q->length); + } + } + + void compute_parent_overlap(DiffContext::Diffs *diffs) { + if (m_diff_context.from_snap_id == 0 && + !m_diff_context.parent_diff.empty()) { + // report parent diff instead + for (vector::iterator q = m_object_extents.begin(); + q != m_object_extents.end(); ++q) { + for (vector >::iterator r = + q->buffer_extents.begin(); + r != q->buffer_extents.end(); ++r) { + interval_set o; + o.insert(m_offset + r->first, r->second); + o.intersection_of(m_diff_context.parent_diff); + ldout(m_image_ctx.cct, 20) << " reporting parent overlap " << o + << dendl; + for (interval_set::iterator s = o.begin(); s != o.end(); + ++s) { + diffs->push_back(boost::make_tuple(s.get_start(), s.get_len(), + true)); + } + } + } + } + } +}; + } // anonymous namespace int DiffIterate::execute() { - librados::IoCtx head_ctx; + CephContext* cct = m_image_ctx.cct; - m_image_ctx.md_lock.get_read(); - m_image_ctx.snap_lock.get_read(); - head_ctx.dup(m_image_ctx.data_ctx); + librados::IoCtx head_ctx; librados::snap_t from_snap_id = 0; + librados::snap_t end_snap_id; uint64_t from_size = 0; - if (m_from_snap_name) { - from_snap_id = m_image_ctx.get_snap_id(m_from_snap_name); - from_size = m_image_ctx.get_image_size(from_snap_id); + uint64_t end_size; + { + RWLock::RLocker md_locker(m_image_ctx.md_lock); + RWLock::RLocker snap_locker(m_image_ctx.snap_lock); + head_ctx.dup(m_image_ctx.data_ctx); + if (m_from_snap_name) { + from_snap_id = m_image_ctx.get_snap_id(m_from_snap_name); + from_size = m_image_ctx.get_image_size(from_snap_id); + } + end_snap_id = m_image_ctx.snap_id; + end_size = m_image_ctx.get_image_size(end_snap_id); } - librados::snap_t end_snap_id = m_image_ctx.snap_id; - uint64_t end_size = m_image_ctx.get_image_size(end_snap_id); - m_image_ctx.snap_lock.put_read(); - m_image_ctx.md_lock.put_read(); + if (from_snap_id == CEPH_NOSNAP) { return -ENOENT; } @@ -58,9 +296,9 @@ int DiffIterate::execute() { if (m_whole_object && (m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) { r = diff_object_map(from_snap_id, end_snap_id, &object_diff_state); if (r < 0) { - ldout(m_image_ctx.cct, 5) << "diff_iterate fast diff disabled" << dendl; + ldout(cct, 5) << "fast diff disabled" << dendl; } else { - ldout(m_image_ctx.cct, 5) << "diff_iterate fast diff enabled" << dendl; + ldout(cct, 5) << "fast diff enabled" << dendl; fast_diff_enabled = true; } } @@ -69,15 +307,13 @@ int DiffIterate::execute() { // we must list snaps via the head, not end snap head_ctx.snap_set_read(CEPH_SNAPDIR); - ldout(m_image_ctx.cct, 5) << "diff_iterate from " << from_snap_id << " to " - << end_snap_id << " size from " << from_size - << " to " << end_size << dendl; - - // FIXME: if end_size > from_size, we could read_iterate for the - // final part, and skip the listsnaps op. + ldout(cct, 5) << "diff_iterate from " << from_snap_id << " to " + << end_snap_id << " size from " << from_size + << " to " << end_size << dendl; // check parent overlap only if we are comparing to the beginning of time - interval_set parent_diff; + DiffContext diff_context(m_image_ctx, m_callback, m_callback_arg, + m_whole_object, from_snap_id, end_snap_id); if (m_include_parent && from_snap_id == 0) { RWLock::RLocker l(m_image_ctx.snap_lock); RWLock::RLocker l2(m_image_ctx.parent_lock); @@ -85,15 +321,16 @@ int DiffIterate::execute() { m_image_ctx.get_parent_overlap(from_snap_id, &overlap); r = 0; if (m_image_ctx.parent && overlap > 0) { - ldout(m_image_ctx.cct, 10) << " first getting parent diff" << dendl; + ldout(cct, 10) << " first getting parent diff" << dendl; DiffIterate diff_parent(*m_image_ctx.parent, NULL, 0, overlap, m_include_parent, m_whole_object, &DiffIterate::simple_diff_cb, &diff_context.parent_diff); r = diff_parent.execute(); } - if (r < 0) + if (r < 0) { return r; + } } uint64_t period = m_image_ctx.get_stripe_period(); @@ -106,7 +343,7 @@ int DiffIterate::execute() { // map to extents map > object_extents; - Striper::file_to_extents(m_image_ctx.cct, m_image_ctx.format_string, + Striper::file_to_extents(cct, m_image_ctx.format_string, &m_image_ctx.layout, off, read_len, 0, object_extents, 0); @@ -114,7 +351,7 @@ int DiffIterate::execute() { for (map >::iterator p = object_extents.begin(); p != object_extents.end(); ++p) { - ldout(m_image_ctx.cct, 20) << "diff_iterate object " << p->first << dendl; + ldout(cct, 20) << "object " << p->first << dendl; if (fast_diff_enabled) { const uint64_t object_no = p->second.front().objectno; @@ -126,82 +363,18 @@ int DiffIterate::execute() { m_callback(off + q->offset, q->length, updated, m_callback_arg); } } - continue; - } - - librados::snap_set_t snap_set; - r = head_ctx.list_snaps(p->first.name, &snap_set); - if (r == -ENOENT) { - if (from_snap_id == 0 && !parent_diff.empty()) { - // report parent diff instead - for (vector::iterator q = p->second.begin(); - q != p->second.end(); ++q) { - for (vector >::iterator r = - q->buffer_extents.begin(); - r != q->buffer_extents.end(); ++r) { - interval_set o; - o.insert(off + r->first, r->second); - o.intersection_of(parent_diff); - ldout(m_image_ctx.cct, 20) << " reporting parent overlap " << o - << dendl; - for (interval_set::iterator s = o.begin(); s != o.end(); - ++s) { - m_callback(s.get_start(), s.get_len(), true, m_callback_arg); - } - } - } - } - continue; - } - if (r < 0) - return r; - - // calc diff from from_snap_id -> to_snap_id - interval_set diff; - bool end_exists; - calc_snap_set_diff(m_image_ctx.cct, snap_set, from_snap_id, end_snap_id, - &diff, &end_exists); - ldout(m_image_ctx.cct, 20) << " diff " << diff << " end_exists=" - << end_exists << dendl; - if (diff.empty()) { - continue; - } else if (m_whole_object) { - // provide the full object extents to the callback - for (vector::iterator q = p->second.begin(); - q != p->second.end(); ++q) { - m_callback(off + q->offset, q->length, end_exists, m_callback_arg); - } - continue; - } - - for (vector::iterator q = p->second.begin(); - q != p->second.end(); ++q) { - ldout(m_image_ctx.cct, 20) << "diff_iterate object " << p->first - << " extent " << q->offset << "~" - << q->length << " from " << q->buffer_extents - << dendl; - uint64_t opos = q->offset; - for (vector >::iterator r = - q->buffer_extents.begin(); - r != q->buffer_extents.end(); ++r) { - interval_set overlap; // object extents - overlap.insert(opos, r->second); - overlap.intersection_of(diff); - ldout(m_image_ctx.cct, 20) << " opos " << opos - << " buf " << r->first << "~" << r->second - << " overlap " << overlap << dendl; - for (interval_set::iterator s = overlap.begin(); - s != overlap.end(); ++s) { - uint64_t su_off = s.get_start() - opos; - uint64_t logical_off = off + r->first + su_off; - ldout(m_image_ctx.cct, 20) << " overlap extent " << s.get_start() - << "~" << s.get_len() << " logical " - << logical_off << "~" << s.get_len() << dendl; - m_callback(logical_off, s.get_len(), end_exists, m_callback_arg); - } - opos += r->second; + } else { + C_DiffObject *diff_object = new C_DiffObject(m_image_ctx, head_ctx, + diff_context, + p->first.name, off, + p->second); + diff_object->send(); + + r = diff_context.invoke_callback(); + if (r < 0) { + diff_context.wait_for_ret(); + return r; } - assert(opos == q->offset + q->length); } } @@ -209,7 +382,13 @@ int DiffIterate::execute() { off += read_len; } - return 0; + r = diff_context.wait_for_ret(); + if (r < 0) { + return r; + } + + r = diff_context.invoke_callback(); + return r; } int DiffIterate::diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id, -- 2.39.5