]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: execute multiple object diffs in parallel
authorJason Dillaman <dillaman@redhat.com>
Thu, 21 May 2015 18:43:20 +0000 (14:43 -0400)
committerJason Dillaman <dillaman@redhat.com>
Mon, 10 Aug 2015 15:24:08 +0000 (11:24 -0400)
Issue up to 'concurrent_management_ops' list_snap ops to the
OSDs concurrently.  The iterate callbacks will continue to be
serialized -- clients that wish higher throughput should avoid
blocking to maximum performance.

Fixes: #11625
Signed-off-by: Jason Dillaman <dillaman@redhat.com>
src/librbd/DiffIterate.cc

index 905e4ae3f9d998ca62746e4f04600cdb5ce9e9b4..35ebfecb0479dfb4eb253c3ca7bedde774052c17 100644 (file)
@@ -3,9 +3,17 @@
 
 #include "librbd/DiffIterate.h"
 #include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
 #include "include/rados/librados.hpp"
 #include "include/interval_set.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/Throttle.h"
 #include "librados/snap_set_diff.h"
+#include <boost/tuple/tuple.hpp>
+#include <list>
+#include <map>
+#include <vector>
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
@@ -21,24 +29,254 @@ enum ObjectDiffState {
   OBJECT_DIFF_STATE_HOLE    = 2
 };
 
+class DiffContext {
+public:
+  typedef boost::tuple<uint64_t, size_t, bool> Diff;
+  typedef std::list<Diff> Diffs;
+
+  bool whole_object;
+  uint64_t from_snap_id;
+  uint64_t end_snap_id;
+  interval_set<uint64_t> parent_diff;
+
+  DiffContext(ImageCtx &image_ctx, DiffIterate::Callback callback,
+              void *callback_arg, bool _whole_object, uint64_t _from_snap_id,
+              uint64_t _end_snap_id)
+    : whole_object(_whole_object), from_snap_id(_from_snap_id),
+      end_snap_id(_end_snap_id), m_lock("librbd::DiffContext::m_lock"),
+      m_image_ctx(image_ctx), m_callback(callback),
+      m_callback_arg(callback_arg), m_pending_ops(0), m_return_value(0),
+      m_next_request(0), m_waiting_request(0)
+  {
+  }
+
+  int invoke_callback() {
+    Mutex::Locker locker(m_lock);
+    if (m_return_value < 0) {
+      return m_return_value;
+    }
+
+    std::map<uint64_t, Diffs>::iterator it;
+    while ((it = m_request_diffs.begin()) != m_request_diffs.end() &&
+           it->first == m_waiting_request) {
+      Diffs diffs = it->second;
+      m_request_diffs.erase(it);
+
+      for (Diffs::const_iterator d = diffs.begin(); d != diffs.end(); ++d) {
+        m_lock.Unlock();
+        m_callback(d->get<0>(), d->get<1>(), d->get<2>(), m_callback_arg);
+        m_lock.Lock();
+      }
+      ++m_waiting_request;
+    }
+    return 0;
+  }
+
+  int wait_for_ret() {
+    Mutex::Locker locker(m_lock);
+    while (m_pending_ops > 0) {
+      m_cond.Wait(m_lock);
+    }
+    return m_return_value;
+  }
+
+  uint64_t start_op() {
+    Mutex::Locker locker(m_lock);
+    while (m_pending_ops >= m_image_ctx.concurrent_management_ops) {
+        m_cond.Wait(m_lock);
+    }
+    ++m_pending_ops;
+    return m_next_request++;
+  }
+
+  void finish_op(uint64_t request_num, int r, const Diffs &diffs) {
+    Mutex::Locker locker(m_lock);
+    m_request_diffs[request_num] = diffs;
+
+    if (m_return_value == 0 && r < 0) {
+      m_return_value = r;
+    }
+
+    --m_pending_ops;
+    m_cond.Signal();
+  }
+
+private:
+  Mutex m_lock;
+  Cond m_cond;
+
+  ImageCtx &m_image_ctx;
+  DiffIterate::Callback m_callback;
+  void *m_callback_arg;
+
+  uint32_t m_pending_ops;
+  int m_return_value;
+
+  uint64_t m_next_request;
+  uint64_t m_waiting_request;
+
+  std::map<uint64_t, Diffs> m_request_diffs;
+};
+
+class C_DiffObject : public Context {
+public:
+  C_DiffObject(ImageCtx &image_ctx, librados::IoCtx &head_ctx,
+               DiffContext &diff_context, const std::string &oid,
+               uint64_t offset, const std::vector<ObjectExtent> &object_extents)
+    : m_image_ctx(image_ctx), m_head_ctx(head_ctx),
+      m_diff_context(diff_context), m_oid(oid), m_offset(offset),
+      m_object_extents(object_extents), m_snap_ret(0)
+  {
+    m_request_num = m_diff_context.start_op();
+  }
+
+  void send() {
+    librados::ObjectReadOperation op;
+    op.list_snaps(&m_snap_set, &m_snap_ret);
+
+    librados::AioCompletion *rados_completion =
+      librados::Rados::aio_create_completion(this, NULL, rados_ctx_cb);
+    int r = m_head_ctx.aio_operate(m_oid, rados_completion, &op, NULL);
+    assert(r == 0);
+    rados_completion->release();
+  }
+
+protected:
+  virtual void finish(int r) {
+    CephContext *cct = m_image_ctx.cct;
+    if (r == 0 && m_snap_ret < 0) {
+      r = m_snap_ret;
+    }
+
+    DiffContext::Diffs diffs;
+    if (r == 0) {
+      ldout(cct, 20) << "object " << m_oid << ": list_snaps complete" << dendl;
+      compute_diffs(&diffs);
+    } else if (r == -ENOENT) {
+      ldout(cct, 20) << "object " << m_oid << ": list_snaps (not found)"
+                     << dendl;
+      r = 0;
+      compute_parent_overlap(&diffs);
+    } else {
+      ldout(cct, 20) << "object " << m_oid << ": list_snaps failed: "
+                     << cpp_strerror(r) << dendl;
+    }
+
+    m_diff_context.finish_op(m_request_num, r, diffs);
+  }
+
+private:
+  ImageCtx &m_image_ctx;
+  librados::IoCtx &m_head_ctx;
+  DiffContext &m_diff_context;
+  uint64_t m_request_num;
+  std::string m_oid;
+  uint64_t m_offset;
+  std::vector<ObjectExtent> m_object_extents;
+
+  librados::snap_set_t m_snap_set;
+  int m_snap_ret;
+
+  void compute_diffs(DiffContext::Diffs *diffs) {
+    CephContext *cct = m_image_ctx.cct;
+
+    // calc diff from from_snap_id -> to_snap_id
+    interval_set<uint64_t> diff;
+    bool end_exists;
+    calc_snap_set_diff(cct, m_snap_set, m_diff_context.from_snap_id,
+                       m_diff_context.end_snap_id, &diff, &end_exists);
+    ldout(cct, 20) << "  diff " << diff << " end_exists=" << end_exists
+                   << dendl;
+    if (diff.empty()) {
+      return;
+    } else if (m_diff_context.whole_object) {
+      // provide the full object extents to the callback
+      for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
+           q != m_object_extents.end(); ++q) {
+        diffs->push_back(boost::make_tuple(m_offset + q->offset, q->length,
+                                           end_exists));
+      }
+      return;
+    }
+
+    for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
+         q != m_object_extents.end(); ++q) {
+      ldout(cct, 20) << "diff_iterate object " << m_oid << " extent "
+                     << q->offset << "~" << q->length << " from "
+                     << q->buffer_extents << dendl;
+      uint64_t opos = q->offset;
+      for (vector<pair<uint64_t,uint64_t> >::iterator r =
+             q->buffer_extents.begin();
+           r != q->buffer_extents.end(); ++r) {
+        interval_set<uint64_t> overlap;  // object extents
+        overlap.insert(opos, r->second);
+        overlap.intersection_of(diff);
+        ldout(m_image_ctx.cct, 20) << " opos " << opos
+                                    << " buf " << r->first << "~" << r->second
+                                    << " overlap " << overlap << dendl;
+        for (interval_set<uint64_t>::iterator s = overlap.begin();
+              s != overlap.end(); ++s) {
+          uint64_t su_off = s.get_start() - opos;
+          uint64_t logical_off = m_offset + r->first + su_off;
+          ldout(cct, 20) << "   overlap extent " << s.get_start() << "~"
+                         << s.get_len() << " logical " << logical_off << "~"
+                         << s.get_len() << dendl;
+          diffs->push_back(boost::make_tuple(logical_off, s.get_len(),
+                           end_exists));
+        }
+        opos += r->second;
+      }
+      assert(opos == q->offset + q->length);
+    }
+  }
+
+  void compute_parent_overlap(DiffContext::Diffs *diffs) {
+    if (m_diff_context.from_snap_id == 0 &&
+        !m_diff_context.parent_diff.empty()) {
+      // report parent diff instead
+      for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
+           q != m_object_extents.end(); ++q) {
+        for (vector<pair<uint64_t,uint64_t> >::iterator r =
+               q->buffer_extents.begin();
+             r != q->buffer_extents.end(); ++r) {
+          interval_set<uint64_t> o;
+          o.insert(m_offset + r->first, r->second);
+          o.intersection_of(m_diff_context.parent_diff);
+          ldout(m_image_ctx.cct, 20) << " reporting parent overlap " << o
+                                     << dendl;
+          for (interval_set<uint64_t>::iterator s = o.begin(); s != o.end();
+               ++s) {
+            diffs->push_back(boost::make_tuple(s.get_start(), s.get_len(),
+                             true));
+          }
+        }
+      }
+    }
+  }
+};
+
 } // anonymous namespace
 
 int DiffIterate::execute() {
-  librados::IoCtx head_ctx;
+  CephContext* cct = m_image_ctx.cct;
 
-  m_image_ctx.md_lock.get_read();
-  m_image_ctx.snap_lock.get_read();
-  head_ctx.dup(m_image_ctx.data_ctx);
+  librados::IoCtx head_ctx;
   librados::snap_t from_snap_id = 0;
+  librados::snap_t end_snap_id;
   uint64_t from_size = 0;
-  if (m_from_snap_name) {
-    from_snap_id = m_image_ctx.get_snap_id(m_from_snap_name);
-    from_size = m_image_ctx.get_image_size(from_snap_id);
+  uint64_t end_size;
+  {
+    RWLock::RLocker md_locker(m_image_ctx.md_lock);
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    head_ctx.dup(m_image_ctx.data_ctx);
+    if (m_from_snap_name) {
+      from_snap_id = m_image_ctx.get_snap_id(m_from_snap_name);
+      from_size = m_image_ctx.get_image_size(from_snap_id);
+    }
+    end_snap_id = m_image_ctx.snap_id;
+    end_size = m_image_ctx.get_image_size(end_snap_id);
   }
-  librados::snap_t end_snap_id = m_image_ctx.snap_id;
-  uint64_t end_size = m_image_ctx.get_image_size(end_snap_id);
-  m_image_ctx.snap_lock.put_read();
-  m_image_ctx.md_lock.put_read();
+
   if (from_snap_id == CEPH_NOSNAP) {
     return -ENOENT;
   }
@@ -58,9 +296,9 @@ int DiffIterate::execute() {
     if (m_whole_object && (m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
       r = diff_object_map(from_snap_id, end_snap_id, &object_diff_state);
       if (r < 0) {
-        ldout(m_image_ctx.cct, 5) << "diff_iterate fast diff disabled" << dendl;
+        ldout(cct, 5) << "fast diff disabled" << dendl;
       } else {
-        ldout(m_image_ctx.cct, 5) << "diff_iterate fast diff enabled" << dendl;
+        ldout(cct, 5) << "fast diff enabled" << dendl;
         fast_diff_enabled = true;
       }
     }
@@ -69,15 +307,13 @@ int DiffIterate::execute() {
   // we must list snaps via the head, not end snap
   head_ctx.snap_set_read(CEPH_SNAPDIR);
 
-  ldout(m_image_ctx.cct, 5) << "diff_iterate from " << from_snap_id << " to "
-                            << end_snap_id << " size from " << from_size
-                            << " to " << end_size << dendl;
-
-  // FIXME: if end_size > from_size, we could read_iterate for the
-  // final part, and skip the listsnaps op.
+  ldout(cct, 5) << "diff_iterate from " << from_snap_id << " to "
+                << end_snap_id << " size from " << from_size
+                << " to " << end_size << dendl;
 
   // check parent overlap only if we are comparing to the beginning of time
-  interval_set<uint64_t> parent_diff;
+  DiffContext diff_context(m_image_ctx, m_callback, m_callback_arg,
+                           m_whole_object, from_snap_id, end_snap_id);
   if (m_include_parent && from_snap_id == 0) {
     RWLock::RLocker l(m_image_ctx.snap_lock);
     RWLock::RLocker l2(m_image_ctx.parent_lock);
@@ -85,15 +321,16 @@ int DiffIterate::execute() {
     m_image_ctx.get_parent_overlap(from_snap_id, &overlap);
     r = 0;
     if (m_image_ctx.parent && overlap > 0) {
-      ldout(m_image_ctx.cct, 10) << " first getting parent diff" << dendl;
+      ldout(cct, 10) << " first getting parent diff" << dendl;
       DiffIterate diff_parent(*m_image_ctx.parent, NULL, 0, overlap,
                               m_include_parent, m_whole_object,
                               &DiffIterate::simple_diff_cb,
                               &diff_context.parent_diff);
       r = diff_parent.execute();
     }
-    if (r < 0)
+    if (r < 0) {
       return r;
+    }
   }
 
   uint64_t period = m_image_ctx.get_stripe_period();
@@ -106,7 +343,7 @@ int DiffIterate::execute() {
 
     // map to extents
     map<object_t,vector<ObjectExtent> > object_extents;
-    Striper::file_to_extents(m_image_ctx.cct, m_image_ctx.format_string,
+    Striper::file_to_extents(cct, m_image_ctx.format_string,
                              &m_image_ctx.layout, off, read_len, 0,
                              object_extents, 0);
 
@@ -114,7 +351,7 @@ int DiffIterate::execute() {
     for (map<object_t,vector<ObjectExtent> >::iterator p =
            object_extents.begin();
          p != object_extents.end(); ++p) {
-      ldout(m_image_ctx.cct, 20) << "diff_iterate object " << p->first << dendl;
+      ldout(cct, 20) << "object " << p->first << dendl;
 
       if (fast_diff_enabled) {
         const uint64_t object_no = p->second.front().objectno;
@@ -126,82 +363,18 @@ int DiffIterate::execute() {
             m_callback(off + q->offset, q->length, updated, m_callback_arg);
           }
         }
-        continue;
-      }
-
-      librados::snap_set_t snap_set;
-      r = head_ctx.list_snaps(p->first.name, &snap_set);
-      if (r == -ENOENT) {
-        if (from_snap_id == 0 && !parent_diff.empty()) {
-          // report parent diff instead
-          for (vector<ObjectExtent>::iterator q = p->second.begin();
-               q != p->second.end(); ++q) {
-            for (vector<pair<uint64_t,uint64_t> >::iterator r =
-                   q->buffer_extents.begin();
-                r != q->buffer_extents.end(); ++r) {
-             interval_set<uint64_t> o;
-             o.insert(off + r->first, r->second);
-             o.intersection_of(parent_diff);
-             ldout(m_image_ctx.cct, 20) << " reporting parent overlap " << o
-                                         << dendl;
-             for (interval_set<uint64_t>::iterator s = o.begin(); s != o.end();
-                   ++s) {
-               m_callback(s.get_start(), s.get_len(), true, m_callback_arg);
-             }
-            }
-          }
-        }
-        continue;
-      }
-      if (r < 0)
-        return r;
-
-      // calc diff from from_snap_id -> to_snap_id
-      interval_set<uint64_t> diff;
-      bool end_exists;
-      calc_snap_set_diff(m_image_ctx.cct, snap_set, from_snap_id, end_snap_id,
-                        &diff, &end_exists);
-      ldout(m_image_ctx.cct, 20) << "  diff " << diff << " end_exists="
-                                 << end_exists << dendl;
-      if (diff.empty()) {
-        continue;
-      } else if (m_whole_object) {
-        // provide the full object extents to the callback
-        for (vector<ObjectExtent>::iterator q = p->second.begin();
-             q != p->second.end(); ++q) {
-          m_callback(off + q->offset, q->length, end_exists, m_callback_arg);
-        }
-        continue;
-      }
-
-      for (vector<ObjectExtent>::iterator q = p->second.begin();
-           q != p->second.end(); ++q) {
-        ldout(m_image_ctx.cct, 20) << "diff_iterate object " << p->first
-                                  << " extent " << q->offset << "~"
-                                   << q->length << " from " << q->buffer_extents
-                                  << dendl;
-        uint64_t opos = q->offset;
-        for (vector<pair<uint64_t,uint64_t> >::iterator r =
-               q->buffer_extents.begin();
-             r != q->buffer_extents.end(); ++r) {
-          interval_set<uint64_t> overlap;  // object extents
-          overlap.insert(opos, r->second);
-          overlap.intersection_of(diff);
-          ldout(m_image_ctx.cct, 20) << " opos " << opos
-                                    << " buf " << r->first << "~" << r->second
-                                    << " overlap " << overlap << dendl;
-          for (interval_set<uint64_t>::iterator s = overlap.begin();
-              s != overlap.end(); ++s) {
-            uint64_t su_off = s.get_start() - opos;
-            uint64_t logical_off = off + r->first + su_off;
-            ldout(m_image_ctx.cct, 20) << "   overlap extent " << s.get_start()
-                           << "~" << s.get_len() << " logical "
-                          << logical_off << "~" << s.get_len() << dendl;
-            m_callback(logical_off, s.get_len(), end_exists, m_callback_arg);
-          }
-          opos += r->second;
+      } else {
+        C_DiffObject *diff_object = new C_DiffObject(m_image_ctx, head_ctx,
+                                                     diff_context,
+                                                     p->first.name, off,
+                                                     p->second);
+        diff_object->send();
+
+        r = diff_context.invoke_callback();
+        if (r < 0) {
+          diff_context.wait_for_ret();
+          return r;
         }
-        assert(opos == q->offset + q->length);
       }
     }
 
@@ -209,7 +382,13 @@ int DiffIterate::execute() {
     off += read_len;
   }
 
-  return 0;
+  r = diff_context.wait_for_ret();
+  if (r < 0) {
+    return r;
+  }
+
+  r = diff_context.invoke_callback();
+  return r;
 }
 
 int DiffIterate::diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id,