]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: Integrate librbd with new exclusive lock feature
authorJason Dillaman <dillaman@redhat.com>
Wed, 8 Oct 2014 12:41:53 +0000 (08:41 -0400)
committerJason Dillaman <dillaman@redhat.com>
Tue, 13 Jan 2015 01:01:19 +0000 (20:01 -0500)
Operations that update the image now require the exclusive lock
if the feature is enabled.  AIO write and discard operations will
automatically request the exclusive lock from the current leader
to support live-migration.

Signed-off-by: Jason Dillaman <dillaman@redhat.com>
src/librbd/AioCompletion.h
src/librbd/internal.cc
src/librbd/internal.h
src/librbd/librbd.cc
src/test/pybind/test_rbd.py

index 3aa5bd5422a87cd3b974f8e793f643701cb22252..e4b034f3c904de7460569155c3d4f33a85e33d58 100644 (file)
@@ -86,13 +86,14 @@ namespace librbd {
     void finish_adding_requests(CephContext *cct);
 
     void init_time(ImageCtx *i, aio_type_t t) {
-      ictx = i;
-      {
+      if (ictx == NULL) {
+        ictx = i;
+        aio_type = t;
+        start_time = ceph_clock_now(ictx->cct);
+
         Mutex::Locker l(ictx->aio_lock);
         ++ictx->pending_aio;
       }
-      aio_type = t;
-      start_time = ceph_clock_now(ictx->cct);
     }
 
     void complete();
index 3eb17601466e928a1958df00e9d0ee53e2c35d01..7794d986f261cf5094f6e8f21e80485bfb5bfb61 100644 (file)
@@ -25,6 +25,8 @@
 
 #include "librados/snap_set_diff.h"
 
+#include <boost/bind.hpp>
+
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
 #define dout_prefix *_dout << "librbd: "
@@ -463,6 +465,29 @@ namespace librbd {
     return 0;
   }
 
+  static int prepare_image_update(ImageCtx *ictx)
+  {
+    assert(ictx->owner_lock.is_locked() && !ictx->owner_lock.is_wlocked());
+    if (ictx->image_watcher == NULL) {
+      return -EROFS;;
+    } else if (!ictx->image_watcher->is_lock_supported() ||
+              ictx->image_watcher->is_lock_owner()) {
+      return 0;
+    }
+
+    // need to upgrade to a write lock
+    int r = 0;
+    ictx->owner_lock.put_read();
+    {
+      RWLock::WLocker l(ictx->owner_lock);
+      if (!ictx->image_watcher->is_lock_owner()) {
+       r = ictx->image_watcher->try_lock();
+      }
+    }
+    ictx->owner_lock.get_read();
+    return r;
+  }
+
   int snap_create(ImageCtx *ictx, const char *snap_name)
   {
     ldout(ictx->cct, 20) << "snap_create " << ictx << " " << snap_name << dendl;
@@ -474,7 +499,18 @@ namespace librbd {
     if (r < 0)
       return r;
 
-    RWLock::RLocker l(ictx->md_lock);
+    RWLock::RLocker l(ictx->owner_lock);
+    r = prepare_image_update(ictx);
+    if (r < 0) {
+      return -EROFS;
+    }
+    if (ictx->image_watcher->is_lock_supported() &&
+        !ictx->image_watcher->is_lock_owner()) {
+      // TODO: temporary until request proxied to lock owner
+      return -EROFS;
+    }
+
+    RWLock::RLocker l2(ictx->md_lock);
     do {
       r = add_snap(ictx, snap_name);
     } while (r == -ESTALE);
@@ -1404,6 +1440,14 @@ reprotect_and_return_err:
     return 0;
   }
 
+  int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner)
+  {
+    RWLock::RLocker l(ictx->owner_lock);
+    *is_owner = (ictx->image_watcher != NULL &&
+                ictx->image_watcher->is_lock_owner());
+    return 0;
+  }
+
   int remove(IoCtx& io_ctx, const char *imgname, ProgressContext& prog_ctx)
   {
     CephContext *cct((CephContext *)io_ctx.cct());
@@ -1566,7 +1610,18 @@ reprotect_and_return_err:
       return r;
     }
 
-    RWLock::WLocker l(ictx->md_lock);
+    RWLock::RLocker l(ictx->owner_lock);
+    r = prepare_image_update(ictx);
+    if (r < 0) {
+      return -EROFS;
+    }
+    if (ictx->image_watcher->is_lock_supported() &&
+        !ictx->image_watcher->is_lock_owner()) {
+      // TODO: temporary until request proxied to lock owner
+      return -EROFS;
+    }
+
+    RWLock::WLocker l2(ictx->md_lock);
     if (size < ictx->size && ictx->object_cacher) {
       // need to invalidate since we're deleting objects, and
       // ObjectCacher doesn't track non-existent objects
@@ -1672,10 +1727,21 @@ reprotect_and_return_err:
   {
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "ictx_check " << ictx << dendl;
+
     ictx->refresh_lock.Lock();
     bool needs_refresh = ictx->last_refresh != ictx->refresh_seq;
     ictx->refresh_lock.Unlock();
 
+    if (ictx->image_watcher != NULL) {
+      // might have encountered an error re-registering a watch
+      int r = ictx->image_watcher->get_watch_error();
+      if (r < 0) {
+        lderr(cct) << "rbd header watch invalid: " << cpp_strerror(r)
+                   << dendl;
+       return r;
+      }
+    }
+
     if (needs_refresh) {
       RWLock::WLocker l(ictx->md_lock);
 
@@ -1894,12 +1960,13 @@ reprotect_and_return_err:
     if (r < 0)
       return r;
 
-    RWLock::WLocker l(ictx->md_lock);
+    RWLock::RLocker l(ictx->owner_lock);
+    RWLock::WLocker l2(ictx->md_lock);
     snap_t snap_id;
     uint64_t new_size;
     {
       // need to drop snap_lock before invalidating cache
-      RWLock::RLocker l2(ictx->snap_lock);
+      RWLock::RLocker l3(ictx->snap_lock);
       if (!ictx->snap_exists)
        return -ENOENT;
 
@@ -1914,6 +1981,15 @@ reprotect_and_return_err:
       new_size = ictx->get_image_size(snap_id);
     }
 
+    r = prepare_image_update(ictx);
+    if (r < 0) {
+      return -EROFS;
+    }
+    if (ictx->image_watcher->is_lock_supported() &&
+        !ictx->image_watcher->is_lock_owner()) {
+      return -EROFS;
+    }
+
     // need to flush any pending writes before resizing and rolling back -
     // writes might create new snapshots. Rolling back will replace
     // the current version, so we have to invalidate that too.
@@ -2102,8 +2178,10 @@ reprotect_and_return_err:
 
   int _snap_set(ImageCtx *ictx, const char *snap_name)
   {
-    RWLock::WLocker l1(ictx->snap_lock);
-    RWLock::WLocker l2(ictx->parent_lock);
+    RWLock::WLocker l(ictx->owner_lock);
+    RWLock::RLocker l1(ictx->md_lock);
+    RWLock::WLocker l2(ictx->snap_lock);
+    RWLock::WLocker l3(ictx->parent_lock);
     int r;
     if ((snap_name != NULL) && (strlen(snap_name) != 0)) {
       r = ictx->snap_set(snap_name);
@@ -2114,6 +2192,7 @@ reprotect_and_return_err:
     if (r < 0) {
       return r;
     }
+
     refresh_parent(ictx);
     return 0;
   }
@@ -2125,13 +2204,32 @@ reprotect_and_return_err:
     // ignore return value, since we may be set to a non-existent
     // snapshot and the user is trying to fix that
     ictx_check(ictx);
+    if (ictx->image_watcher != NULL) {
+      ictx->image_watcher->flush_aio_operations();
+    }
     if (ictx->object_cacher) {
       // complete pending writes before we're set to a snapshot and
       // get -EROFS for writes
       RWLock::WLocker l(ictx->md_lock);
       ictx->flush_cache();
     }
-    return _snap_set(ictx, snap_name);
+    int r = _snap_set(ictx, snap_name);
+    if (r < 0) {
+      return r;
+    }
+
+    RWLock::WLocker l(ictx->owner_lock);
+    if (ictx->image_watcher != NULL) {
+      if (!ictx->image_watcher->is_lock_supported() &&
+         ictx->image_watcher->is_lock_owner()) {
+       r = ictx->image_watcher->unlock();
+       if (r < 0) {
+         lderr(ictx->cct) << "error unlocking image: " << cpp_strerror(r)
+                           << dendl;
+       }
+      }
+    }
+    return r;
   }
 
   int open_image(ImageCtx *ictx)
@@ -2175,6 +2273,9 @@ reprotect_and_return_err:
     ldout(ictx->cct, 20) << "close_image " << ictx << dendl;
 
     ictx->readahead.wait_for_pending();
+    if (ictx->image_watcher != NULL) {
+      ictx->image_watcher->flush_aio_operations();
+    }
     if (ictx->object_cacher) {
       ictx->shutdown_cache(); // implicitly flushes
     } else {
@@ -2188,6 +2289,14 @@ reprotect_and_return_err:
     }
 
     if (ictx->image_watcher) {
+      RWLock::WLocker l(ictx->owner_lock);
+      if (ictx->image_watcher->is_lock_owner()) {
+        int r = ictx->image_watcher->unlock();
+        if (r < 0) {
+         lderr(ictx->cct) << "error unlocking object map: " << cpp_strerror(r)
+                          << dendl;
+        }
+      }
       ictx->unregister_watch();
     }
 
@@ -2239,6 +2348,17 @@ reprotect_and_return_err:
       overlap_objects = Striper::get_num_objects(ictx->layout, overlap); 
     }
 
+    RWLock::RLocker l(ictx->owner_lock);
+    r = prepare_image_update(ictx);
+    if (r < 0) {
+      return -EROFS;
+    }
+    if (ictx->image_watcher->is_lock_supported() &&
+        !ictx->image_watcher->is_lock_owner()) {
+      // TODO: temporary until request proxied to lock owner
+      return -EROFS;
+    }
+
     SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, false);
 
     for (uint64_t ono = 0; ono < overlap_objects; ono++) {
@@ -2902,6 +3022,9 @@ reprotect_and_return_err:
       return r;
     }
 
+    if (ictx->image_watcher != NULL) {
+      ictx->image_watcher->flush_aio_operations();
+    }
     ictx->user_flushed();
 
     c->get();
@@ -2933,6 +3056,9 @@ reprotect_and_return_err:
       return r;
     }
 
+    if (ictx->image_watcher != NULL) {
+      ictx->image_watcher->flush_aio_operations();
+    }
     ictx->user_flushed();
     r = _flush(ictx);
     ictx->perfcounter->inc(l_librbd_flush);
@@ -2966,6 +3092,10 @@ reprotect_and_return_err:
       return r;
     }
 
+    if (ictx->image_watcher != NULL) {
+      ictx->image_watcher->flush_aio_operations();
+    }
+
     RWLock::WLocker l(ictx->md_lock);
     r = ictx->invalidate_cache();
     return r;
@@ -3004,6 +3134,17 @@ reprotect_and_return_err:
 
     ldout(cct, 20) << "  parent overlap " << overlap << dendl;
 
+    c->get();
+    c->init_time(ictx, AIO_TYPE_WRITE);
+
+    RWLock::RLocker l(ictx->owner_lock);
+    if (ictx->image_watcher->is_lock_supported() &&
+       !ictx->image_watcher->is_lock_owner()) {
+      c->put();
+      return ictx->image_watcher->request_lock(
+       boost::bind(&librbd::aio_write, ictx, off, len, buf, _1, op_flags), c);
+    }
+
     // map
     vector<ObjectExtent> extents;
     if (len > 0) {
@@ -3011,8 +3152,6 @@ reprotect_and_return_err:
                               &ictx->layout, off, mylen, 0, extents);
     }
 
-    c->get();
-    c->init_time(ictx, AIO_TYPE_WRITE);
     for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
       ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
                     << " from " << p->buffer_extents << dendl;
@@ -3088,6 +3227,17 @@ reprotect_and_return_err:
       return -EROFS;
     }
 
+    c->get();
+    c->init_time(ictx, AIO_TYPE_DISCARD);
+
+    RWLock::RLocker l(ictx->owner_lock);
+    if (ictx->image_watcher->is_lock_supported() &&
+       !ictx->image_watcher->is_lock_owner()) {
+      c->put();
+      return ictx->image_watcher->request_lock(
+       boost::bind(&librbd::aio_discard, ictx, off, len, _1), c);
+    }
+
     // map
     vector<ObjectExtent> extents;
     if (len > 0) {
@@ -3095,8 +3245,6 @@ reprotect_and_return_err:
                               &ictx->layout, off, len, 0, extents);
     }
 
-    c->get();
-    c->init_time(ictx, AIO_TYPE_DISCARD);
     for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
       ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
                     << " from " << p->buffer_extents << dendl;
index 417efc0ab6a018725f87078807a914b5879b530a..249a33d19a761cb0b860b1ea22c28c6d08910b1b 100644 (file)
@@ -100,6 +100,7 @@ namespace librbd {
   int get_overlap(ImageCtx *ictx, uint64_t *overlap);
   int get_parent_info(ImageCtx *ictx, string *parent_pool_name,
                      string *parent_name, string *parent_snap_name);
+  int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner);
 
   int remove(librados::IoCtx& io_ctx, const char *imgname,
             ProgressContext& prog_ctx);
index dc60ba94c0b1f1af2ed76bf3b31c58abc1e8dce7..de8211eae34b9f5b2cac395b430831d7c16bc144 100644 (file)
@@ -362,9 +362,7 @@ namespace librbd {
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
     tracepoint(librbd, is_exclusive_lock_owner_enter, ictx);
-    // TODO: implement
-    int r = 0;
-    *is_owner = false;
+    int r = librbd::is_exclusive_lock_owner(ictx, is_owner);
     tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner);
     return r;
   }
@@ -1178,9 +1176,9 @@ extern "C" int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   tracepoint(librbd, is_exclusive_lock_owner_enter, ictx);
-  // TODO implement
-  int r = 0;
-  *is_owner = 0;
+  bool owner;
+  int r = librbd::is_exclusive_lock_owner(ictx, &owner);
+  *is_owner = owner ? 1 : 0;
   tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner);
   return r;
 }
index e9bccfffb3ebcd1ae0134a96943d07665ece09d9..f640f41adc309c7e62d7f6ffd68d9a3e32c2bd71 100644 (file)
@@ -913,5 +913,84 @@ class TestExclusiveLock(object):
     def test_ownership(self):
         with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
                 image1, image2):
-            eq(image1.is_exclusive_lock_owner(), False)
+            image1.write('0'*256, 0)
+            eq(image1.is_exclusive_lock_owner(), True)
             eq(image2.is_exclusive_lock_owner(), False)
+
+    def test_snapshot_leadership(self):
+        with Image(ioctx, image_name) as image:
+            image.create_snap('snap')
+            eq(image.is_exclusive_lock_owner(), True)
+        try:
+            with Image(ioctx, image_name) as image:
+                image.write('0'*256, 0)
+                eq(image.is_exclusive_lock_owner(), True)
+                image.set_snap('snap')
+                eq(image.is_exclusive_lock_owner(), False)
+            with Image(ioctx, image_name, snapshot='snap') as image:
+                eq(image.is_exclusive_lock_owner(), False)
+        finally:
+            with Image(ioctx, image_name) as image:
+                image.remove_snap('snap')
+
+    def test_read_only_leadership(self):
+        with Image(ioctx, image_name, read_only=True) as image:
+            eq(image.is_exclusive_lock_owner(), False)
+
+    def test_follower_flatten(self):
+        with Image(ioctx, image_name) as image:
+            image.create_snap('snap')
+            image.protect_snap('snap')
+        try:
+            RBD().clone(ioctx, image_name, 'snap', ioctx, 'clone', features)
+            with nested(Image(ioctx, 'clone'), Image(ioctx2, 'clone')) as (
+                    image1, image2):
+                image1.write('0'*256, 0)
+                assert_raises(ReadOnlyImage, image2.flatten)
+                image1.flatten()
+        finally:
+            RBD().remove(ioctx, 'clone')
+            with Image(ioctx, image_name) as image:
+                image.unprotect_snap('snap')
+                image.remove_snap('snap')
+
+    def test_follower_resize(self):
+        with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
+                image1, image2):
+            image1.write('0'*256, 0)
+            for new_size in [IMG_SIZE * 2, IMG_SIZE / 2]:
+                assert_raises(ReadOnlyImage, image2.resize, new_size)
+                image1.resize(new_size);
+
+    def test_follower_snap_rollback(self):
+        with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
+                image1, image2):
+            image1.create_snap('snap')
+            try:
+                assert_raises(ReadOnlyImage, image2.rollback_to_snap, 'snap')
+                image1.rollback_to_snap('snap')
+            finally:
+                image1.remove_snap('snap')
+
+    def test_follower_discard(self):
+        with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
+                image1, image2):
+            data = rand_data(256)
+            image1.write(data, 0)
+            image2.discard(0, 256)
+            eq(image1.is_exclusive_lock_owner(), False)
+            eq(image2.is_exclusive_lock_owner(), True)
+            read = image2.read(0, 256)
+            eq(256*'\0', read)
+
+    def test_follower_write(self):
+        with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
+                image1, image2):
+            data = rand_data(256)
+            image1.write(data, 0)
+            image2.write(data, IMG_SIZE / 2)
+            eq(image1.is_exclusive_lock_owner(), False)
+            eq(image2.is_exclusive_lock_owner(), True)
+            for offset in [0, IMG_SIZE / 2]:
+                read = image2.read(0, 256)
+                eq(data, read)