From ccadff1445ce8525e4c2419dca63683f0bc1c113 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Wed, 8 Oct 2014 08:41:53 -0400 Subject: [PATCH] librbd: Integrate librbd with new exclusive lock feature Operations that update the image now require the exclusive lock if the feature is enabled. AIO write and discard operations will automatically request the exclusive lock from the current leader to support live-migration. Signed-off-by: Jason Dillaman --- src/librbd/AioCompletion.h | 9 +- src/librbd/internal.cc | 170 +++++++++++++++++++++++++++++++++--- src/librbd/internal.h | 1 + src/librbd/librbd.cc | 10 +-- src/test/pybind/test_rbd.py | 81 ++++++++++++++++- 5 files changed, 249 insertions(+), 22 deletions(-) diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h index 3aa5bd5422a87..e4b034f3c904d 100644 --- a/src/librbd/AioCompletion.h +++ b/src/librbd/AioCompletion.h @@ -86,13 +86,14 @@ namespace librbd { void finish_adding_requests(CephContext *cct); void init_time(ImageCtx *i, aio_type_t t) { - ictx = i; - { + if (ictx == NULL) { + ictx = i; + aio_type = t; + start_time = ceph_clock_now(ictx->cct); + Mutex::Locker l(ictx->aio_lock); ++ictx->pending_aio; } - aio_type = t; - start_time = ceph_clock_now(ictx->cct); } void complete(); diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc index 3eb17601466e9..7794d986f261c 100644 --- a/src/librbd/internal.cc +++ b/src/librbd/internal.cc @@ -25,6 +25,8 @@ #include "librados/snap_set_diff.h" +#include + #define dout_subsys ceph_subsys_rbd #undef dout_prefix #define dout_prefix *_dout << "librbd: " @@ -463,6 +465,29 @@ namespace librbd { return 0; } + static int prepare_image_update(ImageCtx *ictx) + { + assert(ictx->owner_lock.is_locked() && !ictx->owner_lock.is_wlocked()); + if (ictx->image_watcher == NULL) { + return -EROFS;; + } else if (!ictx->image_watcher->is_lock_supported() || + ictx->image_watcher->is_lock_owner()) { + return 0; + } + + // need to upgrade to a write lock + int r = 0; + ictx->owner_lock.put_read(); + { + RWLock::WLocker l(ictx->owner_lock); + if (!ictx->image_watcher->is_lock_owner()) { + r = ictx->image_watcher->try_lock(); + } + } + ictx->owner_lock.get_read(); + return r; + } + int snap_create(ImageCtx *ictx, const char *snap_name) { ldout(ictx->cct, 20) << "snap_create " << ictx << " " << snap_name << dendl; @@ -474,7 +499,18 @@ namespace librbd { if (r < 0) return r; - RWLock::RLocker l(ictx->md_lock); + RWLock::RLocker l(ictx->owner_lock); + r = prepare_image_update(ictx); + if (r < 0) { + return -EROFS; + } + if (ictx->image_watcher->is_lock_supported() && + !ictx->image_watcher->is_lock_owner()) { + // TODO: temporary until request proxied to lock owner + return -EROFS; + } + + RWLock::RLocker l2(ictx->md_lock); do { r = add_snap(ictx, snap_name); } while (r == -ESTALE); @@ -1404,6 +1440,14 @@ reprotect_and_return_err: return 0; } + int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner) + { + RWLock::RLocker l(ictx->owner_lock); + *is_owner = (ictx->image_watcher != NULL && + ictx->image_watcher->is_lock_owner()); + return 0; + } + int remove(IoCtx& io_ctx, const char *imgname, ProgressContext& prog_ctx) { CephContext *cct((CephContext *)io_ctx.cct()); @@ -1566,7 +1610,18 @@ reprotect_and_return_err: return r; } - RWLock::WLocker l(ictx->md_lock); + RWLock::RLocker l(ictx->owner_lock); + r = prepare_image_update(ictx); + if (r < 0) { + return -EROFS; + } + if (ictx->image_watcher->is_lock_supported() && + !ictx->image_watcher->is_lock_owner()) { + // TODO: temporary until request proxied to lock owner + return -EROFS; + } + + RWLock::WLocker l2(ictx->md_lock); if (size < ictx->size && ictx->object_cacher) { // need to invalidate since we're deleting objects, and // ObjectCacher doesn't track non-existent objects @@ -1672,10 +1727,21 @@ reprotect_and_return_err: { CephContext *cct = ictx->cct; ldout(cct, 20) << "ictx_check " << ictx << dendl; + ictx->refresh_lock.Lock(); bool needs_refresh = ictx->last_refresh != ictx->refresh_seq; ictx->refresh_lock.Unlock(); + if (ictx->image_watcher != NULL) { + // might have encountered an error re-registering a watch + int r = ictx->image_watcher->get_watch_error(); + if (r < 0) { + lderr(cct) << "rbd header watch invalid: " << cpp_strerror(r) + << dendl; + return r; + } + } + if (needs_refresh) { RWLock::WLocker l(ictx->md_lock); @@ -1894,12 +1960,13 @@ reprotect_and_return_err: if (r < 0) return r; - RWLock::WLocker l(ictx->md_lock); + RWLock::RLocker l(ictx->owner_lock); + RWLock::WLocker l2(ictx->md_lock); snap_t snap_id; uint64_t new_size; { // need to drop snap_lock before invalidating cache - RWLock::RLocker l2(ictx->snap_lock); + RWLock::RLocker l3(ictx->snap_lock); if (!ictx->snap_exists) return -ENOENT; @@ -1914,6 +1981,15 @@ reprotect_and_return_err: new_size = ictx->get_image_size(snap_id); } + r = prepare_image_update(ictx); + if (r < 0) { + return -EROFS; + } + if (ictx->image_watcher->is_lock_supported() && + !ictx->image_watcher->is_lock_owner()) { + return -EROFS; + } + // need to flush any pending writes before resizing and rolling back - // writes might create new snapshots. Rolling back will replace // the current version, so we have to invalidate that too. @@ -2102,8 +2178,10 @@ reprotect_and_return_err: int _snap_set(ImageCtx *ictx, const char *snap_name) { - RWLock::WLocker l1(ictx->snap_lock); - RWLock::WLocker l2(ictx->parent_lock); + RWLock::WLocker l(ictx->owner_lock); + RWLock::RLocker l1(ictx->md_lock); + RWLock::WLocker l2(ictx->snap_lock); + RWLock::WLocker l3(ictx->parent_lock); int r; if ((snap_name != NULL) && (strlen(snap_name) != 0)) { r = ictx->snap_set(snap_name); @@ -2114,6 +2192,7 @@ reprotect_and_return_err: if (r < 0) { return r; } + refresh_parent(ictx); return 0; } @@ -2125,13 +2204,32 @@ reprotect_and_return_err: // ignore return value, since we may be set to a non-existent // snapshot and the user is trying to fix that ictx_check(ictx); + if (ictx->image_watcher != NULL) { + ictx->image_watcher->flush_aio_operations(); + } if (ictx->object_cacher) { // complete pending writes before we're set to a snapshot and // get -EROFS for writes RWLock::WLocker l(ictx->md_lock); ictx->flush_cache(); } - return _snap_set(ictx, snap_name); + int r = _snap_set(ictx, snap_name); + if (r < 0) { + return r; + } + + RWLock::WLocker l(ictx->owner_lock); + if (ictx->image_watcher != NULL) { + if (!ictx->image_watcher->is_lock_supported() && + ictx->image_watcher->is_lock_owner()) { + r = ictx->image_watcher->unlock(); + if (r < 0) { + lderr(ictx->cct) << "error unlocking image: " << cpp_strerror(r) + << dendl; + } + } + } + return r; } int open_image(ImageCtx *ictx) @@ -2175,6 +2273,9 @@ reprotect_and_return_err: ldout(ictx->cct, 20) << "close_image " << ictx << dendl; ictx->readahead.wait_for_pending(); + if (ictx->image_watcher != NULL) { + ictx->image_watcher->flush_aio_operations(); + } if (ictx->object_cacher) { ictx->shutdown_cache(); // implicitly flushes } else { @@ -2188,6 +2289,14 @@ reprotect_and_return_err: } if (ictx->image_watcher) { + RWLock::WLocker l(ictx->owner_lock); + if (ictx->image_watcher->is_lock_owner()) { + int r = ictx->image_watcher->unlock(); + if (r < 0) { + lderr(ictx->cct) << "error unlocking object map: " << cpp_strerror(r) + << dendl; + } + } ictx->unregister_watch(); } @@ -2239,6 +2348,17 @@ reprotect_and_return_err: overlap_objects = Striper::get_num_objects(ictx->layout, overlap); } + RWLock::RLocker l(ictx->owner_lock); + r = prepare_image_update(ictx); + if (r < 0) { + return -EROFS; + } + if (ictx->image_watcher->is_lock_supported() && + !ictx->image_watcher->is_lock_owner()) { + // TODO: temporary until request proxied to lock owner + return -EROFS; + } + SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, false); for (uint64_t ono = 0; ono < overlap_objects; ono++) { @@ -2902,6 +3022,9 @@ reprotect_and_return_err: return r; } + if (ictx->image_watcher != NULL) { + ictx->image_watcher->flush_aio_operations(); + } ictx->user_flushed(); c->get(); @@ -2933,6 +3056,9 @@ reprotect_and_return_err: return r; } + if (ictx->image_watcher != NULL) { + ictx->image_watcher->flush_aio_operations(); + } ictx->user_flushed(); r = _flush(ictx); ictx->perfcounter->inc(l_librbd_flush); @@ -2966,6 +3092,10 @@ reprotect_and_return_err: return r; } + if (ictx->image_watcher != NULL) { + ictx->image_watcher->flush_aio_operations(); + } + RWLock::WLocker l(ictx->md_lock); r = ictx->invalidate_cache(); return r; @@ -3004,6 +3134,17 @@ reprotect_and_return_err: ldout(cct, 20) << " parent overlap " << overlap << dendl; + c->get(); + c->init_time(ictx, AIO_TYPE_WRITE); + + RWLock::RLocker l(ictx->owner_lock); + if (ictx->image_watcher->is_lock_supported() && + !ictx->image_watcher->is_lock_owner()) { + c->put(); + return ictx->image_watcher->request_lock( + boost::bind(&librbd::aio_write, ictx, off, len, buf, _1, op_flags), c); + } + // map vector extents; if (len > 0) { @@ -3011,8 +3152,6 @@ reprotect_and_return_err: &ictx->layout, off, mylen, 0, extents); } - c->get(); - c->init_time(ictx, AIO_TYPE_WRITE); for (vector::iterator p = extents.begin(); p != extents.end(); ++p) { ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length << " from " << p->buffer_extents << dendl; @@ -3088,6 +3227,17 @@ reprotect_and_return_err: return -EROFS; } + c->get(); + c->init_time(ictx, AIO_TYPE_DISCARD); + + RWLock::RLocker l(ictx->owner_lock); + if (ictx->image_watcher->is_lock_supported() && + !ictx->image_watcher->is_lock_owner()) { + c->put(); + return ictx->image_watcher->request_lock( + boost::bind(&librbd::aio_discard, ictx, off, len, _1), c); + } + // map vector extents; if (len > 0) { @@ -3095,8 +3245,6 @@ reprotect_and_return_err: &ictx->layout, off, len, 0, extents); } - c->get(); - c->init_time(ictx, AIO_TYPE_DISCARD); for (vector::iterator p = extents.begin(); p != extents.end(); ++p) { ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length << " from " << p->buffer_extents << dendl; diff --git a/src/librbd/internal.h b/src/librbd/internal.h index 417efc0ab6a01..249a33d19a761 100644 --- a/src/librbd/internal.h +++ b/src/librbd/internal.h @@ -100,6 +100,7 @@ namespace librbd { int get_overlap(ImageCtx *ictx, uint64_t *overlap); int get_parent_info(ImageCtx *ictx, string *parent_pool_name, string *parent_name, string *parent_snap_name); + int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner); int remove(librados::IoCtx& io_ctx, const char *imgname, ProgressContext& prog_ctx); diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc index dc60ba94c0b1f..de8211eae34b9 100644 --- a/src/librbd/librbd.cc +++ b/src/librbd/librbd.cc @@ -362,9 +362,7 @@ namespace librbd { { ImageCtx *ictx = (ImageCtx *)ctx; tracepoint(librbd, is_exclusive_lock_owner_enter, ictx); - // TODO: implement - int r = 0; - *is_owner = false; + int r = librbd::is_exclusive_lock_owner(ictx, is_owner); tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner); return r; } @@ -1178,9 +1176,9 @@ extern "C" int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner) { librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; tracepoint(librbd, is_exclusive_lock_owner_enter, ictx); - // TODO implement - int r = 0; - *is_owner = 0; + bool owner; + int r = librbd::is_exclusive_lock_owner(ictx, &owner); + *is_owner = owner ? 1 : 0; tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner); return r; } diff --git a/src/test/pybind/test_rbd.py b/src/test/pybind/test_rbd.py index e9bccfffb3ebc..f640f41adc309 100644 --- a/src/test/pybind/test_rbd.py +++ b/src/test/pybind/test_rbd.py @@ -913,5 +913,84 @@ class TestExclusiveLock(object): def test_ownership(self): with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as ( image1, image2): - eq(image1.is_exclusive_lock_owner(), False) + image1.write('0'*256, 0) + eq(image1.is_exclusive_lock_owner(), True) eq(image2.is_exclusive_lock_owner(), False) + + def test_snapshot_leadership(self): + with Image(ioctx, image_name) as image: + image.create_snap('snap') + eq(image.is_exclusive_lock_owner(), True) + try: + with Image(ioctx, image_name) as image: + image.write('0'*256, 0) + eq(image.is_exclusive_lock_owner(), True) + image.set_snap('snap') + eq(image.is_exclusive_lock_owner(), False) + with Image(ioctx, image_name, snapshot='snap') as image: + eq(image.is_exclusive_lock_owner(), False) + finally: + with Image(ioctx, image_name) as image: + image.remove_snap('snap') + + def test_read_only_leadership(self): + with Image(ioctx, image_name, read_only=True) as image: + eq(image.is_exclusive_lock_owner(), False) + + def test_follower_flatten(self): + with Image(ioctx, image_name) as image: + image.create_snap('snap') + image.protect_snap('snap') + try: + RBD().clone(ioctx, image_name, 'snap', ioctx, 'clone', features) + with nested(Image(ioctx, 'clone'), Image(ioctx2, 'clone')) as ( + image1, image2): + image1.write('0'*256, 0) + assert_raises(ReadOnlyImage, image2.flatten) + image1.flatten() + finally: + RBD().remove(ioctx, 'clone') + with Image(ioctx, image_name) as image: + image.unprotect_snap('snap') + image.remove_snap('snap') + + def test_follower_resize(self): + with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as ( + image1, image2): + image1.write('0'*256, 0) + for new_size in [IMG_SIZE * 2, IMG_SIZE / 2]: + assert_raises(ReadOnlyImage, image2.resize, new_size) + image1.resize(new_size); + + def test_follower_snap_rollback(self): + with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as ( + image1, image2): + image1.create_snap('snap') + try: + assert_raises(ReadOnlyImage, image2.rollback_to_snap, 'snap') + image1.rollback_to_snap('snap') + finally: + image1.remove_snap('snap') + + def test_follower_discard(self): + with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as ( + image1, image2): + data = rand_data(256) + image1.write(data, 0) + image2.discard(0, 256) + eq(image1.is_exclusive_lock_owner(), False) + eq(image2.is_exclusive_lock_owner(), True) + read = image2.read(0, 256) + eq(256*'\0', read) + + def test_follower_write(self): + with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as ( + image1, image2): + data = rand_data(256) + image1.write(data, 0) + image2.write(data, IMG_SIZE / 2) + eq(image1.is_exclusive_lock_owner(), False) + eq(image2.is_exclusive_lock_owner(), True) + for offset in [0, IMG_SIZE / 2]: + read = image2.read(0, 256) + eq(data, read) -- 2.39.5