#include "librados/snap_set_diff.h"
+#include <boost/bind.hpp>
+
#define dout_subsys ceph_subsys_rbd
#undef dout_prefix
#define dout_prefix *_dout << "librbd: "
return 0;
}
+ static int prepare_image_update(ImageCtx *ictx)
+ {
+ assert(ictx->owner_lock.is_locked() && !ictx->owner_lock.is_wlocked());
+ if (ictx->image_watcher == NULL) {
+ return -EROFS;;
+ } else if (!ictx->image_watcher->is_lock_supported() ||
+ ictx->image_watcher->is_lock_owner()) {
+ return 0;
+ }
+
+ // need to upgrade to a write lock
+ int r = 0;
+ ictx->owner_lock.put_read();
+ {
+ RWLock::WLocker l(ictx->owner_lock);
+ if (!ictx->image_watcher->is_lock_owner()) {
+ r = ictx->image_watcher->try_lock();
+ }
+ }
+ ictx->owner_lock.get_read();
+ return r;
+ }
+
int snap_create(ImageCtx *ictx, const char *snap_name)
{
ldout(ictx->cct, 20) << "snap_create " << ictx << " " << snap_name << dendl;
if (r < 0)
return r;
- RWLock::RLocker l(ictx->md_lock);
+ RWLock::RLocker l(ictx->owner_lock);
+ r = prepare_image_update(ictx);
+ if (r < 0) {
+ return -EROFS;
+ }
+ if (ictx->image_watcher->is_lock_supported() &&
+ !ictx->image_watcher->is_lock_owner()) {
+ // TODO: temporary until request proxied to lock owner
+ return -EROFS;
+ }
+
+ RWLock::RLocker l2(ictx->md_lock);
do {
r = add_snap(ictx, snap_name);
} while (r == -ESTALE);
return 0;
}
+ int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner)
+ {
+ RWLock::RLocker l(ictx->owner_lock);
+ *is_owner = (ictx->image_watcher != NULL &&
+ ictx->image_watcher->is_lock_owner());
+ return 0;
+ }
+
int remove(IoCtx& io_ctx, const char *imgname, ProgressContext& prog_ctx)
{
CephContext *cct((CephContext *)io_ctx.cct());
return r;
}
- RWLock::WLocker l(ictx->md_lock);
+ RWLock::RLocker l(ictx->owner_lock);
+ r = prepare_image_update(ictx);
+ if (r < 0) {
+ return -EROFS;
+ }
+ if (ictx->image_watcher->is_lock_supported() &&
+ !ictx->image_watcher->is_lock_owner()) {
+ // TODO: temporary until request proxied to lock owner
+ return -EROFS;
+ }
+
+ RWLock::WLocker l2(ictx->md_lock);
if (size < ictx->size && ictx->object_cacher) {
// need to invalidate since we're deleting objects, and
// ObjectCacher doesn't track non-existent objects
{
CephContext *cct = ictx->cct;
ldout(cct, 20) << "ictx_check " << ictx << dendl;
+
ictx->refresh_lock.Lock();
bool needs_refresh = ictx->last_refresh != ictx->refresh_seq;
ictx->refresh_lock.Unlock();
+ if (ictx->image_watcher != NULL) {
+ // might have encountered an error re-registering a watch
+ int r = ictx->image_watcher->get_watch_error();
+ if (r < 0) {
+ lderr(cct) << "rbd header watch invalid: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
if (needs_refresh) {
RWLock::WLocker l(ictx->md_lock);
if (r < 0)
return r;
- RWLock::WLocker l(ictx->md_lock);
+ RWLock::RLocker l(ictx->owner_lock);
+ RWLock::WLocker l2(ictx->md_lock);
snap_t snap_id;
uint64_t new_size;
{
// need to drop snap_lock before invalidating cache
- RWLock::RLocker l2(ictx->snap_lock);
+ RWLock::RLocker l3(ictx->snap_lock);
if (!ictx->snap_exists)
return -ENOENT;
new_size = ictx->get_image_size(snap_id);
}
+ r = prepare_image_update(ictx);
+ if (r < 0) {
+ return -EROFS;
+ }
+ if (ictx->image_watcher->is_lock_supported() &&
+ !ictx->image_watcher->is_lock_owner()) {
+ return -EROFS;
+ }
+
// need to flush any pending writes before resizing and rolling back -
// writes might create new snapshots. Rolling back will replace
// the current version, so we have to invalidate that too.
int _snap_set(ImageCtx *ictx, const char *snap_name)
{
- RWLock::WLocker l1(ictx->snap_lock);
- RWLock::WLocker l2(ictx->parent_lock);
+ RWLock::WLocker l(ictx->owner_lock);
+ RWLock::RLocker l1(ictx->md_lock);
+ RWLock::WLocker l2(ictx->snap_lock);
+ RWLock::WLocker l3(ictx->parent_lock);
int r;
if ((snap_name != NULL) && (strlen(snap_name) != 0)) {
r = ictx->snap_set(snap_name);
if (r < 0) {
return r;
}
+
refresh_parent(ictx);
return 0;
}
// ignore return value, since we may be set to a non-existent
// snapshot and the user is trying to fix that
ictx_check(ictx);
+ if (ictx->image_watcher != NULL) {
+ ictx->image_watcher->flush_aio_operations();
+ }
if (ictx->object_cacher) {
// complete pending writes before we're set to a snapshot and
// get -EROFS for writes
RWLock::WLocker l(ictx->md_lock);
ictx->flush_cache();
}
- return _snap_set(ictx, snap_name);
+ int r = _snap_set(ictx, snap_name);
+ if (r < 0) {
+ return r;
+ }
+
+ RWLock::WLocker l(ictx->owner_lock);
+ if (ictx->image_watcher != NULL) {
+ if (!ictx->image_watcher->is_lock_supported() &&
+ ictx->image_watcher->is_lock_owner()) {
+ r = ictx->image_watcher->unlock();
+ if (r < 0) {
+ lderr(ictx->cct) << "error unlocking image: " << cpp_strerror(r)
+ << dendl;
+ }
+ }
+ }
+ return r;
}
int open_image(ImageCtx *ictx)
ldout(ictx->cct, 20) << "close_image " << ictx << dendl;
ictx->readahead.wait_for_pending();
+ if (ictx->image_watcher != NULL) {
+ ictx->image_watcher->flush_aio_operations();
+ }
if (ictx->object_cacher) {
ictx->shutdown_cache(); // implicitly flushes
} else {
}
if (ictx->image_watcher) {
+ RWLock::WLocker l(ictx->owner_lock);
+ if (ictx->image_watcher->is_lock_owner()) {
+ int r = ictx->image_watcher->unlock();
+ if (r < 0) {
+ lderr(ictx->cct) << "error unlocking object map: " << cpp_strerror(r)
+ << dendl;
+ }
+ }
ictx->unregister_watch();
}
overlap_objects = Striper::get_num_objects(ictx->layout, overlap);
}
+ RWLock::RLocker l(ictx->owner_lock);
+ r = prepare_image_update(ictx);
+ if (r < 0) {
+ return -EROFS;
+ }
+ if (ictx->image_watcher->is_lock_supported() &&
+ !ictx->image_watcher->is_lock_owner()) {
+ // TODO: temporary until request proxied to lock owner
+ return -EROFS;
+ }
+
SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, false);
for (uint64_t ono = 0; ono < overlap_objects; ono++) {
return r;
}
+ if (ictx->image_watcher != NULL) {
+ ictx->image_watcher->flush_aio_operations();
+ }
ictx->user_flushed();
c->get();
return r;
}
+ if (ictx->image_watcher != NULL) {
+ ictx->image_watcher->flush_aio_operations();
+ }
ictx->user_flushed();
r = _flush(ictx);
ictx->perfcounter->inc(l_librbd_flush);
return r;
}
+ if (ictx->image_watcher != NULL) {
+ ictx->image_watcher->flush_aio_operations();
+ }
+
RWLock::WLocker l(ictx->md_lock);
r = ictx->invalidate_cache();
return r;
ldout(cct, 20) << " parent overlap " << overlap << dendl;
+ c->get();
+ c->init_time(ictx, AIO_TYPE_WRITE);
+
+ RWLock::RLocker l(ictx->owner_lock);
+ if (ictx->image_watcher->is_lock_supported() &&
+ !ictx->image_watcher->is_lock_owner()) {
+ c->put();
+ return ictx->image_watcher->request_lock(
+ boost::bind(&librbd::aio_write, ictx, off, len, buf, _1, op_flags), c);
+ }
+
// map
vector<ObjectExtent> extents;
if (len > 0) {
&ictx->layout, off, mylen, 0, extents);
}
- c->get();
- c->init_time(ictx, AIO_TYPE_WRITE);
for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
<< " from " << p->buffer_extents << dendl;
return -EROFS;
}
+ c->get();
+ c->init_time(ictx, AIO_TYPE_DISCARD);
+
+ RWLock::RLocker l(ictx->owner_lock);
+ if (ictx->image_watcher->is_lock_supported() &&
+ !ictx->image_watcher->is_lock_owner()) {
+ c->put();
+ return ictx->image_watcher->request_lock(
+ boost::bind(&librbd::aio_discard, ictx, off, len, _1), c);
+ }
+
// map
vector<ObjectExtent> extents;
if (len > 0) {
&ictx->layout, off, len, 0, extents);
}
- c->get();
- c->init_time(ictx, AIO_TYPE_DISCARD);
for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
<< " from " << p->buffer_extents << dendl;
def test_ownership(self):
with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
image1, image2):
- eq(image1.is_exclusive_lock_owner(), False)
+ image1.write('0'*256, 0)
+ eq(image1.is_exclusive_lock_owner(), True)
eq(image2.is_exclusive_lock_owner(), False)
+
+ def test_snapshot_leadership(self):
+ with Image(ioctx, image_name) as image:
+ image.create_snap('snap')
+ eq(image.is_exclusive_lock_owner(), True)
+ try:
+ with Image(ioctx, image_name) as image:
+ image.write('0'*256, 0)
+ eq(image.is_exclusive_lock_owner(), True)
+ image.set_snap('snap')
+ eq(image.is_exclusive_lock_owner(), False)
+ with Image(ioctx, image_name, snapshot='snap') as image:
+ eq(image.is_exclusive_lock_owner(), False)
+ finally:
+ with Image(ioctx, image_name) as image:
+ image.remove_snap('snap')
+
+ def test_read_only_leadership(self):
+ with Image(ioctx, image_name, read_only=True) as image:
+ eq(image.is_exclusive_lock_owner(), False)
+
+ def test_follower_flatten(self):
+ with Image(ioctx, image_name) as image:
+ image.create_snap('snap')
+ image.protect_snap('snap')
+ try:
+ RBD().clone(ioctx, image_name, 'snap', ioctx, 'clone', features)
+ with nested(Image(ioctx, 'clone'), Image(ioctx2, 'clone')) as (
+ image1, image2):
+ image1.write('0'*256, 0)
+ assert_raises(ReadOnlyImage, image2.flatten)
+ image1.flatten()
+ finally:
+ RBD().remove(ioctx, 'clone')
+ with Image(ioctx, image_name) as image:
+ image.unprotect_snap('snap')
+ image.remove_snap('snap')
+
+ def test_follower_resize(self):
+ with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
+ image1, image2):
+ image1.write('0'*256, 0)
+ for new_size in [IMG_SIZE * 2, IMG_SIZE / 2]:
+ assert_raises(ReadOnlyImage, image2.resize, new_size)
+ image1.resize(new_size);
+
+ def test_follower_snap_rollback(self):
+ with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
+ image1, image2):
+ image1.create_snap('snap')
+ try:
+ assert_raises(ReadOnlyImage, image2.rollback_to_snap, 'snap')
+ image1.rollback_to_snap('snap')
+ finally:
+ image1.remove_snap('snap')
+
+ def test_follower_discard(self):
+ with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
+ image1, image2):
+ data = rand_data(256)
+ image1.write(data, 0)
+ image2.discard(0, 256)
+ eq(image1.is_exclusive_lock_owner(), False)
+ eq(image2.is_exclusive_lock_owner(), True)
+ read = image2.read(0, 256)
+ eq(256*'\0', read)
+
+ def test_follower_write(self):
+ with nested(Image(ioctx, image_name), Image(ioctx2, image_name)) as (
+ image1, image2):
+ data = rand_data(256)
+ image1.write(data, 0)
+ image2.write(data, IMG_SIZE / 2)
+ eq(image1.is_exclusive_lock_owner(), False)
+ eq(image2.is_exclusive_lock_owner(), True)
+ for offset in [0, IMG_SIZE / 2]:
+ read = image2.read(0, 256)
+ eq(data, read)