From 4e8259db4fb5ed572de100b4bc27d3bcbd38cdb2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 27 Dec 2013 16:11:27 -0800 Subject: [PATCH] osd/ReplicatedPG: allow cache-evict on snaps We do three things here: - make cache-evict a CACHE instead of WR op, allowing us to submit it on snaps (not just head) - allow eviction of a snap - verify that all snaps are missing before evicting a head Signed-off-by: Sage Weil --- src/include/rados.h | 2 +- src/include/rados/librados.hpp | 16 +- src/librados/librados.cc | 2 +- src/osd/ReplicatedPG.cc | 42 ++++- src/test/librados/tier.cc | 285 +++++++++++++++++++++++++++++++-- src/test/osd/RadosModel.h | 5 +- src/tools/rados/rados.cc | 5 +- 7 files changed, 324 insertions(+), 33 deletions(-) diff --git a/src/include/rados.h b/src/include/rados.h index c4aa5035fb368..894856649c464 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -229,7 +229,7 @@ enum { CEPH_OSD_OP_ISDIRTY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 29, CEPH_OSD_OP_COPY_GET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 30, CEPH_OSD_OP_CACHE_FLUSH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 31, - CEPH_OSD_OP_CACHE_EVICT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 32, + CEPH_OSD_OP_CACHE_EVICT = CEPH_OSD_OP_MODE_CACHE | CEPH_OSD_OP_TYPE_DATA | 32, CEPH_OSD_OP_CACHE_TRY_FLUSH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 33, /* convert tmap to omap */ diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp index 5e9eb312fd6aa..411c90ff2fc98 100644 --- a/src/include/rados/librados.hpp +++ b/src/include/rados/librados.hpp @@ -328,14 +328,6 @@ namespace librados */ void cache_try_flush(); - /** - * evict a clean cache tier object - * - * This should be used in concert with OPERATION_IGNORE_CACHE to avoid - * triggering a promote on the OSD (that is then evicted). - */ - void cache_evict(); - friend class IoCtx; }; @@ -461,6 +453,14 @@ namespace librados * @param prval [out] place error code in prval upon completion */ void is_dirty(bool *isdirty, int *prval); + + /** + * evict a clean cache tier object + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promote on the OSD (that is then evicted). + */ + void cache_evict(); }; /* IoCtx : This is a context in which we can perform I/O. diff --git a/src/librados/librados.cc b/src/librados/librados.cc index ec3422c3f036f..70686be7a930a 100644 --- a/src/librados/librados.cc +++ b/src/librados/librados.cc @@ -415,7 +415,7 @@ void librados::ObjectWriteOperation::cache_try_flush() o->cache_try_flush(); } -void librados::ObjectWriteOperation::cache_evict() +void librados::ObjectReadOperation::cache_evict() { ::ObjectOperation *o = (::ObjectOperation *)impl; o->cache_evict(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 1061ee44d5ac9..1e5d368697fbe 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -3053,6 +3053,27 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) result = -EBUSY; break; } + if (soid.snap == CEPH_NOSNAP) { + // verify that all clones have been evicted + dout(20) << __func__ << " verifying clones are absent " + << ctx->new_snapset << dendl; + result = 0; + for (vector::iterator p = ctx->new_snapset.clones.begin(); + p != ctx->new_snapset.clones.end(); + ++p) { + hobject_t clone_oid = soid; + clone_oid.snap = *p; + ObjectContextRef clone_obc = get_object_context(clone_oid, false); + if (clone_obc && clone_obc->obs.exists) { + dout(10) << __func__ << " cannot evict head before clone " + << clone_oid << dendl; + result = -EBUSY; + break; + } + } + if (result < 0) + break; + } result = _delete_head(ctx, true); } break; @@ -4543,7 +4564,8 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx) } // clone, if necessary - make_writeable(ctx); + if (soid.snap == CEPH_NOSNAP) + make_writeable(ctx); finish_ctx(ctx, ctx->new_obs.exists ? pg_log_entry_t::MODIFY : @@ -4657,13 +4679,19 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type) ctx->user_at_version, ctx->reqid, ctx->mtime)); if (soid.snap < CEPH_NOSNAP) { - dout(20) << __func__ << " encoding snaps " << ctx->new_obs.oi.snaps << dendl; - ::encode(ctx->new_obs.oi.snaps, ctx->log.back().snaps); - OSDriver::OSTransaction _t(osdriver.get_transaction(&(ctx->local_t))); - set _snaps(ctx->new_obs.oi.snaps.begin(), - ctx->new_obs.oi.snaps.end()); - snap_mapper.add_oid(soid, _snaps, &_t); + if (log_op_type == pg_log_entry_t::MODIFY || + log_op_type == pg_log_entry_t::PROMOTE) { + dout(20) << __func__ << " encoding snaps " << ctx->new_obs.oi.snaps + << dendl; + ::encode(ctx->new_obs.oi.snaps, ctx->log.back().snaps); + + set _snaps(ctx->new_obs.oi.snaps.begin(), + ctx->new_obs.oi.snaps.end()); + snap_mapper.add_oid(soid, _snaps, &_t); + } else { + snap_mapper.remove_oid(soid, &_t); + } } // apply new object state. diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc index d2e162c89f3ee..de8323f6855f9 100644 --- a/src/test/librados/tier.cc +++ b/src/test/librados/tier.cc @@ -576,38 +576,299 @@ TEST(LibRadosTier, Evict) { // evict { - ObjectWriteOperation op; + ObjectReadOperation op; op.cache_evict(); librados::AioCompletion *completion = cluster.aio_create_completion(); - ASSERT_EQ(0, cache_ioctx.aio_operate( - "foo", completion, &op, librados::OPERATION_IGNORE_CACHE)); + ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op, + librados::OPERATION_IGNORE_CACHE, + NULL)); completion->wait_for_safe(); ASSERT_EQ(0, completion->get_return_value()); completion->release(); } { - ObjectWriteOperation op; + ObjectReadOperation op; op.cache_evict(); librados::AioCompletion *completion = cluster.aio_create_completion(); ASSERT_EQ(0, cache_ioctx.aio_operate( "fooberdoodle", completion, &op, - librados::OPERATION_IGNORE_CACHE)); + librados::OPERATION_IGNORE_CACHE, NULL)); completion->wait_for_safe(); ASSERT_EQ(-ENOENT, completion->get_return_value()); completion->release(); } { + ObjectReadOperation op; + op.cache_evict(); + librados::AioCompletion *completion = cluster.aio_create_completion(); + ASSERT_EQ(0, cache_ioctx.aio_operate( + "bar", completion, &op, + librados::OPERATION_IGNORE_CACHE, NULL)); + completion->wait_for_safe(); + ASSERT_EQ(-EBUSY, completion->get_return_value()); + completion->release(); + } + + // tear down tiers + ASSERT_EQ(0, cluster.mon_command( + "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + base_pool_name + + "\"}", + inbl, NULL, NULL)); + ASSERT_EQ(0, cluster.mon_command( + "{\"prefix\": \"osd tier remove\", \"pool\": \"" + base_pool_name + + "\", \"tierpool\": \"" + cache_pool_name + "\"}", + inbl, NULL, NULL)); + + base_ioctx.close(); + cache_ioctx.close(); + + cluster.pool_delete(cache_pool_name.c_str()); + ASSERT_EQ(0, destroy_one_pool_pp(base_pool_name, cluster)); +} + +TEST(LibRadosTier, EvictSnap) { + Rados cluster; + std::string base_pool_name = get_temp_pool_name(); + std::string cache_pool_name = base_pool_name + "-cache"; + ASSERT_EQ("", create_one_pool_pp(base_pool_name, cluster)); + ASSERT_EQ(0, cluster.pool_create(cache_pool_name.c_str())); + IoCtx cache_ioctx; + ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx)); + IoCtx base_ioctx; + ASSERT_EQ(0, cluster.ioctx_create(base_pool_name.c_str(), base_ioctx)); + + // create object + { + bufferlist bl; + bl.append("hi there"); + ObjectWriteOperation op; + op.write_full(bl); + ASSERT_EQ(0, base_ioctx.operate("foo", &op)); + } + { + bufferlist bl; + bl.append("hi there"); + ObjectWriteOperation op; + op.write_full(bl); + ASSERT_EQ(0, base_ioctx.operate("bar", &op)); + } + { + bufferlist bl; + bl.append("hi there"); + ObjectWriteOperation op; + op.write_full(bl); + ASSERT_EQ(0, base_ioctx.operate("baz", &op)); + } + { + bufferlist bl; + bl.append("hi there"); + ObjectWriteOperation op; + op.write_full(bl); + ASSERT_EQ(0, base_ioctx.operate("bam", &op)); + } + + // create a snapshot, clone + vector my_snaps(1); + ASSERT_EQ(0, base_ioctx.selfmanaged_snap_create(&my_snaps[0])); + ASSERT_EQ(0, base_ioctx.selfmanaged_snap_set_write_ctx(my_snaps[0], + my_snaps)); + { + bufferlist bl; + bl.append("ciao!"); + ObjectWriteOperation op; + op.write_full(bl); + ASSERT_EQ(0, base_ioctx.operate("foo", &op)); + } + { + bufferlist bl; + bl.append("ciao!"); + ObjectWriteOperation op; + op.write_full(bl); + ASSERT_EQ(0, base_ioctx.operate("bar", &op)); + } + { + ObjectWriteOperation op; + op.remove(); + ASSERT_EQ(0, base_ioctx.operate("baz", &op)); + } + { + bufferlist bl; + bl.append("ciao!"); ObjectWriteOperation op; + op.write_full(bl); + ASSERT_EQ(0, base_ioctx.operate("bam", &op)); + } + + // configure cache + bufferlist inbl; + ASSERT_EQ(0, cluster.mon_command( + "{\"prefix\": \"osd tier add\", \"pool\": \"" + base_pool_name + + "\", \"tierpool\": \"" + cache_pool_name + "\"}", + inbl, NULL, NULL)); + ASSERT_EQ(0, cluster.mon_command( + "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + base_pool_name + + "\", \"overlaypool\": \"" + cache_pool_name + "\"}", + inbl, NULL, NULL)); + ASSERT_EQ(0, cluster.mon_command( + "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name + + "\", \"mode\": \"writeback\"}", + inbl, NULL, NULL)); + + // wait for maps to settle + cluster.wait_for_latest_osdmap(); + + // read, trigger a promote on the head + { + bufferlist bl; + ASSERT_EQ(1, base_ioctx.read("foo", bl, 1, 0)); + ASSERT_EQ('c', bl[0]); + } + { + bufferlist bl; + ASSERT_EQ(1, base_ioctx.read("bam", bl, 1, 0)); + ASSERT_EQ('c', bl[0]); + } + + // evict bam + { + ObjectReadOperation op; op.cache_evict(); librados::AioCompletion *completion = cluster.aio_create_completion(); ASSERT_EQ(0, cache_ioctx.aio_operate( + "bam", completion, &op, + librados::OPERATION_IGNORE_CACHE, NULL)); + completion->wait_for_safe(); + ASSERT_EQ(0, completion->get_return_value()); + completion->release(); + } + { + bufferlist bl; + ObjectReadOperation op; + op.read(1, 0, &bl, NULL); + librados::AioCompletion *completion = cluster.aio_create_completion(); + ASSERT_EQ(0, cache_ioctx.aio_operate( + "bam", completion, &op, + librados::OPERATION_IGNORE_CACHE, NULL)); + completion->wait_for_safe(); + ASSERT_EQ(-ENOENT, completion->get_return_value()); + completion->release(); + } + + // read foo snap + base_ioctx.snap_set_read(my_snaps[0]); + { + bufferlist bl; + ASSERT_EQ(1, base_ioctx.read("foo", bl, 1, 0)); + ASSERT_EQ('h', bl[0]); + } + + // evict foo snap + { + ObjectReadOperation op; + op.cache_evict(); + librados::AioCompletion *completion = cluster.aio_create_completion(); + ASSERT_EQ(0, base_ioctx.aio_operate( + "foo", completion, &op, + librados::OPERATION_IGNORE_CACHE, NULL)); + completion->wait_for_safe(); + ASSERT_EQ(0, completion->get_return_value()); + completion->release(); + } + // snap is gone... + { + bufferlist bl; + ObjectReadOperation op; + op.read(1, 0, &bl, NULL); + librados::AioCompletion *completion = cluster.aio_create_completion(); + ASSERT_EQ(0, base_ioctx.aio_operate( + "foo", completion, &op, + librados::OPERATION_IGNORE_CACHE, NULL)); + completion->wait_for_safe(); + ASSERT_EQ(-ENOENT, completion->get_return_value()); + completion->release(); + } + // head is still there... + base_ioctx.snap_set_read(librados::SNAP_HEAD); + { + bufferlist bl; + ObjectReadOperation op; + op.read(1, 0, &bl, NULL); + librados::AioCompletion *completion = cluster.aio_create_completion(); + ASSERT_EQ(0, base_ioctx.aio_operate( + "foo", completion, &op, + librados::OPERATION_IGNORE_CACHE, NULL)); + completion->wait_for_safe(); + ASSERT_EQ(0, completion->get_return_value()); + completion->release(); + } + + // promote head + snap of bar + base_ioctx.snap_set_read(librados::SNAP_HEAD); + { + bufferlist bl; + ASSERT_EQ(1, base_ioctx.read("bar", bl, 1, 0)); + ASSERT_EQ('c', bl[0]); + } + base_ioctx.snap_set_read(my_snaps[0]); + { + bufferlist bl; + ASSERT_EQ(1, base_ioctx.read("bar", bl, 1, 0)); + ASSERT_EQ('h', bl[0]); + } + + // evict bar head (fail) + base_ioctx.snap_set_read(librados::SNAP_HEAD); + { + ObjectReadOperation op; + op.cache_evict(); + librados::AioCompletion *completion = cluster.aio_create_completion(); + ASSERT_EQ(0, base_ioctx.aio_operate( "bar", completion, &op, - librados::OPERATION_IGNORE_CACHE)); + librados::OPERATION_IGNORE_CACHE, NULL)); completion->wait_for_safe(); ASSERT_EQ(-EBUSY, completion->get_return_value()); completion->release(); } + // evict bar snap + base_ioctx.snap_set_read(my_snaps[0]); + { + ObjectReadOperation op; + op.cache_evict(); + librados::AioCompletion *completion = cluster.aio_create_completion(); + ASSERT_EQ(0, base_ioctx.aio_operate( + "bar", completion, &op, + librados::OPERATION_IGNORE_CACHE, NULL)); + completion->wait_for_safe(); + ASSERT_EQ(0, completion->get_return_value()); + completion->release(); + } + // ...and then head + base_ioctx.snap_set_read(librados::SNAP_HEAD); + { + bufferlist bl; + ObjectReadOperation op; + op.read(1, 0, &bl, NULL); + librados::AioCompletion *completion = cluster.aio_create_completion(); + ASSERT_EQ(0, base_ioctx.aio_operate( + "bar", completion, &op, + librados::OPERATION_IGNORE_CACHE, NULL)); + completion->wait_for_safe(); + ASSERT_EQ(0, completion->get_return_value()); + completion->release(); + } + { + ObjectReadOperation op; + op.cache_evict(); + librados::AioCompletion *completion = cluster.aio_create_completion(); + ASSERT_EQ(0, base_ioctx.aio_operate( + "bar", completion, &op, + librados::OPERATION_IGNORE_CACHE, NULL)); + completion->wait_for_safe(); + ASSERT_EQ(0, completion->get_return_value()); + completion->release(); + } + // tear down tiers ASSERT_EQ(0, cluster.mon_command( "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + base_pool_name + @@ -725,11 +986,11 @@ TEST(LibRadosTier, TryFlush) { // evict it { - ObjectWriteOperation op; + ObjectReadOperation op; op.cache_evict(); librados::AioCompletion *completion = cluster.aio_create_completion(); ASSERT_EQ(0, cache_ioctx.aio_operate( - "foo", completion, &op, librados::OPERATION_IGNORE_CACHE)); + "foo", completion, &op, librados::OPERATION_IGNORE_CACHE, NULL)); completion->wait_for_safe(); ASSERT_EQ(0, completion->get_return_value()); completion->release(); @@ -860,11 +1121,11 @@ TEST(LibRadosTier, Flush) { // evict it { - ObjectWriteOperation op; + ObjectReadOperation op; op.cache_evict(); librados::AioCompletion *completion = cluster.aio_create_completion(); ASSERT_EQ(0, cache_ioctx.aio_operate( - "foo", completion, &op, librados::OPERATION_IGNORE_CACHE)); + "foo", completion, &op, librados::OPERATION_IGNORE_CACHE, NULL)); completion->wait_for_safe(); ASSERT_EQ(0, completion->get_return_value()); completion->release(); @@ -905,11 +1166,11 @@ TEST(LibRadosTier, Flush) { // evict { - ObjectWriteOperation op; + ObjectReadOperation op; op.cache_evict(); librados::AioCompletion *completion = cluster.aio_create_completion(); ASSERT_EQ(0, cache_ioctx.aio_operate( - "foo", completion, &op, librados::OPERATION_IGNORE_CACHE)); + "foo", completion, &op, librados::OPERATION_IGNORE_CACHE, NULL)); completion->wait_for_safe(); ASSERT_EQ(0, completion->get_return_value()); completion->release(); diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h index 62aadc07eb22b..22090a8cbbf9f 100644 --- a/src/test/osd/RadosModel.h +++ b/src/test/osd/RadosModel.h @@ -1840,7 +1840,7 @@ public: class CacheEvictOp : public TestOp { public: librados::AioCompletion *completion; - librados::ObjectWriteOperation op; + librados::ObjectReadOperation op; string oid; CacheEvictOp(int n, @@ -1867,7 +1867,8 @@ public: op.cache_evict(); int r = context->io_ctx.aio_operate(context->prefix+oid, completion, - &op, librados::OPERATION_IGNORE_CACHE); + &op, librados::OPERATION_IGNORE_CACHE, + NULL); assert(!r); } diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc index 07bd94951f169..2b132faa0cabd 100644 --- a/src/tools/rados/rados.cc +++ b/src/tools/rados/rados.cc @@ -1082,14 +1082,15 @@ static int do_cache_try_flush(IoCtx& io_ctx, string oid) static int do_cache_evict(IoCtx& io_ctx, string oid) { - ObjectWriteOperation op; + ObjectReadOperation op; op.cache_evict(); librados::AioCompletion *completion = librados::Rados::aio_create_completion(); io_ctx.aio_operate(oid.c_str(), completion, &op, librados::OPERATION_IGNORE_CACHE | librados::OPERATION_IGNORE_OVERLAY | - librados::OPERATION_SKIPRWLOCKS); + librados::OPERATION_SKIPRWLOCKS, + NULL); completion->wait_for_safe(); int r = completion->get_return_value(); completion->release(); -- 2.39.5