]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/ReplicatedPG: allow cache-evict on snaps
authorSage Weil <sage@inktank.com>
Sat, 28 Dec 2013 00:11:27 +0000 (16:11 -0800)
committerSage Weil <sage@inktank.com>
Tue, 14 Jan 2014 00:19:42 +0000 (16:19 -0800)
We do three things here:

 - make cache-evict a CACHE instead of WR op, allowing us to submit it
   on snaps (not just head)
 - allow eviction of a snap
 - verify that all snaps are missing before evicting a head

Signed-off-by: Sage Weil <sage@inktank.com>
src/include/rados.h
src/include/rados/librados.hpp
src/librados/librados.cc
src/osd/ReplicatedPG.cc
src/test/librados/tier.cc
src/test/osd/RadosModel.h
src/tools/rados/rados.cc

index c4aa5035fb368c78e6e9e0eaf449129054cea0f6..894856649c464a1283b774eff6bbc770700c1f89 100644 (file)
@@ -229,7 +229,7 @@ enum {
        CEPH_OSD_OP_ISDIRTY   = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 29,
        CEPH_OSD_OP_COPY_GET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 30,
        CEPH_OSD_OP_CACHE_FLUSH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 31,
-       CEPH_OSD_OP_CACHE_EVICT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 32,
+       CEPH_OSD_OP_CACHE_EVICT = CEPH_OSD_OP_MODE_CACHE | CEPH_OSD_OP_TYPE_DATA | 32,
        CEPH_OSD_OP_CACHE_TRY_FLUSH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 33,
 
        /* convert tmap to omap */
index 5e9eb312fd6aa67849a71e925659f23289b22c85..411c90ff2fc9866ed1d461fda792654145ffcca5 100644 (file)
@@ -328,14 +328,6 @@ namespace librados
      */
     void cache_try_flush();
 
-    /**
-     * evict a clean cache tier object
-     *
-     * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
-     * triggering a promote on the OSD (that is then evicted).
-     */
-    void cache_evict();
-
     friend class IoCtx;
   };
 
@@ -461,6 +453,14 @@ namespace librados
      * @param prval [out] place error code in prval upon completion
      */
     void is_dirty(bool *isdirty, int *prval);
+
+    /**
+     * evict a clean cache tier object
+     *
+     * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+     * triggering a promote on the OSD (that is then evicted).
+     */
+    void cache_evict();
   };
 
   /* IoCtx : This is a context in which we can perform I/O.
index ec3422c3f036fd1b5c7f77e6b57b1f190016aa29..70686be7a930a0a79cfd031b939f2b96988ece5a 100644 (file)
@@ -415,7 +415,7 @@ void librados::ObjectWriteOperation::cache_try_flush()
   o->cache_try_flush();
 }
 
-void librados::ObjectWriteOperation::cache_evict()
+void librados::ObjectReadOperation::cache_evict()
 {
   ::ObjectOperation *o = (::ObjectOperation *)impl;
   o->cache_evict();
index 1061ee44d5ac931bddff67b3b22c1a697e0c2a09..1e5d368697fbe661eaad7710408b8a0d7f0d493d 100644 (file)
@@ -3053,6 +3053,27 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
          result = -EBUSY;
          break;
        }
+       if (soid.snap == CEPH_NOSNAP) {
+         // verify that all clones have been evicted
+         dout(20) << __func__ << " verifying clones are absent "
+                  << ctx->new_snapset << dendl;
+         result = 0;
+         for (vector<snapid_t>::iterator p = ctx->new_snapset.clones.begin();
+              p != ctx->new_snapset.clones.end();
+              ++p) {
+           hobject_t clone_oid = soid;
+           clone_oid.snap = *p;
+           ObjectContextRef clone_obc = get_object_context(clone_oid, false);
+           if (clone_obc && clone_obc->obs.exists) {
+             dout(10) << __func__ << " cannot evict head before clone "
+                      << clone_oid << dendl;
+             result = -EBUSY;
+             break;
+           }
+         }
+         if (result < 0)
+           break;
+       }
        result = _delete_head(ctx, true);
       }
       break;
@@ -4543,7 +4564,8 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
   }
 
   // clone, if necessary
-  make_writeable(ctx);
+  if (soid.snap == CEPH_NOSNAP)
+    make_writeable(ctx);
 
   finish_ctx(ctx,
             ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
@@ -4657,13 +4679,19 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type)
                                    ctx->user_at_version, ctx->reqid,
                                    ctx->mtime));
   if (soid.snap < CEPH_NOSNAP) {
-    dout(20) << __func__ << " encoding snaps " << ctx->new_obs.oi.snaps << dendl;
-    ::encode(ctx->new_obs.oi.snaps, ctx->log.back().snaps);
-
     OSDriver::OSTransaction _t(osdriver.get_transaction(&(ctx->local_t)));
-    set<snapid_t> _snaps(ctx->new_obs.oi.snaps.begin(),
-                        ctx->new_obs.oi.snaps.end());
-    snap_mapper.add_oid(soid, _snaps, &_t);
+    if (log_op_type == pg_log_entry_t::MODIFY ||
+       log_op_type == pg_log_entry_t::PROMOTE) {
+      dout(20) << __func__ << " encoding snaps " << ctx->new_obs.oi.snaps
+              << dendl;
+      ::encode(ctx->new_obs.oi.snaps, ctx->log.back().snaps);
+
+      set<snapid_t> _snaps(ctx->new_obs.oi.snaps.begin(),
+                          ctx->new_obs.oi.snaps.end());
+      snap_mapper.add_oid(soid, _snaps, &_t);
+    } else {
+      snap_mapper.remove_oid(soid, &_t);
+    }
   }
 
   // apply new object state.
index d2e162c89f3ee972f705b7bb4f7048e10addc583..de8323f6855f9a0a33b32bec0a6832c658e02aa2 100644 (file)
@@ -576,38 +576,299 @@ TEST(LibRadosTier, Evict) {
 
   // evict
   {
-    ObjectWriteOperation op;
+    ObjectReadOperation op;
     op.cache_evict();
     librados::AioCompletion *completion = cluster.aio_create_completion();
-    ASSERT_EQ(0, cache_ioctx.aio_operate(
-      "foo", completion, &op, librados::OPERATION_IGNORE_CACHE));
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op,
+                                        librados::OPERATION_IGNORE_CACHE,
+                                        NULL));
     completion->wait_for_safe();
     ASSERT_EQ(0, completion->get_return_value());
     completion->release();
   }
   {
-    ObjectWriteOperation op;
+    ObjectReadOperation op;
     op.cache_evict();
     librados::AioCompletion *completion = cluster.aio_create_completion();
     ASSERT_EQ(0, cache_ioctx.aio_operate(
       "fooberdoodle", completion, &op,
-      librados::OPERATION_IGNORE_CACHE));
+      librados::OPERATION_IGNORE_CACHE, NULL));
     completion->wait_for_safe();
     ASSERT_EQ(-ENOENT, completion->get_return_value());
     completion->release();
   }
   {
+    ObjectReadOperation op;
+    op.cache_evict();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "bar", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(-EBUSY, completion->get_return_value());
+    completion->release();
+  }
+
+  // tear down tiers
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + base_pool_name +
+    "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + base_pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+
+  base_ioctx.close();
+  cache_ioctx.close();
+
+  cluster.pool_delete(cache_pool_name.c_str());
+  ASSERT_EQ(0, destroy_one_pool_pp(base_pool_name, cluster));
+}
+
+TEST(LibRadosTier, EvictSnap) {
+  Rados cluster;
+  std::string base_pool_name = get_temp_pool_name();
+  std::string cache_pool_name = base_pool_name + "-cache";
+  ASSERT_EQ("", create_one_pool_pp(base_pool_name, cluster));
+  ASSERT_EQ(0, cluster.pool_create(cache_pool_name.c_str()));
+  IoCtx cache_ioctx;
+  ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
+  IoCtx base_ioctx;
+  ASSERT_EQ(0, cluster.ioctx_create(base_pool_name.c_str(), base_ioctx));
+
+  // create object
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, base_ioctx.operate("foo", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, base_ioctx.operate("bar", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, base_ioctx.operate("baz", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, base_ioctx.operate("bam", &op));
+  }
+
+  // create a snapshot, clone
+  vector<uint64_t> my_snaps(1);
+  ASSERT_EQ(0, base_ioctx.selfmanaged_snap_create(&my_snaps[0]));
+  ASSERT_EQ(0, base_ioctx.selfmanaged_snap_set_write_ctx(my_snaps[0],
+                                                        my_snaps));
+  {
+    bufferlist bl;
+    bl.append("ciao!");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, base_ioctx.operate("foo", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("ciao!");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, base_ioctx.operate("bar", &op));
+  }
+  {
+    ObjectWriteOperation op;
+    op.remove();
+    ASSERT_EQ(0, base_ioctx.operate("baz", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("ciao!");
     ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, base_ioctx.operate("bam", &op));
+  }
+
+  // configure cache
+  bufferlist inbl;
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + base_pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + base_pool_name +
+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+    "\", \"mode\": \"writeback\"}",
+    inbl, NULL, NULL));
+
+  // wait for maps to settle
+  cluster.wait_for_latest_osdmap();
+
+  // read, trigger a promote on the head
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, base_ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ('c', bl[0]);
+  }
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, base_ioctx.read("bam", bl, 1, 0));
+    ASSERT_EQ('c', bl[0]);
+  }
+
+  // evict bam
+  {
+    ObjectReadOperation op;
     op.cache_evict();
     librados::AioCompletion *completion = cluster.aio_create_completion();
     ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "bam", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+  {
+    bufferlist bl;
+    ObjectReadOperation op;
+    op.read(1, 0, &bl, NULL);
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "bam", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(-ENOENT, completion->get_return_value());
+    completion->release();
+  }
+
+  // read foo snap
+  base_ioctx.snap_set_read(my_snaps[0]);
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, base_ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ('h', bl[0]);
+  }
+
+  // evict foo snap
+  {
+    ObjectReadOperation op;
+    op.cache_evict();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, base_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+  // snap is gone...
+  {
+    bufferlist bl;
+    ObjectReadOperation op;
+    op.read(1, 0, &bl, NULL);
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, base_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(-ENOENT, completion->get_return_value());
+    completion->release();
+  }
+  // head is still there...
+  base_ioctx.snap_set_read(librados::SNAP_HEAD);
+  {
+    bufferlist bl;
+    ObjectReadOperation op;
+    op.read(1, 0, &bl, NULL);
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, base_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // promote head + snap of bar
+  base_ioctx.snap_set_read(librados::SNAP_HEAD);
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, base_ioctx.read("bar", bl, 1, 0));
+    ASSERT_EQ('c', bl[0]);
+  }
+  base_ioctx.snap_set_read(my_snaps[0]);
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, base_ioctx.read("bar", bl, 1, 0));
+    ASSERT_EQ('h', bl[0]);
+  }
+
+  // evict bar head (fail)
+  base_ioctx.snap_set_read(librados::SNAP_HEAD);
+  {
+    ObjectReadOperation op;
+    op.cache_evict();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, base_ioctx.aio_operate(
       "bar", completion, &op,
-      librados::OPERATION_IGNORE_CACHE));
+      librados::OPERATION_IGNORE_CACHE, NULL));
     completion->wait_for_safe();
     ASSERT_EQ(-EBUSY, completion->get_return_value());
     completion->release();
   }
 
+  // evict bar snap
+  base_ioctx.snap_set_read(my_snaps[0]);
+  {
+    ObjectReadOperation op;
+    op.cache_evict();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, base_ioctx.aio_operate(
+      "bar", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+  // ...and then head
+  base_ioctx.snap_set_read(librados::SNAP_HEAD);
+  {
+    bufferlist bl;
+    ObjectReadOperation op;
+    op.read(1, 0, &bl, NULL);
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, base_ioctx.aio_operate(
+      "bar", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+  {
+    ObjectReadOperation op;
+    op.cache_evict();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, base_ioctx.aio_operate(
+      "bar", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
   // tear down tiers
   ASSERT_EQ(0, cluster.mon_command(
     "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + base_pool_name +
@@ -725,11 +986,11 @@ TEST(LibRadosTier, TryFlush) {
 
   // evict it
   {
-    ObjectWriteOperation op;
+    ObjectReadOperation op;
     op.cache_evict();
     librados::AioCompletion *completion = cluster.aio_create_completion();
     ASSERT_EQ(0, cache_ioctx.aio_operate(
-      "foo", completion, &op, librados::OPERATION_IGNORE_CACHE));
+        "foo", completion, &op, librados::OPERATION_IGNORE_CACHE, NULL));
     completion->wait_for_safe();
     ASSERT_EQ(0, completion->get_return_value());
     completion->release();
@@ -860,11 +1121,11 @@ TEST(LibRadosTier, Flush) {
 
   // evict it
   {
-    ObjectWriteOperation op;
+    ObjectReadOperation op;
     op.cache_evict();
     librados::AioCompletion *completion = cluster.aio_create_completion();
     ASSERT_EQ(0, cache_ioctx.aio_operate(
-      "foo", completion, &op, librados::OPERATION_IGNORE_CACHE));
+        "foo", completion, &op, librados::OPERATION_IGNORE_CACHE, NULL));
     completion->wait_for_safe();
     ASSERT_EQ(0, completion->get_return_value());
     completion->release();
@@ -905,11 +1166,11 @@ TEST(LibRadosTier, Flush) {
 
   // evict
   {
-    ObjectWriteOperation op;
+    ObjectReadOperation op;
     op.cache_evict();
     librados::AioCompletion *completion = cluster.aio_create_completion();
     ASSERT_EQ(0, cache_ioctx.aio_operate(
-      "foo", completion, &op, librados::OPERATION_IGNORE_CACHE));
+        "foo", completion, &op, librados::OPERATION_IGNORE_CACHE, NULL));
     completion->wait_for_safe();
     ASSERT_EQ(0, completion->get_return_value());
     completion->release();
index 62aadc07eb22bb2746e617f7c917c5e9b56b51b3..22090a8cbbf9f7428671acfa50480d409e4d9b9d 100644 (file)
@@ -1840,7 +1840,7 @@ public:
 class CacheEvictOp : public TestOp {
 public:
   librados::AioCompletion *completion;
-  librados::ObjectWriteOperation op;
+  librados::ObjectReadOperation op;
   string oid;
 
   CacheEvictOp(int n,
@@ -1867,7 +1867,8 @@ public:
 
     op.cache_evict();
     int r = context->io_ctx.aio_operate(context->prefix+oid, completion,
-                                       &op, librados::OPERATION_IGNORE_CACHE);
+                                       &op, librados::OPERATION_IGNORE_CACHE,
+                                       NULL);
     assert(!r);
   }
 
index 07bd94951f16950a06b4b3e10cf1c36dea17bea7..2b132faa0cabd33a0f8af241244d86aa0dc32f09 100644 (file)
@@ -1082,14 +1082,15 @@ static int do_cache_try_flush(IoCtx& io_ctx, string oid)
 
 static int do_cache_evict(IoCtx& io_ctx, string oid)
 {
-  ObjectWriteOperation op;
+  ObjectReadOperation op;
   op.cache_evict();
   librados::AioCompletion *completion =
     librados::Rados::aio_create_completion();
   io_ctx.aio_operate(oid.c_str(), completion, &op,
                     librados::OPERATION_IGNORE_CACHE |
                     librados::OPERATION_IGNORE_OVERLAY |
-                    librados::OPERATION_SKIPRWLOCKS);
+                    librados::OPERATION_SKIPRWLOCKS,
+                    NULL);
   completion->wait_for_safe();
   int r = completion->get_return_value();
   completion->release();