]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: add flag to force thick-provisioning during write_zeroes API
authorJason Dillaman <dillaman@redhat.com>
Tue, 30 Jun 2020 20:59:47 +0000 (16:59 -0400)
committerJason Dillaman <dillaman@redhat.com>
Sun, 5 Jul 2020 22:55:17 +0000 (18:55 -0400)
Signed-off-by: Jason Dillaman <dillaman@redhat.com>
src/include/rbd/librbd.h
src/librbd/api/Io.cc
src/pybind/rbd/rbd.pyx
src/test/librbd/test_librbd.cc
src/test/pybind/test_rbd.py

index 7113a0e2970648d9fea083fb2165020ddf993fef..fce85e6e5a0982d7617811e2748e1e4220cffb88 100644 (file)
@@ -366,6 +366,11 @@ typedef enum {
   RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS
 } rbd_pool_stat_option_t;
 
+/* rbd_write_zeroes / rbd_aio_write_zeroes flags */
+enum {
+  RBD_WRITE_ZEROES_FLAG_THICK_PROVISION = (1U<<0), /* fully allocated zeroed extent */
+};
+
 CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts);
 CEPH_RBD_API void rbd_image_options_destroy(rbd_image_options_t opts);
 CEPH_RBD_API int rbd_image_options_set_string(rbd_image_options_t opts,
index 4d97b9d8a8cfb10a7ae3a0050e100f553c236f3b..76d43c4a344f46d08b79b3852d2b0fc995776f7d 100644 (file)
@@ -2,6 +2,7 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "librbd/api/Io.h"
+#include "include/intarith.h"
 #include "common/dout.h"
 #include "common/errno.h"
 #include "common/Cond.h"
@@ -341,7 +342,13 @@ void Io<I>::aio_write_zeroes(I& image_ctx, io::AioCompletion *aio_comp,
     trace.event("init");
   }
 
-  aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_DISCARD);
+  auto io_type = io::AIO_TYPE_DISCARD;
+  if ((zero_flags & RBD_WRITE_ZEROES_FLAG_THICK_PROVISION) != 0) {
+    zero_flags &= ~RBD_WRITE_ZEROES_FLAG_THICK_PROVISION;
+    io_type = io::AIO_TYPE_WRITESAME;
+  }
+
+  aio_comp->init_time(util::get_image_ctx(&image_ctx), io_type);
   ldout(cct, 20) << "ictx=" << &image_ctx << ", "
                  << "completion=" << aio_comp << ", off=" << off << ", "
                  << "len=" << len << dendl;
@@ -360,6 +367,108 @@ void Io<I>::aio_write_zeroes(I& image_ctx, io::AioCompletion *aio_comp,
     return;
   }
 
+  if (io_type == io::AIO_TYPE_WRITESAME) {
+    // write-same needs to be aligned to its buffer but librbd has never forced
+    // block alignment. Hide that requirement from the user by adding optional
+    // writes.
+    const uint64_t data_length = 512;
+    uint64_t write_same_offset = p2roundup(off, data_length);
+    uint64_t write_same_offset_end = p2align(off + len, data_length);
+    uint64_t write_same_length = 0;
+    if (write_same_offset_end > write_same_offset) {
+      write_same_length = write_same_offset_end - write_same_offset;
+    }
+
+    uint64_t prepend_offset = off;
+    uint64_t prepend_length = write_same_offset - off;
+    uint64_t append_offset = write_same_offset + write_same_length;
+    uint64_t append_length = len - prepend_length - write_same_length;
+    ldout(cct, 20) << "prepend_offset=" << prepend_offset << ", "
+                   << "prepend_length=" << prepend_length << ", "
+                   << "write_same_offset=" << write_same_offset << ", "
+                   << "write_same_length=" << write_same_length << ", "
+                   << "append_offset=" << append_offset << ", "
+                   << "append_length=" << append_length << dendl;
+    ceph_assert(prepend_length + write_same_length + append_length == len);
+
+    if (write_same_length <= data_length) {
+      // unaligned or small write-zeroes request -- use single write
+      bufferlist bl;
+      bl.append_zero(len);
+
+      aio_comp->aio_type = io::AIO_TYPE_WRITE;
+      auto req = io::ImageDispatchSpec<I>::create_write(
+        image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, {{off, len}},
+        std::move(bl), op_flags, trace, 0);
+      req->send();
+      return;
+    } else if (prepend_length == 0 && append_length == 0) {
+      // fully aligned -- use a single write-same image request
+      bufferlist bl;
+      bl.append_zero(data_length);
+
+      auto req = io::ImageDispatchSpec<I>::create_write_same(
+        image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, off, len,
+        std::move(bl), op_flags, trace, 0);
+      req->send();
+      return;
+    }
+
+    // to reach this point, we need at least one prepend/append write along with
+    // a write-same -- therefore we will need to wrap the provided AioCompletion
+    auto request_count = 1;
+    if (prepend_length > 0) {
+      ++request_count;
+    }
+    if (append_length > 0) {
+      ++request_count;
+    }
+
+    ceph_assert(request_count > 1);
+    aio_comp->start_op();
+    aio_comp->set_request_count(request_count);
+
+    if (prepend_length > 0) {
+      bufferlist bl;
+      bl.append_zero(prepend_length);
+
+      Context* prepend_ctx = new io::C_AioRequest(aio_comp);
+      auto prepend_aio_comp = io::AioCompletion::create_and_start(
+        prepend_ctx, &image_ctx, io::AIO_TYPE_WRITE);
+      auto prepend_req = io::ImageDispatchSpec<I>::create_write(
+        image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, prepend_aio_comp,
+        {{prepend_offset, prepend_length}}, std::move(bl), op_flags, trace,
+        0);
+      prepend_req->send();
+    }
+
+    if (append_length > 0) {
+      bufferlist bl;
+      bl.append_zero(append_length);
+
+      Context* append_ctx = new io::C_AioRequest(aio_comp);
+      auto append_aio_comp = io::AioCompletion::create_and_start(
+        append_ctx, &image_ctx, io::AIO_TYPE_WRITE);
+      auto append_req = io::ImageDispatchSpec<I>::create_write(
+        image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, append_aio_comp,
+        {{append_offset, append_length}}, std::move(bl), op_flags, trace, 0);
+      append_req->send();
+    }
+
+    bufferlist bl;
+    bl.append_zero(data_length);
+
+    Context* write_same_ctx = new io::C_AioRequest(aio_comp);
+    auto write_same_aio_comp = io::AioCompletion::create_and_start(
+      write_same_ctx, &image_ctx, io::AIO_TYPE_WRITESAME);
+    auto req = io::ImageDispatchSpec<I>::create_write_same(
+      image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, write_same_aio_comp,
+      write_same_offset, write_same_length, std::move(bl), op_flags, trace,
+      0);
+    req->send();
+    return;
+  }
+
   // enable partial discard (zeroing) of objects
   uint32_t discard_granularity_bytes = 0;
 
index 8e081e3a4b5dca6159084ec8e788f10d54aa93eb..fe1ec95e99c7260b31f9675e9d9983a99023c1a2 100644 (file)
@@ -105,6 +105,8 @@ cdef extern from "rbd/librbd.h" nogil:
         _RBD_SNAP_REMOVE_FLATTEN "RBD_SNAP_REMOVE_FLATTEN"
         _RBD_SNAP_REMOVE_FORCE "RBD_SNAP_REMOVE_FORCE"
 
+        _RBD_WRITE_ZEROES_FLAG_THICK_PROVISION "RBD_WRITE_ZEROES_FLAG_THICK_PROVISION"
+
     ctypedef void* rados_t
     ctypedef void* rados_ioctx_t
     ctypedef void* rbd_image_t
@@ -816,6 +818,8 @@ RBD_SNAP_REMOVE_UNPROTECT = _RBD_SNAP_REMOVE_UNPROTECT
 RBD_SNAP_REMOVE_FLATTEN = _RBD_SNAP_REMOVE_FLATTEN
 RBD_SNAP_REMOVE_FORCE = _RBD_SNAP_REMOVE_FORCE
 
+RBD_WRITE_ZEROES_FLAG_THICK_PROVISION = _RBD_WRITE_ZEROES_FLAG_THICK_PROVISION
+
 class Error(Exception):
     pass
 
index 190016ef3ced08e2389c35e77693d7bfeaaa77ae..a69662efa974b5656edb8f12d929d3992177684b 100644 (file)
@@ -8572,6 +8572,85 @@ TEST_F(TestLibRBD, WriteZeroes) {
   ASSERT_EQ(0, image.close());
 }
 
+TEST_F(TestLibRBD, WriteZeroesThickProvision) {
+  librbd::RBD rbd;
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+  std::string name = get_temp_image_name();
+  int order = 0;
+  uint64_t size = 2 << 20;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image;
+  ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+  interval_set<uint64_t> diff;
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  auto expected_diff = interval_set<uint64_t>{{}};
+  ASSERT_EQ(expected_diff, diff);
+
+  // writes unaligned zeroes as a prepend
+  ASSERT_EQ(128, image.write_zeroes(
+              0, 128, RBD_WRITE_ZEROES_FLAG_THICK_PROVISION, 0));
+  diff.clear();
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  expected_diff = interval_set<uint64_t>{{{0, 128}}};
+  ASSERT_EQ(expected_diff, diff);
+
+  ASSERT_EQ(512, image.write_zeroes(
+              384, 512, RBD_WRITE_ZEROES_FLAG_THICK_PROVISION, 0));
+  diff.clear();
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  expected_diff = interval_set<uint64_t>{{{0, 896}}};
+  ASSERT_EQ(expected_diff, diff);
+
+  // prepend with write-same
+  ASSERT_EQ(640, image.write_zeroes(
+              896, 640, RBD_WRITE_ZEROES_FLAG_THICK_PROVISION, 0));
+  diff.clear();
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  expected_diff = interval_set<uint64_t>{{{0, 1536}}};
+  ASSERT_EQ(expected_diff, diff);
+
+  // write-same with append
+  ASSERT_EQ(640, image.write_zeroes(
+              1536, 640, RBD_WRITE_ZEROES_FLAG_THICK_PROVISION, 0));
+  diff.clear();
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  expected_diff = interval_set<uint64_t>{{{0, 2176}}};
+  ASSERT_EQ(expected_diff, diff);
+
+  // prepend + write-same + append
+  ASSERT_EQ(768, image.write_zeroes(
+              2176, 768, RBD_WRITE_ZEROES_FLAG_THICK_PROVISION, 0));
+  diff.clear();
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  expected_diff = interval_set<uint64_t>{{{0, 2944}}};
+
+  // write-same
+  ASSERT_EQ(1024, image.write_zeroes(
+              3072, 1024, RBD_WRITE_ZEROES_FLAG_THICK_PROVISION, 0));
+  diff.clear();
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  expected_diff = interval_set<uint64_t>{{{0, 4096}}};
+
+  bufferlist expected_bl;
+  expected_bl.append_zero(size);
+
+  bufferlist read_bl;
+  EXPECT_EQ(size, image.read(0, size, read_bl));
+  EXPECT_EQ(expected_bl, read_bl);
+
+  ASSERT_EQ(0, image.close());
+}
+
 // poorman's ceph_assert()
 namespace ceph {
   void __ceph_assert_fail(const char *assertion, const char *file, int line,
index b272748173b1a6aefae0e6c04d7f7113cab274ad..09b47e5660f6edf47748a47f0bc845c3dd07243c 100644 (file)
@@ -40,7 +40,8 @@ from rbd import (RBD, Group, Image, ImageNotFound, InvalidArgument, ImageExists,
                  RBD_SNAP_REMOVE_UNPROTECT, RBD_SNAP_MIRROR_STATE_PRIMARY,
                  RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED,
                  RBD_SNAP_CREATE_SKIP_QUIESCE,
-                 RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR)
+                 RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR,
+                 RBD_WRITE_ZEROES_FLAG_THICK_PROVISION)
 
 rados = None
 ioctx = None
@@ -608,6 +609,14 @@ class TestImage(object):
         self.image.write(data, 0)
         self.image.write_zeroes(0, 256)
         eq(self.image.read(256, 256), b'\0' * 256)
+        check_diff(self.image, 0, IMG_SIZE, None, [])
+
+    def test_write_zeroes_thick_provision(self):
+        data = rand_data(256)
+        self.image.write(data, 0)
+        self.image.write_zeroes(0, 256, RBD_WRITE_ZEROES_FLAG_THICK_PROVISION)
+        eq(self.image.read(256, 256), b'\0' * 256)
+        check_diff(self.image, 0, IMG_SIZE, None, [(0, 256, True)])
 
     def test_read(self):
         data = self.image.read(0, 20)