]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: add 'write_zeroes' public C/C++ API methods
authorJason Dillaman <dillaman@redhat.com>
Tue, 30 Jun 2020 12:44:38 +0000 (08:44 -0400)
committerJason Dillaman <dillaman@redhat.com>
Wed, 22 Jul 2020 18:19:56 +0000 (14:19 -0400)
Unlike the existing 'discard' option which is more of a hint to
attempt to release space, the new 'write_zeroes' APIs will ensure
that the entire provided extent is fully zeroed.

Signed-off-by: Jason Dillaman <dillaman@redhat.com>
(cherry picked from commit ae6dd86b22e928dc23a385faf41cf76b0d293576)

Conflicts:
src/librbd/api/Io.h/cc: logic exists in ImageRequestWQ files
src/librbd/librbd.cc: trivial resolution due missing api::Io
src/test/librbd/test_librbd.cc: trivial resolution

src/include/rbd/librbd.h
src/include/rbd/librbd.hpp
src/librbd/io/ImageRequestWQ.cc
src/librbd/io/ImageRequestWQ.h
src/librbd/librbd.cc
src/test/librbd/test_librbd.cc

index d85cd36504b66def6c2599dbf1eae7f450f9ab6e..419d3232f07ed40e151049237d939334061c852b 100644 (file)
@@ -47,6 +47,7 @@ extern "C" {
 #define LIBRBD_SUPPORTS_IOVEC 1
 #define LIBRBD_SUPPORTS_WATCH 0
 #define LIBRBD_SUPPORTS_WRITESAME 1
+#define LIBRBD_SUPPORTS_WRITE_ZEROES 1
 
 #if __GNUC__ >= 4
   #define CEPH_RBD_API          __attribute__ ((visibility ("default")))
@@ -1094,10 +1095,15 @@ CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
                                 const char *buf, int op_flags);
 CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len);
 CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
-                                   const char *buf, size_t data_len, int op_flags);
+                                   const char *buf, size_t data_len,
+                                   int op_flags);
+CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs,
+                                      uint64_t len, int zero_flags,
+                                      int op_flags);
 CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs,
                                            size_t len, const char *cmp_buf,
-                                           const char *buf, uint64_t *mismatch_off,
+                                           const char *buf,
+                                           uint64_t *mismatch_off,
                                            int op_flags);
 
 CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
@@ -1125,10 +1131,15 @@ CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
 CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
                                    const char *buf, size_t data_len,
                                    rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off,
+                                      size_t len, rbd_completion_t c,
+                                      int zero_flags, int op_flags);
 CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image,
                                                uint64_t off, size_t len,
-                                               const char *cmp_buf, const char *buf,
-                                               rbd_completion_t c, uint64_t *mismatch_off,
+                                               const char *cmp_buf,
+                                               const char *buf,
+                                               rbd_completion_t c,
+                                               uint64_t *mismatch_off,
                                                int op_flags);
 
 CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg,
index ca2d7449baf39fa3a0d34761967e9ac720a72391..1bb74ce72f2e9e13f3800306143fc47285de78db 100644 (file)
@@ -664,8 +664,11 @@ public:
   ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl);
   /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
   ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+
   int discard(uint64_t ofs, uint64_t len);
   ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags);
+  ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags);
+
   ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl,
                             ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags);
 
@@ -673,11 +676,17 @@ public:
   /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
   int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl,
                  RBD::AioCompletion *c, int op_flags);
+
+  int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
   int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl,
                     RBD::AioCompletion *c, int op_flags);
+  int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c,
+                       int zero_flags, int op_flags);
+
   int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl,
                             ceph::bufferlist& bl, RBD::AioCompletion *c,
                             uint64_t *mismatch_off, int op_flags);
+
   /**
    * read async from image
    *
@@ -699,7 +708,6 @@ public:
   /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
   int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl,
                  RBD::AioCompletion *c, int op_flags);
-  int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
 
   int flush();
   /**
index 2d21418428196da2d04046c8fa9996bfcd447b04..94dd982180f27489f887560edc35547db250b9eb 100644 (file)
@@ -212,6 +212,32 @@ ssize_t ImageRequestWQ<I>::writesame(uint64_t off, uint64_t len,
   return len;
 }
 
+template <typename I>
+ssize_t ImageRequestWQ<I>::write_zeroes(uint64_t off, uint64_t len,
+                                        int zero_flags, int op_flags) {
+  auto cct = m_image_ctx.cct;
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
+                 << "len = " << len << dendl;
+
+  m_image_ctx.image_lock.lock_shared();
+  int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
+  m_image_ctx.image_lock.unlock_shared();
+  if (r < 0) {
+    lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  C_SaferCond ctx;
+  auto aio_comp = io::AioCompletion::create(&ctx);
+  aio_write_zeroes(aio_comp, off, len, zero_flags, op_flags, false);
+
+  r = ctx.wait();
+  if (r < 0) {
+    return r;
+  }
+  return len;
+}
+
 template <typename I>
 ssize_t ImageRequestWQ<I>::compare_and_write(uint64_t off, uint64_t len,
                                              bufferlist &&cmp_bl,
@@ -486,6 +512,62 @@ void ImageRequestWQ<I>::aio_writesame(AioCompletion *c, uint64_t off,
   trace.event("finish");
 }
 
+
+template <typename I>
+void ImageRequestWQ<I>::aio_write_zeroes(io::AioCompletion *aio_comp,
+                                         uint64_t off, uint64_t len,
+                                         int zero_flags, int op_flags,
+                                         bool native_async) {
+  auto cct = m_image_ctx.cct;
+  FUNCTRACE(cct);
+  ZTracer::Trace trace;
+  if (m_image_ctx.blkin_trace_all) {
+    trace.init("io: write_zeroes", &m_image_ctx.trace_endpoint);
+    trace.event("init");
+  }
+
+  aio_comp->init_time(util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_DISCARD);
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+                 << "completion=" << aio_comp << ", off=" << off << ", "
+                 << "len=" << len << dendl;
+
+  if (native_async && m_image_ctx.event_socket.is_valid()) {
+    aio_comp->set_event_notify(true);
+  }
+
+  // validate the supported flags
+  if (zero_flags != 0U) {
+    aio_comp->fail(-EINVAL);
+    return;
+  }
+
+  if (!start_in_flight_io(aio_comp)) {
+    return;
+  }
+
+  // enable partial discard (zeroing) of objects
+  uint32_t discard_granularity_bytes = 0;
+
+  auto tid = ++m_last_tid;
+
+  {
+    std::lock_guard locker{m_lock};
+    m_queued_or_blocked_io_tids.insert(tid);
+  }
+
+  auto req = ImageDispatchSpec<I>::create_discard_request(
+    m_image_ctx, aio_comp, off, len, discard_granularity_bytes, trace, tid);
+
+  std::shared_lock owner_locker{m_image_ctx.owner_lock};
+  if (m_image_ctx.non_blocking_aio || writes_blocked()) {
+    queue(req);
+  } else {
+    process_io(req, false);
+    finish_in_flight_io();
+  }
+  trace.event("finish");
+}
+
 template <typename I>
 void ImageRequestWQ<I>::aio_compare_and_write(AioCompletion *c,
                                               uint64_t off, uint64_t len,
index ecbf33f3d20a8794381cd397498134a7f7d555a2..cef18124a61a42a24851da5458a3f9e1edd16092 100644 (file)
@@ -38,6 +38,8 @@ public:
   ssize_t discard(uint64_t off, uint64_t len,
                   uint32_t discard_granularity_bytes);
   ssize_t writesame(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags);
+  ssize_t write_zeroes(uint64_t off, uint64_t len, int zero_flags,
+                       int op_flags);
   ssize_t compare_and_write(uint64_t off, uint64_t len,
                             bufferlist &&cmp_bl, bufferlist &&bl,
                             uint64_t *mismatch_off, int op_flags);
@@ -52,6 +54,8 @@ public:
   void aio_flush(AioCompletion *c, bool native_async=true);
   void aio_writesame(AioCompletion *c, uint64_t off, uint64_t len,
                      bufferlist &&bl, int op_flags, bool native_async=true);
+  void aio_write_zeroes(AioCompletion *c, uint64_t off, uint64_t len,
+                        int zero_flags, int op_flags, bool native_async);
   void aio_compare_and_write(AioCompletion *c, uint64_t off,
                              uint64_t len, bufferlist &&cmp_bl,
                              bufferlist &&bl, uint64_t *mismatch_off,
index ca1009e0ee54db351be19b174041bb9f4b09f576..9a3cc3e67d1aaa7c3cc86dd57a0be5a1de1004e9 100644 (file)
@@ -2540,8 +2540,8 @@ namespace librbd {
     }
 
     bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
-    if (discard_zero && mem_is_zero(bl.c_str(), bl.length())) {
-      int r = ictx->io_work_queue->discard(ofs, len, 0);
+    if (discard_zero && bl.is_zero()) {
+      int r = ictx->io_work_queue->write_zeroes(ofs, len, 0U, op_flags);
       tracepoint(librbd, writesame_exit, r);
       return r;
     }
@@ -2551,6 +2551,13 @@ namespace librbd {
     return r;
   }
 
+  ssize_t Image::write_zeroes(uint64_t ofs, size_t len, int zero_flags,
+                              int op_flags)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags);
+  }
+
   ssize_t Image::compare_and_write(uint64_t ofs, size_t len,
                                    ceph::bufferlist &cmp_bl, ceph::bufferlist& bl,
                                    uint64_t *mismatch_off, int op_flags)
@@ -2678,8 +2685,9 @@ namespace librbd {
     }
 
     bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
-    if (discard_zero && mem_is_zero(bl.c_str(), bl.length())) {
-      ictx->io_work_queue->aio_discard(get_aio_completion(c), off, len, 0);
+    if (discard_zero && bl.is_zero()) {
+      ictx->io_work_queue->aio_write_zeroes(get_aio_completion(c), off, len, 0U,
+                                            op_flags, true);
       tracepoint(librbd, aio_writesame_exit, 0);
       return 0;
     }
@@ -2690,6 +2698,15 @@ namespace librbd {
     return 0;
   }
 
+  int Image::aio_write_zeroes(uint64_t off, size_t len, RBD::AioCompletion *c,
+                              int zero_flags, int op_flags)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    ictx->io_work_queue->aio_write_zeroes(
+      get_aio_completion(c), off, len, zero_flags, op_flags, true);
+    return 0;
+  }
+
   int Image::aio_compare_and_write(uint64_t off, size_t len,
                                    ceph::bufferlist& cmp_bl, ceph::bufferlist& bl,
                                    RBD::AioCompletion *c, uint64_t *mismatch_off,
@@ -5861,7 +5878,7 @@ extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
 
   bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
   if (discard_zero && mem_is_zero(buf, data_len)) {
-    int r = ictx->io_work_queue->discard(ofs, len, 0);
+    int r = ictx->io_work_queue->write_zeroes(ofs, len, 0, op_flags);
     tracepoint(librbd, writesame_exit, r);
     return r;
   }
@@ -5873,6 +5890,13 @@ extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
   return r;
 }
 
+extern "C" ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, size_t len,
+                                    int zero_flags, int op_flags)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags);
+}
+
 extern "C" ssize_t rbd_compare_and_write(rbd_image_t image,
                                          uint64_t ofs, size_t len,
                                          const char *cmp_buf,
@@ -6085,7 +6109,8 @@ extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
 
   bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
   if (discard_zero && mem_is_zero(buf, data_len)) {
-    ictx->io_work_queue->aio_discard(get_aio_completion(comp), off, len, 0);
+    ictx->io_work_queue->aio_write_zeroes(get_aio_completion(comp), off, len, 0,
+                                          op_flags, true);
     tracepoint(librbd, aio_writesame_exit, 0);
     return 0;
   }
@@ -6099,6 +6124,18 @@ extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
   return 0;
 }
 
+extern "C" int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, size_t len,
+                                    rbd_completion_t c, int zero_flags,
+                                    int op_flags)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+  ictx->io_work_queue->aio_write_zeroes(
+    get_aio_completion(comp), off, len, zero_flags, op_flags, true);
+  return 0;
+}
+
 extern "C" ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off,
                                              size_t len, const char *cmp_buf,
                                              const char *buf, rbd_completion_t c,
index 2c83af9839b4cef897bdb3fde054af9ef9331502..d5943cc369f4af11d00410aba047deb31724c78d 100644 (file)
@@ -8175,6 +8175,65 @@ TEST_F(TestLibRBD, SnapRemoveWithChildMissing)
   rados_ioctx_destroy(ioctx1);
 }
 
+TEST_F(TestLibRBD, WriteZeroes) {
+  librbd::RBD rbd;
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+  std::string name = get_temp_image_name();
+  int order = 0;
+  uint64_t size = 2 << 20;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image;
+  ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+  // 1s from [0, 256) / length 256
+  char data[256];
+  memset(data, 1, sizeof(data));
+  bufferlist bl;
+  bl.append(data, 256);
+  ASSERT_EQ(256, image.write(0, 256, bl));
+
+  interval_set<uint64_t> diff;
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  auto expected_diff = interval_set<uint64_t>{{{0, 256}}};
+  ASSERT_EQ(expected_diff, diff);
+
+  // writes zero passed the current end extents.
+  // Now 1s from [0, 192) / length 192
+  ASSERT_EQ(size - 192,
+            image.write_zeroes(192, size - 192, 0U, 0));
+  diff.clear();
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  expected_diff = interval_set<uint64_t>{{{0, 192}}};
+  ASSERT_EQ(expected_diff, diff);
+
+  // zero an existing extent and truncate some off the end
+  // Now 1s from [64, 192) / length 192
+  ASSERT_EQ(64, image.write_zeroes(0, 64, 0U, 0));
+
+  diff.clear();
+  ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+                                   iterate_cb, (void *)&diff));
+  expected_diff = interval_set<uint64_t>{{{0, 192}}};
+  ASSERT_EQ(expected_diff, diff);
+
+  bufferlist expected_bl;
+  expected_bl.append_zero(64);
+  bufferlist sub_bl;
+  sub_bl.substr_of(bl, 0, 128);
+  expected_bl.claim_append(sub_bl);
+  expected_bl.append_zero(size - 192);
+
+  bufferlist read_bl;
+  EXPECT_EQ(size, image.read(0, size, read_bl));
+  EXPECT_EQ(expected_bl, read_bl);
+
+  ASSERT_EQ(0, image.close());
+}
+
 // poorman's ceph_assert()
 namespace ceph {
   void __ceph_assert_fail(const char *assertion, const char *file, int line,