From: Jason Dillaman Date: Tue, 30 Jun 2020 12:44:38 +0000 (-0400) Subject: librbd: add 'write_zeroes' public C/C++ API methods X-Git-Tag: v15.2.5~100^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0e3add97ff1f919d81a1c644b150672cbd691c84;p=ceph.git librbd: add 'write_zeroes' public C/C++ API methods Unlike the existing 'discard' option which is more of a hint to attempt to release space, the new 'write_zeroes' APIs will ensure that the entire provided extent is fully zeroed. Signed-off-by: Jason Dillaman (cherry picked from commit ae6dd86b22e928dc23a385faf41cf76b0d293576) Conflicts: src/librbd/api/Io.h/cc: logic exists in ImageRequestWQ files src/librbd/librbd.cc: trivial resolution due missing api::Io src/test/librbd/test_librbd.cc: trivial resolution --- diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h index d85cd36504b..419d3232f07 100644 --- a/src/include/rbd/librbd.h +++ b/src/include/rbd/librbd.h @@ -47,6 +47,7 @@ extern "C" { #define LIBRBD_SUPPORTS_IOVEC 1 #define LIBRBD_SUPPORTS_WATCH 0 #define LIBRBD_SUPPORTS_WRITESAME 1 +#define LIBRBD_SUPPORTS_WRITE_ZEROES 1 #if __GNUC__ >= 4 #define CEPH_RBD_API __attribute__ ((visibility ("default"))) @@ -1094,10 +1095,15 @@ CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len, const char *buf, int op_flags); CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len); CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len, - const char *buf, size_t data_len, int op_flags); + const char *buf, size_t data_len, + int op_flags); +CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, + uint64_t len, int zero_flags, + int op_flags); CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs, size_t len, const char *cmp_buf, - const char *buf, uint64_t *mismatch_off, + const char *buf, + uint64_t *mismatch_off, int op_flags); CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, @@ -1125,10 +1131,15 @@ CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len, const char *buf, size_t data_len, rbd_completion_t c, int op_flags); +CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, + size_t len, rbd_completion_t c, + int zero_flags, int op_flags); CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off, size_t len, - const char *cmp_buf, const char *buf, - rbd_completion_t c, uint64_t *mismatch_off, + const char *cmp_buf, + const char *buf, + rbd_completion_t c, + uint64_t *mismatch_off, int op_flags); CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg, diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp index ca2d7449baf..1bb74ce72f2 100644 --- a/src/include/rbd/librbd.hpp +++ b/src/include/rbd/librbd.hpp @@ -664,8 +664,11 @@ public: ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl); /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags); + int discard(uint64_t ofs, uint64_t len); ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags); + ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags); + ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl, ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags); @@ -673,11 +676,17 @@ public: /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c, int op_flags); + + int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c); int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c, int op_flags); + int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c, + int zero_flags, int op_flags); + int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl, ceph::bufferlist& bl, RBD::AioCompletion *c, uint64_t *mismatch_off, int op_flags); + /** * read async from image * @@ -699,7 +708,6 @@ public: /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c, int op_flags); - int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c); int flush(); /** diff --git a/src/librbd/io/ImageRequestWQ.cc b/src/librbd/io/ImageRequestWQ.cc index 2d214184281..94dd982180f 100644 --- a/src/librbd/io/ImageRequestWQ.cc +++ b/src/librbd/io/ImageRequestWQ.cc @@ -212,6 +212,32 @@ ssize_t ImageRequestWQ::writesame(uint64_t off, uint64_t len, return len; } +template +ssize_t ImageRequestWQ::write_zeroes(uint64_t off, uint64_t len, + int zero_flags, int op_flags) { + auto cct = m_image_ctx.cct; + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", " + << "len = " << len << dendl; + + m_image_ctx.image_lock.lock_shared(); + int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len); + m_image_ctx.image_lock.unlock_shared(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond ctx; + auto aio_comp = io::AioCompletion::create(&ctx); + aio_write_zeroes(aio_comp, off, len, zero_flags, op_flags, false); + + r = ctx.wait(); + if (r < 0) { + return r; + } + return len; +} + template ssize_t ImageRequestWQ::compare_and_write(uint64_t off, uint64_t len, bufferlist &&cmp_bl, @@ -486,6 +512,62 @@ void ImageRequestWQ::aio_writesame(AioCompletion *c, uint64_t off, trace.event("finish"); } + +template +void ImageRequestWQ::aio_write_zeroes(io::AioCompletion *aio_comp, + uint64_t off, uint64_t len, + int zero_flags, int op_flags, + bool native_async) { + auto cct = m_image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (m_image_ctx.blkin_trace_all) { + trace.init("io: write_zeroes", &m_image_ctx.trace_endpoint); + trace.event("init"); + } + + aio_comp->init_time(util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_DISCARD); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", " + << "completion=" << aio_comp << ", off=" << off << ", " + << "len=" << len << dendl; + + if (native_async && m_image_ctx.event_socket.is_valid()) { + aio_comp->set_event_notify(true); + } + + // validate the supported flags + if (zero_flags != 0U) { + aio_comp->fail(-EINVAL); + return; + } + + if (!start_in_flight_io(aio_comp)) { + return; + } + + // enable partial discard (zeroing) of objects + uint32_t discard_granularity_bytes = 0; + + auto tid = ++m_last_tid; + + { + std::lock_guard locker{m_lock}; + m_queued_or_blocked_io_tids.insert(tid); + } + + auto req = ImageDispatchSpec::create_discard_request( + m_image_ctx, aio_comp, off, len, discard_granularity_bytes, trace, tid); + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + if (m_image_ctx.non_blocking_aio || writes_blocked()) { + queue(req); + } else { + process_io(req, false); + finish_in_flight_io(); + } + trace.event("finish"); +} + template void ImageRequestWQ::aio_compare_and_write(AioCompletion *c, uint64_t off, uint64_t len, diff --git a/src/librbd/io/ImageRequestWQ.h b/src/librbd/io/ImageRequestWQ.h index ecbf33f3d20..cef18124a61 100644 --- a/src/librbd/io/ImageRequestWQ.h +++ b/src/librbd/io/ImageRequestWQ.h @@ -38,6 +38,8 @@ public: ssize_t discard(uint64_t off, uint64_t len, uint32_t discard_granularity_bytes); ssize_t writesame(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags); + ssize_t write_zeroes(uint64_t off, uint64_t len, int zero_flags, + int op_flags); ssize_t compare_and_write(uint64_t off, uint64_t len, bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_off, int op_flags); @@ -52,6 +54,8 @@ public: void aio_flush(AioCompletion *c, bool native_async=true); void aio_writesame(AioCompletion *c, uint64_t off, uint64_t len, bufferlist &&bl, int op_flags, bool native_async=true); + void aio_write_zeroes(AioCompletion *c, uint64_t off, uint64_t len, + int zero_flags, int op_flags, bool native_async); void aio_compare_and_write(AioCompletion *c, uint64_t off, uint64_t len, bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_off, diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc index ca1009e0ee5..9a3cc3e67d1 100644 --- a/src/librbd/librbd.cc +++ b/src/librbd/librbd.cc @@ -2540,8 +2540,8 @@ namespace librbd { } bool discard_zero = ictx->config.get_val("rbd_discard_on_zeroed_write_same"); - if (discard_zero && mem_is_zero(bl.c_str(), bl.length())) { - int r = ictx->io_work_queue->discard(ofs, len, 0); + if (discard_zero && bl.is_zero()) { + int r = ictx->io_work_queue->write_zeroes(ofs, len, 0U, op_flags); tracepoint(librbd, writesame_exit, r); return r; } @@ -2551,6 +2551,13 @@ namespace librbd { return r; } + ssize_t Image::write_zeroes(uint64_t ofs, size_t len, int zero_flags, + int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags); + } + ssize_t Image::compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl, ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags) @@ -2678,8 +2685,9 @@ namespace librbd { } bool discard_zero = ictx->config.get_val("rbd_discard_on_zeroed_write_same"); - if (discard_zero && mem_is_zero(bl.c_str(), bl.length())) { - ictx->io_work_queue->aio_discard(get_aio_completion(c), off, len, 0); + if (discard_zero && bl.is_zero()) { + ictx->io_work_queue->aio_write_zeroes(get_aio_completion(c), off, len, 0U, + op_flags, true); tracepoint(librbd, aio_writesame_exit, 0); return 0; } @@ -2690,6 +2698,15 @@ namespace librbd { return 0; } + int Image::aio_write_zeroes(uint64_t off, size_t len, RBD::AioCompletion *c, + int zero_flags, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + ictx->io_work_queue->aio_write_zeroes( + get_aio_completion(c), off, len, zero_flags, op_flags, true); + return 0; + } + int Image::aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl, ceph::bufferlist& bl, RBD::AioCompletion *c, uint64_t *mismatch_off, @@ -5861,7 +5878,7 @@ extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len, bool discard_zero = ictx->config.get_val("rbd_discard_on_zeroed_write_same"); if (discard_zero && mem_is_zero(buf, data_len)) { - int r = ictx->io_work_queue->discard(ofs, len, 0); + int r = ictx->io_work_queue->write_zeroes(ofs, len, 0, op_flags); tracepoint(librbd, writesame_exit, r); return r; } @@ -5873,6 +5890,13 @@ extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len, return r; } +extern "C" ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, size_t len, + int zero_flags, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags); +} + extern "C" ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs, size_t len, const char *cmp_buf, @@ -6085,7 +6109,8 @@ extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len, bool discard_zero = ictx->config.get_val("rbd_discard_on_zeroed_write_same"); if (discard_zero && mem_is_zero(buf, data_len)) { - ictx->io_work_queue->aio_discard(get_aio_completion(comp), off, len, 0); + ictx->io_work_queue->aio_write_zeroes(get_aio_completion(comp), off, len, 0, + op_flags, true); tracepoint(librbd, aio_writesame_exit, 0); return 0; } @@ -6099,6 +6124,18 @@ extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len, return 0; } +extern "C" int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, size_t len, + rbd_completion_t c, int zero_flags, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + ictx->io_work_queue->aio_write_zeroes( + get_aio_completion(comp), off, len, zero_flags, op_flags, true); + return 0; +} + extern "C" ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off, size_t len, const char *cmp_buf, const char *buf, rbd_completion_t c, diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc index 2c83af9839b..d5943cc369f 100644 --- a/src/test/librbd/test_librbd.cc +++ b/src/test/librbd/test_librbd.cc @@ -8175,6 +8175,65 @@ TEST_F(TestLibRBD, SnapRemoveWithChildMissing) rados_ioctx_destroy(ioctx1); } +TEST_F(TestLibRBD, WriteZeroes) { + librbd::RBD rbd; + librados::IoCtx ioctx; + ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx)); + std::string name = get_temp_image_name(); + int order = 0; + uint64_t size = 2 << 20; + ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order)); + + librbd::Image image; + ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL)); + + // 1s from [0, 256) / length 256 + char data[256]; + memset(data, 1, sizeof(data)); + bufferlist bl; + bl.append(data, 256); + ASSERT_EQ(256, image.write(0, 256, bl)); + + interval_set diff; + ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false, + iterate_cb, (void *)&diff)); + auto expected_diff = interval_set{{{0, 256}}}; + ASSERT_EQ(expected_diff, diff); + + // writes zero passed the current end extents. + // Now 1s from [0, 192) / length 192 + ASSERT_EQ(size - 192, + image.write_zeroes(192, size - 192, 0U, 0)); + diff.clear(); + ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false, + iterate_cb, (void *)&diff)); + expected_diff = interval_set{{{0, 192}}}; + ASSERT_EQ(expected_diff, diff); + + // zero an existing extent and truncate some off the end + // Now 1s from [64, 192) / length 192 + ASSERT_EQ(64, image.write_zeroes(0, 64, 0U, 0)); + + diff.clear(); + ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false, + iterate_cb, (void *)&diff)); + expected_diff = interval_set{{{0, 192}}}; + ASSERT_EQ(expected_diff, diff); + + bufferlist expected_bl; + expected_bl.append_zero(64); + bufferlist sub_bl; + sub_bl.substr_of(bl, 0, 128); + expected_bl.claim_append(sub_bl); + expected_bl.append_zero(size - 192); + + bufferlist read_bl; + EXPECT_EQ(size, image.read(0, size, read_bl)); + EXPECT_EQ(expected_bl, read_bl); + + ASSERT_EQ(0, image.close()); +} + // poorman's ceph_assert() namespace ceph { void __ceph_assert_fail(const char *assertion, const char *file, int line,