#define LIBRBD_SUPPORTS_IOVEC 1
#define LIBRBD_SUPPORTS_WATCH 0
#define LIBRBD_SUPPORTS_WRITESAME 1
+#define LIBRBD_SUPPORTS_WRITE_ZEROES 1
#if __GNUC__ >= 4
#define CEPH_RBD_API __attribute__ ((visibility ("default")))
const char *buf, int op_flags);
CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len);
CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
- const char *buf, size_t data_len, int op_flags);
+ const char *buf, size_t data_len,
+ int op_flags);
+CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs,
+ uint64_t len, int zero_flags,
+ int op_flags);
CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs,
size_t len, const char *cmp_buf,
- const char *buf, uint64_t *mismatch_off,
+ const char *buf,
+ uint64_t *mismatch_off,
int op_flags);
CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
const char *buf, size_t data_len,
rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off,
+ size_t len, rbd_completion_t c,
+ int zero_flags, int op_flags);
CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image,
uint64_t off, size_t len,
- const char *cmp_buf, const char *buf,
- rbd_completion_t c, uint64_t *mismatch_off,
+ const char *cmp_buf,
+ const char *buf,
+ rbd_completion_t c,
+ uint64_t *mismatch_off,
int op_flags);
CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg,
ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl);
/* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+
int discard(uint64_t ofs, uint64_t len);
ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags);
+ ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags);
+
ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl,
ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags);
/* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl,
RBD::AioCompletion *c, int op_flags);
+
+ int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl,
RBD::AioCompletion *c, int op_flags);
+ int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c,
+ int zero_flags, int op_flags);
+
int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl,
ceph::bufferlist& bl, RBD::AioCompletion *c,
uint64_t *mismatch_off, int op_flags);
+
/**
* read async from image
*
/* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl,
RBD::AioCompletion *c, int op_flags);
- int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
int flush();
/**
return len;
}
+template <typename I>
+ssize_t ImageRequestWQ<I>::write_zeroes(uint64_t off, uint64_t len,
+ int zero_flags, int op_flags) {
+ auto cct = m_image_ctx.cct;
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
+ << "len = " << len << dendl;
+
+ m_image_ctx.image_lock.lock_shared();
+ int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
+ m_image_ctx.image_lock.unlock_shared();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond ctx;
+ auto aio_comp = io::AioCompletion::create(&ctx);
+ aio_write_zeroes(aio_comp, off, len, zero_flags, op_flags, false);
+
+ r = ctx.wait();
+ if (r < 0) {
+ return r;
+ }
+ return len;
+}
+
template <typename I>
ssize_t ImageRequestWQ<I>::compare_and_write(uint64_t off, uint64_t len,
bufferlist &&cmp_bl,
trace.event("finish");
}
+
+template <typename I>
+void ImageRequestWQ<I>::aio_write_zeroes(io::AioCompletion *aio_comp,
+ uint64_t off, uint64_t len,
+ int zero_flags, int op_flags,
+ bool native_async) {
+ auto cct = m_image_ctx.cct;
+ FUNCTRACE(cct);
+ ZTracer::Trace trace;
+ if (m_image_ctx.blkin_trace_all) {
+ trace.init("io: write_zeroes", &m_image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ aio_comp->init_time(util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_DISCARD);
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "completion=" << aio_comp << ", off=" << off << ", "
+ << "len=" << len << dendl;
+
+ if (native_async && m_image_ctx.event_socket.is_valid()) {
+ aio_comp->set_event_notify(true);
+ }
+
+ // validate the supported flags
+ if (zero_flags != 0U) {
+ aio_comp->fail(-EINVAL);
+ return;
+ }
+
+ if (!start_in_flight_io(aio_comp)) {
+ return;
+ }
+
+ // enable partial discard (zeroing) of objects
+ uint32_t discard_granularity_bytes = 0;
+
+ auto tid = ++m_last_tid;
+
+ {
+ std::lock_guard locker{m_lock};
+ m_queued_or_blocked_io_tids.insert(tid);
+ }
+
+ auto req = ImageDispatchSpec<I>::create_discard_request(
+ m_image_ctx, aio_comp, off, len, discard_granularity_bytes, trace, tid);
+
+ std::shared_lock owner_locker{m_image_ctx.owner_lock};
+ if (m_image_ctx.non_blocking_aio || writes_blocked()) {
+ queue(req);
+ } else {
+ process_io(req, false);
+ finish_in_flight_io();
+ }
+ trace.event("finish");
+}
+
template <typename I>
void ImageRequestWQ<I>::aio_compare_and_write(AioCompletion *c,
uint64_t off, uint64_t len,
ssize_t discard(uint64_t off, uint64_t len,
uint32_t discard_granularity_bytes);
ssize_t writesame(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags);
+ ssize_t write_zeroes(uint64_t off, uint64_t len, int zero_flags,
+ int op_flags);
ssize_t compare_and_write(uint64_t off, uint64_t len,
bufferlist &&cmp_bl, bufferlist &&bl,
uint64_t *mismatch_off, int op_flags);
void aio_flush(AioCompletion *c, bool native_async=true);
void aio_writesame(AioCompletion *c, uint64_t off, uint64_t len,
bufferlist &&bl, int op_flags, bool native_async=true);
+ void aio_write_zeroes(AioCompletion *c, uint64_t off, uint64_t len,
+ int zero_flags, int op_flags, bool native_async);
void aio_compare_and_write(AioCompletion *c, uint64_t off,
uint64_t len, bufferlist &&cmp_bl,
bufferlist &&bl, uint64_t *mismatch_off,
}
bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
- if (discard_zero && mem_is_zero(bl.c_str(), bl.length())) {
- int r = ictx->io_work_queue->discard(ofs, len, 0);
+ if (discard_zero && bl.is_zero()) {
+ int r = ictx->io_work_queue->write_zeroes(ofs, len, 0U, op_flags);
tracepoint(librbd, writesame_exit, r);
return r;
}
return r;
}
+ ssize_t Image::write_zeroes(uint64_t ofs, size_t len, int zero_flags,
+ int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags);
+ }
+
ssize_t Image::compare_and_write(uint64_t ofs, size_t len,
ceph::bufferlist &cmp_bl, ceph::bufferlist& bl,
uint64_t *mismatch_off, int op_flags)
}
bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
- if (discard_zero && mem_is_zero(bl.c_str(), bl.length())) {
- ictx->io_work_queue->aio_discard(get_aio_completion(c), off, len, 0);
+ if (discard_zero && bl.is_zero()) {
+ ictx->io_work_queue->aio_write_zeroes(get_aio_completion(c), off, len, 0U,
+ op_flags, true);
tracepoint(librbd, aio_writesame_exit, 0);
return 0;
}
return 0;
}
+ int Image::aio_write_zeroes(uint64_t off, size_t len, RBD::AioCompletion *c,
+ int zero_flags, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ ictx->io_work_queue->aio_write_zeroes(
+ get_aio_completion(c), off, len, zero_flags, op_flags, true);
+ return 0;
+ }
+
int Image::aio_compare_and_write(uint64_t off, size_t len,
ceph::bufferlist& cmp_bl, ceph::bufferlist& bl,
RBD::AioCompletion *c, uint64_t *mismatch_off,
bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
if (discard_zero && mem_is_zero(buf, data_len)) {
- int r = ictx->io_work_queue->discard(ofs, len, 0);
+ int r = ictx->io_work_queue->write_zeroes(ofs, len, 0, op_flags);
tracepoint(librbd, writesame_exit, r);
return r;
}
return r;
}
+extern "C" ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, size_t len,
+ int zero_flags, int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return ictx->io_work_queue->write_zeroes(ofs, len, zero_flags, op_flags);
+}
+
extern "C" ssize_t rbd_compare_and_write(rbd_image_t image,
uint64_t ofs, size_t len,
const char *cmp_buf,
bool discard_zero = ictx->config.get_val<bool>("rbd_discard_on_zeroed_write_same");
if (discard_zero && mem_is_zero(buf, data_len)) {
- ictx->io_work_queue->aio_discard(get_aio_completion(comp), off, len, 0);
+ ictx->io_work_queue->aio_write_zeroes(get_aio_completion(comp), off, len, 0,
+ op_flags, true);
tracepoint(librbd, aio_writesame_exit, 0);
return 0;
}
return 0;
}
+extern "C" int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, size_t len,
+ rbd_completion_t c, int zero_flags,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+
+ ictx->io_work_queue->aio_write_zeroes(
+ get_aio_completion(comp), off, len, zero_flags, op_flags, true);
+ return 0;
+}
+
extern "C" ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off,
size_t len, const char *cmp_buf,
const char *buf, rbd_completion_t c,
rados_ioctx_destroy(ioctx1);
}
+TEST_F(TestLibRBD, WriteZeroes) {
+ librbd::RBD rbd;
+ librados::IoCtx ioctx;
+ ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+ std::string name = get_temp_image_name();
+ int order = 0;
+ uint64_t size = 2 << 20;
+ ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+ librbd::Image image;
+ ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+ // 1s from [0, 256) / length 256
+ char data[256];
+ memset(data, 1, sizeof(data));
+ bufferlist bl;
+ bl.append(data, 256);
+ ASSERT_EQ(256, image.write(0, 256, bl));
+
+ interval_set<uint64_t> diff;
+ ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+ iterate_cb, (void *)&diff));
+ auto expected_diff = interval_set<uint64_t>{{{0, 256}}};
+ ASSERT_EQ(expected_diff, diff);
+
+ // writes zero passed the current end extents.
+ // Now 1s from [0, 192) / length 192
+ ASSERT_EQ(size - 192,
+ image.write_zeroes(192, size - 192, 0U, 0));
+ diff.clear();
+ ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+ iterate_cb, (void *)&diff));
+ expected_diff = interval_set<uint64_t>{{{0, 192}}};
+ ASSERT_EQ(expected_diff, diff);
+
+ // zero an existing extent and truncate some off the end
+ // Now 1s from [64, 192) / length 192
+ ASSERT_EQ(64, image.write_zeroes(0, 64, 0U, 0));
+
+ diff.clear();
+ ASSERT_EQ(0, image.diff_iterate2(nullptr, 0, size, false, false,
+ iterate_cb, (void *)&diff));
+ expected_diff = interval_set<uint64_t>{{{0, 192}}};
+ ASSERT_EQ(expected_diff, diff);
+
+ bufferlist expected_bl;
+ expected_bl.append_zero(64);
+ bufferlist sub_bl;
+ sub_bl.substr_of(bl, 0, 128);
+ expected_bl.claim_append(sub_bl);
+ expected_bl.append_zero(size - 192);
+
+ bufferlist read_bl;
+ EXPECT_EQ(size, image.read(0, size, read_bl));
+ EXPECT_EQ(expected_bl, read_bl);
+
+ ASSERT_EQ(0, image.close());
+}
+
// poorman's ceph_assert()
namespace ceph {
void __ceph_assert_fail(const char *assertion, const char *file, int line,