From d6bf5ce6f5227c346253ecb40b04f832e8dd7821 Mon Sep 17 00:00:00 2001 From: Mykola Golub Date: Fri, 29 Mar 2019 09:04:57 +0000 Subject: [PATCH] librbd: support EC data pool images sparsify (object truncate or remove) Fixes: https://tracker.ceph.com/issues/38364 Signed-off-by: Mykola Golub --- src/librbd/operation/SparsifyRequest.cc | 254 ++++++++++++++++++++---- src/test/librbd/test_internal.cc | 74 +++---- 2 files changed, 252 insertions(+), 76 deletions(-) diff --git a/src/librbd/operation/SparsifyRequest.cc b/src/librbd/operation/SparsifyRequest.cc index 94ee129665a..4dc6bde7e32 100644 --- a/src/librbd/operation/SparsifyRequest.cc +++ b/src/librbd/operation/SparsifyRequest.cc @@ -5,6 +5,7 @@ #include "cls/rbd/cls_rbd_client.h" #include "common/dout.h" #include "common/errno.h" +#include "include/err.h" #include "librbd/AsyncObjectThrottle.h" #include "librbd/ExclusiveLock.h" #include "librbd/ImageCtx.h" @@ -19,6 +20,58 @@ namespace librbd { namespace operation { +namespace { + +bool may_be_trimmed(const std::map &extent_map, + const bufferlist &bl, size_t sparse_size, + uint64_t *new_end_ptr) { + if (extent_map.empty()) { + *new_end_ptr = 0; + return true; + } + + uint64_t end = extent_map.rbegin()->first + extent_map.rbegin()->second; + uint64_t new_end = end; + uint64_t bl_off = bl.length(); + + for (auto it = extent_map.rbegin(); it != extent_map.rend(); it++) { + auto off = it->first; + auto len = it->second; + + new_end = p2roundup(off + len, sparse_size); + + uint64_t extent_left = len; + uint64_t sub_len = len % sparse_size; + if (sub_len == 0) { + sub_len = sparse_size; + } + while (extent_left > 0) { + ceph_assert(bl_off >= sub_len); + bl_off -= sub_len; + bufferlist sub_bl; + sub_bl.substr_of(bl, bl_off, sub_len); + if (!sub_bl.is_zero()) { + break; + } + new_end -= sparse_size; + extent_left -= sub_len; + sub_len = sparse_size; + } + if (extent_left > 0) { + break; + } + } + + if (new_end < end) { + *new_end_ptr = new_end; + return true; + } + + return false; +} + +} // anonymous namespace + using util::create_context_callback; using util::create_rados_callback; @@ -35,21 +88,24 @@ public: * * * | - * v (object map disabled) - * SPARSIFY -----------------------\ - * | | - * | (object map enabled) | - * v | - * PRE UPDATE OBJECT MAP | - * | | - * v | - * CHECK EXISTS | - * | | - * v | - * POST UPDATE OBJECT MAP | - * | | - * v | - * <----------------------/ + * v (not supported) + * SPARSIFY * * * * * * * * * * * * > READ < * * * * * * * * * * (concurrent + * | | * update is + * | (object map disabled) | (can trim) * detected) + * |------------------------\ V * + * | | PRE UPDATE OBJECT MAP * + * | (object map enabled) | | (if needed) * + * v | V * + * PRE UPDATE OBJECT MAP | TRIM * * * * * * * * * * * + * | | | + * v | V + * CHECK EXISTS | POST UPDATE OBJECT MAP + * | | | (if needed) + * v | | + * POST UPDATE OBJECT MAP | | + * | | | + * v | | + * <------------------/<-------/ * * @endverbatim * @@ -112,12 +168,20 @@ public: void handle_sparsify(int r) { ldout(m_cct, 20) << "r=" << r << dendl; - if (r < 0 && r != -ENOENT) { - lderr(m_cct) << "failed to sparsify: " << cpp_strerror(r) << dendl; + if (r == -EOPNOTSUPP) { + m_trying_trim = true; + send_read(); + return; } if (r == -ENOENT) { - this->complete(0); + finish_op(0); + return; + } + + if (r < 0) { + lderr(m_cct) << "failed to sparsify: " << cpp_strerror(r) << dendl; + finish_op(r); return; } @@ -125,34 +189,41 @@ public: } void send_pre_update_object_map() { - I *image_ctx = &this->m_image_ctx; + I &image_ctx = this->m_image_ctx; - if (!m_remove_empty || !image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) { - this->complete(0); + if (m_trying_trim) { + if (!m_remove_empty || m_new_end != 0 || + !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + send_trim(); + return; + } + } else if (!m_remove_empty || + !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + finish_op(0); return; } ldout(m_cct, 20) << dendl; - image_ctx->owner_lock.get_read(); - image_ctx->snap_lock.get_read(); - if (image_ctx->object_map == nullptr) { + image_ctx.owner_lock.get_read(); + image_ctx.snap_lock.get_read(); + if (image_ctx.object_map == nullptr) { // possible that exclusive lock was lost in background lderr(m_cct) << "object map is not initialized" << dendl; - image_ctx->snap_lock.put_read(); - image_ctx->owner_lock.put_read(); - this->complete(-EINVAL); + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + finish_op(-EINVAL); return; } int r; - m_finish_op_ctx = image_ctx->exclusive_lock->start_op(&r); + m_finish_op_ctx = image_ctx.exclusive_lock->start_op(&r); if (m_finish_op_ctx == nullptr) { lderr(m_cct) << "lost exclusive lock" << dendl; - image_ctx->snap_lock.put_read(); - image_ctx->owner_lock.put_read(); - this->complete(r); + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); + finish_op(r); return; } @@ -160,17 +231,17 @@ public: C_SparsifyObject, &C_SparsifyObject::handle_pre_update_object_map>(this); - image_ctx->object_map_lock.get_write(); - bool sent = image_ctx->object_map->template aio_update< + image_ctx.object_map_lock.get_write(); + bool sent = image_ctx.object_map->template aio_update< Context, &Context::complete>(CEPH_NOSNAP, m_object_no, OBJECT_PENDING, OBJECT_EXISTS, {}, false, ctx); // NOTE: state machine might complete before we reach here - image_ctx->object_map_lock.put_write(); - image_ctx->snap_lock.put_read(); - image_ctx->owner_lock.put_read(); + image_ctx.object_map_lock.put_write(); + image_ctx.snap_lock.put_read(); + image_ctx.owner_lock.put_read(); if (!sent) { - ctx->complete(0); + finish_op(0); } } @@ -184,7 +255,11 @@ public: return; } - send_check_exists(); + if (m_trying_trim) { + send_trim(); + } else { + send_check_exists(); + } } void send_check_exists() { @@ -194,10 +269,10 @@ public: librados::ObjectReadOperation op; op.stat(NULL, NULL, NULL); - m_out_bl.clear(); + m_bl.clear(); auto comp = create_rados_callback< C_SparsifyObject, &C_SparsifyObject::handle_check_exists>(this); - int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_out_bl); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl); ceph_assert(r == 0); comp->release(); } @@ -217,6 +292,8 @@ public: void send_post_update_object_map(bool exists) { I &image_ctx = this->m_image_ctx; + ldout(m_cct, 20) << dendl; + auto ctx = create_context_callback< C_SparsifyObject, &C_SparsifyObject::handle_post_update_object_map>(this); @@ -253,10 +330,100 @@ public: finish_op(0); } + void send_read() { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + m_bl.clear(); + op.sparse_read(0, image_ctx.layout.object_size, &m_extent_map, &m_bl, + nullptr); + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_read>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl); + ceph_assert(r == 0); + comp->release(); + } + + void handle_read(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + if (r == -ENOENT) { + r = 0; + } else { + lderr(m_cct) << "failed to read object: " << cpp_strerror(r) << dendl; + } + finish_op(r); + return; + } + + if (!may_be_trimmed(m_extent_map, m_bl, m_sparse_size, &m_new_end)) { + finish_op(0); + return; + } + + send_pre_update_object_map(); + } + + void send_trim() { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + ceph_assert(m_new_end < image_ctx.layout.object_size); + + librados::ObjectWriteOperation op; + m_bl.clear(); + m_bl.append_zero(image_ctx.layout.object_size - m_new_end); + op.cmpext(m_new_end, m_bl, nullptr); + if (m_new_end == 0 && m_remove_empty) { + op.remove(); + } else { + op.truncate(m_new_end); + } + + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_trim>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + + void handle_trim(int r) { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r <= -MAX_ERRNO) { + m_finish_op_ctx->complete(0); + m_finish_op_ctx = nullptr; + send_read(); + return; + } + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to trim: " << cpp_strerror(r) << dendl; + finish_op(r); + return; + } + + if (!m_remove_empty || m_new_end != 0 || + !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + finish_op(0); + return; + } + + send_post_update_object_map(false); + } + void finish_op(int r) { ldout(m_cct, 20) << "r=" << r << dendl; - m_finish_op_ctx->complete(0); + if (m_finish_op_ctx != nullptr) { + m_finish_op_ctx->complete(0); + } this->complete(r); } @@ -267,7 +434,10 @@ private: std::string m_oid; bool m_remove_empty = false; - bufferlist m_out_bl; + bool m_trying_trim = false; + bufferlist m_bl; + std::map m_extent_map; + uint64_t m_new_end = 0; Context *m_finish_op_ctx = nullptr; }; diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc index 8d24a608db6..20146e075e0 100644 --- a/src/test/librbd/test_internal.cc +++ b/src/test/librbd/test_internal.cc @@ -1365,11 +1365,14 @@ TEST_F(TestInternal, Sparsify) { librbd::ImageCtx *ictx; ASSERT_EQ(0, open_image(m_image_name, &ictx)); - REQUIRE(is_sparsify_supported(ictx->data_ctx, ictx->get_object_name(10))); - + bool sparsify_supported = is_sparsify_supported(ictx->data_ctx, + ictx->get_object_name(10)); bool sparse_read_supported = is_sparse_read_supported( ictx->data_ctx, ictx->get_object_name(10)); + std::cout << "sparsify_supported=" << sparsify_supported << std::endl; + std::cout << "sparse_read_supported=" << sparse_read_supported << std::endl; + librbd::NoOpProgressContext no_op; ASSERT_EQ(0, ictx->operations->resize((1 << ictx->order) * 20, true, no_op)); @@ -1379,13 +1382,24 @@ TEST_F(TestInternal, Sparsify) { ASSERT_EQ((ssize_t)bl.length(), ictx->io_work_queue->write(0, bl.length(), bufferlist{bl}, 0)); + ASSERT_EQ((ssize_t)bl.length(), + ictx->io_work_queue->write((1 << ictx->order) * 1 + 512, + bl.length(), bufferlist{bl}, 0)); + bl.append(std::string(4096, '1')); bl.append(std::string(4096, '\0')); bl.append(std::string(4096, '2')); - bl.append(std::string(4096, '\0')); + bl.append(std::string(4096 - 1, '\0')); ASSERT_EQ((ssize_t)bl.length(), ictx->io_work_queue->write((1 << ictx->order) * 10, bl.length(), bufferlist{bl}, 0)); + + bufferlist bl2; + bl2.append(std::string(4096 - 1, '\0')); + ASSERT_EQ((ssize_t)bl2.length(), + ictx->io_work_queue->write((1 << ictx->order) * 10 + 4096 * 10, + bl2.length(), bufferlist{bl2}, 0)); + ASSERT_EQ(0, ictx->io_work_queue->flush()); ASSERT_EQ(0, ictx->operations->sparsify(4096, no_op)); @@ -1404,21 +1418,30 @@ TEST_F(TestInternal, Sparsify) { uint64_t size; ASSERT_EQ(-ENOENT, ictx->data_ctx.stat(oid, &size, NULL)); - if (!sparse_read_supported) { - return; - } + oid = ictx->get_object_name(1); + ASSERT_EQ(-ENOENT, ictx->data_ctx.stat(oid, &size, NULL)); oid = ictx->get_object_name(10); std::map m; - read_bl.clear(); - ASSERT_EQ(2, ictx->data_ctx.sparse_read(oid, m, read_bl, bl.length(), 0)); - std::map expected_m = - {{4096 * 1, 4096}, {4096 * 3, 4096}}; - ASSERT_EQ(m, expected_m); + std::map expected_m; + auto read_len = bl.length(); bl.clear(); - bl.append(std::string(4096, '1')); - bl.append(std::string(4096, '2')); - ASSERT_TRUE(bl.contents_equal(read_bl)); + if (sparsify_supported && sparse_read_supported) { + expected_m = {{4096 * 1, 4096}, {4096 * 3, 4096}}; + bl.append(std::string(4096, '1')); + bl.append(std::string(4096, '2')); + } else { + expected_m = {{0, 4096 * 4}}; + bl.append(std::string(4096, '\0')); + bl.append(std::string(4096, '1')); + bl.append(std::string(4096, '\0')); + bl.append(std::string(4096, '2')); + } + read_bl.clear(); + EXPECT_EQ(static_cast(expected_m.size()), + ictx->data_ctx.sparse_read(oid, m, read_bl, read_len, 0)); + EXPECT_EQ(m, expected_m); + EXPECT_TRUE(bl.contents_equal(read_bl)); } @@ -1428,10 +1451,9 @@ TEST_F(TestInternal, SparsifyClone) { librbd::ImageCtx *ictx; ASSERT_EQ(0, open_image(m_image_name, &ictx)); - REQUIRE(is_sparsify_supported(ictx->data_ctx, ictx->get_object_name(10))); - - bool sparse_read_supported = is_sparse_read_supported( - ictx->data_ctx, ictx->get_object_name(10)); + bool sparsify_supported = is_sparsify_supported(ictx->data_ctx, + ictx->get_object_name(10)); + std::cout << "sparsify_supported=" << sparsify_supported << std::endl; librbd::NoOpProgressContext no_op; ASSERT_EQ(0, ictx->operations->resize((1 << ictx->order) * 10, true, no_op)); @@ -1484,20 +1506,4 @@ TEST_F(TestInternal, SparsifyClone) { uint64_t size; ASSERT_EQ(0, ictx->data_ctx.stat(oid, &size, NULL)); ASSERT_EQ(0, ictx->data_ctx.read(oid, read_bl, 4096, 0)); - - if (!sparse_read_supported) { - return; - } - - oid = ictx->get_object_name(10); - std::map m; - read_bl.clear(); - ASSERT_EQ(2, ictx->data_ctx.sparse_read(oid, m, read_bl, bl.length(), 0)); - std::map expected_m = - {{4096 * 1, 4096}, {4096 * 3, 4096}}; - ASSERT_EQ(m, expected_m); - bl.clear(); - bl.append(std::string(4096, '1')); - bl.append(std::string(4096, '2')); - ASSERT_TRUE(bl.contents_equal(read_bl)); } -- 2.39.5