From c455711c2fb22dcfd09d3b77cc289c649b44e69b Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Thu, 1 Dec 2016 10:27:30 +0530 Subject: [PATCH] librbd: Create few empty objects during copyup This is based out of Doug's (@fullerdj) work (PR #9329) as an attempt to avoid creating empty objects when flattening an image and otherwise whenever unnecessary. This gives good optimization benefit when a parent image is sparsely populated. Moreover, this change is required for correct behavior when checking disk usage of a clone (which used to report fully allocated image due to all, including empty objects being created during flatten). Signed-off-by: Douglas Fuller dfuller@redhat.com Signed-off-by: Venky Shankar --- src/librbd/AioObjectRequest.cc | 54 +++++++------ src/librbd/AioObjectRequest.h | 134 ++++++++++++++++++++------------- src/librbd/CopyupRequest.cc | 78 ++++++++++++++++--- src/librbd/CopyupRequest.h | 34 +++++---- 4 files changed, 202 insertions(+), 98 deletions(-) diff --git a/src/librbd/AioObjectRequest.cc b/src/librbd/AioObjectRequest.cc index 222abd1edb570..3417eda863343 100644 --- a/src/librbd/AioObjectRequest.cc +++ b/src/librbd/AioObjectRequest.cc @@ -359,7 +359,7 @@ void AbstractAioObjectWrite::guard_write() bool AbstractAioObjectWrite::should_complete(int r) { - ldout(m_ictx->cct, 20) << get_write_type() << " " << this << " " << m_oid + ldout(m_ictx->cct, 20) << get_op_type() << " " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << " should_complete: r = " << r << dendl; @@ -371,7 +371,7 @@ bool AbstractAioObjectWrite::should_complete(int r) return true; } - send_write(); + send_write_op(); finished = false; break; @@ -395,7 +395,7 @@ bool AbstractAioObjectWrite::should_complete(int r) break; } - finished = send_post(); + finished = send_post_object_map_update(); break; case LIBRBD_AIO_WRITE_COPYUP: @@ -405,14 +405,14 @@ bool AbstractAioObjectWrite::should_complete(int r) complete(r); finished = false; } else { - finished = send_post(); + finished = send_post_object_map_update(); } break; case LIBRBD_AIO_WRITE_FLAT: ldout(m_ictx->cct, 20) << "WRITE_FLAT" << dendl; - finished = send_post(); + finished = send_post_object_map_update(); break; case LIBRBD_AIO_WRITE_ERROR: @@ -429,13 +429,9 @@ bool AbstractAioObjectWrite::should_complete(int r) } void AbstractAioObjectWrite::send() { - ldout(m_ictx->cct, 20) << "send " << get_write_type() << " " << this <<" " + ldout(m_ictx->cct, 20) << "send " << get_op_type() << " " << this <<" " << m_oid << " " << m_object_off << "~" << m_object_len << dendl; - send_pre(); -} - -void AbstractAioObjectWrite::send_pre() { { RWLock::RLocker snap_lock(m_ictx->snap_lock); if (m_ictx->object_map == nullptr) { @@ -444,12 +440,22 @@ void AbstractAioObjectWrite::send_pre() { // should have been flushed prior to releasing lock assert(m_ictx->exclusive_lock->is_lock_owner()); m_object_exist = m_ictx->object_map->object_may_exist(m_object_no); + } + } + send_write(); +} + +void AbstractAioObjectWrite::send_pre_object_map_update() { + ldout(m_ictx->cct, 20) << __func__ << dendl; + + { + RWLock::RLocker snap_lock(m_ictx->snap_lock); + if (m_ictx->object_map != nullptr) { uint8_t new_state; pre_object_map_update(&new_state); - RWLock::WLocker object_map_locker(m_ictx->object_map_lock); - ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " " + ldout(m_ictx->cct, 20) << __func__ << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl; m_state = LIBRBD_AIO_WRITE_PRE; @@ -461,11 +467,10 @@ void AbstractAioObjectWrite::send_pre() { } } - // no object map update required - send_write(); + send_write_op(); } -bool AbstractAioObjectWrite::send_post() { +bool AbstractAioObjectWrite::send_post_object_map_update() { RWLock::RLocker snap_locker(m_ictx->snap_lock); if (m_ictx->object_map == nullptr || !post_object_map_update()) { return true; @@ -475,7 +480,7 @@ bool AbstractAioObjectWrite::send_post() { assert(m_ictx->exclusive_lock->is_lock_owner()); RWLock::WLocker object_map_locker(m_ictx->object_map_lock); - ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " " + ldout(m_ictx->cct, 20) << __func__ << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl; m_state = LIBRBD_AIO_WRITE_POST; @@ -496,7 +501,7 @@ void AbstractAioObjectWrite::send_write() { m_state = LIBRBD_AIO_WRITE_GUARD; handle_write_guard(); } else { - send_write_op(true); + send_pre_object_map_update(); } } @@ -526,11 +531,13 @@ void AbstractAioObjectWrite::send_copyup() m_ictx->copyup_list_lock.Unlock(); } } -void AbstractAioObjectWrite::send_write_op(bool write_guard) +void AbstractAioObjectWrite::send_write_op() { m_state = LIBRBD_AIO_WRITE_FLAT; - if (write_guard) + if (m_guard) { guard_write(); + } + add_write_ops(&m_write); assert(m_write.size() != 0); @@ -582,10 +589,10 @@ void AioObjectWrite::send_write() { << " object exist " << m_object_exist << " write_full " << write_full << dendl; if (write_full && !has_parent()) { - send_write_op(false); - } else { - AbstractAioObjectWrite::send_write(); + m_guard = false; } + + AbstractAioObjectWrite::send_write(); } void AioObjectRemove::guard_write() { @@ -598,8 +605,9 @@ void AioObjectRemove::guard_write() { void AioObjectRemove::send_write() { ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl; - send_write_op(true); + send_pre_object_map_update(); } + void AioObjectTruncate::send_write() { ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " truncate " << m_object_off << dendl; diff --git a/src/librbd/AioObjectRequest.h b/src/librbd/AioObjectRequest.h index 8d6d98a19018f..08fa870620008 100644 --- a/src/librbd/AioObjectRequest.h +++ b/src/librbd/AioObjectRequest.h @@ -83,6 +83,13 @@ public: return m_has_parent; } + virtual bool is_op_payload_empty() const { + return false; + } + + virtual const char *get_op_type() const = 0; + virtual bool pre_object_map_update(uint8_t *new_state) = 0; + protected: bool compute_parent_extents(); @@ -137,6 +144,15 @@ public: ExtentMap &get_extent_map() { return m_ext_map; } + + const char *get_op_type() const { + return "read"; + } + + bool pre_object_map_update(uint8_t *new_state) { + return false; + } + private: Extents m_buffer_extents; bool m_tried_parent; @@ -193,17 +209,19 @@ public: * Writes go through the following state machine to deal with * layering and the object map: * - * - * . | - * . | - * . \---> LIBRBD_AIO_WRITE_PRE - * . | | - * . . . . . . | . . . . | . . . . . . . . . . . - * . | -or- | . - * . | | v - * . | \----------------> LIBRBD_AIO_WRITE_FLAT . . . - * . | | . - * v v need copyup | . + * + * | + * |\ + * | \ -or- + * | ---------------------------------> LIBRBD_AIO_WRITE_PRE + * | . | + * | . | + * | . v + * | . . . . > LIBRBD_AIO_WRITE_FLAT. . . + * | | . + * | | . + * | | . + * v need copyup (copyup performs pre) | . * LIBRBD_AIO_WRITE_GUARD -----------> LIBRBD_AIO_WRITE_COPYUP | . * . | | . | . * . | | . | . @@ -238,21 +256,21 @@ protected: uint64_t m_snap_seq; std::vector m_snaps; bool m_object_exist; + bool m_guard = true; virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0; - virtual const char* get_write_type() const = 0; virtual void guard_write(); - virtual void pre_object_map_update(uint8_t *new_state) = 0; virtual bool post_object_map_update() { return false; } virtual void send_write(); - virtual void send_write_op(bool write_guard); + virtual void send_write_op(); virtual void handle_write_guard(); + void send_pre_object_map_update(); + private: - void send_pre(); - bool send_post(); + bool send_post_object_map_update(); void send_copyup(); }; @@ -267,16 +285,22 @@ public: m_write_data(data), m_op_flags(op_flags) { } -protected: - virtual void add_write_ops(librados::ObjectWriteOperation *wr); + bool is_op_payload_empty() const { + return (m_write_data.length() == 0); + } - virtual const char* get_write_type() const { + virtual const char *get_op_type() const { return "write"; } - virtual void pre_object_map_update(uint8_t *new_state) { + virtual bool pre_object_map_update(uint8_t *new_state) { *new_state = OBJECT_EXISTS; + return true; } + +protected: + virtual void add_write_ops(librados::ObjectWriteOperation *wr); + virtual void send_write(); private: @@ -293,28 +317,21 @@ public: m_object_state(OBJECT_NONEXISTENT) { } -protected: - virtual void add_write_ops(librados::ObjectWriteOperation *wr) { - if (has_parent()) { - wr->truncate(0); - } else { - wr->remove(); - } - } - - virtual const char* get_write_type() const { + virtual const char* get_op_type() const { if (has_parent()) { return "remove (trunc)"; } return "remove"; } - virtual void pre_object_map_update(uint8_t *new_state) { + + virtual bool pre_object_map_update(uint8_t *new_state) { if (has_parent()) { m_object_state = OBJECT_EXISTS; } else { m_object_state = OBJECT_PENDING; } *new_state = m_object_state; + return true; } virtual bool post_object_map_update() { @@ -327,6 +344,15 @@ protected: virtual void guard_write(); virtual void send_write(); +protected: + virtual void add_write_ops(librados::ObjectWriteOperation *wr) { + if (has_parent()) { + wr->truncate(0); + } else { + wr->remove(); + } + } + private: uint8_t m_object_state; }; @@ -342,23 +368,24 @@ public: : AbstractAioObjectWrite(ictx, oid, object_no, 0, 0, snapc, completion, true), m_post_object_map_update(post_object_map_update) { } -protected: - virtual void add_write_ops(librados::ObjectWriteOperation *wr) { - wr->remove(); - } - - virtual const char* get_write_type() const { + virtual const char* get_op_type() const { return "remove (trim)"; } - virtual void pre_object_map_update(uint8_t *new_state) { + virtual bool pre_object_map_update(uint8_t *new_state) { *new_state = OBJECT_PENDING; + return true; } virtual bool post_object_map_update() { return m_post_object_map_update; } +protected: + virtual void add_write_ops(librados::ObjectWriteOperation *wr) { + wr->remove(); + } + private: bool m_post_object_map_update; }; @@ -372,22 +399,24 @@ public: completion, true) { } -protected: - virtual void add_write_ops(librados::ObjectWriteOperation *wr) { - wr->truncate(m_object_off); - } - - virtual const char* get_write_type() const { + virtual const char* get_op_type() const { return "truncate"; } - virtual void pre_object_map_update(uint8_t *new_state) { + virtual bool pre_object_map_update(uint8_t *new_state) { if (!m_object_exist && !has_parent()) *new_state = OBJECT_NONEXISTENT; else *new_state = OBJECT_EXISTS; + return true; } + virtual void send_write(); + +protected: + virtual void add_write_ops(librados::ObjectWriteOperation *wr) { + wr->truncate(m_object_off); + } }; class AioObjectZero : public AbstractAioObjectWrite { @@ -399,17 +428,18 @@ public: snapc, completion, true) { } -protected: - virtual void add_write_ops(librados::ObjectWriteOperation *wr) { - wr->zero(m_object_off, m_object_len); - } - - virtual const char* get_write_type() const { + virtual const char* get_op_type() const { return "zero"; } - virtual void pre_object_map_update(uint8_t *new_state) { + virtual bool pre_object_map_update(uint8_t *new_state) { *new_state = OBJECT_EXISTS; + return true; + } + +protected: + virtual void add_write_ops(librados::ObjectWriteOperation *wr) { + wr->zero(m_object_off, m_object_len); } }; diff --git a/src/librbd/CopyupRequest.cc b/src/librbd/CopyupRequest.cc index e31981db7cca4..e7dd8a45df065 100644 --- a/src/librbd/CopyupRequest.cc +++ b/src/librbd/CopyupRequest.cc @@ -181,6 +181,18 @@ bool CopyupRequest::send_copyup() { return false; } +bool CopyupRequest::is_copyup_required() { + bool noop = true; + for (const AioObjectRequest<> *req : m_pending_requests) { + if (!req->is_op_payload_empty()) { + noop = false; + break; + } + } + + return (m_copyup_data.is_zero() && noop); +} + void CopyupRequest::send() { m_state = STATE_READ_FROM_PARENT; @@ -217,10 +229,20 @@ bool CopyupRequest::should_complete(int r) ldout(cct, 20) << "READ_FROM_PARENT" << dendl; remove_from_list(); if (r >= 0 || r == -ENOENT) { - return send_object_map(); + if (is_copyup_required()) { + ldout(cct, 20) << __func__ << " " << this << " nop, skipping" << dendl; + return true; + } + + return send_object_map_head(); } break; + case STATE_OBJECT_MAP_HEAD: + ldout(cct, 20) << "OBJECT_MAP_HEAD" << dendl; + assert(r == 0); + return send_object_map(); + case STATE_OBJECT_MAP: ldout(cct, 20) << "OBJECT_MAP" << dendl; assert(r == 0); @@ -259,26 +281,65 @@ void CopyupRequest::remove_from_list() m_ictx->copyup_list.erase(it); } -bool CopyupRequest::send_object_map() { +bool CopyupRequest::send_object_map_head() { + CephContext *cct = m_ictx->cct; + ldout(cct, 20) << __func__ << " " << this << dendl; + + m_state = STATE_OBJECT_MAP_HEAD; + { + RWLock::RLocker owner_locker(m_ictx->owner_lock); RWLock::RLocker snap_locker(m_ictx->snap_lock); if (m_ictx->object_map != nullptr) { bool copy_on_read = m_pending_requests.empty(); assert(m_ictx->exclusive_lock->is_lock_owner()); RWLock::WLocker object_map_locker(m_ictx->object_map_lock); - if (copy_on_read && - (*m_ictx->object_map)[m_object_no] != OBJECT_EXISTS) { - // CoW already updates the HEAD object map - m_snap_ids.push_back(CEPH_NOSNAP); - } if (!m_ictx->snaps.empty()) { m_snap_ids.insert(m_snap_ids.end(), m_ictx->snaps.begin(), m_ictx->snaps.end()); } + if (copy_on_read && + (*m_ictx->object_map)[m_object_no] != OBJECT_EXISTS) { + m_snap_ids.insert(m_snap_ids.begin(), CEPH_NOSNAP); + object_map_locker.unlock(); + snap_locker.unlock(); + owner_locker.unlock(); + return send_object_map(); + } + + bool may_update = false; + uint8_t new_state, current_state; + + vector *>::reverse_iterator r_it = m_pending_requests.rbegin(); + for (; r_it != m_pending_requests.rend(); ++r_it) { + AioObjectRequest<> *req = *r_it; + if (!req->pre_object_map_update(&new_state)) { + continue; + } + + current_state = (*m_ictx->object_map)[m_object_no]; + ldout(cct, 20) << __func__ << " " << req->get_op_type() << " object no " + << m_object_no << " current state " + << stringify(static_cast(current_state)) + << " new state " << stringify(static_cast(new_state)) + << dendl; + may_update = true; + break; + } + + if (may_update && (new_state != current_state) && + m_ictx->object_map->aio_update( + CEPH_NOSNAP, m_object_no, new_state, current_state, this)) { + return false; + } } } + return send_object_map(); +} + +bool CopyupRequest::send_object_map() { // avoid possible recursive lock attempts if (m_snap_ids.empty()) { // no object map update required @@ -286,8 +347,7 @@ bool CopyupRequest::send_object_map() { } else { // update object maps for HEAD and all existing snapshots ldout(m_ictx->cct, 20) << __func__ << " " << this - << ": oid " << m_oid - << dendl; + << ": oid " << m_oid << dendl; m_state = STATE_OBJECT_MAP; RWLock::RLocker owner_locker(m_ictx->owner_lock); diff --git a/src/librbd/CopyupRequest.h b/src/librbd/CopyupRequest.h index a0edbf3accab9..d661aafb9da2f 100644 --- a/src/librbd/CopyupRequest.h +++ b/src/librbd/CopyupRequest.h @@ -36,19 +36,22 @@ private: * * @verbatim * - * - * | - * v - * STATE_READ_FROM_PARENT - * . . | - * . . v - * . . STATE_OBJECT_MAP . . - * . . | . - * . . v . - * . . . > STATE_COPYUP . - * . | . - * . v . - * . . . . > < . . . . . + * + * | + * v + * . . .STATE_READ_FROM_PARENT. . . + * . . | . + * . . v . + * . . STATE_OBJECT_MAP_HEAD v (copy on read / + * . . | . no HEAD rev. update) + * v v v . + * . . STATE_OBJECT_MAP. . . . . + * . . | + * . . v + * . . . . > STATE_COPYUP + * . | + * . v + * . . . . > * * @endverbatim * @@ -58,7 +61,8 @@ private: */ enum State { STATE_READ_FROM_PARENT, - STATE_OBJECT_MAP, + STATE_OBJECT_MAP_HEAD, // only update the HEAD revision + STATE_OBJECT_MAP, // update HEAD+snaps (if any) STATE_COPYUP }; @@ -82,8 +86,10 @@ private: void remove_from_list(); + bool send_object_map_head(); bool send_object_map(); bool send_copyup(); + bool is_copyup_required(); }; } // namespace librbd -- 2.39.5