#include "common/dout.h"
#include "common/errno.h"
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
#include "librbd/internal.h"
#include "librbd/AioCompletion.h"
namespace librbd {
- class AioRead;
+ class AioObjectRead;
typedef enum {
AIO_TYPE_READ = 0,
/**
* AioCompletion is the overall completion for a single
- * rbd I/O request. It may be composed of many AioRequests,
+ * rbd I/O request. It may be composed of many AioObjectRequests,
* which each go to a single object.
*
* The retrying of individual requests is handled at a lower level,
}
virtual ~C_AioRead() {}
virtual void finish(int r);
- void set_req(AioRead *req) {
+ void set_req(AioObjectRead *req) {
m_req = req;
}
private:
- AioRead *m_req;
+ AioObjectRead *m_req;
};
class C_CacheRead : public Context {
public:
- explicit C_CacheRead(ImageCtx *ictx, AioRead *req)
+ explicit C_CacheRead(ImageCtx *ictx, AioObjectRead *req)
: m_image_ctx(*ictx), m_req(req), m_enqueued(false) {}
virtual void complete(int r);
protected:
virtual void finish(int r);
private:
ImageCtx &m_image_ctx;
- AioRead *m_req;
+ AioObjectRead *m_req;
bool m_enqueued;
};
}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/RWLock.h"
+
+#include "librbd/AioCompletion.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
+
+#include "librbd/AioObjectRequest.h"
+#include "librbd/CopyupRequest.h"
+
+#include <boost/bind.hpp>
+#include <boost/optional.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::AioObjectRequest: "
+
+namespace librbd {
+
+ AioObjectRequest::AioObjectRequest(ImageCtx *ictx, const std::string &oid,
+ uint64_t objectno, uint64_t off,
+ uint64_t len, librados::snap_t snap_id,
+ Context *completion, bool hide_enoent)
+ : m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off),
+ m_object_len(len), m_snap_id(snap_id), m_completion(completion),
+ m_hide_enoent(hide_enoent) {
+
+ Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+ 0, m_ictx->layout.fl_object_size, m_parent_extents);
+
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ RWLock::RLocker parent_locker(m_ictx->parent_lock);
+ compute_parent_extents();
+ }
+
+ void AioObjectRequest::complete(int r)
+ {
+ if (should_complete(r)) {
+ ldout(m_ictx->cct, 20) << "complete " << this << dendl;
+ if (m_hide_enoent && r == -ENOENT) {
+ r = 0;
+ }
+ m_completion->complete(r);
+ delete this;
+ }
+ }
+
+ bool AioObjectRequest::compute_parent_extents() {
+ assert(m_ictx->snap_lock.is_locked());
+ assert(m_ictx->parent_lock.is_locked());
+
+ uint64_t parent_overlap;
+ int r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
+ if (r < 0) {
+ // NOTE: it's possible for a snapshot to be deleted while we are
+ // still reading from it
+ lderr(m_ictx->cct) << this << " compute_parent_extents: failed to "
+ << "retrieve parent overlap: " << cpp_strerror(r)
+ << dendl;
+ m_parent_extents.clear();
+ return false;
+ }
+
+ uint64_t object_overlap =
+ m_ictx->prune_parent_extents(m_parent_extents, parent_overlap);
+ if (object_overlap > 0) {
+ ldout(m_ictx->cct, 20) << this << " compute_parent_extents: "
+ << "overlap " << parent_overlap << " "
+ << "extents " << m_parent_extents << dendl;
+ return true;
+ }
+ return false;
+ }
+
+ static inline bool is_copy_on_read(ImageCtx *ictx, librados::snap_t snap_id) {
+ assert(ictx->snap_lock.is_locked());
+ return (ictx->clone_copy_on_read) &&
+ (!ictx->read_only) && (snap_id == CEPH_NOSNAP);
+ }
+
+ /** read **/
+
+ AioObjectRead::AioObjectRead(ImageCtx *ictx, const std::string &oid,
+ uint64_t objectno, uint64_t offset, uint64_t len,
+ vector<pair<uint64_t,uint64_t> >& be,
+ librados::snap_t snap_id, bool sparse,
+ Context *completion, int op_flags)
+ : AioObjectRequest(ictx, oid, objectno, offset, len, snap_id, completion,
+ false),
+ m_buffer_extents(be), m_tried_parent(false), m_sparse(sparse),
+ m_op_flags(op_flags), m_parent_completion(NULL),
+ m_state(LIBRBD_AIO_READ_FLAT) {
+
+ guard_read();
+ }
+
+ AioObjectRead::~AioObjectRead()
+ {
+ if (m_parent_completion) {
+ m_parent_completion->release();
+ m_parent_completion = NULL;
+ }
+ }
+
+ void AioObjectRead::guard_read()
+ {
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ RWLock::RLocker parent_locker(m_ictx->parent_lock);
+
+ if (has_parent()) {
+ ldout(m_ictx->cct, 20) << __func__ << " guarding read" << dendl;
+ m_state = LIBRBD_AIO_READ_GUARD;
+ }
+ }
+
+ bool AioObjectRead::should_complete(int r)
+ {
+ ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len
+ << " r = " << r << dendl;
+
+ bool finished = true;
+
+ switch (m_state) {
+ case LIBRBD_AIO_READ_GUARD:
+ ldout(m_ictx->cct, 20) << "should_complete " << this
+ << " READ_CHECK_GUARD" << dendl;
+
+ // This is the step to read from parent
+ if (!m_tried_parent && r == -ENOENT) {
+ {
+ RWLock::RLocker l(m_ictx->snap_lock);
+ RWLock::RLocker l2(m_ictx->parent_lock);
+ if (m_ictx->parent == NULL) {
+ ldout(m_ictx->cct, 20) << "parent is gone; do nothing" << dendl;
+ m_state = LIBRBD_AIO_READ_FLAT;
+ finished = false;
+ break;
+ }
+
+ // calculate reverse mapping onto the image
+ vector<pair<uint64_t,uint64_t> > parent_extents;
+ Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+ m_object_off, m_object_len, parent_extents);
+
+ uint64_t parent_overlap = 0;
+ uint64_t object_overlap = 0;
+ r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
+ if (r == 0) {
+ object_overlap = m_ictx->prune_parent_extents(parent_extents,
+ parent_overlap);
+ }
+
+ if (object_overlap > 0) {
+ m_tried_parent = true;
+ if (is_copy_on_read(m_ictx, m_snap_id)) {
+ m_state = LIBRBD_AIO_READ_COPYUP;
+ }
+
+ read_from_parent(parent_extents);
+ finished = false;
+ }
+ }
+
+ if (m_tried_parent) {
+ // release reference to the parent read completion. this request
+ // might be completed after unblock is invoked.
+ AioCompletion *parent_completion = m_parent_completion;
+ parent_completion->unblock(m_ictx->cct);
+ parent_completion->put();
+ }
+ }
+ break;
+ case LIBRBD_AIO_READ_COPYUP:
+ ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP"
+ << dendl;
+ // This is the extra step for copy-on-read: kick off an asynchronous copyup.
+ // It is different from copy-on-write as asynchronous copyup will finish
+ // by itself so state won't go back to LIBRBD_AIO_READ_GUARD.
+
+ assert(m_tried_parent);
+ if (r > 0) {
+ // If read entire object from parent success and CoR is possible, kick
+ // off a asynchronous copyup. This approach minimizes the latency
+ // impact.
+ send_copyup();
+ }
+ break;
+ case LIBRBD_AIO_READ_FLAT:
+ ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT"
+ << dendl;
+ // The read content should be deposit in m_read_data
+ break;
+ default:
+ lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
+ assert(0);
+ }
+
+ return finished;
+ }
+
+ void AioObjectRead::send() {
+ ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
+
+ // send read request to parent if the object doesn't exist locally
+ if (!m_ictx->object_map.object_may_exist(m_object_no)) {
+ complete(-ENOENT);
+ return;
+ }
+
+ librados::AioCompletion *rados_completion =
+ librados::Rados::aio_create_completion(this, rados_req_cb, NULL);
+ int r;
+ librados::ObjectReadOperation op;
+ int flags = m_ictx->get_read_flags(m_snap_id);
+ if (m_sparse) {
+ op.sparse_read(m_object_off, m_object_len, &m_ext_map, &m_read_data,
+ NULL);
+ } else {
+ op.read(m_object_off, m_object_len, &m_read_data, NULL);
+ }
+ op.set_op_flags2(m_op_flags);
+
+ r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &op, flags, NULL);
+ assert(r == 0);
+
+ rados_completion->release();
+ }
+
+ void AioObjectRead::send_copyup()
+ {
+ {
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ RWLock::RLocker parent_locker(m_ictx->parent_lock);
+ if (!compute_parent_extents()) {
+ return;
+ }
+ }
+
+ Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
+ map<uint64_t, CopyupRequest*>::iterator it =
+ m_ictx->copyup_list.find(m_object_no);
+ if (it == m_ictx->copyup_list.end()) {
+ // create and kick off a CopyupRequest
+ CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid, m_object_no,
+ m_parent_extents);
+ m_ictx->copyup_list[m_object_no] = new_req;
+ new_req->queue_send();
+ }
+ }
+
+ void AioObjectRead::read_from_parent(const vector<pair<uint64_t,uint64_t> >& parent_extents)
+ {
+ assert(!m_parent_completion);
+ m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
+
+ // prevent the parent image from being deleted while this
+ // request is still in-progress
+ m_parent_completion->get();
+ m_parent_completion->block();
+
+ ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
+ << " parent completion " << m_parent_completion
+ << " extents " << parent_extents
+ << dendl;
+ aio_read(m_ictx->parent, parent_extents, NULL, &m_read_data,
+ m_parent_completion, 0);
+ }
+
+ /** write **/
+
+ AbstractAioObjectWrite::AbstractAioObjectWrite(ImageCtx *ictx,
+ const std::string &oid,
+ uint64_t object_no,
+ uint64_t object_off,
+ uint64_t len,
+ const ::SnapContext &snapc,
+ Context *completion,
+ bool hide_enoent)
+ : AioObjectRequest(ictx, oid, object_no, object_off, len, CEPH_NOSNAP,
+ completion, hide_enoent),
+ m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
+ {
+ m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
+ }
+
+ void AbstractAioObjectWrite::guard_write()
+ {
+ if (has_parent()) {
+ m_state = LIBRBD_AIO_WRITE_GUARD;
+ m_write.assert_exists();
+ ldout(m_ictx->cct, 20) << __func__ << " guarding write" << dendl;
+ }
+ }
+
+ bool AbstractAioObjectWrite::should_complete(int r)
+ {
+ ldout(m_ictx->cct, 20) << get_write_type() << " " << this << " " << m_oid
+ << " " << m_object_off << "~" << m_object_len
+ << " should_complete: r = " << r << dendl;
+
+ bool finished = true;
+ switch (m_state) {
+ case LIBRBD_AIO_WRITE_PRE:
+ ldout(m_ictx->cct, 20) << "WRITE_PRE" << dendl;
+ if (r < 0) {
+ return true;
+ }
+
+ send_write();
+ finished = false;
+ break;
+
+ case LIBRBD_AIO_WRITE_POST:
+ ldout(m_ictx->cct, 20) << "WRITE_POST" << dendl;
+ finished = true;
+ break;
+
+ case LIBRBD_AIO_WRITE_GUARD:
+ ldout(m_ictx->cct, 20) << "WRITE_CHECK_GUARD" << dendl;
+
+ if (r == -ENOENT) {
+ handle_write_guard();
+ finished = false;
+ break;
+ } else if (r < 0) {
+ // pass the error code to the finish context
+ m_state = LIBRBD_AIO_WRITE_ERROR;
+ complete(r);
+ finished = false;
+ break;
+ }
+
+ finished = send_post();
+ break;
+
+ case LIBRBD_AIO_WRITE_COPYUP:
+ ldout(m_ictx->cct, 20) << "WRITE_COPYUP" << dendl;
+ if (r < 0) {
+ m_state = LIBRBD_AIO_WRITE_ERROR;
+ complete(r);
+ finished = false;
+ } else {
+ finished = send_post();
+ }
+ break;
+
+ case LIBRBD_AIO_WRITE_FLAT:
+ ldout(m_ictx->cct, 20) << "WRITE_FLAT" << dendl;
+
+ finished = send_post();
+ break;
+
+ case LIBRBD_AIO_WRITE_ERROR:
+ assert(r < 0);
+ lderr(m_ictx->cct) << "WRITE_ERROR: " << cpp_strerror(r)
+ << dendl;
+ break;
+
+ default:
+ lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
+ assert(0);
+ }
+
+ return finished;
+ }
+
+ void AbstractAioObjectWrite::send() {
+ assert(m_ictx->owner_lock.is_locked());
+ ldout(m_ictx->cct, 20) << "send " << get_write_type() << " " << this <<" "
+ << m_oid << " " << m_object_off << "~"
+ << m_object_len << dendl;
+ send_pre();
+ }
+
+ void AbstractAioObjectWrite::send_pre() {
+ assert(m_ictx->owner_lock.is_locked());
+
+ m_object_exist = m_ictx->object_map.object_may_exist(m_object_no);
+ bool write = false;
+ {
+ RWLock::RLocker snap_lock(m_ictx->snap_lock);
+ if (!m_ictx->object_map.enabled()) {
+ write = true;
+ } else {
+ // should have been flushed prior to releasing lock
+ assert(m_ictx->image_watcher->is_lock_owner());
+
+ ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
+ m_state = LIBRBD_AIO_WRITE_PRE;
+
+ uint8_t new_state;
+ boost::optional<uint8_t> current_state;
+ pre_object_map_update(&new_state);
+
+ RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+ if (m_ictx->object_map[m_object_no] != new_state) {
+ FunctionContext *ctx = new FunctionContext(
+ boost::bind(&AioObjectRequest::complete, this, _1));
+ bool updated = m_ictx->object_map.aio_update(m_object_no, new_state,
+ current_state, ctx);
+ assert(updated);
+ } else {
+ write = true;
+ }
+ }
+ }
+
+ // avoid possible recursive lock attempts
+ if (write) {
+ // no object map update required
+ send_write();
+ }
+ }
+
+ bool AbstractAioObjectWrite::send_post() {
+ RWLock::RLocker owner_locker(m_ictx->owner_lock);
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ if (!m_ictx->object_map.enabled() || !post_object_map_update()) {
+ return true;
+ }
+
+ // should have been flushed prior to releasing lock
+ assert(m_ictx->image_watcher->is_lock_owner());
+
+ ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
+ m_state = LIBRBD_AIO_WRITE_POST;
+
+ RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+ uint8_t current_state = m_ictx->object_map[m_object_no];
+ if (current_state != OBJECT_PENDING ||
+ current_state == OBJECT_NONEXISTENT) {
+ return true;
+ }
+
+ FunctionContext *ctx = new FunctionContext(
+ boost::bind(&AioObjectRequest::complete, this, _1));
+ bool updated = m_ictx->object_map.aio_update(m_object_no,
+ OBJECT_NONEXISTENT,
+ OBJECT_PENDING, ctx);
+ assert(updated);
+ return false;
+ }
+
+ void AbstractAioObjectWrite::send_write() {
+ ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len
+ << " object exist " << m_object_exist << dendl;
+
+ if (!m_object_exist && has_parent()) {
+ m_state = LIBRBD_AIO_WRITE_GUARD;
+ handle_write_guard();
+ } else {
+ send_write_op(true);
+ }
+ }
+
+ void AbstractAioObjectWrite::send_copyup()
+ {
+ ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
+ m_state = LIBRBD_AIO_WRITE_COPYUP;
+
+ m_ictx->copyup_list_lock.Lock();
+ map<uint64_t, CopyupRequest*>::iterator it =
+ m_ictx->copyup_list.find(m_object_no);
+ if (it == m_ictx->copyup_list.end()) {
+ CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+ m_object_no,
+ m_parent_extents);
+
+ // make sure to wait on this CopyupRequest
+ new_req->append_request(this);
+ m_ictx->copyup_list[m_object_no] = new_req;
+
+ m_ictx->copyup_list_lock.Unlock();
+ new_req->send();
+ } else {
+ it->second->append_request(this);
+ m_ictx->copyup_list_lock.Unlock();
+ }
+ }
+ void AbstractAioObjectWrite::send_write_op(bool write_guard)
+ {
+ m_state = LIBRBD_AIO_WRITE_FLAT;
+ if (write_guard)
+ guard_write();
+ add_write_ops(&m_write);
+ assert(m_write.size() != 0);
+
+ librados::AioCompletion *rados_completion =
+ librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
+ int r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &m_write,
+ m_snap_seq, m_snaps);
+ assert(r == 0);
+ rados_completion->release();
+ }
+ void AbstractAioObjectWrite::handle_write_guard()
+ {
+ bool has_parent;
+ {
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ RWLock::RLocker parent_locker(m_ictx->parent_lock);
+ has_parent = compute_parent_extents();
+ }
+ // If parent still exists, overlap might also have changed.
+ if (has_parent) {
+ send_copyup();
+ } else {
+ // parent may have disappeared -- send original write again
+ ldout(m_ictx->cct, 20) << "should_complete(" << this
+ << "): parent overlap now 0" << dendl;
+ send_write();
+ }
+ }
+
+ void AioObjectWrite::add_write_ops(librados::ObjectWriteOperation *wr) {
+ if (m_ictx->enable_alloc_hint && !m_ictx->object_map.object_may_exist(m_object_no))
+ wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
+ if (m_object_off == 0 && m_object_len == m_ictx->get_object_size()) {
+ wr->write_full(m_write_data);
+ } else {
+ wr->write(m_object_off, m_write_data);
+ }
+ wr->set_op_flags2(m_op_flags);
+ }
+
+ void AioObjectWrite::send_write() {
+ bool write_full = (m_object_off == 0 && m_object_len == m_ictx->get_object_size());
+ ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len
+ << " object exist " << m_object_exist
+ << " write_full " << write_full << dendl;
+ if (write_full) {
+ send_write_op(false);
+ } else {
+ AbstractAioObjectWrite::send_write();
+ }
+ }
+
+ void AioObjectRemove::guard_write() {
+ // do nothing to disable write guard only if deep-copyup not required
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ if (!m_ictx->snaps.empty()) {
+ AbstractAioObjectWrite::guard_write();
+ }
+ }
+ void AioObjectRemove::send_write() {
+ ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
+ send_write_op(true);
+ }
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_AIOREQUEST_H
+#define CEPH_LIBRBD_AIOREQUEST_H
+
+#include "include/int_types.h"
+
+#include <map>
+
+#include "common/snap_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "librbd/ObjectMap.h"
+
+namespace librbd {
+
+ struct AioCompletion;
+ struct ImageCtx;
+ class CopyupRequest;
+
+ /**
+ * This class represents an I/O operation to a single RBD data object.
+ * Its subclasses encapsulate logic for dealing with special cases
+ * for I/O due to layering.
+ */
+ class AioObjectRequest
+ {
+ public:
+ AioObjectRequest(ImageCtx *ictx, const std::string &oid,
+ uint64_t objectno, uint64_t off, uint64_t len,
+ librados::snap_t snap_id,
+ Context *completion, bool hide_enoent);
+ virtual ~AioObjectRequest() {}
+
+ virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) {};
+
+ void complete(int r);
+
+ virtual bool should_complete(int r) = 0;
+ virtual void send() = 0;
+
+ bool has_parent() const {
+ return !m_parent_extents.empty();
+ }
+
+ protected:
+ bool compute_parent_extents();
+
+ ImageCtx *m_ictx;
+ std::string m_oid;
+ uint64_t m_object_no, m_object_off, m_object_len;
+ librados::snap_t m_snap_id;
+ Context *m_completion;
+ std::vector<std::pair<uint64_t,uint64_t> > m_parent_extents;
+ bool m_hide_enoent;
+ };
+
+ class AioObjectRead : public AioObjectRequest {
+ public:
+ AioObjectRead(ImageCtx *ictx, const std::string &oid,
+ uint64_t objectno, uint64_t offset, uint64_t len,
+ vector<pair<uint64_t,uint64_t> >& be,
+ librados::snap_t snap_id, bool sparse,
+ Context *completion, int op_flags);
+ virtual ~AioObjectRead();
+
+ virtual bool should_complete(int r);
+ virtual void send();
+ void guard_read();
+
+ ceph::bufferlist &data() {
+ return m_read_data;
+ }
+
+ std::map<uint64_t, uint64_t> m_ext_map;
+
+ friend class C_AioRead;
+
+ private:
+ vector<pair<uint64_t,uint64_t> > m_buffer_extents;
+ bool m_tried_parent;
+ bool m_sparse;
+ int m_op_flags;
+ ceph::bufferlist m_read_data;
+ AioCompletion *m_parent_completion;
+
+ /**
+ * Reads go through the following state machine to deal with
+ * layering:
+ *
+ * need copyup
+ * LIBRBD_AIO_READ_GUARD ---------------> LIBRBD_AIO_READ_COPYUP
+ * | |
+ * v |
+ * done <------------------------------------/
+ * ^
+ * |
+ * LIBRBD_AIO_READ_FLAT
+ *
+ * Reads start in LIBRBD_AIO_READ_GUARD or _FLAT, depending on
+ * whether there is a parent or not.
+ */
+ enum read_state_d {
+ LIBRBD_AIO_READ_GUARD,
+ LIBRBD_AIO_READ_COPYUP,
+ LIBRBD_AIO_READ_FLAT
+ };
+
+ read_state_d m_state;
+
+ void send_copyup();
+ void read_from_parent(const vector<pair<uint64_t,uint64_t> >& image_extents);
+ };
+
+ class AbstractAioObjectWrite : public AioObjectRequest {
+ public:
+ AbstractAioObjectWrite(ImageCtx *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ uint64_t len, const ::SnapContext &snapc,
+ Context *completion, bool hide_enoent);
+
+ virtual void add_copyup_ops(librados::ObjectWriteOperation *wr)
+ {
+ add_write_ops(wr);
+ }
+
+ virtual bool should_complete(int r);
+ virtual void send();
+
+ /**
+ * Writes go through the following state machine to deal with
+ * layering and the object map:
+ *
+ * <start>
+ * . |
+ * . |
+ * . \---> LIBRBD_AIO_WRITE_PRE
+ * . | |
+ * . . . . . . | . . . . | . . . . . . . . . . .
+ * . | -or- | .
+ * . | | v
+ * . | \----------------> LIBRBD_AIO_WRITE_FLAT . . .
+ * . | | .
+ * v v need copyup | .
+ * LIBRBD_AIO_WRITE_GUARD -----------> LIBRBD_AIO_WRITE_COPYUP | .
+ * . | | . | .
+ * . | | . | .
+ * . | /-----/ . | .
+ * . | | . | .
+ * . \-------------------\ | /-------------------/ .
+ * . | | | . .
+ * . v v v . .
+ * . LIBRBD_AIO_WRITE_POST . .
+ * . | . .
+ * . | . . . . . . . . .
+ * . | . .
+ * . v v .
+ * . . . . . . . . . . . . . . > <finish> < . . . . . . . . . . . . . .
+ *
+ * The _PRE/_POST states are skipped if the object map is disabled.
+ * The write starts in _WRITE_GUARD or _FLAT depending on whether or not
+ * there is a parent overlap.
+ */
+ protected:
+ enum write_state_d {
+ LIBRBD_AIO_WRITE_GUARD,
+ LIBRBD_AIO_WRITE_COPYUP,
+ LIBRBD_AIO_WRITE_FLAT,
+ LIBRBD_AIO_WRITE_PRE,
+ LIBRBD_AIO_WRITE_POST,
+ LIBRBD_AIO_WRITE_ERROR
+ };
+
+ write_state_d m_state;
+ librados::ObjectWriteOperation m_write;
+ uint64_t m_snap_seq;
+ std::vector<librados::snap_t> m_snaps;
+ bool m_object_exist;
+
+ virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
+ virtual const char* get_write_type() const = 0;
+ virtual void guard_write();
+ virtual void pre_object_map_update(uint8_t *new_state) = 0;
+ virtual bool post_object_map_update() {
+ return false;
+ }
+ virtual void send_write();
+ virtual void send_write_op(bool write_guard);
+ virtual void handle_write_guard();
+
+ private:
+ void send_pre();
+ bool send_post();
+ void send_copyup();
+ };
+
+ class AioObjectWrite : public AbstractAioObjectWrite {
+ public:
+ AioObjectWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, const ceph::bufferlist &data,
+ const ::SnapContext &snapc, Context *completion)
+ : AbstractAioObjectWrite(ictx, oid, object_no, object_off, data.length(),
+ snapc, completion, false),
+ m_write_data(data), m_op_flags(0) {
+ }
+
+ void set_op_flags(int op_flags) {
+ m_op_flags = op_flags;
+ }
+ protected:
+ virtual void add_write_ops(librados::ObjectWriteOperation *wr);
+
+ virtual const char* get_write_type() const {
+ return "write";
+ }
+
+ virtual void pre_object_map_update(uint8_t *new_state) {
+ *new_state = OBJECT_EXISTS;
+ }
+ virtual void send_write();
+
+ private:
+ ceph::bufferlist m_write_data;
+ int m_op_flags;
+ };
+
+ class AioObjectRemove : public AbstractAioObjectWrite {
+ public:
+ AioObjectRemove(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+ const ::SnapContext &snapc, Context *completion)
+ : AbstractAioObjectWrite(ictx, oid, object_no, 0, 0, snapc, completion,
+ true),
+ m_object_state(OBJECT_NONEXISTENT) {
+ }
+
+ protected:
+ virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
+ if (has_parent()) {
+ wr->truncate(0);
+ } else {
+ wr->remove();
+ }
+ }
+
+ virtual const char* get_write_type() const {
+ if (has_parent()) {
+ return "remove (trunc)";
+ }
+ return "remove";
+ }
+ virtual void pre_object_map_update(uint8_t *new_state) {
+ if (has_parent()) {
+ m_object_state = OBJECT_EXISTS;
+ } else {
+ m_object_state = OBJECT_PENDING;
+ }
+ *new_state = m_object_state;
+ }
+
+ virtual bool post_object_map_update() {
+ if (m_object_state == OBJECT_EXISTS) {
+ return false;
+ }
+ return true;
+ }
+
+ virtual void guard_write();
+ virtual void send_write();
+
+ private:
+ uint8_t m_object_state;
+ };
+
+ class AioObjectTrim : public AbstractAioObjectWrite {
+ public:
+ AioObjectTrim(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+ const ::SnapContext &snapc, Context *completion)
+ : AbstractAioObjectWrite(ictx, oid, object_no, 0, 0, snapc, completion,
+ true) {
+ }
+
+ protected:
+ virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
+ wr->remove();
+ }
+
+ virtual const char* get_write_type() const {
+ return "remove (trim)";
+ }
+
+ virtual void pre_object_map_update(uint8_t *new_state) {
+ *new_state = OBJECT_PENDING;
+ }
+
+ virtual bool post_object_map_update() {
+ return true;
+ }
+ };
+
+ class AioObjectTruncate : public AbstractAioObjectWrite {
+ public:
+ AioObjectTruncate(ImageCtx *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ const ::SnapContext &snapc, Context *completion)
+ : AbstractAioObjectWrite(ictx, oid, object_no, object_off, 0, snapc,
+ completion, true) {
+ }
+
+ protected:
+ virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
+ wr->truncate(m_object_off);
+ }
+
+ virtual const char* get_write_type() const {
+ return "truncate";
+ }
+
+ virtual void pre_object_map_update(uint8_t *new_state) {
+ *new_state = OBJECT_EXISTS;
+ }
+ };
+
+ class AioObjectZero : public AbstractAioObjectWrite {
+ public:
+ AioObjectZero(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, uint64_t object_len,
+ const ::SnapContext &snapc, Context *completion)
+ : AbstractAioObjectWrite(ictx, oid, object_no, object_off, object_len,
+ snapc, completion, true) {
+ }
+
+ protected:
+ virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
+ wr->zero(m_object_off, m_object_len);
+ }
+
+ virtual const char* get_write_type() const {
+ return "zero";
+ }
+
+ virtual void pre_object_map_update(uint8_t *new_state) {
+ *new_state = OBJECT_EXISTS;
+ }
+ };
+
+}
+
+#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "common/ceph_context.h"
-#include "common/dout.h"
-#include "common/errno.h"
-#include "common/Mutex.h"
-#include "common/RWLock.h"
-
-#include "librbd/AioCompletion.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/ImageWatcher.h"
-#include "librbd/internal.h"
-
-#include "librbd/AioRequest.h"
-#include "librbd/CopyupRequest.h"
-
-#include <boost/bind.hpp>
-#include <boost/optional.hpp>
-
-#define dout_subsys ceph_subsys_rbd
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::AioRequest: "
-
-namespace librbd {
-
- AioRequest::AioRequest(ImageCtx *ictx, const std::string &oid,
- uint64_t objectno, uint64_t off, uint64_t len,
- librados::snap_t snap_id,
- Context *completion,
- bool hide_enoent)
- : m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off),
- m_object_len(len), m_snap_id(snap_id), m_completion(completion),
- m_hide_enoent(hide_enoent) {
-
- Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
- 0, m_ictx->layout.fl_object_size, m_parent_extents);
-
- RWLock::RLocker snap_locker(m_ictx->snap_lock);
- RWLock::RLocker parent_locker(m_ictx->parent_lock);
- compute_parent_extents();
- }
-
- void AioRequest::complete(int r)
- {
- if (should_complete(r)) {
- ldout(m_ictx->cct, 20) << "complete " << this << dendl;
- if (m_hide_enoent && r == -ENOENT) {
- r = 0;
- }
- m_completion->complete(r);
- delete this;
- }
- }
-
- bool AioRequest::compute_parent_extents() {
- assert(m_ictx->snap_lock.is_locked());
- assert(m_ictx->parent_lock.is_locked());
-
- uint64_t parent_overlap;
- int r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
- if (r < 0) {
- // NOTE: it's possible for a snapshot to be deleted while we are
- // still reading from it
- lderr(m_ictx->cct) << this << " compute_parent_extents: failed to "
- << "retrieve parent overlap: " << cpp_strerror(r)
- << dendl;
- m_parent_extents.clear();
- return false;
- }
-
- uint64_t object_overlap =
- m_ictx->prune_parent_extents(m_parent_extents, parent_overlap);
- if (object_overlap > 0) {
- ldout(m_ictx->cct, 20) << this << " compute_parent_extents: "
- << "overlap " << parent_overlap << " "
- << "extents " << m_parent_extents << dendl;
- return true;
- }
- return false;
- }
-
- static inline bool is_copy_on_read(ImageCtx *ictx, librados::snap_t snap_id) {
- assert(ictx->snap_lock.is_locked());
- return (ictx->clone_copy_on_read) &&
- (!ictx->read_only) && (snap_id == CEPH_NOSNAP);
- }
-
- /** read **/
-
- AioRead::AioRead(ImageCtx *ictx, const std::string &oid,
- uint64_t objectno, uint64_t offset, uint64_t len,
- vector<pair<uint64_t,uint64_t> >& be,
- librados::snap_t snap_id, bool sparse,
- Context *completion, int op_flags)
- : AioRequest(ictx, oid, objectno, offset, len, snap_id, completion, false),
- m_buffer_extents(be), m_tried_parent(false), m_sparse(sparse),
- m_op_flags(op_flags), m_parent_completion(NULL),
- m_state(LIBRBD_AIO_READ_FLAT) {
-
- guard_read();
- }
-
- AioRead::~AioRead()
- {
- if (m_parent_completion) {
- m_parent_completion->release();
- m_parent_completion = NULL;
- }
- }
-
- void AioRead::guard_read()
- {
- RWLock::RLocker snap_locker(m_ictx->snap_lock);
- RWLock::RLocker parent_locker(m_ictx->parent_lock);
-
- if (has_parent()) {
- ldout(m_ictx->cct, 20) << __func__ << " guarding read" << dendl;
- m_state = LIBRBD_AIO_READ_GUARD;
- }
- }
-
- bool AioRead::should_complete(int r)
- {
- ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len
- << " r = " << r << dendl;
-
- bool finished = true;
-
- switch (m_state) {
- case LIBRBD_AIO_READ_GUARD:
- ldout(m_ictx->cct, 20) << "should_complete " << this
- << " READ_CHECK_GUARD" << dendl;
-
- // This is the step to read from parent
- if (!m_tried_parent && r == -ENOENT) {
- {
- RWLock::RLocker l(m_ictx->snap_lock);
- RWLock::RLocker l2(m_ictx->parent_lock);
- if (m_ictx->parent == NULL) {
- ldout(m_ictx->cct, 20) << "parent is gone; do nothing" << dendl;
- m_state = LIBRBD_AIO_READ_FLAT;
- finished = false;
- break;
- }
-
- // calculate reverse mapping onto the image
- vector<pair<uint64_t,uint64_t> > parent_extents;
- Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
- m_object_off, m_object_len, parent_extents);
-
- uint64_t parent_overlap = 0;
- uint64_t object_overlap = 0;
- r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
- if (r == 0) {
- object_overlap = m_ictx->prune_parent_extents(parent_extents,
- parent_overlap);
- }
-
- if (object_overlap > 0) {
- m_tried_parent = true;
- if (is_copy_on_read(m_ictx, m_snap_id)) {
- m_state = LIBRBD_AIO_READ_COPYUP;
- }
-
- read_from_parent(parent_extents);
- finished = false;
- }
- }
-
- if (m_tried_parent) {
- // release reference to the parent read completion. this request
- // might be completed after unblock is invoked.
- AioCompletion *parent_completion = m_parent_completion;
- parent_completion->unblock(m_ictx->cct);
- parent_completion->put();
- }
- }
- break;
- case LIBRBD_AIO_READ_COPYUP:
- ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP"
- << dendl;
- // This is the extra step for copy-on-read: kick off an asynchronous copyup.
- // It is different from copy-on-write as asynchronous copyup will finish
- // by itself so state won't go back to LIBRBD_AIO_READ_GUARD.
-
- assert(m_tried_parent);
- if (r > 0) {
- // If read entire object from parent success and CoR is possible, kick
- // off a asynchronous copyup. This approach minimizes the latency
- // impact.
- send_copyup();
- }
- break;
- case LIBRBD_AIO_READ_FLAT:
- ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT"
- << dendl;
- // The read content should be deposit in m_read_data
- break;
- default:
- lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
- assert(0);
- }
-
- return finished;
- }
-
- void AioRead::send() {
- ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len << dendl;
-
- // send read request to parent if the object doesn't exist locally
- if (!m_ictx->object_map.object_may_exist(m_object_no)) {
- complete(-ENOENT);
- return;
- }
-
- librados::AioCompletion *rados_completion =
- librados::Rados::aio_create_completion(this, rados_req_cb, NULL);
- int r;
- librados::ObjectReadOperation op;
- int flags = m_ictx->get_read_flags(m_snap_id);
- if (m_sparse) {
- op.sparse_read(m_object_off, m_object_len, &m_ext_map, &m_read_data,
- NULL);
- } else {
- op.read(m_object_off, m_object_len, &m_read_data, NULL);
- }
- op.set_op_flags2(m_op_flags);
-
- r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &op, flags, NULL);
- assert(r == 0);
-
- rados_completion->release();
- }
-
- void AioRead::send_copyup()
- {
- {
- RWLock::RLocker snap_locker(m_ictx->snap_lock);
- RWLock::RLocker parent_locker(m_ictx->parent_lock);
- if (!compute_parent_extents()) {
- return;
- }
- }
-
- Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
- map<uint64_t, CopyupRequest*>::iterator it =
- m_ictx->copyup_list.find(m_object_no);
- if (it == m_ictx->copyup_list.end()) {
- // create and kick off a CopyupRequest
- CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid, m_object_no,
- m_parent_extents);
- m_ictx->copyup_list[m_object_no] = new_req;
- new_req->queue_send();
- }
- }
-
- void AioRead::read_from_parent(const vector<pair<uint64_t,uint64_t> >& parent_extents)
- {
- assert(!m_parent_completion);
- m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
-
- // prevent the parent image from being deleted while this
- // request is still in-progress
- m_parent_completion->get();
- m_parent_completion->block();
-
- ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
- << " parent completion " << m_parent_completion
- << " extents " << parent_extents
- << dendl;
- aio_read(m_ictx->parent, parent_extents, NULL, &m_read_data,
- m_parent_completion, 0);
- }
-
- /** write **/
-
- AbstractWrite::AbstractWrite(ImageCtx *ictx, const std::string &oid,
- uint64_t object_no, uint64_t object_off,
- uint64_t len, const ::SnapContext &snapc,
- Context *completion, bool hide_enoent)
- : AioRequest(ictx, oid, object_no, object_off, len, CEPH_NOSNAP, completion,
- hide_enoent),
- m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
- {
- m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
- }
-
- void AbstractWrite::guard_write()
- {
- if (has_parent()) {
- m_state = LIBRBD_AIO_WRITE_GUARD;
- m_write.assert_exists();
- ldout(m_ictx->cct, 20) << __func__ << " guarding write" << dendl;
- }
- }
-
- bool AbstractWrite::should_complete(int r)
- {
- ldout(m_ictx->cct, 20) << get_write_type() << " " << this << " " << m_oid
- << " " << m_object_off << "~" << m_object_len
- << " should_complete: r = " << r << dendl;
-
- bool finished = true;
- switch (m_state) {
- case LIBRBD_AIO_WRITE_PRE:
- ldout(m_ictx->cct, 20) << "WRITE_PRE" << dendl;
- if (r < 0) {
- return true;
- }
-
- send_write();
- finished = false;
- break;
-
- case LIBRBD_AIO_WRITE_POST:
- ldout(m_ictx->cct, 20) << "WRITE_POST" << dendl;
- finished = true;
- break;
-
- case LIBRBD_AIO_WRITE_GUARD:
- ldout(m_ictx->cct, 20) << "WRITE_CHECK_GUARD" << dendl;
-
- if (r == -ENOENT) {
- handle_write_guard();
- finished = false;
- break;
- } else if (r < 0) {
- // pass the error code to the finish context
- m_state = LIBRBD_AIO_WRITE_ERROR;
- complete(r);
- finished = false;
- break;
- }
-
- finished = send_post();
- break;
-
- case LIBRBD_AIO_WRITE_COPYUP:
- ldout(m_ictx->cct, 20) << "WRITE_COPYUP" << dendl;
- if (r < 0) {
- m_state = LIBRBD_AIO_WRITE_ERROR;
- complete(r);
- finished = false;
- } else {
- finished = send_post();
- }
- break;
-
- case LIBRBD_AIO_WRITE_FLAT:
- ldout(m_ictx->cct, 20) << "WRITE_FLAT" << dendl;
-
- finished = send_post();
- break;
-
- case LIBRBD_AIO_WRITE_ERROR:
- assert(r < 0);
- lderr(m_ictx->cct) << "WRITE_ERROR: " << cpp_strerror(r)
- << dendl;
- break;
-
- default:
- lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
- assert(0);
- }
-
- return finished;
- }
-
- void AbstractWrite::send() {
- assert(m_ictx->owner_lock.is_locked());
- ldout(m_ictx->cct, 20) << "send " << get_write_type() << " " << this <<" "
- << m_oid << " " << m_object_off << "~"
- << m_object_len << dendl;
- send_pre();
- }
-
- void AbstractWrite::send_pre() {
- assert(m_ictx->owner_lock.is_locked());
-
- m_object_exist = m_ictx->object_map.object_may_exist(m_object_no);
- bool write = false;
- {
- RWLock::RLocker snap_lock(m_ictx->snap_lock);
- if (!m_ictx->object_map.enabled()) {
- write = true;
- } else {
- // should have been flushed prior to releasing lock
- assert(m_ictx->image_watcher->is_lock_owner());
-
- ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len << dendl;
- m_state = LIBRBD_AIO_WRITE_PRE;
-
- uint8_t new_state;
- boost::optional<uint8_t> current_state;
- pre_object_map_update(&new_state);
-
- RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
- if (m_ictx->object_map[m_object_no] != new_state) {
- FunctionContext *ctx = new FunctionContext(
- boost::bind(&AioRequest::complete, this, _1));
- bool updated = m_ictx->object_map.aio_update(m_object_no, new_state,
- current_state, ctx);
- assert(updated);
- } else {
- write = true;
- }
- }
- }
-
- // avoid possible recursive lock attempts
- if (write) {
- // no object map update required
- send_write();
- }
- }
-
- bool AbstractWrite::send_post() {
- RWLock::RLocker owner_locker(m_ictx->owner_lock);
- RWLock::RLocker snap_locker(m_ictx->snap_lock);
- if (!m_ictx->object_map.enabled() || !post_object_map_update()) {
- return true;
- }
-
- // should have been flushed prior to releasing lock
- assert(m_ictx->image_watcher->is_lock_owner());
-
- ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len << dendl;
- m_state = LIBRBD_AIO_WRITE_POST;
-
- RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
- uint8_t current_state = m_ictx->object_map[m_object_no];
- if (current_state != OBJECT_PENDING ||
- current_state == OBJECT_NONEXISTENT) {
- return true;
- }
-
- FunctionContext *ctx = new FunctionContext(
- boost::bind(&AioRequest::complete, this, _1));
- bool updated = m_ictx->object_map.aio_update(m_object_no,
- OBJECT_NONEXISTENT,
- OBJECT_PENDING, ctx);
- assert(updated);
- return false;
- }
-
- void AbstractWrite::send_write() {
- ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len
- << " object exist " << m_object_exist << dendl;
-
- if (!m_object_exist && has_parent()) {
- m_state = LIBRBD_AIO_WRITE_GUARD;
- handle_write_guard();
- } else {
- send_write_op(true);
- }
- }
-
- void AbstractWrite::send_copyup()
- {
- ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len << dendl;
- m_state = LIBRBD_AIO_WRITE_COPYUP;
-
- m_ictx->copyup_list_lock.Lock();
- map<uint64_t, CopyupRequest*>::iterator it =
- m_ictx->copyup_list.find(m_object_no);
- if (it == m_ictx->copyup_list.end()) {
- CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
- m_object_no,
- m_parent_extents);
-
- // make sure to wait on this CopyupRequest
- new_req->append_request(this);
- m_ictx->copyup_list[m_object_no] = new_req;
-
- m_ictx->copyup_list_lock.Unlock();
- new_req->send();
- } else {
- it->second->append_request(this);
- m_ictx->copyup_list_lock.Unlock();
- }
- }
- void AbstractWrite::send_write_op(bool write_guard)
- {
- m_state = LIBRBD_AIO_WRITE_FLAT;
- if (write_guard)
- guard_write();
- add_write_ops(&m_write);
- assert(m_write.size() != 0);
-
- librados::AioCompletion *rados_completion =
- librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
- int r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &m_write,
- m_snap_seq, m_snaps);
- assert(r == 0);
- rados_completion->release();
- }
- void AbstractWrite::handle_write_guard()
- {
- bool has_parent;
- {
- RWLock::RLocker snap_locker(m_ictx->snap_lock);
- RWLock::RLocker parent_locker(m_ictx->parent_lock);
- has_parent = compute_parent_extents();
- }
- // If parent still exists, overlap might also have changed.
- if (has_parent) {
- send_copyup();
- } else {
- // parent may have disappeared -- send original write again
- ldout(m_ictx->cct, 20) << "should_complete(" << this
- << "): parent overlap now 0" << dendl;
- send_write();
- }
- }
-
- void AioWrite::add_write_ops(librados::ObjectWriteOperation *wr) {
- if (m_ictx->enable_alloc_hint && !m_ictx->object_map.object_may_exist(m_object_no))
- wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
- if (m_object_off == 0 && m_object_len == m_ictx->get_object_size()) {
- wr->write_full(m_write_data);
- } else {
- wr->write(m_object_off, m_write_data);
- }
- wr->set_op_flags2(m_op_flags);
- }
- void AioWrite::send_write() {
- bool write_full = (m_object_off == 0 && m_object_len == m_ictx->get_object_size());
- ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len
- << " object exist " << m_object_exist
- << " write_full " << write_full << dendl;
- if (write_full) {
- send_write_op(false);
- } else {
- AbstractWrite::send_write();
- }
- }
-
- void AioRemove::guard_write() {
- // do nothing to disable write guard only if deep-copyup not required
- RWLock::RLocker snap_locker(m_ictx->snap_lock);
- if (!m_ictx->snaps.empty()) {
- AbstractWrite::guard_write();
- }
- }
- void AioRemove::send_write() {
- ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len << dendl;
- send_write_op(true);
- }
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_LIBRBD_AIOREQUEST_H
-#define CEPH_LIBRBD_AIOREQUEST_H
-
-#include "include/int_types.h"
-
-#include <map>
-
-#include "common/snap_types.h"
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "include/rados/librados.hpp"
-#include "librbd/ObjectMap.h"
-
-namespace librbd {
-
- struct AioCompletion;
- struct ImageCtx;
- class CopyupRequest;
-
- /**
- * This class represents an I/O operation to a single RBD data object.
- * Its subclasses encapsulate logic for dealing with special cases
- * for I/O due to layering.
- */
- class AioRequest
- {
- public:
- AioRequest(ImageCtx *ictx, const std::string &oid,
- uint64_t objectno, uint64_t off, uint64_t len,
- librados::snap_t snap_id,
- Context *completion, bool hide_enoent);
- virtual ~AioRequest() {}
-
- virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) {};
-
- void complete(int r);
-
- virtual bool should_complete(int r) = 0;
- virtual void send() = 0;
-
- bool has_parent() const {
- return !m_parent_extents.empty();
- }
-
- protected:
- bool compute_parent_extents();
-
- ImageCtx *m_ictx;
- std::string m_oid;
- uint64_t m_object_no, m_object_off, m_object_len;
- librados::snap_t m_snap_id;
- Context *m_completion;
- std::vector<std::pair<uint64_t,uint64_t> > m_parent_extents;
- bool m_hide_enoent;
- };
-
- class AioRead : public AioRequest {
- public:
- AioRead(ImageCtx *ictx, const std::string &oid,
- uint64_t objectno, uint64_t offset, uint64_t len,
- vector<pair<uint64_t,uint64_t> >& be,
- librados::snap_t snap_id, bool sparse,
- Context *completion, int op_flags);
- virtual ~AioRead();
-
- virtual bool should_complete(int r);
- virtual void send();
- void guard_read();
-
- ceph::bufferlist &data() {
- return m_read_data;
- }
-
- std::map<uint64_t, uint64_t> m_ext_map;
-
- friend class C_AioRead;
-
- private:
- vector<pair<uint64_t,uint64_t> > m_buffer_extents;
- bool m_tried_parent;
- bool m_sparse;
- int m_op_flags;
- ceph::bufferlist m_read_data;
- AioCompletion *m_parent_completion;
-
- /**
- * Reads go through the following state machine to deal with
- * layering:
- *
- * need copyup
- * LIBRBD_AIO_READ_GUARD ---------------> LIBRBD_AIO_READ_COPYUP
- * | |
- * v |
- * done <------------------------------------/
- * ^
- * |
- * LIBRBD_AIO_READ_FLAT
- *
- * Reads start in LIBRBD_AIO_READ_GUARD or _FLAT, depending on
- * whether there is a parent or not.
- */
- enum read_state_d {
- LIBRBD_AIO_READ_GUARD,
- LIBRBD_AIO_READ_COPYUP,
- LIBRBD_AIO_READ_FLAT
- };
-
- read_state_d m_state;
-
- void send_copyup();
- void read_from_parent(const vector<pair<uint64_t,uint64_t> >& image_extents);
- };
-
- class AbstractWrite : public AioRequest {
- public:
- AbstractWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
- uint64_t object_off, uint64_t len, const ::SnapContext &snapc,
- Context *completion, bool hide_enoent);
- virtual ~AbstractWrite() {}
-
- virtual void add_copyup_ops(librados::ObjectWriteOperation *wr)
- {
- add_write_ops(wr);
- }
-
- virtual bool should_complete(int r);
- virtual void send();
-
- /**
- * Writes go through the following state machine to deal with
- * layering and the object map:
- *
- * <start>
- * . |
- * . |
- * . \---> LIBRBD_AIO_WRITE_PRE
- * . | |
- * . . . . . . | . . . . | . . . . . . . . . . .
- * . | -or- | .
- * . | | v
- * . | \----------------> LIBRBD_AIO_WRITE_FLAT . . .
- * . | | .
- * v v need copyup | .
- * LIBRBD_AIO_WRITE_GUARD -----------> LIBRBD_AIO_WRITE_COPYUP | .
- * . | | . | .
- * . | | . | .
- * . | /-----/ . | .
- * . | | . | .
- * . \-------------------\ | /-------------------/ .
- * . | | | . .
- * . v v v . .
- * . LIBRBD_AIO_WRITE_POST . .
- * . | . .
- * . | . . . . . . . . .
- * . | . .
- * . v v .
- * . . . . . . . . . . . . . . > <finish> < . . . . . . . . . . . . . .
- *
- * The _PRE/_POST states are skipped if the object map is disabled.
- * The write starts in _WRITE_GUARD or _FLAT depending on whether or not
- * there is a parent overlap.
- */
- protected:
- enum write_state_d {
- LIBRBD_AIO_WRITE_GUARD,
- LIBRBD_AIO_WRITE_COPYUP,
- LIBRBD_AIO_WRITE_FLAT,
- LIBRBD_AIO_WRITE_PRE,
- LIBRBD_AIO_WRITE_POST,
- LIBRBD_AIO_WRITE_ERROR
- };
-
- write_state_d m_state;
- librados::ObjectWriteOperation m_write;
- uint64_t m_snap_seq;
- std::vector<librados::snap_t> m_snaps;
- bool m_object_exist;
-
- virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
- virtual const char* get_write_type() const = 0;
- virtual void guard_write();
- virtual void pre_object_map_update(uint8_t *new_state) = 0;
- virtual bool post_object_map_update() {
- return false;
- }
- virtual void send_write();
- virtual void send_write_op(bool write_guard);
- virtual void handle_write_guard();
-
- private:
- void send_pre();
- bool send_post();
- void send_copyup();
- };
-
- class AioWrite : public AbstractWrite {
- public:
- AioWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
- uint64_t object_off, const ceph::bufferlist &data,
- const ::SnapContext &snapc, Context *completion)
- : AbstractWrite(ictx, oid, object_no, object_off, data.length(), snapc,
- completion, false),
- m_write_data(data), m_op_flags(0) {
- }
- virtual ~AioWrite() {}
-
- void set_op_flags(int op_flags) {
- m_op_flags = op_flags;
- }
- protected:
- virtual void add_write_ops(librados::ObjectWriteOperation *wr);
-
- virtual const char* get_write_type() const {
- return "write";
- }
-
- virtual void pre_object_map_update(uint8_t *new_state) {
- *new_state = OBJECT_EXISTS;
- }
- virtual void send_write();
-
- private:
- ceph::bufferlist m_write_data;
- int m_op_flags;
- };
-
- class AioRemove : public AbstractWrite {
- public:
- AioRemove(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
- const ::SnapContext &snapc, Context *completion)
- : AbstractWrite(ictx, oid, object_no, 0, 0, snapc, completion, true),
- m_object_state(OBJECT_NONEXISTENT) {
- }
- virtual ~AioRemove() {}
-
- protected:
- virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
- if (has_parent()) {
- wr->truncate(0);
- } else {
- wr->remove();
- }
- }
-
- virtual const char* get_write_type() const {
- if (has_parent()) {
- return "remove (trunc)";
- }
- return "remove";
- }
- virtual void pre_object_map_update(uint8_t *new_state) {
- if (has_parent()) {
- m_object_state = OBJECT_EXISTS;
- } else {
- m_object_state = OBJECT_PENDING;
- }
- *new_state = m_object_state;
- }
-
- virtual bool post_object_map_update() {
- if (m_object_state == OBJECT_EXISTS) {
- return false;
- }
- return true;
- }
-
- virtual void guard_write();
- virtual void send_write();
-
- private:
- uint8_t m_object_state;
- };
-
- class AioTrim : public AbstractWrite {
- public:
- AioTrim(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
- const ::SnapContext &snapc, Context *completion)
- : AbstractWrite(ictx, oid, object_no, 0, 0, snapc, completion, true) {
- }
-
- protected:
- virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
- wr->remove();
- }
-
- virtual const char* get_write_type() const {
- return "remove (trim)";
- }
-
- virtual void pre_object_map_update(uint8_t *new_state) {
- *new_state = OBJECT_PENDING;
- }
-
- virtual bool post_object_map_update() {
- return true;
- }
- };
-
- class AioTruncate : public AbstractWrite {
- public:
- AioTruncate(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
- uint64_t object_off, const ::SnapContext &snapc,
- Context *completion)
- : AbstractWrite(ictx, oid, object_no, object_off, 0, snapc, completion,
- true) {
- }
- virtual ~AioTruncate() {}
-
- protected:
- virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
- wr->truncate(m_object_off);
- }
-
- virtual const char* get_write_type() const {
- return "truncate";
- }
-
- virtual void pre_object_map_update(uint8_t *new_state) {
- *new_state = OBJECT_EXISTS;
- }
- };
-
- class AioZero : public AbstractWrite {
- public:
- AioZero(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
- uint64_t object_off, uint64_t object_len,
- const ::SnapContext &snapc, Context *completion)
- : AbstractWrite(ictx, oid, object_no, object_off, object_len, snapc,
- completion, true) {
- }
- virtual ~AioZero() {}
-
- protected:
- virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
- wr->zero(m_object_off, m_object_len);
- }
-
- virtual const char* get_write_type() const {
- return "zero";
- }
-
- virtual void pre_object_map_update(uint8_t *new_state) {
- *new_state = OBJECT_EXISTS;
- }
- };
-
-}
-
-#endif
// vim: ts=8 sw=2 smarttab
#include "librbd/AsyncFlattenRequest.h"
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
#include "librbd/AsyncObjectThrottle.h"
#include "librbd/ImageCtx.h"
#include "librbd/ImageWatcher.h"
bufferlist bl;
string oid = m_image_ctx.get_object_name(m_object_no);
- AioWrite *req = new AioWrite(&m_image_ctx, oid, m_object_no, 0, bl, m_snapc,
- this);
+ AioObjectWrite *req = new AioObjectWrite(&m_image_ctx, oid, m_object_no, 0,
+ bl, m_snapc, this);
if (!req->has_parent()) {
// stop early if the parent went away - it just means
// another flatten finished first or the image was resized
// vim: ts=8 sw=2 smarttab
#include "librbd/AsyncTrimRequest.h"
#include "librbd/AsyncObjectThrottle.h"
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
#include "librbd/ImageCtx.h"
#include "librbd/ImageWatcher.h"
#include "librbd/internal.h"
string oid = m_image_ctx.get_object_name(m_object_no);
ldout(m_image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
- AbstractWrite *req = new AioTrim(&m_image_ctx, oid, m_object_no, m_snapc,
- this);
+ AioObjectRequest *req = new AioObjectTrim(&m_image_ctx, oid, m_object_no,
+ m_snapc, this);
req->send();
return 0;
}
ldout(cct, 20) << " ex " << *p << dendl;
Context *req_comp = new C_ContextCompletion(*completion);
- AbstractWrite *req;
+ AioObjectRequest *req;
if (p->offset == 0) {
- req = new AioTrim(&m_image_ctx, p->oid.name, p->objectno, snapc,
- req_comp);
+ req = new AioObjectTrim(&m_image_ctx, p->oid.name, p->objectno, snapc,
+ req_comp);
} else {
- req = new AioTruncate(&m_image_ctx, p->oid.name, p->objectno,
- p->offset, snapc, req_comp);
+ req = new AioObjectTruncate(&m_image_ctx, p->oid.name, p->objectno,
+ p->offset, snapc, req_comp);
}
req->send();
}
#include "common/Mutex.h"
#include "librbd/AioCompletion.h"
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
#include "librbd/AsyncObjectThrottle.h"
#include "librbd/CopyupRequest.h"
#include "librbd/ImageCtx.h"
m_async_op.finish_op();
}
- void CopyupRequest::append_request(AioRequest *req) {
+ void CopyupRequest::append_request(AioObjectRequest *req) {
ldout(m_ictx->cct, 20) << __func__ << " " << this << ": " << req << dendl;
m_pending_requests.push_back(req);
}
void CopyupRequest::complete_requests(int r) {
while (!m_pending_requests.empty()) {
- vector<AioRequest *>::iterator it = m_pending_requests.begin();
- AioRequest *req = *it;
+ vector<AioObjectRequest *>::iterator it = m_pending_requests.begin();
+ AioObjectRequest *req = *it;
ldout(m_ictx->cct, 20) << __func__ << " completing request " << req
<< dendl;
req->complete(r);
// merge all pending write ops into this single RADOS op
for (size_t i=0; i<m_pending_requests.size(); ++i) {
- AioRequest *req = m_pending_requests[i];
+ AioObjectRequest *req = m_pending_requests[i];
ldout(m_ictx->cct, 20) << __func__ << " add_copyup_ops " << req
<< dendl;
req->add_copyup_ops(&write_op);
vector<pair<uint64_t,uint64_t> >& image_extents);
~CopyupRequest();
- void append_request(AioRequest *req);
+ void append_request(AioObjectRequest *req);
void send();
void queue_send();
vector<pair<uint64_t,uint64_t> > m_image_extents;
State m_state;
ceph::bufferlist m_copyup_data;
- vector<AioRequest *> m_pending_requests;
+ vector<AioObjectRequest *> m_pending_requests;
atomic_t m_pending_copyups;
AsyncOperation m_async_op;
#include "include/rados/librados.hpp"
#include "include/rbd/librbd.hpp"
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
#include "librbd/ImageCtx.h"
#include "librbd/internal.h"
#include "librbd/LibrbdWriteback.h"
{
assert(m_ictx->owner_lock.is_locked());
uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
-
+
write_result_d *result = new write_result_d(oid.name, oncommit);
m_writes[oid.name].push(result);
ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl;
C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this);
- AioWrite *req = new AioWrite(m_ictx, oid.name, object_no, off, bl, snapc,
- req_comp);
+ AioObjectWrite *req = new AioObjectWrite(m_ictx, oid.name, object_no, off,
+ bl, snapc, req_comp);
req->send();
return ++m_tid;
}
librbd_internal_la_SOURCES = \
librbd/AioCompletion.cc \
- librbd/AioRequest.cc \
+ librbd/AioObjectRequest.cc \
librbd/AsyncFlattenRequest.cc \
librbd/AsyncObjectThrottle.cc \
librbd/AsyncOperation.cc \
noinst_HEADERS += \
librbd/AioCompletion.h \
- librbd/AioRequest.h \
+ librbd/AioObjectRequest.h \
librbd/AsyncFlattenRequest.h \
librbd/AsyncObjectThrottle.h \
librbd/AsyncOperation.h \
#include "cls/rbd/cls_rbd_client.h"
#include "librbd/AioCompletion.h"
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
#include "librbd/AsyncFlattenRequest.h"
#include "librbd/AsyncResizeRequest.h"
#include "librbd/AsyncTrimRequest.h"
void rados_req_cb(rados_completion_t c, void *arg)
{
- AioRequest *req = reinterpret_cast<AioRequest *>(arg);
+ AioObjectRequest *req = reinterpret_cast<AioObjectRequest *>(arg);
req->complete(rados_aio_get_return_value(c));
}
if (ictx->object_cacher) {
ictx->write_to_cache(p->oid, bl, p->length, p->offset, req_comp, op_flags);
} else {
- AioWrite *req = new AioWrite(ictx, p->oid.name, p->objectno, p->offset,
- bl, snapc, req_comp);
+ AioObjectWrite *req = new AioObjectWrite(ictx, p->oid.name, p->objectno,
+ p->offset, bl, snapc,
+ req_comp);
req->set_op_flags(op_flags);
req->send();
ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
<< " from " << p->buffer_extents << dendl;
C_AioRequest *req_comp = new C_AioRequest(cct, c);
- AbstractWrite *req;
+ AioObjectRequest *req;
if (p->length == ictx->layout.fl_object_size) {
- req = new AioRemove(ictx, p->oid.name, p->objectno, snapc, req_comp);
+ req = new AioObjectRemove(ictx, p->oid.name, p->objectno, snapc,
+ req_comp);
} else if (p->offset + p->length == ictx->layout.fl_object_size) {
- req = new AioTruncate(ictx, p->oid.name, p->objectno, p->offset, snapc,
- req_comp);
+ req = new AioObjectTruncate(ictx, p->oid.name, p->objectno, p->offset,
+ snapc, req_comp);
} else {
if(ictx->cct->_conf->rbd_skip_partial_discard) {
delete req_comp;
continue;
}
- req = new AioZero(ictx, p->oid.name, p->objectno, p->offset, p->length,
- snapc, req_comp);
+ req = new AioObjectZero(ictx, p->oid.name, p->objectno, p->offset,
+ p->length, snapc, req_comp);
}
req->send();
void rbd_req_cb(completion_t cb, void *arg)
{
- AioRequest *req = reinterpret_cast<AioRequest *>(arg);
+ AioObjectRequest *req = reinterpret_cast<AioObjectRequest *>(arg);
AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb);
req->complete(comp->get_return_value());
}
<< dendl;
C_AioRead *req_comp = new C_AioRead(ictx->cct, c);
- AioRead *req = new AioRead(ictx, q->oid.name, q->objectno, q->offset,
- q->length, q->buffer_extents, snap_id, true,
- req_comp, op_flags);
+ AioObjectRead *req = new AioObjectRead(ictx, q->oid.name, q->objectno,
+ q->offset, q->length,
+ q->buffer_extents, snap_id, true,
+ req_comp, op_flags);
req_comp->set_req(req);
if (ictx->object_cacher) {