bool hide_enoent)
: m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off),
m_object_len(len), m_snap_id(snap_id), m_completion(completion),
- m_parent_completion(NULL), m_hide_enoent(hide_enoent) {
+ m_hide_enoent(hide_enoent) {
Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
0, m_ictx->layout.fl_object_size, m_parent_extents);
compute_parent_extents();
}
- AioRequest::~AioRequest() {
- if (m_parent_completion) {
- m_parent_completion->release();
- m_parent_completion = NULL;
- }
- }
-
void AioRequest::complete(int r)
{
if (should_complete(r)) {
return false;
}
- void AioRequest::read_from_parent(const vector<pair<uint64_t,uint64_t> >& parent_extents,
- bool block_completion)
- {
- assert(!m_parent_completion);
- m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
- if (block_completion) {
- // prevent the parent image from being deleted while this
- // request is still in-progress
- m_parent_completion->get();
- m_parent_completion->block();
- }
-
- ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
- << " parent completion " << m_parent_completion
- << " extents " << parent_extents
- << dendl;
- aio_read(m_ictx->parent, parent_extents, NULL, &m_read_data,
- m_parent_completion, 0);
- }
-
static inline bool is_copy_on_read(ImageCtx *ictx, librados::snap_t snap_id) {
assert(ictx->snap_lock.is_locked());
return (ictx->cct->_conf->rbd_clone_copy_on_read) &&
Context *completion, int op_flags)
: AioRequest(ictx, oid, objectno, offset, len, snap_id, completion, false),
m_buffer_extents(be), m_tried_parent(false), m_sparse(sparse),
- m_op_flags(op_flags), m_state(LIBRBD_AIO_READ_FLAT) {
+ m_op_flags(op_flags), m_parent_completion(NULL),
+ m_state(LIBRBD_AIO_READ_FLAT) {
guard_read();
}
+ AioRead::~AioRead()
+ {
+ if (m_parent_completion) {
+ m_parent_completion->release();
+ m_parent_completion = NULL;
+ }
+ }
+
void AioRead::guard_read()
{
RWLock::RLocker snap_locker(m_ictx->snap_lock);
m_state = LIBRBD_AIO_READ_COPYUP;
}
- read_from_parent(parent_extents, true);
+ read_from_parent(parent_extents);
finished = false;
}
}
// If read entire object from parent success and CoR is possible, kick
// off a asynchronous copyup. This approach minimizes the latency
// impact.
- Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
- map<uint64_t, CopyupRequest*>::iterator it =
- m_ictx->copyup_list.find(m_object_no);
- if (it == m_ictx->copyup_list.end()) {
- RWLock::RLocker snap_locker(m_ictx->snap_lock);
- RWLock::RLocker parent_locker(m_ictx->parent_lock);
- if (compute_parent_extents()) {
- // create and kick off a CopyupRequest
- CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
- m_object_no,
- m_parent_extents);
- m_ictx->copyup_list[m_object_no] = new_req;
- new_req->queue_send();
- }
- }
+ send_copyup();
}
break;
case LIBRBD_AIO_READ_FLAT:
rados_completion->release();
}
+ void AioRead::send_copyup()
+ {
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ RWLock::RLocker parent_locker(m_ictx->parent_lock);
+ Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
+ map<uint64_t, CopyupRequest*>::iterator it =
+ m_ictx->copyup_list.find(m_object_no);
+ if (it == m_ictx->copyup_list.end()) {
+ if (compute_parent_extents()) {
+ // create and kick off a CopyupRequest
+ CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+ m_object_no,
+ m_parent_extents);
+ m_ictx->copyup_list[m_object_no] = new_req;
+ new_req->queue_send();
+ }
+ }
+ }
+
+ void AioRead::read_from_parent(const vector<pair<uint64_t,uint64_t> >& parent_extents)
+ {
+ assert(!m_parent_completion);
+ m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
+
+ // prevent the parent image from being deleted while this
+ // request is still in-progress
+ m_parent_completion->get();
+ m_parent_completion->block();
+
+ ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
+ << " parent completion " << m_parent_completion
+ << " extents " << parent_extents
+ << dendl;
+ aio_read(m_ictx->parent, parent_extents, NULL, &m_read_data,
+ m_parent_completion, 0);
+ }
+
/** write **/
AbstractWrite::AbstractWrite(ImageCtx *ictx, const std::string &oid,
Context *completion, bool hide_enoent)
: AioRequest(ictx, oid, object_no, object_off, len, CEPH_NOSNAP, completion,
hide_enoent),
- m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val),
- m_entire_object(NULL)
+ m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
{
m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
}
<< m_object_off << "~" << m_object_len
<< " should_complete: r = " << r << dendl;
- map<uint64_t, CopyupRequest*>::iterator it;
bool finished = true;
switch (m_state) {
case LIBRBD_AIO_WRITE_PRE:
// If parent still exists, overlap might also have changed.
if (compute_parent_extents()) {
- m_state = LIBRBD_AIO_WRITE_COPYUP;
-
- if (is_copy_on_read(m_ictx, CEPH_NOSNAP)) {
- m_ictx->copyup_list_lock.Lock();
- it = m_ictx->copyup_list.find(m_object_no);
- if (it == m_ictx->copyup_list.end()) {
- // If it is not in the list, create a CopyupRequest and wait
- CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
- m_object_no,
- m_parent_extents);
- // make sure to wait on this CopyupRequest
- new_req->append_request(this);
- m_ictx->copyup_list[m_object_no] = new_req;
-
- m_entire_object = &(new_req->get_copyup_data());
- m_ictx->copyup_list_lock.Unlock();
- new_req->send();
- } else {
- it->second->append_request(this);
- m_entire_object = &it->second->get_copyup_data();
- m_ictx->copyup_list_lock.Unlock();
- }
- } else {
- read_from_parent(m_parent_extents, false);
- }
+ send_copyup();
} else {
- /*
- * Parent may have disappeared; if so, recover by using
- * send_copyup() to send the original write req (the copyup
- * operation itself will be a no-op, since someone must have
- * populated the child object while we weren't looking).
- * Move to WRITE_FLAT state as we'll be done with the
- * operation once the null copyup completes.
- */
+ // parent may have disappeared -- send original write again
ldout(m_ictx->cct, 20) << "should_complete(" << this
<< "): parent overlap now 0" << dendl;
- m_state = LIBRBD_AIO_WRITE_FLAT;
- send_copyup();
+ send_write();
}
finished = false;
break;
return should_complete(r);
}
- // Read data from waiting list safely. If this AioWrite created a
- // CopyupRequest, m_read_data should be empty.
- if (m_entire_object != NULL) {
- assert(m_read_data.length() == 0);
- m_read_data.append(*m_entire_object);
- }
-
- send_copyup();
- finished = false;
+ finished = send_post();
break;
case LIBRBD_AIO_WRITE_FLAT:
rados_completion->release();
}
- void AbstractWrite::send_copyup() {
+ void AbstractWrite::send_copyup()
+ {
ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " "
<< m_object_off << "~" << m_object_len << dendl;
- librados::ObjectWriteOperation op;
- if (!m_read_data.is_zero()) {
- op.exec("rbd", "copyup", m_read_data);
+ m_state = LIBRBD_AIO_WRITE_COPYUP;
+
+ m_ictx->copyup_list_lock.Lock();
+ map<uint64_t, CopyupRequest*>::iterator it =
+ m_ictx->copyup_list.find(m_object_no);
+ if (it == m_ictx->copyup_list.end()) {
+ CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+ m_object_no,
+ m_parent_extents);
+
+ // make sure to wait on this CopyupRequest
+ new_req->append_request(this);
+ m_ictx->copyup_list[m_object_no] = new_req;
+
+ m_ictx->copyup_list_lock.Unlock();
+ new_req->send();
+ } else {
+ it->second->append_request(this);
+ m_ictx->copyup_list_lock.Unlock();
}
- add_write_ops(&op);
- assert(op.size() != 0);
-
- librados::AioCompletion *rados_completion =
- librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
- m_ictx->md_ctx.aio_operate(m_oid, rados_completion, &op,
- m_snap_seq, m_snaps);
- rados_completion->release();
}
void AioWrite::add_write_ops(librados::ObjectWriteOperation *wr) {
uint64_t objectno, uint64_t off, uint64_t len,
librados::snap_t snap_id,
Context *completion, bool hide_enoent);
- virtual ~AioRequest();
+ virtual ~AioRequest() {}
+
+ virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) {};
void complete(int r);
protected:
bool compute_parent_extents();
- void read_from_parent(const vector<pair<uint64_t,uint64_t> >& image_extents,
- bool block_completion);
ImageCtx *m_ictx;
std::string m_oid;
librados::snap_t m_snap_id;
Context *m_completion;
std::vector<std::pair<uint64_t,uint64_t> > m_parent_extents;
- AioCompletion *m_parent_completion;
- ceph::bufferlist m_read_data;
bool m_hide_enoent;
};
vector<pair<uint64_t,uint64_t> >& be,
librados::snap_t snap_id, bool sparse,
Context *completion, int op_flags);
- virtual ~AioRead() {}
+ virtual ~AioRead();
+
virtual bool should_complete(int r);
virtual void send();
void guard_read();
bool m_tried_parent;
bool m_sparse;
int m_op_flags;
+ ceph::bufferlist m_read_data;
+ AioCompletion *m_parent_completion;
/**
* Reads go through the following state machine to deal with
};
read_state_d m_state;
+
+ void send_copyup();
+ void read_from_parent(const vector<pair<uint64_t,uint64_t> >& image_extents);
};
class AbstractWrite : public AioRequest {
Context *completion, bool hide_enoent);
virtual ~AbstractWrite() {}
+ virtual void add_copyup_ops(librados::ObjectWriteOperation *wr)
+ {
+ add_write_ops(wr);
+ }
+
virtual bool should_complete(int r);
virtual void send();
* . | | .
* v v need copyup | .
* LIBRBD_AIO_WRITE_GUARD -----------> LIBRBD_AIO_WRITE_COPYUP | .
- * . | ^ | | .
- * . | | | | .
- * . | \---------------------------/ | .
- * . | | .
- * . \-------------------\ /-------------------/ .
- * . | | .
- * . LIBRBD_AIO_WRITE_POST .
- * . | .
- * . v .
+ * . | | . | .
+ * . | | . | .
+ * . | /-----/ . | .
+ * . | | . | .
+ * . \-------------------\ | /-------------------/ .
+ * . | | | . .
+ * . v v v . .
+ * . LIBRBD_AIO_WRITE_POST . .
+ * . | . .
+ * . | . . . . . . . . .
+ * . | . .
+ * . v v .
* . . . . . . . . . . . . . . > <finish> < . . . . . . . . . . . . . .
*
- * The _PRE_REMOVE/_POST_REMOVE states are skipped if the object map
- * is disabled. The write starts in _WRITE_GUARD or _FLAT depending on
- * whether or not there is a parent overlap.
+ * The _PRE/_POST states are skipped if the object map is disabled.
+ * The write starts in _WRITE_GUARD or _FLAT depending on whether or not
+ * there is a parent overlap.
*/
enum write_state_d {
LIBRBD_AIO_WRITE_GUARD,
librados::ObjectWriteOperation m_write;
uint64_t m_snap_seq;
std::vector<librados::snap_t> m_snaps;
- ceph::bufferlist *m_entire_object;
virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
virtual void guard_write();
m_async_op.finish_op();
}
- ceph::bufferlist& CopyupRequest::get_copyup_data() {
- return m_copyup_data;
- }
-
void CopyupRequest::append_request(AioRequest *req) {
ldout(m_ictx->cct, 20) << __func__ << " " << this << ": " << req << dendl;
m_pending_requests.push_back(req);
}
- bool CopyupRequest::complete_requests(int r) {
- if (m_pending_requests.empty()) {
- return false;
- }
-
+ void CopyupRequest::complete_requests(int r) {
while (!m_pending_requests.empty()) {
vector<AioRequest *>::iterator it = m_pending_requests.begin();
AioRequest *req = *it;
req->complete(r);
m_pending_requests.erase(it);
}
- return true;
}
- void CopyupRequest::send_copyup() {
- ldout(m_ictx->cct, 20) << __func__ << " " << this
- << ": oid " << m_oid << dendl;
-
+ bool CopyupRequest::send_copyup() {
m_ictx->snap_lock.get_read();
::SnapContext snapc = m_ictx->snapc;
m_ictx->snap_lock.put_read();
snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
librados::ObjectWriteOperation copyup_op;
- copyup_op.exec("rbd", "copyup", m_copyup_data);
+ if (!m_copyup_data.is_zero()) {
+ copyup_op.exec("rbd", "copyup", m_copyup_data);
+ }
+
+ // merge all pending write ops into this single RADOS op
+ for (size_t i=0; i<m_pending_requests.size(); ++i) {
+ AioRequest *req = m_pending_requests[i];
+ ldout(m_ictx->cct, 20) << __func__ << " add_copyup_ops " << req << dendl;
+ req->add_copyup_ops(©up_op);
+ }
+
+ if (copyup_op.size() == 0) {
+ return true;
+ }
+
+ ldout(m_ictx->cct, 20) << __func__ << " " << this
+ << ": oid " << m_oid << dendl;
+ m_state = STATE_COPYUP;
librados::AioCompletion *comp =
- librados::Rados::aio_create_completion(NULL, NULL, NULL);
- m_ictx->md_ctx.aio_operate(m_oid, comp, ©up_op, snapc.seq.val, snaps);
+ librados::Rados::aio_create_completion(create_callback_context(), NULL,
+ rados_ctx_cb);
+ int r = m_ictx->md_ctx.aio_operate(m_oid, comp, ©up_op, snapc.seq.val,
+ snaps);
+ assert(r == 0);
comp->release();
+ return false;
}
void CopyupRequest::send()
bool CopyupRequest::should_complete(int r)
{
CephContext *cct = m_ictx->cct;
- ldout(cct, 20) << __func__ << " "
+ ldout(cct, 20) << __func__ << " " << this
<< ": oid " << m_oid
<< ", extents " << m_image_extents
<< ", r " << r << dendl;
case STATE_READ_FROM_PARENT:
ldout(cct, 20) << "READ_FROM_PARENT" << dendl;
remove_from_list();
- if (complete_requests(r)) {
- // pending write operation: it will handle object map / copyup
- return true;
- } else if (r < 0) {
- // nothing to copyup
- return true;
- } else if (send_object_map()) {
- return true;
+ if (r >= 0) {
+ return send_object_map();
+ } else if (r == -ENOENT) {
+ return send_copyup();
}
break;
case STATE_OBJECT_MAP:
ldout(cct, 20) << "OBJECT_MAP" << dendl;
if (r == 0) {
- send_copyup();
+ return send_copyup();
}
+ break;
+
+ case STATE_COPYUP:
+ ldout(cct, 20) << "COPYUP" << dendl;
+ complete_requests(r);
return true;
default:
assert(false);
break;
}
+
+ if (r < 0) {
+ complete_requests(r);
+ return true;
+ }
return false;
}
}
bool CopyupRequest::send_object_map() {
- ldout(m_ictx->cct, 20) << __func__ << " " << this
- << ": oid " << m_oid
- << ", extents " << m_image_extents
- << dendl;
-
bool copyup = false;
{
RWLock::RLocker l(m_ictx->owner_lock);
if (!m_ictx->object_map.enabled()) {
copyup = true;
} else if (!m_ictx->image_watcher->is_lock_owner()) {
- ldout(m_ictx->cct, 20) << "exclusive lock not held for copy-on-read"
+ ldout(m_ictx->cct, 20) << "exclusive lock not held for copyup request"
<< dendl;
- return true;
+ assert(m_pending_requests.empty());
+ return true;
} else {
- m_state = STATE_OBJECT_MAP;
- Context *ctx = create_callback_context();
RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
- if (!m_ictx->object_map.aio_update(m_object_no, OBJECT_EXISTS,
- boost::optional<uint8_t>(), ctx)) {
- delete ctx;
- copyup = true;
- }
+ if (m_ictx->object_map[m_object_no] != OBJECT_EXISTS) {
+ ldout(m_ictx->cct, 20) << __func__ << " " << this
+ << ": oid " << m_oid
+ << ", extents " << m_image_extents
+ << dendl;
+ m_state = STATE_OBJECT_MAP;
+
+ Context *ctx = create_callback_context();
+ bool sent = m_ictx->object_map.aio_update(m_object_no, OBJECT_EXISTS,
+ boost::optional<uint8_t>(),
+ ctx);
+ assert(sent);
+ } else {
+ copyup = true;
+ }
}
}
// avoid possible recursive lock attempts
if (copyup) {
// no object map update required
- send_copyup();
- return true;
+ return send_copyup();
}
return false;
}