From c29548519e6f7b8e11099ffae896c84c2f842bc9 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Wed, 25 Feb 2015 12:00:26 -0500 Subject: [PATCH] librbd: restart async requests if lock owner doesn't report progress Detect the case of a crashed lock owner by waiting for up to 30 seconds for a async request progress message from the leader. If a progress message isn't received, restart the request (and possibly take ownership of the lock). Signed-off-by: Jason Dillaman --- src/common/config_opts.h | 1 + src/librbd/ImageWatcher.cc | 68 +++++++++++++++++++++----------------- src/librbd/ImageWatcher.h | 4 ++- 3 files changed, 42 insertions(+), 31 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 00a23b6b661f2..f914ca93fd0ac 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -870,6 +870,7 @@ OPTION(rbd_readahead_disable_after_bytes, OPT_LONGLONG, 50 * 1024 * 1024) // how OPTION(rbd_clone_copy_on_read, OPT_BOOL, false) OPTION(rbd_blacklist_on_break_lock, OPT_BOOL, true) // whether to blacklist clients whose lock was broken OPTION(rbd_blacklist_expire_seconds, OPT_INT, 0) // number of seconds to blacklist - set to 0 for OSD default +OPTION(rbd_request_timed_out_seconds, OPT_INT, 30) // number of seconds before maint request times out /* * The following options change the behavior for librbd's image creation methods that diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc index de751123681f8..e9e9200a9e310 100644 --- a/src/librbd/ImageWatcher.cc +++ b/src/librbd/ImageWatcher.cc @@ -653,43 +653,56 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) { return r; } +void ImageWatcher::schedule_async_request_timed_out(const AsyncRequestId &id) { + Context *ctx = new FunctionContext(boost::bind( + &ImageWatcher::async_request_timed_out, this, id)); + + Task task(TASK_CODE_ASYNC_REQUEST, id); + m_task_finisher->cancel(task); + + md_config_t *conf = m_image_ctx.cct->_conf; + m_task_finisher->add_event_after(task, conf->rbd_request_timed_out_seconds, + ctx); +} + +void ImageWatcher::async_request_timed_out(const AsyncRequestId &id) { + RWLock::RLocker l(m_async_request_lock); + std::map::iterator it = + m_async_requests.find(id); + if (it != m_async_requests.end()) { + ldout(m_image_ctx.cct, 10) << "request timed-out: " << id << dendl; + it->second.first->complete(-ERESTART); + } +} + int ImageWatcher::notify_async_request(const AsyncRequestId &async_request_id, bufferlist &in, ProgressContext& prog_ctx) { assert(m_image_ctx.owner_lock.is_locked()); ldout(m_image_ctx.cct, 10) << "async request: " << async_request_id << dendl; - Mutex my_lock("librbd::ImageWatcher::notify_async_request::my_lock"); - Cond cond; - bool done = false; - int r; - Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &r); + + C_SaferCond ctx; { RWLock::WLocker l(m_async_request_lock); - m_async_requests[async_request_id] = AsyncRequest(ctx, &prog_ctx); + m_async_requests[async_request_id] = AsyncRequest(&ctx, &prog_ctx); } - BOOST_SCOPE_EXIT( (ctx)(async_request_id)(&m_async_requests) - (&m_async_request_lock)(&done) ) { + BOOST_SCOPE_EXIT( (&ctx)(async_request_id)(&m_task_finisher) + (&m_async_requests)(&m_async_request_lock) ) { + m_task_finisher->cancel(Task(TASK_CODE_ASYNC_REQUEST, async_request_id)); + RWLock::WLocker l(m_async_request_lock); m_async_requests.erase(async_request_id); - if (!done) { - delete ctx; - } } BOOST_SCOPE_EXIT_END - r = notify_lock_owner(in); + schedule_async_request_timed_out(async_request_id); + int r = notify_lock_owner(in); if (r < 0) { return r; } - - my_lock.Lock(); - while (!done) { - cond.Wait(my_lock); - } - my_lock.Unlock(); - return r; + return ctx.wait(); } void ImageWatcher::handle_payload(const HeaderUpdatePayload &payload, @@ -775,26 +788,21 @@ void ImageWatcher::handle_payload(const AsyncProgressPayload &payload, << payload.async_request_id << " @ " << payload.offset << "/" << payload.total << dendl; + schedule_async_request_timed_out(payload.async_request_id); req_it->second.second->update_progress(payload.offset, payload.total); } } void ImageWatcher::handle_payload(const AsyncCompletePayload &payload, bufferlist *out) { - Context *ctx = NULL; - { - RWLock::RLocker l(m_async_request_lock); - std::map::iterator req_it = - m_async_requests.find(payload.async_request_id); - if (req_it != m_async_requests.end()) { - ctx = req_it->second.first; - } - } - if (ctx != NULL) { + RWLock::RLocker l(m_async_request_lock); + std::map::iterator req_it = + m_async_requests.find(payload.async_request_id); + if (req_it != m_async_requests.end()) { ldout(m_image_ctx.cct, 10) << "request finished: " << payload.async_request_id << "=" << payload.result << dendl; - ctx->complete(payload.result); + req_it->second.first->complete(payload.result); } } diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h index d159d59a2af4e..2ffe96569723a 100644 --- a/src/librbd/ImageWatcher.h +++ b/src/librbd/ImageWatcher.h @@ -161,7 +161,7 @@ namespace librbd { struct HandlePayloadVisitor : public boost::static_visitor { ImageWatcher *image_watcher; uint64_t notify_id; - uint64_t handle; + uint64_t handle; HandlePayloadVisitor(ImageWatcher *image_watcher_, uint64_t notify_id_, uint64_t handle_) @@ -226,6 +226,8 @@ namespace librbd { void notify_request_lock(); int notify_lock_owner(bufferlist &bl); + void schedule_async_request_timed_out(const WatchNotify::AsyncRequestId &id); + void async_request_timed_out(const WatchNotify::AsyncRequestId &id); int notify_async_request(const WatchNotify::AsyncRequestId &id, bufferlist &in, ProgressContext& prog_ctx); void notify_request_leadership(); -- 2.39.5