From: Mykola Golub Date: Thu, 8 Oct 2020 17:12:11 +0000 (+0100) Subject: librbd: fix race on watcher unregister X-Git-Tag: v16.1.0~850^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F37594%2Fhead;p=ceph.git librbd: fix race on watcher unregister It was possible that "unregister_watch" would get stuck forever in "m_async_op_tracker.wait_for_ops", waiting for unqueiesce notifications to complete, which had been already canceled when "unregister" called "TaskFinisher::cancel_all". Signed-off-by: Mykola Golub --- diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc index 653b64720b34..44e748fb974b 100644 --- a/src/librbd/ImageWatcher.cc +++ b/src/librbd/ImageWatcher.cc @@ -90,12 +90,11 @@ void ImageWatcher::unregister_watch(Context *on_finish) { cancel_async_requests(); on_finish = new LambdaContext([this, on_finish](int r) { + cancel_quiesce_requests(); + m_task_finisher->cancel_all(); m_async_op_tracker.wait_for_ops(on_finish); }); - auto ctx = new LambdaContext([this, on_finish](int r) { - m_task_finisher->cancel_all(on_finish); - }); - Watcher::unregister_watch(ctx); + Watcher::unregister_watch(on_finish); } template @@ -537,8 +536,12 @@ void ImageWatcher::schedule_request_lock(bool use_timer, int timer_delay) { if (this->is_registered(this->m_watch_lock)) { ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl; - auto ctx = new LambdaContext( - boost::bind(&ImageWatcher::notify_request_lock, this)); + auto ctx = new LambdaContext([this](int r) { + if (r != -ECANCELED) { + notify_request_lock(); + } + }); + if (use_timer) { if (timer_delay < 0) { timer_delay = RETRY_DELAY_SECONDS; @@ -681,8 +684,11 @@ void ImageWatcher::schedule_async_request_timed_out(const AsyncRequestId &id) ldout(m_image_ctx.cct, 20) << "scheduling async request time out: " << id << dendl; - Context *ctx = new LambdaContext(boost::bind( - &ImageWatcher::async_request_timed_out, this, id)); + auto ctx = new LambdaContext([this, id](int r) { + if (r != -ECANCELED) { + async_request_timed_out(id); + } + }); Task task(TASK_CODE_ASYNC_REQUEST, id); m_task_finisher->cancel(task); @@ -816,22 +822,25 @@ Context *ImageWatcher::prepare_quiesce_request( std::unique_lock async_request_locker{m_async_request_lock}; mark_async_request_complete(request, r); auto ctx = remove_async_request(request, m_async_request_lock); - ceph_assert(ctx != nullptr); - ctx = new C_ResponseMessage(static_cast(ctx)); async_request_locker.unlock(); - ctx->complete(r); + if (ctx != nullptr) { + ctx = new C_ResponseMessage(static_cast(ctx)); + ctx->complete(r); + } else { + m_task_finisher->cancel(Task(TASK_CODE_QUIESCE, request)); + } }); } template -Context *ImageWatcher::prepare_unquiesce_request(const AsyncRequestId &request) { +void ImageWatcher::prepare_unquiesce_request(const AsyncRequestId &request) { { std::unique_lock async_request_locker{m_async_request_lock}; auto it = m_async_complete.find(request); if (it == m_async_complete.end()) { ldout(m_image_ctx.cct, 20) << this << " " << request << ": not found in complete" << dendl; - return nullptr; + return; } // reset complete request expiration time mark_async_request_complete(request, it->second); @@ -841,13 +850,22 @@ Context *ImageWatcher::prepare_unquiesce_request(const AsyncRequestId &reques if (!canceled) { ldout(m_image_ctx.cct, 20) << this << " " << request << ": timer task not found" << dendl; - return nullptr; } +} - return new LambdaContext( - [this](int r) { - m_async_op_tracker.finish_op(); - }); +template +void ImageWatcher::cancel_quiesce_requests() { + std::unique_lock l{m_async_request_lock}; + for (auto it = m_async_requests.begin(); it != m_async_requests.end(); ) { + if (it->second.second == nullptr) { + // Quiesce notify request. + mark_async_request_complete(it->first, 0); + delete it->second.first; + it = m_async_requests.erase(it); + } else { + it++; + } + } } template @@ -1341,17 +1359,10 @@ bool ImageWatcher::handle_payload(const QuiescePayload &payload, template bool ImageWatcher::handle_payload(const UnquiescePayload &payload, C_NotifyAck *ack_ctx) { - auto on_finish = prepare_unquiesce_request(payload.async_request_id); - if (on_finish == nullptr) { - ldout(m_image_ctx.cct, 10) << this - << " duplicate or unknown unquiesce request: " - << payload.async_request_id << dendl; - return true; - } - ldout(m_image_ctx.cct, 10) << this << " unquiesce request: " << payload.async_request_id << dendl; - m_image_ctx.state->notify_unquiesce(on_finish); + + prepare_unquiesce_request(payload.async_request_id); return true; } diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h index 0f70743b2751..c445974c79ae 100644 --- a/src/librbd/ImageWatcher.h +++ b/src/librbd/ImageWatcher.h @@ -223,7 +223,8 @@ private: Context *prepare_quiesce_request(const watch_notify::AsyncRequestId &request, C_NotifyAck *ack_ctx); - Context *prepare_unquiesce_request(const watch_notify::AsyncRequestId &request); + void prepare_unquiesce_request(const watch_notify::AsyncRequestId &request); + void cancel_quiesce_requests(); void notify_quiesce(const watch_notify::AsyncRequestId &async_request_id, size_t attempts, ProgressContext &prog_ctx, diff --git a/src/librbd/TaskFinisher.h b/src/librbd/TaskFinisher.h index bf76f633f7b4..268c7e7a917d 100644 --- a/src/librbd/TaskFinisher.h +++ b/src/librbd/TaskFinisher.h @@ -64,24 +64,21 @@ public: if (it == m_task_contexts.end()) { return false; } - delete it->second.first; + it->second.first->complete(-ECANCELED); bool canceled = m_safe_timer->cancel_event(it->second.second); ceph_assert(canceled); m_task_contexts.erase(it); return true; } - void cancel_all(Context *comp) { - { - std::lock_guard l{*m_lock}; - for (typename TaskContexts::iterator it = m_task_contexts.begin(); - it != m_task_contexts.end(); ++it) { - delete it->second.first; - m_safe_timer->cancel_event(it->second.second); - } - m_task_contexts.clear(); + void cancel_all() { + std::lock_guard l{*m_lock}; + for (auto &[task, pair] : m_task_contexts) { + pair.first->complete(-ECANCELED); + bool canceled = m_safe_timer->cancel_event(pair.second); + ceph_assert(canceled); } - m_finisher->queue(comp); + m_task_contexts.clear(); } bool add_event_after(const Task& task, double seconds, Context *ctx) {