From 42846fdb1085a1a5bc0789bf477237fbbb86c9fd Mon Sep 17 00:00:00 2001 From: Prasanna Kumar Kalever Date: Thu, 6 Nov 2025 15:59:42 +0530 Subject: [PATCH] rbd-mirror: fix potential deadlock in finish_shut_down() Problem: A race condition could lead to a deadlock in finish_shut_down(). The issue occurs when the routine attempts to acquire m_lock while it is already held by one of the Replayer callbacks, such as validate_local_group_snapshots() or create_group_snapshot(). Solution: Refactored finish_shut_down() to run asynchronously by splitting it into two routines, wait_for_in_flight_ops() and handle_wait_for_in_flight_ops(). The handle_wait_for_in_flight_ops() function acquires the lock, but now executes in a separate thread, avoiding lock contention and eliminating the deadlock risk. Credits to Ilya Dryomov Signed-off-by: Prasanna Kumar Kalever Resolves: rhbz#2411963 --- .../rbd_mirror/group_replayer/Replayer.cc | 48 ++++++++----------- .../rbd_mirror/group_replayer/Replayer.h | 4 +- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/src/tools/rbd_mirror/group_replayer/Replayer.cc b/src/tools/rbd_mirror/group_replayer/Replayer.cc index fdd2826af81..a2933f9ac3d 100644 --- a/src/tools/rbd_mirror/group_replayer/Replayer.cc +++ b/src/tools/rbd_mirror/group_replayer/Replayer.cc @@ -160,7 +160,7 @@ Replayer::~Replayer() { template bool Replayer::is_replay_interrupted(std::unique_lock* locker) { - if (m_state == STATE_COMPLETE || m_stop_requested) { + if (m_state == STATE_COMPLETE) { locker->unlock(); return true; } @@ -240,7 +240,6 @@ void Replayer::handle_replay_complete(std::unique_lock* locker, return; } - m_stop_requested = true; m_state = STATE_COMPLETE; notify_group_listener(); } @@ -383,13 +382,10 @@ void Replayer::validate_local_group_snapshots() { dout(10) << "m_local_group_id=" << m_local_group_id << dendl; std::unique_lock locker{m_lock}; - if (is_replay_interrupted(&locker) || m_stop_requested) { + if (is_replay_interrupted(&locker)) { return; } - - if (m_state != STATE_COMPLETE) { - m_state = STATE_REPLAYING; - } + m_state = STATE_REPLAYING; // early exit if no snapshots to process if (m_local_group_snaps.empty()) { @@ -725,13 +721,8 @@ void Replayer::scan_for_unsynced_group_snapshots( } else { // empty local cluster, started mirroring freshly try_create_group_snapshot("", locker); } - - if (m_stop_requested) { - // stop group replayer - handle_replay_complete(locker, 0, ""); - return; - } locker->unlock(); + schedule_load_group_snapshots(); } @@ -1794,41 +1785,42 @@ void Replayer::shut_down(Context* on_finish) { { std::unique_lock locker{m_lock}; - m_stop_requested = false; ceph_assert(m_on_shutdown == nullptr); std::swap(m_on_shutdown, on_finish); + m_error_code = 0; + m_error_description = ""; + ceph_assert(m_state != STATE_INIT); auto state = STATE_COMPLETE; std::swap(m_state, state); } cancel_load_group_snapshots(); - if (!m_in_flight_op_tracker.empty()) { - m_in_flight_op_tracker.wait_for_ops(new LambdaContext([this](int) { - finish_shut_down(); - })); - return; - } - - finish_shut_down(); - return; + wait_for_in_flight_ops(); } template -void Replayer::finish_shut_down() { +void Replayer::wait_for_in_flight_ops() { dout(10) << dendl; - Context *on_finish = nullptr; + auto ctx = create_async_context_callback( + m_threads->work_queue, create_context_callback< + Replayer, &Replayer::handle_wait_for_in_flight_ops>(this)); + m_in_flight_op_tracker.wait_for_ops(ctx); +} + +template +void Replayer::handle_wait_for_in_flight_ops(int r) { + dout(10) << "r=" << r << dendl; + Context *on_finish = nullptr; { std::unique_lock locker{m_lock}; ceph_assert(m_on_shutdown != nullptr); std::swap(m_on_shutdown, on_finish); } - if (on_finish) { - on_finish->complete(0); - } + on_finish->complete(m_error_code); } template diff --git a/src/tools/rbd_mirror/group_replayer/Replayer.h b/src/tools/rbd_mirror/group_replayer/Replayer.h index 4731caf4772..45bd78735ec 100644 --- a/src/tools/rbd_mirror/group_replayer/Replayer.h +++ b/src/tools/rbd_mirror/group_replayer/Replayer.h @@ -62,7 +62,6 @@ public: } void init(Context* on_finish); void shut_down(Context* on_finish); - void finish_shut_down(); bool is_replaying() const { std::unique_lock locker{m_lock}; @@ -121,7 +120,6 @@ private: int m_error_code = 0; std::string m_error_description; - bool m_stop_requested = false; bool m_retry_validate_snap = false; utime_t m_snapshot_start; @@ -235,6 +233,8 @@ private: void set_image_replayer_limits(const std::string &image_id, const cls::rbd::GroupSnapshot *remote_snap, std::unique_lock* locker); + void wait_for_in_flight_ops(); + void handle_wait_for_in_flight_ops(int r); }; } // namespace group_replayer -- 2.47.3