]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
rbd-mirror: fix potential deadlock in finish_shut_down()
authorPrasanna Kumar Kalever <prasanna.kalever@redhat.com>
Thu, 6 Nov 2025 10:29:42 +0000 (15:59 +0530)
committerIlya Dryomov <idryomov@redhat.com>
Sat, 22 Nov 2025 16:46:48 +0000 (17:46 +0100)
Problem:
A race condition could lead to a deadlock in finish_shut_down(). The issue
occurs when the routine attempts to acquire m_lock while it is already held
by one of the Replayer callbacks, such as validate_local_group_snapshots() or
create_group_snapshot().

Solution:
Refactored finish_shut_down() to run asynchronously by splitting it into two
routines, wait_for_in_flight_ops() and handle_wait_for_in_flight_ops(). The
handle_wait_for_in_flight_ops() function acquires the lock, but now executes in
a separate thread, avoiding lock contention and eliminating the deadlock risk.

Credits to Ilya Dryomov <idryomov@gmail.com>

Signed-off-by: Prasanna Kumar Kalever <prasanna.kalever@redhat.com>
Resolves: rhbz#2411963

src/tools/rbd_mirror/group_replayer/Replayer.cc
src/tools/rbd_mirror/group_replayer/Replayer.h

index fdd2826af81f2e2a433c7733b736e00887eec4cd..a2933f9ac3df31117853d99d2eb352993efcb179 100644 (file)
@@ -160,7 +160,7 @@ Replayer<I>::~Replayer() {
 template <typename I>
 bool Replayer<I>::is_replay_interrupted(std::unique_lock<ceph::mutex>* locker) {
 
-  if (m_state == STATE_COMPLETE || m_stop_requested) {
+  if (m_state == STATE_COMPLETE) {
     locker->unlock();
     return true;
   }
@@ -240,7 +240,6 @@ void Replayer<I>::handle_replay_complete(std::unique_lock<ceph::mutex>* locker,
     return;
   }
 
-  m_stop_requested = true;
   m_state = STATE_COMPLETE;
   notify_group_listener();
 }
@@ -383,13 +382,10 @@ void Replayer<I>::validate_local_group_snapshots() {
   dout(10) << "m_local_group_id=" << m_local_group_id << dendl;
 
   std::unique_lock locker{m_lock};
-  if (is_replay_interrupted(&locker) || m_stop_requested) {
+  if (is_replay_interrupted(&locker)) {
     return;
   }
-
-  if (m_state != STATE_COMPLETE) {
-    m_state = STATE_REPLAYING;
-  }
+  m_state = STATE_REPLAYING;
 
   // early exit if no snapshots to process
   if (m_local_group_snaps.empty()) {
@@ -725,13 +721,8 @@ void Replayer<I>::scan_for_unsynced_group_snapshots(
   } else { // empty local cluster, started mirroring freshly
     try_create_group_snapshot("", locker);
   }
-
-  if (m_stop_requested) {
-    // stop group replayer
-    handle_replay_complete(locker, 0, "");
-    return;
-  }
   locker->unlock();
+
   schedule_load_group_snapshots();
 }
 
@@ -1794,41 +1785,42 @@ void Replayer<I>::shut_down(Context* on_finish) {
 
   {
     std::unique_lock locker{m_lock};
-    m_stop_requested = false;
     ceph_assert(m_on_shutdown == nullptr);
     std::swap(m_on_shutdown, on_finish);
+    m_error_code = 0;
+    m_error_description = "";
 
+    ceph_assert(m_state != STATE_INIT);
     auto state = STATE_COMPLETE;
     std::swap(m_state, state);
   }
 
   cancel_load_group_snapshots();
 
-  if (!m_in_flight_op_tracker.empty()) {
-    m_in_flight_op_tracker.wait_for_ops(new LambdaContext([this](int) {
-        finish_shut_down();
-      }));
-    return;
-  }
-
-  finish_shut_down();
-  return;
+  wait_for_in_flight_ops();
 }
 
 template <typename I>
-void Replayer<I>::finish_shut_down() {
+void Replayer<I>::wait_for_in_flight_ops() {
   dout(10) << dendl;
 
-  Context *on_finish = nullptr;
+  auto ctx = create_async_context_callback(
+    m_threads->work_queue, create_context_callback<
+      Replayer<I>, &Replayer<I>::handle_wait_for_in_flight_ops>(this));
+  m_in_flight_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_wait_for_in_flight_ops(int r) {
+  dout(10) << "r=" << r << dendl;
 
+  Context *on_finish = nullptr;
   {
     std::unique_lock locker{m_lock};
     ceph_assert(m_on_shutdown != nullptr);
     std::swap(m_on_shutdown, on_finish);
   }
-  if (on_finish) {
-    on_finish->complete(0);
-  }
+  on_finish->complete(m_error_code);
 }
 
 template <typename I>
index 4731caf47723fd5bb3b00b2671ad89f662499ff6..45bd78735ec9b78b5c1d2cc515484782daeec999 100644 (file)
@@ -62,7 +62,6 @@ public:
   }
   void init(Context* on_finish);
   void shut_down(Context* on_finish);
-  void finish_shut_down();
 
   bool is_replaying() const {
     std::unique_lock locker{m_lock};
@@ -121,7 +120,6 @@ private:
   int m_error_code = 0;
   std::string m_error_description;
 
-  bool m_stop_requested = false;
   bool m_retry_validate_snap = false;
 
   utime_t m_snapshot_start;
@@ -235,6 +233,8 @@ private:
   void set_image_replayer_limits(const std::string &image_id,
                                  const cls::rbd::GroupSnapshot *remote_snap,
                                  std::unique_lock<ceph::mutex>* locker);
+  void wait_for_in_flight_ops();
+  void handle_wait_for_in_flight_ops(int r);
 };
 
 } // namespace group_replayer