From 5c9635ebfa5c133c88d714542a0e47e767b2c514 Mon Sep 17 00:00:00 2001 From: N Balachandran Date: Tue, 15 Apr 2025 17:04:41 +0530 Subject: [PATCH] rbd-mirror: fix crash in group Replayer The rbd-mirror daemon crashed if the Replayer was destroyed while an on-going remote group snap listing operation was in progress. The callback would attempt to access members of the Replayer instance which no longer existed. Signed-off-by: N Balachandran --- .../rbd_mirror/group_replayer/Replayer.cc | 30 +++++++++++++------ .../rbd_mirror/group_replayer/Replayer.h | 2 +- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/tools/rbd_mirror/group_replayer/Replayer.cc b/src/tools/rbd_mirror/group_replayer/Replayer.cc index c47195624b7d0..cb7a6df06b4d2 100644 --- a/src/tools/rbd_mirror/group_replayer/Replayer.cc +++ b/src/tools/rbd_mirror/group_replayer/Replayer.cc @@ -164,7 +164,9 @@ void Replayer::handle_schedule_load_group_snapshots(int r) { auto ctx = new LambdaContext( [this](int r) { load_local_group_snapshots(); + m_in_flight_op_tracker.finish_op(); }); + m_in_flight_op_tracker.start_op(); m_threads->work_queue->queue(ctx, 0); } @@ -293,6 +295,7 @@ void Replayer::init(Context* on_finish) { << ", remote_mirror_peer_uuid=" << m_remote_mirror_peer_uuid << dendl; on_finish->complete(0); + load_local_group_snapshots(); } @@ -321,6 +324,7 @@ void Replayer::load_local_group_snapshots() { } } + //FIXME: This is not accessed under a lock if (!m_local_group_snaps.empty()) { for (auto &local_snap : m_local_group_snaps) { if (local_snap.state == cls::rbd::GROUP_SNAPSHOT_STATE_COMPLETE) { @@ -342,6 +346,7 @@ void Replayer::load_local_group_snapshots() { std::unique_lock locker{m_lock}; m_in_flight_op_tracker.start_op(); m_local_group_snaps.clear(); + auto ctx = create_context_callback< Replayer, &Replayer::handle_load_local_group_snapshots>(this); @@ -355,10 +360,12 @@ template void Replayer::handle_load_local_group_snapshots(int r) { dout(10) << "r=" << r << dendl; - if (is_replay_interrupted()) { + std::unique_lock locker{m_lock}; + if (is_replay_interrupted(&locker)) { m_in_flight_op_tracker.finish_op(); return; } + m_in_flight_op_tracker.finish_op(); if (r < 0) { @@ -383,21 +390,18 @@ void Replayer::handle_load_local_group_snapshots(int r) { return; } - load_remote_group_snapshots(); + load_remote_group_snapshots(&locker); } template -void Replayer::load_remote_group_snapshots() { +void Replayer::load_remote_group_snapshots(std::unique_lock* locker) { dout(10) << "m_remote_group_id=" << m_remote_group_id << dendl; - std::unique_lock locker{m_lock}; - if (is_replay_interrupted(&locker)) { - return; - } m_remote_group_snaps.clear(); auto ctx = new LambdaContext( [this] (int r) { handle_load_remote_group_snapshots(r); + m_in_flight_op_tracker.finish_op(); }); m_in_flight_op_tracker.start_op(); @@ -410,11 +414,13 @@ template void Replayer::handle_load_remote_group_snapshots(int r) { dout(10) << "r=" << r << dendl; + // FIXME: This should take a lock to access variables + // For now the m_in_flight_op_tracker should prevent the Replayer + // from being deleted under the callback. + if (is_replay_interrupted()) { - m_in_flight_op_tracker.finish_op(); return; } - m_in_flight_op_tracker.finish_op(); auto last_remote_snap = m_remote_group_snaps.rbegin(); if (r < 0) { // may be remote group is deleted? @@ -630,6 +636,11 @@ void Replayer::scan_for_unsynced_group_snapshots() { local_snap->snapshot_namespace); auto local_snap_ns = std::get_if( &local_snap->snapshot_namespace); + + if (local_snap_ns) { + dout(10) << "local mirror snapshot: id=" << local_snap->id + << ", mirror_ns=" << *local_snap_ns << dendl; + } auto next_remote_snap = m_remote_group_snaps.end(); if (snap_type == cls::rbd::GROUP_SNAPSHOT_NAMESPACE_TYPE_USER || (local_snap_ns && (local_snap_ns->is_non_primary() || @@ -644,6 +655,7 @@ void Replayer::scan_for_unsynced_group_snapshots() { } } if (found && next_remote_snap == m_remote_group_snaps.end()) { + dout(10) << "all remote snaps synced" << dendl; syncs_upto_date = true; break; } else if (next_remote_snap != m_remote_group_snaps.end()) { diff --git a/src/tools/rbd_mirror/group_replayer/Replayer.h b/src/tools/rbd_mirror/group_replayer/Replayer.h index 9c4215166fd41..79b08392bd11f 100644 --- a/src/tools/rbd_mirror/group_replayer/Replayer.h +++ b/src/tools/rbd_mirror/group_replayer/Replayer.h @@ -128,7 +128,7 @@ private: void load_local_group_snapshots(); void handle_load_local_group_snapshots(int r); - void load_remote_group_snapshots(); + void load_remote_group_snapshots(std::unique_lock* locker); void handle_load_remote_group_snapshots(int r); void validate_image_snaps_sync_complete(const std::string &group_snap_id); -- 2.39.5