From 66c167641aaa839aaee2cfe6e5c7138fdfccf030 Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Thu, 10 Dec 2020 19:31:45 -0500 Subject: [PATCH] rbd-mirror: validate that remote start snapshot still exists Perform a basic sanity check to verify that the remote start snapshot still exists. This was previosly being deleted as part of the unlink process due to a race condition between the remote side completing a sync between snapshots 1 and 2 and snapshot 2 being unlinked due to reaching max snapshots. Signed-off-by: Jason Dillaman (cherry picked from commit fb69efc6c19e6ee2bd8947129fb7f35442acb907) --- .../snapshot/test_mock_Replayer.cc | 85 ++++++++++++++++++- .../image_replayer/snapshot/Replayer.cc | 71 +++++++++------- 2 files changed, 125 insertions(+), 31 deletions(-) diff --git a/src/test/rbd_mirror/image_replayer/snapshot/test_mock_Replayer.cc b/src/test/rbd_mirror/image_replayer/snapshot/test_mock_Replayer.cc index 4ce139d21f4e0..fe1a0e3f92e63 100644 --- a/src/test/rbd_mirror/image_replayer/snapshot/test_mock_Replayer.cc +++ b/src/test/rbd_mirror/image_replayer/snapshot/test_mock_Replayer.cc @@ -786,6 +786,10 @@ TEST_F(TestMockImageReplayerSnapshotReplayer, SyncSnapshot) { expect_is_refresh_required(mock_remote_image_ctx, true); expect_refresh( mock_remote_image_ctx, { + {1U, librbd::SnapInfo{"snap1", cls::rbd::MirrorSnapshotNamespace{ + cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY, {"remote mirror peer uuid"}, + "", CEPH_NOSNAP, true, 0, {}}, + 0, {}, 0, 0, {}}}, {2U, librbd::SnapInfo{"snap2", cls::rbd::UserSnapshotNamespace{}, 0, {}, 0, 0, {}}}, {3U, librbd::SnapInfo{"snap3", cls::rbd::MirrorSnapshotNamespace{ @@ -839,6 +843,10 @@ TEST_F(TestMockImageReplayerSnapshotReplayer, SyncSnapshot) { expect_is_refresh_required(mock_remote_image_ctx, true); expect_refresh( mock_remote_image_ctx, { + {1U, librbd::SnapInfo{"snap1", cls::rbd::MirrorSnapshotNamespace{ + cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY, {"remote mirror peer uuid"}, + "", CEPH_NOSNAP, true, 0, {}}, + 0, {}, 0, 0, {}}}, {2U, librbd::SnapInfo{"snap2", cls::rbd::UserSnapshotNamespace{}, 0, {}, 0, 0, {}}}, {3U, librbd::SnapInfo{"snap3", cls::rbd::MirrorSnapshotNamespace{ @@ -862,7 +870,14 @@ TEST_F(TestMockImageReplayerSnapshotReplayer, SyncSnapshot) { 4, true, 0, {}}, 0, {}, 0, 0, {}}}, }, 0); - expect_is_refresh_required(mock_remote_image_ctx, false); + expect_is_refresh_required(mock_remote_image_ctx, true); + expect_refresh( + mock_remote_image_ctx, { + {4U, librbd::SnapInfo{"snap4", cls::rbd::MirrorSnapshotNamespace{ + cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY, {"remote mirror peer uuid"}, + "", CEPH_NOSNAP, true, 0, {}}, + 0, {}, 0, 0, {}}} + }, 0); // fire init C_SaferCond init_ctx; @@ -2010,6 +2025,74 @@ TEST_F(TestMockImageReplayerSnapshotReplayer, SplitBrain) { mock_remote_image_ctx)); } +TEST_F(TestMockImageReplayerSnapshotReplayer, RemoteSnapshotMissingSplitBrain) { + librbd::MockTestImageCtx mock_local_image_ctx{*m_local_image_ctx}; + librbd::MockTestImageCtx mock_remote_image_ctx{*m_remote_image_ctx}; + + MockThreads mock_threads(m_threads); + expect_work_queue_repeatedly(mock_threads); + + MockReplayerListener mock_replayer_listener; + expect_notification(mock_threads, mock_replayer_listener); + + InSequence seq; + + MockInstanceWatcher mock_instance_watcher; + MockImageMeta mock_image_meta; + MockStateBuilder mock_state_builder(mock_local_image_ctx, + mock_remote_image_ctx, + mock_image_meta); + MockReplayer mock_replayer{&mock_threads, &mock_instance_watcher, + "local mirror uuid", &m_pool_meta_cache, + &mock_state_builder, &mock_replayer_listener}; + m_pool_meta_cache.set_remote_pool_meta( + m_remote_io_ctx.get_id(), + {"remote mirror uuid", "remote mirror peer uuid"}); + + librbd::UpdateWatchCtx* update_watch_ctx = nullptr; + ASSERT_EQ(0, init_entry_replayer(mock_replayer, mock_threads, + mock_local_image_ctx, + mock_remote_image_ctx, + mock_replayer_listener, + mock_image_meta, + &update_watch_ctx)); + + // inject a missing remote start snap (deleted) + mock_local_image_ctx.snap_info = { + {11U, librbd::SnapInfo{"snap3", cls::rbd::MirrorSnapshotNamespace{ + cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY, {}, + "remote mirror uuid", 1, true, 0, + {{1, CEPH_NOSNAP}}}, + 0, {}, 0, 0, {}}}}; + mock_remote_image_ctx.snap_info = { + {2U, librbd::SnapInfo{"snap2", cls::rbd::MirrorSnapshotNamespace{ + cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY, {"remote mirror peer uuid"}, + "", CEPH_NOSNAP, true, 0, {}}, + 0, {}, 0, 0, {}}}, + {3U, librbd::SnapInfo{"snap3", cls::rbd::MirrorSnapshotNamespace{ + cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY, {"remote mirror peer uuid"}, + "", CEPH_NOSNAP, true, 0, {}}, + 0, {}, 0, 0, {}}}}; + + // split-brain due to missing snapshot 1 + expect_load_image_meta(mock_image_meta, false, 0); + expect_is_refresh_required(mock_local_image_ctx, false); + expect_is_refresh_required(mock_remote_image_ctx, false); + + // wake-up replayer + update_watch_ctx->handle_notify(); + + // wait for sync to complete and expect replay complete + ASSERT_EQ(0, wait_for_notification(1)); + ASSERT_FALSE(mock_replayer.is_replaying()); + ASSERT_EQ(-EEXIST, mock_replayer.get_error_code()); + ASSERT_EQ(std::string{"split-brain"}, mock_replayer.get_error_description()); + + ASSERT_EQ(0, shut_down_entry_replayer(mock_replayer, mock_threads, + mock_local_image_ctx, + mock_remote_image_ctx)); +} + TEST_F(TestMockImageReplayerSnapshotReplayer, RemoteFailover) { librbd::MockTestImageCtx mock_local_image_ctx{*m_local_image_ctx}; librbd::MockTestImageCtx mock_remote_image_ctx{*m_remote_image_ctx}; diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc b/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc index f46270c927fa5..b2a1c150cb080 100644 --- a/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc +++ b/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc @@ -616,43 +616,54 @@ void Replayer::scan_remote_mirror_snapshots( m_remote_snap_id_end = remote_snap_id; m_remote_mirror_snap_ns = *mirror_ns; } - image_locker.unlock(); - - unlink_snap_ids.erase(m_remote_snap_id_start); - unlink_snap_ids.erase(m_remote_snap_id_end); - if (!unlink_snap_ids.empty()) { - locker->unlock(); - // retry the unlinking process for a remote snapshot that we do not - // need anymore - auto remote_snap_id = *unlink_snap_ids.begin(); - dout(10) << "unlinking from remote snapshot " << remote_snap_id << dendl; - unlink_peer(remote_snap_id); - return; + if (m_remote_snap_id_start != 0 && + remote_image_ctx->snap_info.count(m_remote_snap_id_start) == 0) { + // the remote start snapshot was deleted out from under us + derr << "failed to locate remote start snapshot: " + << "snap_id=" << m_remote_snap_id_start << dendl; + split_brain = true; } - if (m_remote_snap_id_end != CEPH_NOSNAP) { - dout(10) << "found remote mirror snapshot: " - << "remote_snap_id_start=" << m_remote_snap_id_start << ", " - << "remote_snap_id_end=" << m_remote_snap_id_end << ", " - << "remote_snap_ns=" << m_remote_mirror_snap_ns << dendl; - if (m_remote_mirror_snap_ns.complete) { + image_locker.unlock(); + + if (!split_brain) { + unlink_snap_ids.erase(m_remote_snap_id_start); + unlink_snap_ids.erase(m_remote_snap_id_end); + if (!unlink_snap_ids.empty()) { locker->unlock(); - if (m_local_snap_id_end != CEPH_NOSNAP && - !m_local_mirror_snap_ns.complete) { - // attempt to resume image-sync - dout(10) << "local image contains in-progress mirror snapshot" - << dendl; - get_local_image_state(); + // retry the unlinking process for a remote snapshot that we do not + // need anymore + auto remote_snap_id = *unlink_snap_ids.begin(); + dout(10) << "unlinking from remote snapshot " << remote_snap_id << dendl; + unlink_peer(remote_snap_id); + return; + } + + if (m_remote_snap_id_end != CEPH_NOSNAP) { + dout(10) << "found remote mirror snapshot: " + << "remote_snap_id_start=" << m_remote_snap_id_start << ", " + << "remote_snap_id_end=" << m_remote_snap_id_end << ", " + << "remote_snap_ns=" << m_remote_mirror_snap_ns << dendl; + if (m_remote_mirror_snap_ns.complete) { + locker->unlock(); + + if (m_local_snap_id_end != CEPH_NOSNAP && + !m_local_mirror_snap_ns.complete) { + // attempt to resume image-sync + dout(10) << "local image contains in-progress mirror snapshot" + << dendl; + get_local_image_state(); + } else { + copy_snapshots(); + } + return; } else { - copy_snapshots(); + // might have raced with the creation of a remote mirror snapshot + // so we will need to refresh and rescan once it completes + dout(15) << "remote mirror snapshot not complete" << dendl; } - return; - } else { - // might have raced with the creation of a remote mirror snapshot - // so we will need to refresh and rescan once it completes - dout(15) << "remote mirror snapshot not complete" << dendl; } } -- 2.39.5