From 9a4030e36cca053d874aca0c3354241375476999 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 24 Aug 2022 12:56:31 +0200 Subject: [PATCH] rbd-mirror: resume pending shutdown on error in snapshot replayer If a shutdown is requested, e.g. by update_pool_replayers() because remote RADOS instance got blocklisted, and Replayer::shut_down() pends it on completion of current snapshot sync, it gets stuck if replayer encounters an error in the interim. This is particularly likely in the blocklist case: a higher layer may detect that client got blocklisted and request a shutdown first, and then when replayer sees EBLOCKLISTED in turn, it calls handle_replay_complete() -- which does not resume a pending shutdown. Because update_pool_replayers() blocks on shutdown with Mirror::m_lock held, eventually the entire daemon hangs in perpetuity. Fixes: https://tracker.ceph.com/issues/56154 Signed-off-by: Ilya Dryomov (cherry picked from commit fc4cc575bc53f62f88ee3faf0daba8906bc1c6c1) --- .../snapshot/test_mock_Replayer.cc | 172 ++++++++++++++++++ .../image_replayer/snapshot/Replayer.cc | 16 +- 2 files changed, 183 insertions(+), 5 deletions(-) diff --git a/src/test/rbd_mirror/image_replayer/snapshot/test_mock_Replayer.cc b/src/test/rbd_mirror/image_replayer/snapshot/test_mock_Replayer.cc index 4cf2dbb142bf7..e1ee953e7f62d 100644 --- a/src/test/rbd_mirror/image_replayer/snapshot/test_mock_Replayer.cc +++ b/src/test/rbd_mirror/image_replayer/snapshot/test_mock_Replayer.cc @@ -571,6 +571,11 @@ public: EXPECT_CALL(mock_instance_watcher, notify_sync_complete(image_id)); } + void expect_cancel_sync_request(MockInstanceWatcher& mock_instance_watcher, + const std::string& image_id) { + EXPECT_CALL(mock_instance_watcher, cancel_sync_request(image_id)); + } + void expect_image_copy(MockImageCopyRequest& mock_image_copy_request, uint64_t src_snap_id_start, uint64_t src_snap_id_end, uint64_t dst_snap_id_start, @@ -3144,6 +3149,173 @@ TEST_F(TestMockImageReplayerSnapshotReplayer, ImageNameUpdated) { mock_remote_image_ctx)); } +TEST_F(TestMockImageReplayerSnapshotReplayer, ApplyImageStatePendingShutdown) { + librbd::MockTestImageCtx mock_local_image_ctx{*m_local_image_ctx}; + librbd::MockTestImageCtx mock_remote_image_ctx{*m_remote_image_ctx}; + + MockThreads mock_threads(m_threads); + expect_work_queue_repeatedly(mock_threads); + + MockReplayerListener mock_replayer_listener; + expect_notification(mock_threads, mock_replayer_listener); + + InSequence seq; + + MockInstanceWatcher mock_instance_watcher; + MockImageMeta mock_image_meta; + MockStateBuilder mock_state_builder(mock_local_image_ctx, + mock_remote_image_ctx, + mock_image_meta); + MockReplayer mock_replayer{&mock_threads, &mock_instance_watcher, + "local mirror uuid", &m_pool_meta_cache, + &mock_state_builder, &mock_replayer_listener}; + C_SaferCond shutdown_ctx; + m_pool_meta_cache.set_remote_pool_meta( + m_remote_io_ctx.get_id(), + {"remote mirror uuid", "remote mirror peer uuid"}); + + librbd::UpdateWatchCtx* update_watch_ctx = nullptr; + ASSERT_EQ(0, init_entry_replayer(mock_replayer, mock_threads, + mock_local_image_ctx, + mock_remote_image_ctx, + mock_replayer_listener, + mock_image_meta, + &update_watch_ctx)); + + // inject snapshot + mock_remote_image_ctx.snap_info = { + {1U, librbd::SnapInfo{"snap1", cls::rbd::MirrorSnapshotNamespace{ + cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY, {"remote mirror peer uuid"}, "", + CEPH_NOSNAP, true, 0, {}}, + 0, {}, 0, 0, {}}}}; + + // sync snap1 + expect_load_image_meta(mock_image_meta, false, 0); + expect_is_refresh_required(mock_local_image_ctx, false); + expect_is_refresh_required(mock_remote_image_ctx, false); + MockSnapshotCopyRequest mock_snapshot_copy_request; + expect_snapshot_copy(mock_snapshot_copy_request, 0, 1, 0, {{1, CEPH_NOSNAP}}, + 0); + MockGetImageStateRequest mock_get_image_state_request; + expect_get_image_state(mock_get_image_state_request, 1, 0); + MockCreateNonPrimaryRequest mock_create_non_primary_request; + expect_create_non_primary_request(mock_create_non_primary_request, + false, "remote mirror uuid", 1, + {{1, CEPH_NOSNAP}}, 11, 0); + MockImageStateUpdateRequest mock_image_state_update_request; + expect_update_mirror_image_state(mock_image_state_update_request, 0); + expect_notify_sync_request(mock_instance_watcher, mock_local_image_ctx.id, 0); + MockImageCopyRequest mock_image_copy_request; + expect_image_copy(mock_image_copy_request, 0, 1, 0, {}, + {{1, CEPH_NOSNAP}}, 0); + MockApplyImageStateRequest mock_apply_state_request; + EXPECT_CALL(mock_apply_state_request, send()) + .WillOnce(Invoke([this, &req=mock_apply_state_request, + &replayer=mock_replayer, &ctx=shutdown_ctx]() { + // inject a shutdown, to be pended due to STATE_REPLAYING + replayer.shut_down(&ctx); + m_threads->work_queue->queue(req.on_finish, 0); + })); + expect_cancel_sync_request(mock_instance_watcher, mock_local_image_ctx.id); + expect_mirror_image_snapshot_set_copy_progress( + mock_local_image_ctx, 11, true, 0, 0); + expect_notify_update(mock_local_image_ctx); + expect_notify_sync_complete(mock_instance_watcher, mock_local_image_ctx.id); + + // shutdown should be resumed + expect_unregister_update_watcher(mock_remote_image_ctx, 234, 0); + expect_unregister_update_watcher(mock_local_image_ctx, 123, 0); + + // wake-up replayer + update_watch_ctx->handle_notify(); + + ASSERT_EQ(0, wait_for_notification(1)); + ASSERT_FALSE(mock_replayer.is_replaying()); + ASSERT_EQ(0, mock_replayer.get_error_code()); + + ASSERT_EQ(0, shutdown_ctx.wait()); +} + +TEST_F(TestMockImageReplayerSnapshotReplayer, ApplyImageStateErrorPendingShutdown) { + librbd::MockTestImageCtx mock_local_image_ctx{*m_local_image_ctx}; + librbd::MockTestImageCtx mock_remote_image_ctx{*m_remote_image_ctx}; + + MockThreads mock_threads(m_threads); + expect_work_queue_repeatedly(mock_threads); + + MockReplayerListener mock_replayer_listener; + expect_notification(mock_threads, mock_replayer_listener); + + InSequence seq; + + MockInstanceWatcher mock_instance_watcher; + MockImageMeta mock_image_meta; + MockStateBuilder mock_state_builder(mock_local_image_ctx, + mock_remote_image_ctx, + mock_image_meta); + MockReplayer mock_replayer{&mock_threads, &mock_instance_watcher, + "local mirror uuid", &m_pool_meta_cache, + &mock_state_builder, &mock_replayer_listener}; + C_SaferCond shutdown_ctx; + m_pool_meta_cache.set_remote_pool_meta( + m_remote_io_ctx.get_id(), + {"remote mirror uuid", "remote mirror peer uuid"}); + + librbd::UpdateWatchCtx* update_watch_ctx = nullptr; + ASSERT_EQ(0, init_entry_replayer(mock_replayer, mock_threads, + mock_local_image_ctx, + mock_remote_image_ctx, + mock_replayer_listener, + mock_image_meta, + &update_watch_ctx)); + + // inject snapshot + mock_remote_image_ctx.snap_info = { + {1U, librbd::SnapInfo{"snap1", cls::rbd::MirrorSnapshotNamespace{ + cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY, {"remote mirror peer uuid"}, "", + CEPH_NOSNAP, true, 0, {}}, + 0, {}, 0, 0, {}}}}; + + // sync snap1 + expect_load_image_meta(mock_image_meta, false, 0); + expect_is_refresh_required(mock_local_image_ctx, false); + expect_is_refresh_required(mock_remote_image_ctx, false); + MockSnapshotCopyRequest mock_snapshot_copy_request; + expect_snapshot_copy(mock_snapshot_copy_request, 0, 1, 0, {{1, CEPH_NOSNAP}}, + 0); + MockGetImageStateRequest mock_get_image_state_request; + expect_get_image_state(mock_get_image_state_request, 1, 0); + MockCreateNonPrimaryRequest mock_create_non_primary_request; + expect_create_non_primary_request(mock_create_non_primary_request, + false, "remote mirror uuid", 1, + {{1, CEPH_NOSNAP}}, 11, 0); + MockImageStateUpdateRequest mock_image_state_update_request; + expect_update_mirror_image_state(mock_image_state_update_request, 0); + expect_notify_sync_request(mock_instance_watcher, mock_local_image_ctx.id, 0); + MockImageCopyRequest mock_image_copy_request; + expect_image_copy(mock_image_copy_request, 0, 1, 0, {}, + {{1, CEPH_NOSNAP}}, 0); + MockApplyImageStateRequest mock_apply_state_request; + EXPECT_CALL(mock_apply_state_request, send()) + .WillOnce(Invoke([this, &req=mock_apply_state_request, + &replayer=mock_replayer, &ctx=shutdown_ctx]() { + // inject a shutdown, to be pended due to STATE_REPLAYING + replayer.shut_down(&ctx); + m_threads->work_queue->queue(req.on_finish, -EINVAL); + })); + expect_cancel_sync_request(mock_instance_watcher, mock_local_image_ctx.id); + expect_notify_sync_complete(mock_instance_watcher, mock_local_image_ctx.id); + + // shutdown should be resumed + expect_unregister_update_watcher(mock_remote_image_ctx, 234, 0); + expect_unregister_update_watcher(mock_local_image_ctx, 123, 0); + + // wake-up replayer + update_watch_ctx->handle_notify(); + + ASSERT_EQ(0, shutdown_ctx.wait()); +} + } // namespace snapshot } // namespace image_replayer } // namespace mirror diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc b/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc index b79c652eb548b..71983b5b31c67 100644 --- a/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc +++ b/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc @@ -1490,17 +1490,23 @@ void Replayer::handle_replay_complete(std::unique_lock* locker, const std::string& description) { ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); - if (m_error_code == 0) { - m_error_code = r; - m_error_description = description; - } - if (m_sync_in_progress) { m_sync_in_progress = false; m_instance_watcher->notify_sync_complete( m_state_builder->local_image_ctx->id); } + // don't set error code and description if resuming a pending + // shutdown + if (is_replay_interrupted(locker)) { + return; + } + + if (m_error_code == 0) { + m_error_code = r; + m_error_description = description; + } + if (m_state != STATE_REPLAYING && m_state != STATE_IDLE) { return; } -- 2.39.5