From facd729fbca38a70fb0ff772fa94d0b9e74fc174 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 24 Apr 2024 12:45:27 +0200 Subject: [PATCH] rbd-mirror: remove callout when destroying pool replayer If a pool replayer is removed in an error state (e.g. after failing to connect to the remote cluster), its callout should be removed as well. Otherwise, the error would persist causing "daemon health: ERROR" status to be reported even after a new pool replayer is created and started successfully. Fixes: https://tracker.ceph.com/issues/65487 Signed-off-by: Ilya Dryomov (cherry picked from commit b7e79642d53c1802ae1202c3f013677fcfd26b1b) --- qa/workunits/rbd/rbd_mirror_bootstrap.sh | 35 ++++ qa/workunits/rbd/rbd_mirror_helpers.sh | 33 ++++ src/test/rbd_mirror/test_mock_PoolReplayer.cc | 154 ++++++++++++++++++ src/tools/rbd_mirror/PoolReplayer.cc | 7 + 4 files changed, 229 insertions(+) diff --git a/qa/workunits/rbd/rbd_mirror_bootstrap.sh b/qa/workunits/rbd/rbd_mirror_bootstrap.sh index f4c1070bc95ba..b1aeeaac9272f 100755 --- a/qa/workunits/rbd/rbd_mirror_bootstrap.sh +++ b/qa/workunits/rbd/rbd_mirror_bootstrap.sh @@ -58,3 +58,38 @@ wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${PARENT_POOL} image1 wait_for_image_replay_started ${CLUSTER1} ${PARENT_POOL} image2 write_image ${CLUSTER2} ${PARENT_POOL} image2 100 wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} image2 + +testlog "TEST: pool replayer and callout cleanup when peer is updated" +test_health_state ${CLUSTER1} ${PARENT_POOL} 'OK' +test_health_state ${CLUSTER2} ${PARENT_POOL} 'OK' +POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL}) +jq -e '.summary.health == "OK"' <<< ${POOL_STATUS} +jq -e '.summary.daemon_health == "OK"' <<< ${POOL_STATUS} +jq -e '.daemons[0].health == "OK"' <<< ${POOL_STATUS} +jq -e '.daemons[0] | has("callouts") | not' <<< ${POOL_STATUS} +OLD_SERVICE_ID=$(jq -r '.daemons[0].service_id' <<< ${POOL_STATUS}) +OLD_INSTANCE_ID=$(jq -r '.daemons[0].instance_id' <<< ${POOL_STATUS}) +# mess up the peer on one of the clusters by setting a bogus user name +PEER_UUID=$(rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool info --format json | jq -r '.peers[0].uuid') +rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool peer set ${PEER_UUID} client client.invalid +wait_for_health_state ${CLUSTER2} ${PARENT_POOL} 'ERROR' +test_health_state ${CLUSTER1} ${PARENT_POOL} 'WARNING' +POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL}) +jq -e '.summary.health == "ERROR"' <<< ${POOL_STATUS} +jq -e '.summary.daemon_health == "ERROR"' <<< ${POOL_STATUS} +jq -e '.daemons[0].health == "ERROR"' <<< ${POOL_STATUS} +jq -e '.daemons[0].callouts == ["unable to connect to remote cluster"]' <<< ${POOL_STATUS} +# restore the correct user name +rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool peer set ${PEER_UUID} client client.rbd-mirror-peer +wait_for_health_state ${CLUSTER2} ${PARENT_POOL} 'OK' +test_health_state ${CLUSTER1} ${PARENT_POOL} 'OK' +POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL}) +jq -e '.summary.health == "OK"' <<< ${POOL_STATUS} +jq -e '.summary.daemon_health == "OK"' <<< ${POOL_STATUS} +jq -e '.daemons[0].health == "OK"' <<< ${POOL_STATUS} +jq -e '.daemons[0] | has("callouts") | not' <<< ${POOL_STATUS} +NEW_SERVICE_ID=$(jq -r '.daemons[0].service_id' <<< ${POOL_STATUS}) +NEW_INSTANCE_ID=$(jq -r '.daemons[0].instance_id' <<< ${POOL_STATUS}) +# check that we are running the same service (daemon) but a newer pool replayer +((OLD_SERVICE_ID == NEW_SERVICE_ID)) +((OLD_INSTANCE_ID < NEW_INSTANCE_ID)) diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh index b6abff96d6dda..2dfee5181d528 100755 --- a/qa/workunits/rbd/rbd_mirror_helpers.sh +++ b/qa/workunits/rbd/rbd_mirror_helpers.sh @@ -623,6 +623,39 @@ flush() admin_daemons "${cluster}" ${cmd} } +get_pool_status_json() +{ + local cluster="$1" + local pool="$2" + + CEPH_ARGS='' rbd --cluster "${cluster}" mirror pool status "${pool}" --verbose --format json +} + +test_health_state() +{ + local cluster="$1" + local pool="$2" + local state="$3" + + local status + status="$(get_pool_status_json "${cluster}" "${pool}")" + jq -e '.summary.health == "'"${state}"'"' <<< "${status}" +} + +wait_for_health_state() +{ + local cluster="$1" + local pool="$2" + local state="$3" + local s + + for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do + sleep "${s}" + test_health_state "${cluster}" "${pool}" "${state}" && return 0 + done + return 1 +} + test_image_replay_state() { local cluster=$1 diff --git a/src/test/rbd_mirror/test_mock_PoolReplayer.cc b/src/test/rbd_mirror/test_mock_PoolReplayer.cc index ebd27d7e1cba0..5a7d2348a1779 100644 --- a/src/test/rbd_mirror/test_mock_PoolReplayer.cc +++ b/src/test/rbd_mirror/test_mock_PoolReplayer.cc @@ -533,6 +533,22 @@ public: mock_service_daemon, "instance_id", {instance_id}); } + void expect_service_daemon_add_or_update_callout( + MockServiceDaemon& mock_service_daemon, + uint64_t callout_id, uint64_t out_callout_id, + service_daemon::CalloutLevel callout_level, const std::string& text) { + EXPECT_CALL(mock_service_daemon, + add_or_update_callout(m_local_io_ctx.get_id(), callout_id, + callout_level, text)) + .WillOnce(Return(out_callout_id)); + } + + void expect_service_daemon_remove_callout( + MockServiceDaemon& mock_service_daemon, uint64_t callout_id) { + EXPECT_CALL(mock_service_daemon, + remove_callout(m_local_io_ctx.get_id(), callout_id)); + } + PoolMetaCache m_pool_meta_cache{g_ceph_context}; }; @@ -930,5 +946,143 @@ TEST_F(TestMockPoolReplayer, NamespacesError) { pool_replayer.shut_down(); } +TEST_F(TestMockPoolReplayer, RemoveCalloutOnInit) { + PeerSpec peer_spec{"uuid", "cluster name", "client.name"}; + + auto mock_default_namespace_replayer = new MockNamespaceReplayer(); + expect_namespace_replayer_is_blocklisted(*mock_default_namespace_replayer, + false); + + auto mock_leader_watcher = new MockLeaderWatcher(); + expect_leader_watcher_get_leader_instance_id(*mock_leader_watcher); + expect_leader_watcher_is_blocklisted(*mock_leader_watcher, false); + + MockThreads mock_threads(m_threads); + expect_work_queue(mock_threads); + + InSequence seq; + + auto& mock_cluster = get_mock_cluster(); + auto mock_local_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr); + + auto mock_remote_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_remote_rados_client, "cluster name", + nullptr); + + auto mock_local_io_ctx = mock_local_rados_client->do_create_ioctx( + m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name()); + expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx); + + expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM); + + MockServiceDaemon mock_service_daemon; + expect_service_daemon_add_or_update_callout( + mock_service_daemon, service_daemon::CALLOUT_ID_NONE, 123, + service_daemon::CALLOUT_LEVEL_ERROR, "unable to query local mirror uuid"); + + MockPoolReplayer pool_replayer(&mock_threads, &mock_service_daemon, nullptr, + &m_pool_meta_cache, + m_local_io_ctx.get_id(), peer_spec, {}); + pool_replayer.init("siteA"); + pool_replayer.shut_down(); + + mock_local_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr); + + mock_remote_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_remote_rados_client, "cluster name", + nullptr); + + mock_local_io_ctx = mock_local_rados_client->do_create_ioctx( + m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name()); + expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx); + + expect_mirror_uuid_get(mock_local_io_ctx, "uuid", 0); + + auto mock_remote_pool_poller = new MockRemotePoolPoller(); + expect_remote_pool_poller_init(*mock_remote_pool_poller, + {"remote mirror uuid", ""}, 0); + + expect_namespace_replayer_init(*mock_default_namespace_replayer, 0); + expect_leader_watcher_init(*mock_leader_watcher, 0); + + expect_service_daemon_remove_callout(mock_service_daemon, 123); + std::string instance_id = stringify(mock_local_io_ctx->get_instance_id()); + expect_service_daemon_add_or_update_instance_id_attribute( + mock_service_daemon, instance_id); + + pool_replayer.init("siteA"); + + expect_leader_watcher_shut_down(*mock_leader_watcher); + expect_namespace_replayer_shut_down(*mock_default_namespace_replayer); + expect_remote_pool_poller_shut_down(*mock_remote_pool_poller, 0); + + pool_replayer.shut_down(); +} + +TEST_F(TestMockPoolReplayer, RemoveCalloutOnDestruction) { + PeerSpec peer_spec{"uuid", "cluster name", "client.name"}; + + MockThreads mock_threads(m_threads); + expect_work_queue(mock_threads); + + InSequence seq; + + auto& mock_cluster = get_mock_cluster(); + auto mock_local_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr); + + auto mock_remote_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_remote_rados_client, "cluster name", + nullptr); + + auto mock_local_io_ctx = mock_local_rados_client->do_create_ioctx( + m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name()); + expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx); + + expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM); + + MockServiceDaemon mock_service_daemon; + expect_service_daemon_add_or_update_callout( + mock_service_daemon, service_daemon::CALLOUT_ID_NONE, 123, + service_daemon::CALLOUT_LEVEL_ERROR, "unable to query local mirror uuid"); + + MockPoolReplayer pool_replayer(&mock_threads, &mock_service_daemon, nullptr, + &m_pool_meta_cache, + m_local_io_ctx.get_id(), peer_spec, {}); + pool_replayer.init("siteA"); + pool_replayer.shut_down(); + + mock_local_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr); + + mock_remote_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_remote_rados_client, "cluster name", + nullptr); + + mock_local_io_ctx = mock_local_rados_client->do_create_ioctx( + m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name()); + expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx); + + expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM); + expect_service_daemon_add_or_update_callout( + mock_service_daemon, 123, 123, service_daemon::CALLOUT_LEVEL_ERROR, + "unable to query local mirror uuid"); + + pool_replayer.init("siteA"); + pool_replayer.shut_down(); + + expect_service_daemon_remove_callout(mock_service_daemon, 123); +} + } // namespace mirror } // namespace rbd diff --git a/src/tools/rbd_mirror/PoolReplayer.cc b/src/tools/rbd_mirror/PoolReplayer.cc index 8a04219da0686..87001b8af45c0 100644 --- a/src/tools/rbd_mirror/PoolReplayer.cc +++ b/src/tools/rbd_mirror/PoolReplayer.cc @@ -245,6 +245,13 @@ PoolReplayer::~PoolReplayer() { shut_down(); + // pool replayer instances are generally (unless the peer gets + // updated) preserved across errors to reduce ping-ponging of callout + // error notifications, so this can't be done in shut_down() + if (m_callout_id != service_daemon::CALLOUT_ID_NONE) { + m_service_daemon->remove_callout(m_local_pool_id, m_callout_id); + } + ceph_assert(m_asok_hook == nullptr); } -- 2.39.5