From: Ilya Dryomov Date: Wed, 24 Apr 2024 10:45:27 +0000 (+0200) Subject: rbd-mirror: remove callout when destroying pool replayer X-Git-Tag: testing/wip-pdonnell-testing-20240508.183908-debug~27^2~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=b7e79642d53c1802ae1202c3f013677fcfd26b1b;p=ceph-ci.git rbd-mirror: remove callout when destroying pool replayer If a pool replayer is removed in an error state (e.g. after failing to connect to the remote cluster), its callout should be removed as well. Otherwise, the error would persist causing "daemon health: ERROR" status to be reported even after a new pool replayer is created and started successfully. Fixes: https://tracker.ceph.com/issues/65487 Signed-off-by: Ilya Dryomov --- diff --git a/qa/workunits/rbd/rbd_mirror_bootstrap.sh b/qa/workunits/rbd/rbd_mirror_bootstrap.sh index f4c1070bc95..b1aeeaac927 100755 --- a/qa/workunits/rbd/rbd_mirror_bootstrap.sh +++ b/qa/workunits/rbd/rbd_mirror_bootstrap.sh @@ -58,3 +58,38 @@ wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${PARENT_POOL} image1 wait_for_image_replay_started ${CLUSTER1} ${PARENT_POOL} image2 write_image ${CLUSTER2} ${PARENT_POOL} image2 100 wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} image2 + +testlog "TEST: pool replayer and callout cleanup when peer is updated" +test_health_state ${CLUSTER1} ${PARENT_POOL} 'OK' +test_health_state ${CLUSTER2} ${PARENT_POOL} 'OK' +POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL}) +jq -e '.summary.health == "OK"' <<< ${POOL_STATUS} +jq -e '.summary.daemon_health == "OK"' <<< ${POOL_STATUS} +jq -e '.daemons[0].health == "OK"' <<< ${POOL_STATUS} +jq -e '.daemons[0] | has("callouts") | not' <<< ${POOL_STATUS} +OLD_SERVICE_ID=$(jq -r '.daemons[0].service_id' <<< ${POOL_STATUS}) +OLD_INSTANCE_ID=$(jq -r '.daemons[0].instance_id' <<< ${POOL_STATUS}) +# mess up the peer on one of the clusters by setting a bogus user name +PEER_UUID=$(rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool info --format json | jq -r '.peers[0].uuid') +rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool peer set ${PEER_UUID} client client.invalid +wait_for_health_state ${CLUSTER2} ${PARENT_POOL} 'ERROR' +test_health_state ${CLUSTER1} ${PARENT_POOL} 'WARNING' +POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL}) +jq -e '.summary.health == "ERROR"' <<< ${POOL_STATUS} +jq -e '.summary.daemon_health == "ERROR"' <<< ${POOL_STATUS} +jq -e '.daemons[0].health == "ERROR"' <<< ${POOL_STATUS} +jq -e '.daemons[0].callouts == ["unable to connect to remote cluster"]' <<< ${POOL_STATUS} +# restore the correct user name +rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool peer set ${PEER_UUID} client client.rbd-mirror-peer +wait_for_health_state ${CLUSTER2} ${PARENT_POOL} 'OK' +test_health_state ${CLUSTER1} ${PARENT_POOL} 'OK' +POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL}) +jq -e '.summary.health == "OK"' <<< ${POOL_STATUS} +jq -e '.summary.daemon_health == "OK"' <<< ${POOL_STATUS} +jq -e '.daemons[0].health == "OK"' <<< ${POOL_STATUS} +jq -e '.daemons[0] | has("callouts") | not' <<< ${POOL_STATUS} +NEW_SERVICE_ID=$(jq -r '.daemons[0].service_id' <<< ${POOL_STATUS}) +NEW_INSTANCE_ID=$(jq -r '.daemons[0].instance_id' <<< ${POOL_STATUS}) +# check that we are running the same service (daemon) but a newer pool replayer +((OLD_SERVICE_ID == NEW_SERVICE_ID)) +((OLD_INSTANCE_ID < NEW_INSTANCE_ID)) diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh index b6e1fa87de2..a1fa099bc42 100755 --- a/qa/workunits/rbd/rbd_mirror_helpers.sh +++ b/qa/workunits/rbd/rbd_mirror_helpers.sh @@ -632,6 +632,39 @@ flush() admin_daemons "${cluster}" ${cmd} } +get_pool_status_json() +{ + local cluster="$1" + local pool="$2" + + CEPH_ARGS='' rbd --cluster "${cluster}" mirror pool status "${pool}" --verbose --format json +} + +test_health_state() +{ + local cluster="$1" + local pool="$2" + local state="$3" + + local status + status="$(get_pool_status_json "${cluster}" "${pool}")" + jq -e '.summary.health == "'"${state}"'"' <<< "${status}" +} + +wait_for_health_state() +{ + local cluster="$1" + local pool="$2" + local state="$3" + local s + + for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do + sleep "${s}" + test_health_state "${cluster}" "${pool}" "${state}" && return 0 + done + return 1 +} + test_image_replay_state() { local cluster=$1 diff --git a/src/test/rbd_mirror/test_mock_PoolReplayer.cc b/src/test/rbd_mirror/test_mock_PoolReplayer.cc index ebd27d7e1cb..5a7d2348a17 100644 --- a/src/test/rbd_mirror/test_mock_PoolReplayer.cc +++ b/src/test/rbd_mirror/test_mock_PoolReplayer.cc @@ -533,6 +533,22 @@ public: mock_service_daemon, "instance_id", {instance_id}); } + void expect_service_daemon_add_or_update_callout( + MockServiceDaemon& mock_service_daemon, + uint64_t callout_id, uint64_t out_callout_id, + service_daemon::CalloutLevel callout_level, const std::string& text) { + EXPECT_CALL(mock_service_daemon, + add_or_update_callout(m_local_io_ctx.get_id(), callout_id, + callout_level, text)) + .WillOnce(Return(out_callout_id)); + } + + void expect_service_daemon_remove_callout( + MockServiceDaemon& mock_service_daemon, uint64_t callout_id) { + EXPECT_CALL(mock_service_daemon, + remove_callout(m_local_io_ctx.get_id(), callout_id)); + } + PoolMetaCache m_pool_meta_cache{g_ceph_context}; }; @@ -930,5 +946,143 @@ TEST_F(TestMockPoolReplayer, NamespacesError) { pool_replayer.shut_down(); } +TEST_F(TestMockPoolReplayer, RemoveCalloutOnInit) { + PeerSpec peer_spec{"uuid", "cluster name", "client.name"}; + + auto mock_default_namespace_replayer = new MockNamespaceReplayer(); + expect_namespace_replayer_is_blocklisted(*mock_default_namespace_replayer, + false); + + auto mock_leader_watcher = new MockLeaderWatcher(); + expect_leader_watcher_get_leader_instance_id(*mock_leader_watcher); + expect_leader_watcher_is_blocklisted(*mock_leader_watcher, false); + + MockThreads mock_threads(m_threads); + expect_work_queue(mock_threads); + + InSequence seq; + + auto& mock_cluster = get_mock_cluster(); + auto mock_local_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr); + + auto mock_remote_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_remote_rados_client, "cluster name", + nullptr); + + auto mock_local_io_ctx = mock_local_rados_client->do_create_ioctx( + m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name()); + expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx); + + expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM); + + MockServiceDaemon mock_service_daemon; + expect_service_daemon_add_or_update_callout( + mock_service_daemon, service_daemon::CALLOUT_ID_NONE, 123, + service_daemon::CALLOUT_LEVEL_ERROR, "unable to query local mirror uuid"); + + MockPoolReplayer pool_replayer(&mock_threads, &mock_service_daemon, nullptr, + &m_pool_meta_cache, + m_local_io_ctx.get_id(), peer_spec, {}); + pool_replayer.init("siteA"); + pool_replayer.shut_down(); + + mock_local_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr); + + mock_remote_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_remote_rados_client, "cluster name", + nullptr); + + mock_local_io_ctx = mock_local_rados_client->do_create_ioctx( + m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name()); + expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx); + + expect_mirror_uuid_get(mock_local_io_ctx, "uuid", 0); + + auto mock_remote_pool_poller = new MockRemotePoolPoller(); + expect_remote_pool_poller_init(*mock_remote_pool_poller, + {"remote mirror uuid", ""}, 0); + + expect_namespace_replayer_init(*mock_default_namespace_replayer, 0); + expect_leader_watcher_init(*mock_leader_watcher, 0); + + expect_service_daemon_remove_callout(mock_service_daemon, 123); + std::string instance_id = stringify(mock_local_io_ctx->get_instance_id()); + expect_service_daemon_add_or_update_instance_id_attribute( + mock_service_daemon, instance_id); + + pool_replayer.init("siteA"); + + expect_leader_watcher_shut_down(*mock_leader_watcher); + expect_namespace_replayer_shut_down(*mock_default_namespace_replayer); + expect_remote_pool_poller_shut_down(*mock_remote_pool_poller, 0); + + pool_replayer.shut_down(); +} + +TEST_F(TestMockPoolReplayer, RemoveCalloutOnDestruction) { + PeerSpec peer_spec{"uuid", "cluster name", "client.name"}; + + MockThreads mock_threads(m_threads); + expect_work_queue(mock_threads); + + InSequence seq; + + auto& mock_cluster = get_mock_cluster(); + auto mock_local_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr); + + auto mock_remote_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_remote_rados_client, "cluster name", + nullptr); + + auto mock_local_io_ctx = mock_local_rados_client->do_create_ioctx( + m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name()); + expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx); + + expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM); + + MockServiceDaemon mock_service_daemon; + expect_service_daemon_add_or_update_callout( + mock_service_daemon, service_daemon::CALLOUT_ID_NONE, 123, + service_daemon::CALLOUT_LEVEL_ERROR, "unable to query local mirror uuid"); + + MockPoolReplayer pool_replayer(&mock_threads, &mock_service_daemon, nullptr, + &m_pool_meta_cache, + m_local_io_ctx.get_id(), peer_spec, {}); + pool_replayer.init("siteA"); + pool_replayer.shut_down(); + + mock_local_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr); + + mock_remote_rados_client = mock_cluster.do_create_rados_client( + g_ceph_context); + expect_connect(mock_cluster, mock_remote_rados_client, "cluster name", + nullptr); + + mock_local_io_ctx = mock_local_rados_client->do_create_ioctx( + m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name()); + expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx); + + expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM); + expect_service_daemon_add_or_update_callout( + mock_service_daemon, 123, 123, service_daemon::CALLOUT_LEVEL_ERROR, + "unable to query local mirror uuid"); + + pool_replayer.init("siteA"); + pool_replayer.shut_down(); + + expect_service_daemon_remove_callout(mock_service_daemon, 123); +} + } // namespace mirror } // namespace rbd diff --git a/src/tools/rbd_mirror/PoolReplayer.cc b/src/tools/rbd_mirror/PoolReplayer.cc index 81292ba07e2..cb88e6e96f7 100644 --- a/src/tools/rbd_mirror/PoolReplayer.cc +++ b/src/tools/rbd_mirror/PoolReplayer.cc @@ -245,6 +245,13 @@ PoolReplayer::~PoolReplayer() { shut_down(); + // pool replayer instances are generally (unless the peer gets + // updated) preserved across errors to reduce ping-ponging of callout + // error notifications, so this can't be done in shut_down() + if (m_callout_id != service_daemon::CALLOUT_ID_NONE) { + m_service_daemon->remove_callout(m_local_pool_id, m_callout_id); + } + ceph_assert(m_asok_hook == nullptr); }