]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rbd-mirror: remove callout when destroying pool replayer
authorIlya Dryomov <idryomov@gmail.com>
Wed, 24 Apr 2024 10:45:27 +0000 (12:45 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Tue, 7 May 2024 08:01:27 +0000 (10:01 +0200)
If a pool replayer is removed in an error state (e.g. after failing to
connect to the remote cluster), its callout should be removed as well.
Otherwise, the error would persist causing "daemon health: ERROR"
status to be reported even after a new pool replayer is created and
started successfully.

Fixes: https://tracker.ceph.com/issues/65487
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
(cherry picked from commit b7e79642d53c1802ae1202c3f013677fcfd26b1b)

qa/workunits/rbd/rbd_mirror_bootstrap.sh
qa/workunits/rbd/rbd_mirror_helpers.sh
src/test/rbd_mirror/test_mock_PoolReplayer.cc
src/tools/rbd_mirror/PoolReplayer.cc

index f4c1070bc95bac7764178b8b0e2f44edf2c268a4..b1aeeaac9272f25deb0cef702811c52b4084bc29 100755 (executable)
@@ -58,3 +58,38 @@ wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${PARENT_POOL} image1
 wait_for_image_replay_started ${CLUSTER1} ${PARENT_POOL} image2
 write_image ${CLUSTER2} ${PARENT_POOL} image2 100
 wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} image2
+
+testlog "TEST: pool replayer and callout cleanup when peer is updated"
+test_health_state ${CLUSTER1} ${PARENT_POOL} 'OK'
+test_health_state ${CLUSTER2} ${PARENT_POOL} 'OK'
+POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL})
+jq -e '.summary.health == "OK"' <<< ${POOL_STATUS}
+jq -e '.summary.daemon_health == "OK"' <<< ${POOL_STATUS}
+jq -e '.daemons[0].health == "OK"' <<< ${POOL_STATUS}
+jq -e '.daemons[0] | has("callouts") | not' <<< ${POOL_STATUS}
+OLD_SERVICE_ID=$(jq -r '.daemons[0].service_id' <<< ${POOL_STATUS})
+OLD_INSTANCE_ID=$(jq -r '.daemons[0].instance_id' <<< ${POOL_STATUS})
+# mess up the peer on one of the clusters by setting a bogus user name
+PEER_UUID=$(rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool info --format json | jq -r '.peers[0].uuid')
+rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool peer set ${PEER_UUID} client client.invalid
+wait_for_health_state ${CLUSTER2} ${PARENT_POOL} 'ERROR'
+test_health_state ${CLUSTER1} ${PARENT_POOL} 'WARNING'
+POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL})
+jq -e '.summary.health == "ERROR"' <<< ${POOL_STATUS}
+jq -e '.summary.daemon_health == "ERROR"' <<< ${POOL_STATUS}
+jq -e '.daemons[0].health == "ERROR"' <<< ${POOL_STATUS}
+jq -e '.daemons[0].callouts == ["unable to connect to remote cluster"]' <<< ${POOL_STATUS}
+# restore the correct user name
+rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool peer set ${PEER_UUID} client client.rbd-mirror-peer
+wait_for_health_state ${CLUSTER2} ${PARENT_POOL} 'OK'
+test_health_state ${CLUSTER1} ${PARENT_POOL} 'OK'
+POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL})
+jq -e '.summary.health == "OK"' <<< ${POOL_STATUS}
+jq -e '.summary.daemon_health == "OK"' <<< ${POOL_STATUS}
+jq -e '.daemons[0].health == "OK"' <<< ${POOL_STATUS}
+jq -e '.daemons[0] | has("callouts") | not' <<< ${POOL_STATUS}
+NEW_SERVICE_ID=$(jq -r '.daemons[0].service_id' <<< ${POOL_STATUS})
+NEW_INSTANCE_ID=$(jq -r '.daemons[0].instance_id' <<< ${POOL_STATUS})
+# check that we are running the same service (daemon) but a newer pool replayer
+((OLD_SERVICE_ID == NEW_SERVICE_ID))
+((OLD_INSTANCE_ID < NEW_INSTANCE_ID))
index b6abff96d6dda80890a237bb7eb0abcbcb00b6c3..2dfee5181d528841c46bed9d7a3721385f4eaf07 100755 (executable)
@@ -623,6 +623,39 @@ flush()
     admin_daemons "${cluster}" ${cmd}
 }
 
+get_pool_status_json()
+{
+    local cluster="$1"
+    local pool="$2"
+
+    CEPH_ARGS='' rbd --cluster "${cluster}" mirror pool status "${pool}" --verbose --format json
+}
+
+test_health_state()
+{
+    local cluster="$1"
+    local pool="$2"
+    local state="$3"
+
+    local status
+    status="$(get_pool_status_json "${cluster}" "${pool}")"
+    jq -e '.summary.health == "'"${state}"'"' <<< "${status}"
+}
+
+wait_for_health_state()
+{
+    local cluster="$1"
+    local pool="$2"
+    local state="$3"
+    local s
+
+    for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do
+        sleep "${s}"
+        test_health_state "${cluster}" "${pool}" "${state}" && return 0
+    done
+    return 1
+}
+
 test_image_replay_state()
 {
     local cluster=$1
index ebd27d7e1cba081d9b4f35cf956dd03572ac7493..5a7d2348a17797cd7e6e6cca402df290fc3a014d 100644 (file)
@@ -533,6 +533,22 @@ public:
         mock_service_daemon, "instance_id", {instance_id});
   }
 
+  void expect_service_daemon_add_or_update_callout(
+      MockServiceDaemon& mock_service_daemon,
+      uint64_t callout_id, uint64_t out_callout_id,
+      service_daemon::CalloutLevel callout_level, const std::string& text) {
+    EXPECT_CALL(mock_service_daemon,
+                add_or_update_callout(m_local_io_ctx.get_id(), callout_id,
+                                      callout_level, text))
+      .WillOnce(Return(out_callout_id));
+  }
+
+  void expect_service_daemon_remove_callout(
+      MockServiceDaemon& mock_service_daemon, uint64_t callout_id) {
+    EXPECT_CALL(mock_service_daemon,
+                remove_callout(m_local_io_ctx.get_id(), callout_id));
+  }
+
   PoolMetaCache m_pool_meta_cache{g_ceph_context};
 };
 
@@ -930,5 +946,143 @@ TEST_F(TestMockPoolReplayer, NamespacesError) {
   pool_replayer.shut_down();
 }
 
+TEST_F(TestMockPoolReplayer, RemoveCalloutOnInit) {
+  PeerSpec peer_spec{"uuid", "cluster name", "client.name"};
+
+  auto mock_default_namespace_replayer = new MockNamespaceReplayer();
+  expect_namespace_replayer_is_blocklisted(*mock_default_namespace_replayer,
+                                           false);
+
+  auto mock_leader_watcher = new MockLeaderWatcher();
+  expect_leader_watcher_get_leader_instance_id(*mock_leader_watcher);
+  expect_leader_watcher_is_blocklisted(*mock_leader_watcher, false);
+
+  MockThreads mock_threads(m_threads);
+  expect_work_queue(mock_threads);
+
+  InSequence seq;
+
+  auto& mock_cluster = get_mock_cluster();
+  auto mock_local_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr);
+
+  auto mock_remote_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_remote_rados_client, "cluster name",
+                 nullptr);
+
+  auto mock_local_io_ctx = mock_local_rados_client->do_create_ioctx(
+    m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name());
+  expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx);
+
+  expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM);
+
+  MockServiceDaemon mock_service_daemon;
+  expect_service_daemon_add_or_update_callout(
+    mock_service_daemon, service_daemon::CALLOUT_ID_NONE, 123,
+    service_daemon::CALLOUT_LEVEL_ERROR, "unable to query local mirror uuid");
+
+  MockPoolReplayer pool_replayer(&mock_threads, &mock_service_daemon, nullptr,
+                                 &m_pool_meta_cache,
+                                 m_local_io_ctx.get_id(), peer_spec, {});
+  pool_replayer.init("siteA");
+  pool_replayer.shut_down();
+
+  mock_local_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr);
+
+  mock_remote_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_remote_rados_client, "cluster name",
+                 nullptr);
+
+  mock_local_io_ctx = mock_local_rados_client->do_create_ioctx(
+    m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name());
+  expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx);
+
+  expect_mirror_uuid_get(mock_local_io_ctx, "uuid", 0);
+
+  auto mock_remote_pool_poller = new MockRemotePoolPoller();
+  expect_remote_pool_poller_init(*mock_remote_pool_poller,
+                                 {"remote mirror uuid", ""}, 0);
+
+  expect_namespace_replayer_init(*mock_default_namespace_replayer, 0);
+  expect_leader_watcher_init(*mock_leader_watcher, 0);
+
+  expect_service_daemon_remove_callout(mock_service_daemon, 123);
+  std::string instance_id = stringify(mock_local_io_ctx->get_instance_id());
+  expect_service_daemon_add_or_update_instance_id_attribute(
+    mock_service_daemon, instance_id);
+
+  pool_replayer.init("siteA");
+
+  expect_leader_watcher_shut_down(*mock_leader_watcher);
+  expect_namespace_replayer_shut_down(*mock_default_namespace_replayer);
+  expect_remote_pool_poller_shut_down(*mock_remote_pool_poller, 0);
+
+  pool_replayer.shut_down();
+}
+
+TEST_F(TestMockPoolReplayer, RemoveCalloutOnDestruction) {
+  PeerSpec peer_spec{"uuid", "cluster name", "client.name"};
+
+  MockThreads mock_threads(m_threads);
+  expect_work_queue(mock_threads);
+
+  InSequence seq;
+
+  auto& mock_cluster = get_mock_cluster();
+  auto mock_local_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr);
+
+  auto mock_remote_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_remote_rados_client, "cluster name",
+                 nullptr);
+
+  auto mock_local_io_ctx = mock_local_rados_client->do_create_ioctx(
+    m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name());
+  expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx);
+
+  expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM);
+
+  MockServiceDaemon mock_service_daemon;
+  expect_service_daemon_add_or_update_callout(
+    mock_service_daemon, service_daemon::CALLOUT_ID_NONE, 123,
+    service_daemon::CALLOUT_LEVEL_ERROR, "unable to query local mirror uuid");
+
+  MockPoolReplayer pool_replayer(&mock_threads, &mock_service_daemon, nullptr,
+                                 &m_pool_meta_cache,
+                                 m_local_io_ctx.get_id(), peer_spec, {});
+  pool_replayer.init("siteA");
+  pool_replayer.shut_down();
+
+  mock_local_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr);
+
+  mock_remote_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_remote_rados_client, "cluster name",
+                 nullptr);
+
+  mock_local_io_ctx = mock_local_rados_client->do_create_ioctx(
+    m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name());
+  expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx);
+
+  expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM);
+  expect_service_daemon_add_or_update_callout(
+    mock_service_daemon, 123, 123, service_daemon::CALLOUT_LEVEL_ERROR,
+    "unable to query local mirror uuid");
+
+  pool_replayer.init("siteA");
+  pool_replayer.shut_down();
+
+  expect_service_daemon_remove_callout(mock_service_daemon, 123);
+}
+
 } // namespace mirror
 } // namespace rbd
index 8a04219da06861487371000cc0dff01c31495dbe..87001b8af45c08ec8f998b0168a81177914c9faf 100644 (file)
@@ -245,6 +245,13 @@ PoolReplayer<I>::~PoolReplayer()
 {
   shut_down();
 
+  // pool replayer instances are generally (unless the peer gets
+  // updated) preserved across errors to reduce ping-ponging of callout
+  // error notifications, so this can't be done in shut_down()
+  if (m_callout_id != service_daemon::CALLOUT_ID_NONE) {
+    m_service_daemon->remove_callout(m_local_pool_id, m_callout_id);
+  }
+
   ceph_assert(m_asok_hook == nullptr);
 }