]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
rbd-mirror: remove callout when destroying pool replayer
authorIlya Dryomov <idryomov@gmail.com>
Wed, 24 Apr 2024 10:45:27 +0000 (12:45 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Tue, 7 May 2024 08:02:34 +0000 (10:02 +0200)
If a pool replayer is removed in an error state (e.g. after failing to
connect to the remote cluster), its callout should be removed as well.
Otherwise, the error would persist causing "daemon health: ERROR"
status to be reported even after a new pool replayer is created and
started successfully.

Fixes: https://tracker.ceph.com/issues/65487
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
(cherry picked from commit b7e79642d53c1802ae1202c3f013677fcfd26b1b)

qa/workunits/rbd/rbd_mirror_bootstrap.sh
qa/workunits/rbd/rbd_mirror_helpers.sh
src/test/rbd_mirror/test_mock_PoolReplayer.cc
src/tools/rbd_mirror/PoolReplayer.cc

index f4c1070bc95bac7764178b8b0e2f44edf2c268a4..b1aeeaac9272f25deb0cef702811c52b4084bc29 100755 (executable)
@@ -58,3 +58,38 @@ wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${PARENT_POOL} image1
 wait_for_image_replay_started ${CLUSTER1} ${PARENT_POOL} image2
 write_image ${CLUSTER2} ${PARENT_POOL} image2 100
 wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} image2
+
+testlog "TEST: pool replayer and callout cleanup when peer is updated"
+test_health_state ${CLUSTER1} ${PARENT_POOL} 'OK'
+test_health_state ${CLUSTER2} ${PARENT_POOL} 'OK'
+POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL})
+jq -e '.summary.health == "OK"' <<< ${POOL_STATUS}
+jq -e '.summary.daemon_health == "OK"' <<< ${POOL_STATUS}
+jq -e '.daemons[0].health == "OK"' <<< ${POOL_STATUS}
+jq -e '.daemons[0] | has("callouts") | not' <<< ${POOL_STATUS}
+OLD_SERVICE_ID=$(jq -r '.daemons[0].service_id' <<< ${POOL_STATUS})
+OLD_INSTANCE_ID=$(jq -r '.daemons[0].instance_id' <<< ${POOL_STATUS})
+# mess up the peer on one of the clusters by setting a bogus user name
+PEER_UUID=$(rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool info --format json | jq -r '.peers[0].uuid')
+rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool peer set ${PEER_UUID} client client.invalid
+wait_for_health_state ${CLUSTER2} ${PARENT_POOL} 'ERROR'
+test_health_state ${CLUSTER1} ${PARENT_POOL} 'WARNING'
+POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL})
+jq -e '.summary.health == "ERROR"' <<< ${POOL_STATUS}
+jq -e '.summary.daemon_health == "ERROR"' <<< ${POOL_STATUS}
+jq -e '.daemons[0].health == "ERROR"' <<< ${POOL_STATUS}
+jq -e '.daemons[0].callouts == ["unable to connect to remote cluster"]' <<< ${POOL_STATUS}
+# restore the correct user name
+rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool peer set ${PEER_UUID} client client.rbd-mirror-peer
+wait_for_health_state ${CLUSTER2} ${PARENT_POOL} 'OK'
+test_health_state ${CLUSTER1} ${PARENT_POOL} 'OK'
+POOL_STATUS=$(get_pool_status_json ${CLUSTER2} ${PARENT_POOL})
+jq -e '.summary.health == "OK"' <<< ${POOL_STATUS}
+jq -e '.summary.daemon_health == "OK"' <<< ${POOL_STATUS}
+jq -e '.daemons[0].health == "OK"' <<< ${POOL_STATUS}
+jq -e '.daemons[0] | has("callouts") | not' <<< ${POOL_STATUS}
+NEW_SERVICE_ID=$(jq -r '.daemons[0].service_id' <<< ${POOL_STATUS})
+NEW_INSTANCE_ID=$(jq -r '.daemons[0].instance_id' <<< ${POOL_STATUS})
+# check that we are running the same service (daemon) but a newer pool replayer
+((OLD_SERVICE_ID == NEW_SERVICE_ID))
+((OLD_INSTANCE_ID < NEW_INSTANCE_ID))
index b6e1fa87de2a2716dbce26694c6e9c2094934ca2..a1fa099bc4252cd8f453d1297378e090c469164f 100755 (executable)
@@ -632,6 +632,39 @@ flush()
     admin_daemons "${cluster}" ${cmd}
 }
 
+get_pool_status_json()
+{
+    local cluster="$1"
+    local pool="$2"
+
+    CEPH_ARGS='' rbd --cluster "${cluster}" mirror pool status "${pool}" --verbose --format json
+}
+
+test_health_state()
+{
+    local cluster="$1"
+    local pool="$2"
+    local state="$3"
+
+    local status
+    status="$(get_pool_status_json "${cluster}" "${pool}")"
+    jq -e '.summary.health == "'"${state}"'"' <<< "${status}"
+}
+
+wait_for_health_state()
+{
+    local cluster="$1"
+    local pool="$2"
+    local state="$3"
+    local s
+
+    for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do
+        sleep "${s}"
+        test_health_state "${cluster}" "${pool}" "${state}" && return 0
+    done
+    return 1
+}
+
 test_image_replay_state()
 {
     local cluster=$1
index ebd27d7e1cba081d9b4f35cf956dd03572ac7493..5a7d2348a17797cd7e6e6cca402df290fc3a014d 100644 (file)
@@ -533,6 +533,22 @@ public:
         mock_service_daemon, "instance_id", {instance_id});
   }
 
+  void expect_service_daemon_add_or_update_callout(
+      MockServiceDaemon& mock_service_daemon,
+      uint64_t callout_id, uint64_t out_callout_id,
+      service_daemon::CalloutLevel callout_level, const std::string& text) {
+    EXPECT_CALL(mock_service_daemon,
+                add_or_update_callout(m_local_io_ctx.get_id(), callout_id,
+                                      callout_level, text))
+      .WillOnce(Return(out_callout_id));
+  }
+
+  void expect_service_daemon_remove_callout(
+      MockServiceDaemon& mock_service_daemon, uint64_t callout_id) {
+    EXPECT_CALL(mock_service_daemon,
+                remove_callout(m_local_io_ctx.get_id(), callout_id));
+  }
+
   PoolMetaCache m_pool_meta_cache{g_ceph_context};
 };
 
@@ -930,5 +946,143 @@ TEST_F(TestMockPoolReplayer, NamespacesError) {
   pool_replayer.shut_down();
 }
 
+TEST_F(TestMockPoolReplayer, RemoveCalloutOnInit) {
+  PeerSpec peer_spec{"uuid", "cluster name", "client.name"};
+
+  auto mock_default_namespace_replayer = new MockNamespaceReplayer();
+  expect_namespace_replayer_is_blocklisted(*mock_default_namespace_replayer,
+                                           false);
+
+  auto mock_leader_watcher = new MockLeaderWatcher();
+  expect_leader_watcher_get_leader_instance_id(*mock_leader_watcher);
+  expect_leader_watcher_is_blocklisted(*mock_leader_watcher, false);
+
+  MockThreads mock_threads(m_threads);
+  expect_work_queue(mock_threads);
+
+  InSequence seq;
+
+  auto& mock_cluster = get_mock_cluster();
+  auto mock_local_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr);
+
+  auto mock_remote_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_remote_rados_client, "cluster name",
+                 nullptr);
+
+  auto mock_local_io_ctx = mock_local_rados_client->do_create_ioctx(
+    m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name());
+  expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx);
+
+  expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM);
+
+  MockServiceDaemon mock_service_daemon;
+  expect_service_daemon_add_or_update_callout(
+    mock_service_daemon, service_daemon::CALLOUT_ID_NONE, 123,
+    service_daemon::CALLOUT_LEVEL_ERROR, "unable to query local mirror uuid");
+
+  MockPoolReplayer pool_replayer(&mock_threads, &mock_service_daemon, nullptr,
+                                 &m_pool_meta_cache,
+                                 m_local_io_ctx.get_id(), peer_spec, {});
+  pool_replayer.init("siteA");
+  pool_replayer.shut_down();
+
+  mock_local_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr);
+
+  mock_remote_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_remote_rados_client, "cluster name",
+                 nullptr);
+
+  mock_local_io_ctx = mock_local_rados_client->do_create_ioctx(
+    m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name());
+  expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx);
+
+  expect_mirror_uuid_get(mock_local_io_ctx, "uuid", 0);
+
+  auto mock_remote_pool_poller = new MockRemotePoolPoller();
+  expect_remote_pool_poller_init(*mock_remote_pool_poller,
+                                 {"remote mirror uuid", ""}, 0);
+
+  expect_namespace_replayer_init(*mock_default_namespace_replayer, 0);
+  expect_leader_watcher_init(*mock_leader_watcher, 0);
+
+  expect_service_daemon_remove_callout(mock_service_daemon, 123);
+  std::string instance_id = stringify(mock_local_io_ctx->get_instance_id());
+  expect_service_daemon_add_or_update_instance_id_attribute(
+    mock_service_daemon, instance_id);
+
+  pool_replayer.init("siteA");
+
+  expect_leader_watcher_shut_down(*mock_leader_watcher);
+  expect_namespace_replayer_shut_down(*mock_default_namespace_replayer);
+  expect_remote_pool_poller_shut_down(*mock_remote_pool_poller, 0);
+
+  pool_replayer.shut_down();
+}
+
+TEST_F(TestMockPoolReplayer, RemoveCalloutOnDestruction) {
+  PeerSpec peer_spec{"uuid", "cluster name", "client.name"};
+
+  MockThreads mock_threads(m_threads);
+  expect_work_queue(mock_threads);
+
+  InSequence seq;
+
+  auto& mock_cluster = get_mock_cluster();
+  auto mock_local_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr);
+
+  auto mock_remote_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_remote_rados_client, "cluster name",
+                 nullptr);
+
+  auto mock_local_io_ctx = mock_local_rados_client->do_create_ioctx(
+    m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name());
+  expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx);
+
+  expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM);
+
+  MockServiceDaemon mock_service_daemon;
+  expect_service_daemon_add_or_update_callout(
+    mock_service_daemon, service_daemon::CALLOUT_ID_NONE, 123,
+    service_daemon::CALLOUT_LEVEL_ERROR, "unable to query local mirror uuid");
+
+  MockPoolReplayer pool_replayer(&mock_threads, &mock_service_daemon, nullptr,
+                                 &m_pool_meta_cache,
+                                 m_local_io_ctx.get_id(), peer_spec, {});
+  pool_replayer.init("siteA");
+  pool_replayer.shut_down();
+
+  mock_local_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_local_rados_client, "ceph", nullptr);
+
+  mock_remote_rados_client = mock_cluster.do_create_rados_client(
+    g_ceph_context);
+  expect_connect(mock_cluster, mock_remote_rados_client, "cluster name",
+                 nullptr);
+
+  mock_local_io_ctx = mock_local_rados_client->do_create_ioctx(
+    m_local_io_ctx.get_id(), m_local_io_ctx.get_pool_name());
+  expect_create_ioctx(mock_local_rados_client, mock_local_io_ctx);
+
+  expect_mirror_uuid_get(mock_local_io_ctx, "", -EPERM);
+  expect_service_daemon_add_or_update_callout(
+    mock_service_daemon, 123, 123, service_daemon::CALLOUT_LEVEL_ERROR,
+    "unable to query local mirror uuid");
+
+  pool_replayer.init("siteA");
+  pool_replayer.shut_down();
+
+  expect_service_daemon_remove_callout(mock_service_daemon, 123);
+}
+
 } // namespace mirror
 } // namespace rbd
index 81292ba07e2123d06708b380ec278014e1e77154..cb88e6e96f7da6db75528fe36d383a2da2e5d1b9 100644 (file)
@@ -245,6 +245,13 @@ PoolReplayer<I>::~PoolReplayer()
 {
   shut_down();
 
+  // pool replayer instances are generally (unless the peer gets
+  // updated) preserved across errors to reduce ping-ponging of callout
+  // error notifications, so this can't be done in shut_down()
+  if (m_callout_id != service_daemon::CALLOUT_ID_NONE) {
+    m_service_daemon->remove_callout(m_local_pool_id, m_callout_id);
+  }
+
   ceph_assert(m_asok_hook == nullptr);
 }