From 9d209be7f29c4aed3bbf1588f27f2b3564e42d94 Mon Sep 17 00:00:00 2001 From: Jos Collin Date: Fri, 8 Mar 2024 09:34:57 +0530 Subject: [PATCH] qa: enhance per-client labelled perf counters test Fixes: https://tracker.ceph.com/issues/65497 Signed-off-by: Jos Collin (cherry picked from commit 5337a127f0d3a737b4ed14efc9c1aee5b54da32f) --- qa/tasks/cephfs/test_admin.py | 107 ++++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 17 deletions(-) diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py index 6bf8afc73eefa..132410127bd56 100644 --- a/qa/tasks/cephfs/test_admin.py +++ b/qa/tasks/cephfs/test_admin.py @@ -15,23 +15,88 @@ from tasks.cephfs.fuse_mount import FuseMount from tasks.cephfs.caps_helper import CapTester log = logging.getLogger(__name__) +MDS_RESTART_GRACE = 60 class TestLabeledPerfCounters(CephFSTestCase): CLIENTS_REQUIRED = 2 MDSS_REQUIRED = 1 - def test_per_client_labeled_perf_counters(self): + def _get_counters_for(self, filesystem, client_id): + dump = self.fs.rank_tell(["counter", "dump"]) + per_client_metrics_key = f'mds_client_metrics-{filesystem}' + counters = [c["counters"] for \ + c in dump[per_client_metrics_key] if c["labels"]["client"] == client_id] + return counters[0] + + def test_per_client_labeled_perf_counters_on_client_disconnect(self): + """ + That the per-client labelled metrics are unavailable during client disconnect + """ + mount_a_id = f'client.{self.mount_a.get_global_id()}' + self.mount_a.teardown() + with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_a_id}') as proceed: + while proceed(): + dump = self.fs.rank_tell(["counter", "dump"]) + per_client_metrics_key = f"mds_client_metrics-{dump['mds_client_metrics'][0]['labels']['fs_name']}" + clients = [c["labels"]["client"] for c in dump.get(per_client_metrics_key, {})] + if clients and mount_a_id not in clients: + # success, no metrics. + return True + + def test_per_client_labeled_perf_counters_on_client_reconnect(self): """ - That the per-client labelled perf counters depict the clients - performaing IO. + That the per-client labelled metrics are generated during client reconnect """ - def get_counters_for(filesystem, client_id): - dump = self.fs.rank_tell(["counter", "dump"]) - per_client_metrics_key = f'mds_client_metrics-{filesystem}' - counters = [c["counters"] for \ - c in dump[per_client_metrics_key] if c["labels"]["client"] == client_id] - return counters[0] + # fail active mds and wait for reconnect + mds = self.fs.get_active_names()[0] + self.mds_cluster.mds_fail(mds) + self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE) + mount_a_id = f'client.{self.mount_a.get_global_id()}' + mount_b_id = f'client.{self.mount_b.get_global_id()}' + fs_suffix = "" + + with safe_while(sleep=1, tries=30, action='wait for counters') as proceed: + while proceed(): + dump = self.fs.rank_tell(["counter", "dump"]) + fs_suffix = dump['mds_client_metrics'][0]['labels']['fs_name'] + per_client_metrics_key = f"mds_client_metrics-{fs_suffix}" + clients = [c["labels"]["client"] for c in dump.get(per_client_metrics_key, {})] + if mount_a_id in clients and mount_b_id in clients: + # success, got metrics. + break # break to continue the test + + # Post reconnecting, validate the io perf counters + # write workload + self.mount_a.create_n_files("test_dir/test_file", 1000, sync=True) + with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_a_id}') as proceed: + while proceed(): + counters_dump_a = self._get_counters_for(fs_suffix, mount_a_id) + if counters_dump_a["total_write_ops"] > 0 and counters_dump_a["total_write_size"] > 0 and \ + counters_dump_a["avg_write_latency"] >= 0 and counters_dump_a["avg_metadata_latency"] >= 0 and \ + counters_dump_a["opened_files"] >= 0 and counters_dump_a["opened_inodes"] > 0 and \ + counters_dump_a["cap_hits"] > 0 and counters_dump_a["dentry_lease_hits"] > 0 and \ + counters_dump_a["pinned_icaps"] > 0: + break # break to continue the test + + # read from the other client + for i in range(100): + self.mount_b.open_background(basename=f'test_dir/test_file_{i}', write=False) + with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_b_id}') as proceed: + while proceed(): + counters_dump_b = self._get_counters_for(fs_suffix, mount_b_id) + if counters_dump_b["total_read_ops"] >= 0 and counters_dump_b["total_read_size"] >= 0 and \ + counters_dump_b["avg_read_latency"] >= 0 and counters_dump_b["avg_metadata_latency"] >= 0 and \ + counters_dump_b["opened_files"] >= 0 and counters_dump_b["opened_inodes"] >= 0 and \ + counters_dump_b["cap_hits"] > 0 and counters_dump_a["dentry_lease_hits"] > 0 and \ + counters_dump_b["pinned_icaps"] > 0: + break # break to continue the test + self.mount_a.teardown() + self.mount_b.teardown() + def test_per_client_labeled_perf_counters_io(self): + """ + That the per-client labelled perf counters depict the clients performing IO. + """ # sleep a bit so that we get updated clients... sleep(10) @@ -52,21 +117,29 @@ class TestLabeledPerfCounters(CephFSTestCase): # write workload self.mount_a.create_n_files("test_dir/test_file", 1000, sync=True) with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_a_id}') as proceed: - counters_dump_a = get_counters_for(fs_suffix, mount_a_id) while proceed(): - if counters_dump_a["total_write_ops"] > 0 and counters_dump_a["total_write_size"] > 0: - return True + counters_dump_a = self._get_counters_for(fs_suffix, mount_a_id) + if counters_dump_a["total_write_ops"] > 0 and counters_dump_a["total_write_size"] > 0 and \ + counters_dump_a["avg_write_latency"] >= 0 and counters_dump_a["avg_metadata_latency"] >= 0 and \ + counters_dump_a["opened_files"] >= 0 and counters_dump_a["opened_inodes"] > 0 and \ + counters_dump_a["cap_hits"] > 0 and counters_dump_a["dentry_lease_hits"] > 0 and \ + counters_dump_a["pinned_icaps"] > 0: + break # break to continue the test # read from the other client for i in range(100): self.mount_b.open_background(basename=f'test_dir/test_file_{i}', write=False) with safe_while(sleep=1, tries=30, action=f'wait for counters - {mount_b_id}') as proceed: - counters_dump_b = get_counters_for(fs_suffix, mount_b_id) while proceed(): - if counters_dump_b["total_read_ops"] > 0 and counters_dump_b["total_read_size"] > 0: - return True - - self.fs.teardown() + counters_dump_b = self._get_counters_for(fs_suffix, mount_b_id) + if counters_dump_b["total_read_ops"] >= 0 and counters_dump_b["total_read_size"] >= 0 and \ + counters_dump_b["avg_read_latency"] >= 0 and counters_dump_b["avg_metadata_latency"] >= 0 and \ + counters_dump_b["opened_files"] >= 0 and counters_dump_b["opened_inodes"] >= 0 and \ + counters_dump_b["cap_hits"] > 0 and counters_dump_a["dentry_lease_hits"] > 0 and \ + counters_dump_b["pinned_icaps"] > 0: + break # break to continue the test + self.mount_a.teardown() + self.mount_b.teardown() class TestAdminCommands(CephFSTestCase): """ -- 2.39.5