From: Igor Golikov Date: Thu, 7 Aug 2025 16:35:47 +0000 (+0000) Subject: test: add subvolume metrics sanity test X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=1327fc8897c8779a8f31ea7f5e8759de2e048515;p=ceph.git test: add subvolume metrics sanity test Signed-off-by: Igor Golikov Fixes: https://tracker.ceph.com/issues/68929 --- diff --git a/qa/suites/fs/volumes/tasks/volumes/test/metrics.yaml b/qa/suites/fs/volumes/tasks/volumes/test/metrics.yaml new file mode 100644 index 0000000000000..78162fd25e702 --- /dev/null +++ b/qa/suites/fs/volumes/tasks/volumes/test/metrics.yaml @@ -0,0 +1,12 @@ +overrides: + install: + extra_system_packages: + rpm: + - fio + deb: + - fio +tasks: + - cephfs_test_runner: + fail_on_skip: false + modules: + - tasks.cephfs.test_subvolume.TestSubvolumeMetrics \ No newline at end of file diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py index ae3f2901ed3ed..35d7c630dff2a 100644 --- a/qa/tasks/cephfs/mount.py +++ b/qa/tasks/cephfs/mount.py @@ -1713,4 +1713,7 @@ class CephFSMountBase(object): path_to_mount = subvol_paths[mount_subvol_num] self.cephfs_mntpt = path_to_mount + def get_mount_point(self): + return self.hostfs_mntpt + CephFSMount = CephFSMountBase diff --git a/qa/tasks/cephfs/test_subvolume.py b/qa/tasks/cephfs/test_subvolume.py index ed71ed6f43761..9a1755b21f4b8 100644 --- a/qa/tasks/cephfs/test_subvolume.py +++ b/qa/tasks/cephfs/test_subvolume.py @@ -1,8 +1,10 @@ import logging from time import sleep +import os from tasks.cephfs.cephfs_test_case import CephFSTestCase from teuthology.exceptions import CommandFailedError +from teuthology.contextutil import safe_while log = logging.getLogger(__name__) @@ -16,6 +18,7 @@ class TestSubvolume(CephFSTestCase): self.setup_test() def tearDown(self): + #pass # clean up self.cleanup_test() super().tearDown() @@ -185,7 +188,7 @@ class TestSubvolume(CephFSTestCase): self.mount_a.run_shell(['mkdir', 'group/subvol2/dir/.snap/s2']) # override subdir subvolume with parent subvolume - self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume', + (['setfattr', '-n', 'ceph.dir.subvolume', '-v', '1', 'group/subvol2/dir']) self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume', '-v', '1', 'group/subvol2']) @@ -197,6 +200,7 @@ class TestSubvolume(CephFSTestCase): # clean up self.mount_a.run_shell(['rmdir', 'group/subvol2/dir/.snap/s2']) + def test_subvolume_vxattr_removal_without_setting(self): """ To verify that the ceph.dir.subvolume vxattr removal without setting doesn't cause mds crash @@ -209,7 +213,6 @@ class TestSubvolume(CephFSTestCase): # cleanup self.mount_a.run_shell(['rm', '-rf', 'group/subvol3']) - class TestSubvolumeReplicated(CephFSTestCase): CLIENTS_REQUIRED = 1 MDSS_REQUIRED = 2 @@ -245,3 +248,108 @@ class TestSubvolumeReplicated(CephFSTestCase): ino0 = self.fs.read_cache("/dir1/dir2", depth=0, rank=0)[0] self.assertFalse(ino0['is_auth']) self.assertTrue(ino0['is_subvolume']) + +class TestSubvolumeMetrics(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def get_subvolume_metrics(self, mds_rank=0): + """ + Helper to fetch current subvolume metrics from MDS counters using rank_tell. + """ + mds_info = self.fs.get_rank(rank=mds_rank) + mds_name = mds_info['name'] + counters = self.fs.mds_tell(["counter", "dump"], mds_id=mds_name) + return counters.get("mds_subvolume_metrics") + + def test_subvolume_metrics_lifecycle(self): + """ + Verify that subvolume metrics are initially absent, appear after IO, + and disappear after the aggregation window expires. + """ + subvol_name = "metrics_subv" + subv_path = "/volumes/_nogroup/metrics_subv" + + # no metrics initially + subvol_metrics = self.get_subvolume_metrics() + self.assertFalse(subvol_metrics, "Subvolume metrics should not be present before I/O") + + # create subvolume + self.fs.run_ceph_cmd('fs', 'subvolume', 'create', 'cephfs', subvol_name) + + # generate some I/O + mount_point = self.mount_a.get_mount_point() + suvolume_fs_path = self.fs.get_ceph_cmd_stdout('fs', 'subvolume', 'getpath', 'cephfs', subvol_name).strip() + suvolume_fs_path = os.path.join(mount_point, suvolume_fs_path.strip('/')) + + # do some writes + filename = os.path.join(suvolume_fs_path, "file0") + self.mount_a.run_shell_payload("sudo fio " + "--name test -rw=write " + "--bs=4k --numjobs=1 --time_based " + "--runtime=20s --verify=0 --size=1G " + f"--filename={filename}", wait=True) + + subvol_metrics = None + with safe_while(sleep=1, tries=30, action='wait for subvolume write counters') as proceed: + while proceed(): + # verify that metrics are available + subvol_metrics = self.get_subvolume_metrics() + if subvol_metrics: + break + + log.debug(f'verifying for write: subvol_metrics={subvol_metrics}') + + # Extract first metric entry + metric = subvol_metrics[0] + counters = metric["counters"] + labels = metric["labels"] + + # Label checks + self.assertEqual(labels["fs_name"], "cephfs", "Unexpected fs_name in subvolume metrics") + self.assertEqual(labels["subvolume_path"], subv_path, "Unexpected subvolume_path in subvolume metrics") + + # Counter presence and value checks + self.assertIn("avg_read_iops", counters) + self.assertIn("avg_read_tp_Bps", counters) + self.assertIn("avg_read_lat_msec", counters) + self.assertIn("avg_write_iops", counters) + self.assertIn("avg_write_tp_Bps", counters) + self.assertIn("avg_write_lat_msec", counters) + + # check write metrics + self.assertGreater(counters["avg_write_iops"], 0, "Expected avg_write_iops to be > 0") + self.assertGreater(counters["avg_write_tp_Bps"], 0, "Expected avg_write_tp_Bps to be > 0") + self.assertGreaterEqual(counters["avg_write_lat_msec"], 0, "Expected avg_write_lat_msec to be > 0") + + # do some reads + self.mount_a.run_shell_payload("sudo fio " + "--name test -rw=read " + "--bs=4k --numjobs=1 --time_based " + "--runtime=20s --verify=0 --size=1G " + f"--filename={filename}", wait=True) + + subvol_metrics = None + with safe_while(sleep=1, tries=30, action='wait for subvolume read counters') as proceed: + while proceed(): + # verify that metrics are available + subvol_metrics = self.get_subvolume_metrics() + if subvol_metrics: + break + + log.debug(f'verifying for read: subvol_metrics={subvol_metrics}') + + metric = subvol_metrics[0] + counters = metric["counters"] + + # Assert expected values (example: write I/O occurred, read did not) + self.assertGreater(counters["avg_read_iops"], 0, "Expected avg_read_iops to be >= 0") + self.assertGreater(counters["avg_read_tp_Bps"], 0, "Expected avg_read_tp_Bps to be >= 0") + self.assertGreaterEqual(counters["avg_read_lat_msec"], 0, "Expected avg_read_lat_msec to be >= 0") + + # wait for metrics to expire after inactivity + sleep(60) + + # verify that metrics are not present anymore + subvolume_metrics = self.get_subvolume_metrics() + self.assertFalse(subvolume_metrics, "Subvolume metrics should be gone after inactivity window")