test: add subvolume metrics sanity test

author Igor Golikov <igolikov@ibm.com>

Thu, 7 Aug 2025 16:35:47 +0000 (16:35 +0000)

committer Venky Shankar <vshankar@redhat.com>

Wed, 10 Sep 2025 16:47:16 +0000 (16:47 +0000)
author Igor Golikov <igolikov@ibm.com>
Thu, 7 Aug 2025 16:35:47 +0000 (16:35 +0000)
committer Venky Shankar <vshankar@redhat.com>
Wed, 10 Sep 2025 16:47:16 +0000 (16:47 +0000)
diff --git a/qa/suites/fs/volumes/tasks/volumes/test/metrics.yaml b/qa/suites/fs/volumes/tasks/volumes/test/metrics.yaml

new file mode 100644 (file)

index 0000000..78162fd
--- /dev/null
+++ b/qa/suites/fs/volumes/tasks/volumes/test/metrics.yaml
@@ -0,0 +1,12 @@
+overrides:
+  install:
+    extra_system_packages:
+      rpm:
+        - fio
+      deb:
+        - fio
+tasks:
+  - cephfs_test_runner:
+      fail_on_skip: false
+      modules:
+        - tasks.cephfs.test_subvolume.TestSubvolumeMetrics
+\ No newline at end of file
diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py

index ae3f2901ed3ed0b1377faba0e280c7e3b63a273e..35d7c630dff2ae80337d7aeaba135dfad11a37fd 100644 (file)
--- a/qa/tasks/cephfs/mount.py
+++ b/qa/tasks/cephfs/mount.py
@@ -1713,4 +1713,7 @@ class CephFSMountBase(object):
              path_to_mount = subvol_paths[mount_subvol_num]
              self.cephfs_mntpt = path_to_mount
  
+    def get_mount_point(self):
+        return self.hostfs_mntpt
+
  CephFSMount = CephFSMountBase
diff --git a/qa/tasks/cephfs/test_subvolume.py b/qa/tasks/cephfs/test_subvolume.py

index ed71ed6f4376167c6d5209b3a8f7a89d2b55e8f4..9a1755b21f4b822448539e323c5154c6b8dc1090 100644 (file)
--- a/qa/tasks/cephfs/test_subvolume.py
+++ b/qa/tasks/cephfs/test_subvolume.py
@@ -1,8 +1,10 @@
  import logging
  from time import sleep
+import os
  
  from tasks.cephfs.cephfs_test_case import CephFSTestCase
  from teuthology.exceptions import CommandFailedError
+from teuthology.contextutil import safe_while
  
  log = logging.getLogger(__name__)
  
@@ -16,6 +18,7 @@ class TestSubvolume(CephFSTestCase):
          self.setup_test()
  
      def tearDown(self):
+        #pass
          # clean up
          self.cleanup_test()
          super().tearDown()
@@ -185,7 +188,7 @@ class TestSubvolume(CephFSTestCase):
          self.mount_a.run_shell(['mkdir', 'group/subvol2/dir/.snap/s2'])
  
          # override subdir subvolume with parent subvolume
-        self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume',
+        (['setfattr', '-n', 'ceph.dir.subvolume',
                                  '-v', '1', 'group/subvol2/dir'])
          self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume',
                                  '-v', '1', 'group/subvol2'])
@@ -197,6 +200,7 @@ class TestSubvolume(CephFSTestCase):
          # clean up
          self.mount_a.run_shell(['rmdir', 'group/subvol2/dir/.snap/s2'])
  
+
      def test_subvolume_vxattr_removal_without_setting(self):
          """
          To verify that the ceph.dir.subvolume vxattr removal without setting doesn't cause mds crash
@@ -209,7 +213,6 @@ class TestSubvolume(CephFSTestCase):
          # cleanup
          self.mount_a.run_shell(['rm', '-rf', 'group/subvol3'])
  
-
  class TestSubvolumeReplicated(CephFSTestCase):
      CLIENTS_REQUIRED = 1
      MDSS_REQUIRED = 2
@@ -245,3 +248,108 @@ class TestSubvolumeReplicated(CephFSTestCase):
          ino0 = self.fs.read_cache("/dir1/dir2", depth=0, rank=0)[0]
          self.assertFalse(ino0['is_auth'])
          self.assertTrue(ino0['is_subvolume'])
+
+class TestSubvolumeMetrics(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def get_subvolume_metrics(self, mds_rank=0):
+        """
+        Helper to fetch current subvolume metrics from MDS counters using rank_tell.
+        """
+        mds_info = self.fs.get_rank(rank=mds_rank)
+        mds_name = mds_info['name']
+        counters = self.fs.mds_tell(["counter", "dump"], mds_id=mds_name)
+        return counters.get("mds_subvolume_metrics")
+
+    def test_subvolume_metrics_lifecycle(self):
+        """
+        Verify that subvolume metrics are initially absent, appear after IO,
+        and disappear after the aggregation window expires.
+        """
+        subvol_name = "metrics_subv"
+        subv_path = "/volumes/_nogroup/metrics_subv"
+
+        # no metrics initially
+        subvol_metrics = self.get_subvolume_metrics()
+        self.assertFalse(subvol_metrics, "Subvolume metrics should not be present before I/O")
+
+        # create subvolume
+        self.fs.run_ceph_cmd('fs', 'subvolume', 'create', 'cephfs', subvol_name)
+
+        # generate some I/O
+        mount_point = self.mount_a.get_mount_point()
+        suvolume_fs_path = self.fs.get_ceph_cmd_stdout('fs', 'subvolume', 'getpath', 'cephfs', subvol_name).strip()
+        suvolume_fs_path = os.path.join(mount_point, suvolume_fs_path.strip('/'))
+
+        # do some writes
+        filename = os.path.join(suvolume_fs_path, "file0")
+        self.mount_a.run_shell_payload("sudo fio "
+                                       "--name test -rw=write "
+                                       "--bs=4k --numjobs=1 --time_based "
+                                       "--runtime=20s --verify=0 --size=1G "
+                                       f"--filename={filename}", wait=True)
+
+        subvol_metrics = None
+        with safe_while(sleep=1, tries=30, action='wait for subvolume write counters') as proceed:
+            while proceed():
+                # verify that metrics are available
+                subvol_metrics = self.get_subvolume_metrics()
+                if subvol_metrics:
+                    break
+
+        log.debug(f'verifying for write: subvol_metrics={subvol_metrics}')
+
+        # Extract first metric entry
+        metric = subvol_metrics[0]
+        counters = metric["counters"]
+        labels = metric["labels"]
+
+        # Label checks
+        self.assertEqual(labels["fs_name"], "cephfs", "Unexpected fs_name in subvolume metrics")
+        self.assertEqual(labels["subvolume_path"], subv_path, "Unexpected subvolume_path in subvolume metrics")
+
+        # Counter presence and value checks
+        self.assertIn("avg_read_iops", counters)
+        self.assertIn("avg_read_tp_Bps", counters)
+        self.assertIn("avg_read_lat_msec", counters)
+        self.assertIn("avg_write_iops", counters)
+        self.assertIn("avg_write_tp_Bps", counters)
+        self.assertIn("avg_write_lat_msec", counters)
+
+        # check write metrics
+        self.assertGreater(counters["avg_write_iops"], 0, "Expected avg_write_iops to be > 0")
+        self.assertGreater(counters["avg_write_tp_Bps"], 0, "Expected avg_write_tp_Bps to be > 0")
+        self.assertGreaterEqual(counters["avg_write_lat_msec"], 0, "Expected avg_write_lat_msec to be > 0")
+
+        # do some reads
+        self.mount_a.run_shell_payload("sudo fio "
+                                       "--name test -rw=read "
+                                       "--bs=4k --numjobs=1 --time_based "
+                                       "--runtime=20s --verify=0 --size=1G "
+                                       f"--filename={filename}", wait=True)
+
+        subvol_metrics = None
+        with safe_while(sleep=1, tries=30, action='wait for subvolume read counters') as proceed:
+            while proceed():
+                # verify that metrics are available
+                subvol_metrics = self.get_subvolume_metrics()
+                if subvol_metrics:
+                    break
+
+        log.debug(f'verifying for read: subvol_metrics={subvol_metrics}')
+
+        metric = subvol_metrics[0]
+        counters = metric["counters"]
+
+        # Assert expected values (example: write I/O occurred, read did not)
+        self.assertGreater(counters["avg_read_iops"], 0, "Expected avg_read_iops to be >= 0")
+        self.assertGreater(counters["avg_read_tp_Bps"], 0, "Expected avg_read_tp_Bps to be >= 0")
+        self.assertGreaterEqual(counters["avg_read_lat_msec"], 0, "Expected avg_read_lat_msec to be >= 0")
+
+        # wait for metrics to expire after inactivity
+        sleep(60)
+
+        # verify that metrics are not present anymore
+        subvolume_metrics = self.get_subvolume_metrics()
+        self.assertFalse(subvolume_metrics, "Subvolume metrics should be gone after inactivity window")
author	Igor Golikov <igolikov@ibm.com>
	Thu, 7 Aug 2025 16:35:47 +0000 (16:35 +0000)
committer	Venky Shankar <vshankar@redhat.com>
	Wed, 10 Sep 2025 16:47:16 +0000 (16:47 +0000)
qa/suites/fs/volumes/tasks/volumes/test/metrics.yaml	[new file with mode: 0644]	patch \| blob
qa/tasks/cephfs/mount.py		patch \| blob \| history
qa/tasks/cephfs/test_subvolume.py		patch \| blob \| history