]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
qa/tasks/ceph.healthy: indicate expected failing checks
authorPatrick Donnelly <pdonnell@ibm.com>
Mon, 7 Jul 2025 19:15:31 +0000 (15:15 -0400)
committerPatrick Donnelly <pdonnell@ibm.com>
Mon, 22 Sep 2025 16:36:27 +0000 (12:36 -0400)
We will want to confirm the cluster is healthy despite some checks that we
expect to be failing.

Signed-off-by: Patrick Donnelly <pdonnell@ibm.com>
(cherry picked from commit d78eeb3311650bcde207cab55413421b7028c3c7)

qa/tasks/ceph.py
qa/tasks/ceph_manager.py

index ae681d8febdb0559804958289ab86deae75792f9..d40c934affe0b0ebc880611b159a18e7e6b6db93 100644 (file)
@@ -1539,6 +1539,7 @@ def healthy(ctx, config):
     """
     config = config if isinstance(config, dict) else dict()
     cluster_name = config.get('cluster', 'ceph')
+    expected_checks = config.get('expected_checks', [])
     log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
     manager = ctx.managers[cluster_name]
     try:
@@ -1556,7 +1557,7 @@ def healthy(ctx, config):
 
     if config.get('wait-for-healthy', True):
         log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
-        manager.wait_until_healthy(timeout=300)
+        manager.wait_until_healthy(timeout=300, expected_checks=expected_checks)
 
     if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
         # Some MDSs exist, wait for them to be healthy
index 57d22f3b5e637cc45551ae39585185ddef45076e..49543446c655c599a3c9f41d7af900bcf17c22a3 100644 (file)
@@ -3368,14 +3368,39 @@ class CephManager:
             self.log('health:\n{h}'.format(h=out))
         return json.loads(out)
 
-    def wait_until_healthy(self, timeout=None):
+    def wait_until_healthy(self, timeout=None, expected_checks=[]):
         self.log("wait_until_healthy")
         start = time.time()
-        while self.get_mon_health()['status'] != 'HEALTH_OK':
+        found = set()
+        while True:
+            health = self.get_mon_health()
+            if health['status'] == 'HEALTH_OK':
+                break
+            found = set()
+            okay = True
+            unhealthy = []
+            for name, check in health['checks'].items():
+                if check['muted']:
+                    log.debug("{} is muted", name)
+                elif name in expected_checks:
+                    log.info("{} in expected_checks", name)
+                    found.add(name)
+                else:
+                    unhealthy.append(name)
+                    okay = False
+            if okay:
+                break
             if timeout is not None:
-                assert time.time() - start < timeout, \
-                    'timeout expired in wait_until_healthy'
+                if timeout < (time.time() - start):
+                    what = ", ".join(unhealthy)
+                    err = f"timeout {timeout}s expired waiting for healthy cluster with these unhealthy checks: {what}"
+                    raise RuntimeError(err)
             time.sleep(3)
+        if found != set(expected_checks):
+            exp = ", ".join(expected_checks)
+            fnd = ", ".join(found)
+            err = f"healthy cluster but expected_checks ({exp}) not equal to {fnd}"
+            raise RuntimeError(err)
         self.log("wait_until_healthy done")
 
     def get_filepath(self):