qa/tasks/ceph.healthy: indicate expected failing checks

author Patrick Donnelly <pdonnell@ibm.com>

Mon, 7 Jul 2025 19:15:31 +0000 (15:15 -0400)

committer Patrick Donnelly <pdonnell@ibm.com>

Mon, 22 Sep 2025 16:36:27 +0000 (12:36 -0400)
author Patrick Donnelly <pdonnell@ibm.com>
Mon, 7 Jul 2025 19:15:31 +0000 (15:15 -0400)
committer Patrick Donnelly <pdonnell@ibm.com>
Mon, 22 Sep 2025 16:36:27 +0000 (12:36 -0400)
diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py

index ae681d8febdb0559804958289ab86deae75792f9..d40c934affe0b0ebc880611b159a18e7e6b6db93 100644 (file)
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -1539,6 +1539,7 @@ def healthy(ctx, config):
      """
      config = config if isinstance(config, dict) else dict()
      cluster_name = config.get('cluster', 'ceph')
+    expected_checks = config.get('expected_checks', [])
      log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
      manager = ctx.managers[cluster_name]
      try:
@@ -1556,7 +1557,7 @@ def healthy(ctx, config):
  
      if config.get('wait-for-healthy', True):
          log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
-        manager.wait_until_healthy(timeout=300)
+        manager.wait_until_healthy(timeout=300, expected_checks=expected_checks)
  
      if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
          # Some MDSs exist, wait for them to be healthy
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py

index 57d22f3b5e637cc45551ae39585185ddef45076e..49543446c655c599a3c9f41d7af900bcf17c22a3 100644 (file)
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -3368,14 +3368,39 @@ class CephManager:
              self.log('health:\n{h}'.format(h=out))
          return json.loads(out)
  
-    def wait_until_healthy(self, timeout=None):
+    def wait_until_healthy(self, timeout=None, expected_checks=[]):
          self.log("wait_until_healthy")
          start = time.time()
-        while self.get_mon_health()['status'] != 'HEALTH_OK':
+        found = set()
+        while True:
+            health = self.get_mon_health()
+            if health['status'] == 'HEALTH_OK':
+                break
+            found = set()
+            okay = True
+            unhealthy = []
+            for name, check in health['checks'].items():
+                if check['muted']:
+                    log.debug("{} is muted", name)
+                elif name in expected_checks:
+                    log.info("{} in expected_checks", name)
+                    found.add(name)
+                else:
+                    unhealthy.append(name)
+                    okay = False
+            if okay:
+                break
              if timeout is not None:
-                assert time.time() - start < timeout, \
-                    'timeout expired in wait_until_healthy'
+                if timeout < (time.time() - start):
+                    what = ", ".join(unhealthy)
+                    err = f"timeout {timeout}s expired waiting for healthy cluster with these unhealthy checks: {what}"
+                    raise RuntimeError(err)
              time.sleep(3)
+        if found != set(expected_checks):
+            exp = ", ".join(expected_checks)
+            fnd = ", ".join(found)
+            err = f"healthy cluster but expected_checks ({exp}) not equal to {fnd}"
+            raise RuntimeError(err)
          self.log("wait_until_healthy done")
  
      def get_filepath(self):
author	Patrick Donnelly <pdonnell@ibm.com>
	Mon, 7 Jul 2025 19:15:31 +0000 (15:15 -0400)
committer	Patrick Donnelly <pdonnell@ibm.com>
	Mon, 22 Sep 2025 16:36:27 +0000 (12:36 -0400)
qa/tasks/ceph.py		patch \| blob \| history
qa/tasks/ceph_manager.py		patch \| blob \| history