From c9e62c7d02d77deabc52693b011cd02c6356d0c7 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 7 Jul 2025 15:15:31 -0400 Subject: [PATCH] qa/tasks/ceph.healthy: indicate expected failing checks We will want to confirm the cluster is healthy despite some checks that we expect to be failing. Signed-off-by: Patrick Donnelly --- qa/tasks/ceph.py | 3 ++- qa/tasks/ceph_manager.py | 33 +++++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index 75fea606dfb..cfe790fb380 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -1538,6 +1538,7 @@ def healthy(ctx, config): """ config = config if isinstance(config, dict) else dict() cluster_name = config.get('cluster', 'ceph') + expected_checks = config.get('expected_checks', []) log.info('Waiting until %s daemons up and pgs clean...', cluster_name) manager = ctx.managers[cluster_name] try: @@ -1555,7 +1556,7 @@ def healthy(ctx, config): if config.get('wait-for-healthy', True): log.info('Waiting until ceph cluster %s is healthy...', cluster_name) - manager.wait_until_healthy(timeout=300) + manager.wait_until_healthy(timeout=300, expected_checks=expected_checks) if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes: # Some MDSs exist, wait for them to be healthy diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 0f7e92c5c2f..9fc21230c02 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -3374,14 +3374,39 @@ class CephManager: self.log('health:\n{h}'.format(h=out)) return json.loads(out) - def wait_until_healthy(self, timeout=None): + def wait_until_healthy(self, timeout=None, expected_checks=[]): self.log("wait_until_healthy") start = time.time() - while self.get_mon_health()['status'] != 'HEALTH_OK': + found = set() + while True: + health = self.get_mon_health() + if health['status'] == 'HEALTH_OK': + break + found = set() + okay = True + unhealthy = [] + for name, check in health['checks'].items(): + if check['muted']: + log.debug("{} is muted", name) + elif name in expected_checks: + log.info("{} in expected_checks", name) + found.add(name) + else: + unhealthy.append(name) + okay = False + if okay: + break if timeout is not None: - assert time.time() - start < timeout, \ - 'timeout expired in wait_until_healthy' + if timeout < (time.time() - start): + what = ", ".join(unhealthy) + err = f"timeout {timeout}s expired waiting for healthy cluster with these unhealthy checks: {what}" + raise RuntimeError(err) time.sleep(3) + if found != set(expected_checks): + exp = ", ".join(expected_checks) + fnd = ", ".join(found) + err = f"healthy cluster but expected_checks ({exp}) not equal to {fnd}" + raise RuntimeError(err) self.log("wait_until_healthy done") def get_filepath(self): -- 2.47.3