From: Patrick Donnelly Date: Mon, 7 Jul 2025 19:15:31 +0000 (-0400) Subject: qa/tasks/ceph.healthy: indicate expected failing checks X-Git-Tag: testing/wip-pdonnell-testing-20260210.212535~36 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=dfe352081466a045c4521dadcf754d1b3183f4c9;p=ceph-ci.git qa/tasks/ceph.healthy: indicate expected failing checks We will want to confirm the cluster is healthy despite some checks that we expect to be failing. Signed-off-by: Patrick Donnelly --- diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index 2b3f7495f94..3654a24ae88 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -1538,6 +1538,7 @@ def healthy(ctx, config): """ config = config if isinstance(config, dict) else dict() cluster_name = config.get('cluster', 'ceph') + expected_checks = config.get('expected_checks', []) log.info('Waiting until %s daemons up and pgs clean...', cluster_name) manager = ctx.managers[cluster_name] try: @@ -1555,7 +1556,7 @@ def healthy(ctx, config): if config.get('wait-for-healthy', True): log.info('Waiting until ceph cluster %s is healthy...', cluster_name) - manager.wait_until_healthy(timeout=300) + manager.wait_until_healthy(timeout=300, expected_checks=expected_checks) if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes: # Some MDSs exist, wait for them to be healthy diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 38672785163..4d3b88aa30a 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -3385,14 +3385,39 @@ class CephManager: self.log('health:\n{h}'.format(h=out)) return json.loads(out) - def wait_until_healthy(self, timeout=None): + def wait_until_healthy(self, timeout=None, expected_checks=[]): self.log("wait_until_healthy") start = time.time() - while self.get_mon_health()['status'] != 'HEALTH_OK': + found = set() + while True: + health = self.get_mon_health() + if health['status'] == 'HEALTH_OK': + break + found = set() + okay = True + unhealthy = [] + for name, check in health['checks'].items(): + if check['muted']: + log.debug("{} is muted", name) + elif name in expected_checks: + log.info("{} in expected_checks", name) + found.add(name) + else: + unhealthy.append(name) + okay = False + if okay: + break if timeout is not None: - assert time.time() - start < timeout, \ - 'timeout expired in wait_until_healthy' + if timeout < (time.time() - start): + what = ", ".join(unhealthy) + err = f"timeout {timeout}s expired waiting for healthy cluster with these unhealthy checks: {what}" + raise RuntimeError(err) time.sleep(3) + if found != set(expected_checks): + exp = ", ".join(expected_checks) + fnd = ", ".join(found) + err = f"healthy cluster but expected_checks ({exp}) not equal to {fnd}" + raise RuntimeError(err) self.log("wait_until_healthy done") def get_filepath(self):