From: Patrick Donnelly Date: Mon, 7 Jul 2025 19:15:31 +0000 (-0400) Subject: qa/tasks/ceph.healthy: indicate expected failing checks X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=43763e9c03f080f841fcbc1a654d5d28dca1b0bc;p=ceph-ci.git qa/tasks/ceph.healthy: indicate expected failing checks We will want to confirm the cluster is healthy despite some checks that we expect to be failing. Signed-off-by: Patrick Donnelly (cherry picked from commit d78eeb3311650bcde207cab55413421b7028c3c7) --- diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index ae681d8febd..d40c934affe 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -1539,6 +1539,7 @@ def healthy(ctx, config): """ config = config if isinstance(config, dict) else dict() cluster_name = config.get('cluster', 'ceph') + expected_checks = config.get('expected_checks', []) log.info('Waiting until %s daemons up and pgs clean...', cluster_name) manager = ctx.managers[cluster_name] try: @@ -1556,7 +1557,7 @@ def healthy(ctx, config): if config.get('wait-for-healthy', True): log.info('Waiting until ceph cluster %s is healthy...', cluster_name) - manager.wait_until_healthy(timeout=300) + manager.wait_until_healthy(timeout=300, expected_checks=expected_checks) if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes: # Some MDSs exist, wait for them to be healthy diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 57d22f3b5e6..49543446c65 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -3368,14 +3368,39 @@ class CephManager: self.log('health:\n{h}'.format(h=out)) return json.loads(out) - def wait_until_healthy(self, timeout=None): + def wait_until_healthy(self, timeout=None, expected_checks=[]): self.log("wait_until_healthy") start = time.time() - while self.get_mon_health()['status'] != 'HEALTH_OK': + found = set() + while True: + health = self.get_mon_health() + if health['status'] == 'HEALTH_OK': + break + found = set() + okay = True + unhealthy = [] + for name, check in health['checks'].items(): + if check['muted']: + log.debug("{} is muted", name) + elif name in expected_checks: + log.info("{} in expected_checks", name) + found.add(name) + else: + unhealthy.append(name) + okay = False + if okay: + break if timeout is not None: - assert time.time() - start < timeout, \ - 'timeout expired in wait_until_healthy' + if timeout < (time.time() - start): + what = ", ".join(unhealthy) + err = f"timeout {timeout}s expired waiting for healthy cluster with these unhealthy checks: {what}" + raise RuntimeError(err) time.sleep(3) + if found != set(expected_checks): + exp = ", ".join(expected_checks) + fnd = ", ".join(found) + err = f"healthy cluster but expected_checks ({exp}) not equal to {fnd}" + raise RuntimeError(err) self.log("wait_until_healthy done") def get_filepath(self):