From: Sage Weil Date: Mon, 24 Jul 2017 21:05:11 +0000 (-0400) Subject: qa/tasks/ceph: wait for pg stats to flush in healthy check X-Git-Tag: v12.1.2~63^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a40d94b1632d9cf71e6f6683d41f1fc244103f00;p=ceph.git qa/tasks/ceph: wait for pg stats to flush in healthy check Signed-off-by: Sage Weil --- diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index 5fe19c1ed1f8..05c4c7d90abf 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -1222,7 +1222,13 @@ def healthy(ctx, config): """ config = config if isinstance(config, dict) else dict() cluster_name = config.get('cluster', 'ceph') - log.info('Waiting until ceph cluster %s is healthy...', cluster_name) + log.info('Waiting until %s daemons up and pgs clean...', cluster_name) + manager = ctx.managers[cluster_name] + try: + manager.wait_for_mgr_available() + except run.CommandFailedError: + log.info('ignoring mgr wait error, probably testing upgrade') + firstmon = teuthology.get_first_mon(ctx, config, cluster_name) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up( @@ -1231,6 +1237,14 @@ def healthy(ctx, config): remote=mon0_remote, ceph_cluster=cluster_name, ) + + try: + manager.flush_all_pg_stats() + except run.CommandFailedError: + log.info('ignoring flush pg stats error, probably testing upgrade') + manager.wait_for_clean() + + log.info('Waiting until ceph cluster %s is healthy...', cluster_name) teuthology.wait_until_healthy( ctx, remote=mon0_remote, diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 757a7faf23af..b2f687e49bb2 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -1989,6 +1989,10 @@ class CephManager: """ return self.get_osd_dump_json()['osds'] + def get_mgr_dump(self): + out = self.raw_cluster_cmd('mgr', 'dump', '--format=json') + return json.loads(out) + def get_stuck_pgs(self, type_, threshold): """ :returns: stuck pg information from the cluster @@ -2182,6 +2186,20 @@ class CephManager: for pool in pools: self.wait_for_pool(pool) + def is_mgr_available(self): + x = self.get_mgr_dump() + return x.get('available', False) + + def wait_for_mgr_available(self, timeout=None): + self.log("waiting for mgr available") + start = time.time() + while not self.is_mgr_available(): + if timeout is not None: + assert time.time() - start < timeout, \ + 'timeout expired in wait_for_mgr_available' + time.sleep(3) + self.log("mgr available!") + def wait_for_recovery(self, timeout=None): """ Check peering. When this exists, we have recovered.