From 207cf2d6911103f1757d77c7e5425af555adc2db Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Thu, 23 Feb 2017 17:36:05 -0700 Subject: [PATCH] misc.wait_until_osds_up(): timeout after 5min It doesn't make any sense to wait more than a few minutes for OSDs to come up. If they take more than five minutes, fail the job. Signed-off-by: Zack Cerza --- teuthology/misc.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/teuthology/misc.py b/teuthology/misc.py index b9b5baa031..35d9f0e00e 100644 --- a/teuthology/misc.py +++ b/teuthology/misc.py @@ -910,26 +910,26 @@ def wait_until_osds_up(ctx, cluster, remote, ceph_cluster='ceph'): """Wait until all Ceph OSDs are booted.""" num_osds = num_instances_of_type(cluster, 'osd', ceph_cluster) testdir = get_testdir(ctx) - while True: - r = remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph', - '--cluster', ceph_cluster, - 'osd', 'dump', '--format=json' - ], - stdout=StringIO(), - logger=log.getChild('health'), - ) - out = r.stdout.getvalue() - j = json.loads('\n'.join(out.split('\n')[1:])) - up = len(filter(lambda o: 'up' in o['state'], j['osds'])) - log.debug('%d of %d OSDs are up' % (up, num_osds)) - if up == num_osds: - break - time.sleep(1) + with safe_while(sleep=6, tries=50) as proceed: + while proceed(): + r = remote.run( + args=[ + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=testdir), + 'ceph', + '--cluster', ceph_cluster, + 'osd', 'dump', '--format=json' + ], + stdout=StringIO(), + logger=log.getChild('health'), + ) + out = r.stdout.getvalue() + j = json.loads('\n'.join(out.split('\n')[1:])) + up = len(filter(lambda o: 'up' in o['state'], j['osds'])) + log.debug('%d of %d OSDs are up' % (up, num_osds)) + if up == num_osds: + break def reboot(node, timeout=300, interval=30): -- 2.39.5