]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
verify all osds start before checking health
authorSage Weil <sage@newdream.net>
Sun, 8 Jan 2012 23:14:18 +0000 (15:14 -0800)
committerSage Weil <sage@newdream.net>
Wed, 11 Jan 2012 20:54:08 +0000 (12:54 -0800)
Just checking health isn't good enough, since it races with OSD startup:
we can have a healthy cluster with 0 (or something else < total) OSDs.

teuthology/misc.py
teuthology/task/ceph.py

index 00c674f5e2425aec56b02bf1694506c02de9ab61..2b144c73b5c92be2c9f24e869a74d63123983afc 100644 (file)
@@ -9,6 +9,7 @@ import time
 import urllib2
 import urlparse
 import yaml
+import json
 
 from .orchestra import run
 
@@ -286,6 +287,31 @@ def wait_until_healthy(remote):
             break
         time.sleep(1)
 
+def wait_until_osds_up(cluster, remote):
+    """Wait until all Ceph OSDs are booted."""
+    num_osds = num_instances_of_type(cluster, 'osd')
+    while True:
+        r = remote.run(
+            args=[
+                '/tmp/cephtest/enable-coredump',
+                '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
+                '/tmp/cephtest/archive/coverage',
+                '/tmp/cephtest/binary/usr/local/bin/ceph',
+                '-c', '/tmp/cephtest/ceph.conf',
+                '--concise',
+                'osd', 'dump', '--format=json'
+                ],
+            stdout=StringIO(),
+            logger=log.getChild('health'),
+            )
+        out = r.stdout.getvalue()
+        j = json.loads('\n'.join(out.split('\n')[1:]))
+        up = len(j['osds'])
+        log.debug('%d of %d OSDs are up' % (up, num_osds))
+        if up == num_osds:
+            break
+        time.sleep(1)
+
 def wait_until_fuse_mounted(remote, fuse, mountpoint):
     while True:
         proc = remote.run(
index ffd7919c284bb7ed528608eba58c4ba513315e3a..931212650bfa7384d6930904dc71b1aa247b0a6c 100644 (file)
@@ -904,6 +904,10 @@ def healthy(ctx, config):
     log.info('Waiting until ceph is healthy...')
     firstmon = teuthology.get_first_mon(ctx, config)
     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+    teuthology.wait_until_osds_up(
+        cluster=ctx.cluster,
+        remote=mon0_remote
+        )
     teuthology.wait_until_healthy(
         remote=mon0_remote,
         )