]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
wait_till_clean -> wait_for_clean and wait_for_recovery
authorSage Weil <sage@newdream.net>
Sat, 18 Feb 2012 05:53:25 +0000 (21:53 -0800)
committerSage Weil <sage@newdream.net>
Sat, 18 Feb 2012 05:53:25 +0000 (21:53 -0800)
Clean now also means the correct number of replicas, whereas recovered
means we have done all the work we can do given the replicas/osds we have.
For example, degraded and clean are now mutually exclusive.

Also move away from 'till'.

teuthology/task/backfill.py
teuthology/task/ceph_manager.py
teuthology/task/lost_unfound.py
teuthology/task/thrashosds.py

index b188ed58eb474da94903d5fa2905da768c63260b..f7dda6643a13fec86b8a38b94969bc0325343fae 100644 (file)
@@ -50,7 +50,7 @@ def task(ctx, config):
     manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-    manager.wait_till_clean()
+    manager.wait_for_clean()
 
     # write some data
     p = rados_start(mon, ['-p', 'data', 'bench', '15', 'write', '-b', '4096'])
@@ -67,7 +67,7 @@ def task(ctx, config):
     # wait for everything to peer and be happy...
     manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-    manager.wait_till_clean()
+    manager.wait_for_recovery()
 
     # write some new data
     p = rados_start(mon, ['-p', 'data', 'bench', '30', 'write', '-b', '4096'])
@@ -87,12 +87,12 @@ def task(ctx, config):
     # cluster must recover
     manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-    manager.wait_till_clean()
+    manager.wait_for_recovery()
 
     # re-add osd.0
     manager.revive_osd(0)
     manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-    manager.wait_till_clean()
+    manager.wait_for_clean()
 
 
index 13273afc8f83e81d09880e0ae314dc66578a459d..b946b49295de16ed0f2abb307a36762f81b9abeb 100644 (file)
@@ -8,7 +8,7 @@ import json
 class Thrasher(gevent.Greenlet):
     def __init__(self, manager, config, logger=None):
         self.ceph_manager = manager
-        self.ceph_manager.wait_till_clean()
+        self.ceph_manager.wait_for_clean()
         osd_status = self.ceph_manager.get_osd_status()
         self.in_osds = osd_status['in']
         self.live_osds = osd_status['live']
@@ -124,7 +124,7 @@ class Thrasher(gevent.Greenlet):
             if random.uniform(0,1) < (float(delay) / cleanint):
                 while len(self.dead_osds) > maxdead:
                     self.revive_osd()
-                self.ceph_manager.wait_till_clean(
+                self.ceph_manager.wait_for_recovery(
                     timeout=self.config.get('timeout')
                     )
             self.choose_action()()
@@ -226,6 +226,14 @@ class CephManager:
                 num += 1
         return num
 
+    def get_num_active_recovered(self):
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if pg['state'].count('active') and not pg['state'].count('recovering'):
+                num += 1
+        return num
+
     def get_num_active(self):
         pgs = self.get_pg_stats()
         num = 0
@@ -237,8 +245,11 @@ class CephManager:
     def is_clean(self):
         return self.get_num_active_clean() == self.get_num_pgs()
 
-    def wait_till_clean(self, timeout=None):
-        self.log("waiting till clean")
+    def is_recovered(self):
+        return self.get_num_active_recovered() == self.get_num_pgs()
+
+    def wait_for_clean(self, timeout=None):
+        self.log("waiting for clean")
         start = time.time()
         num_active_clean = self.get_num_active_clean()
         while not self.is_clean():
@@ -252,6 +263,21 @@ class CephManager:
             time.sleep(3)
         self.log("clean!")
 
+    def wait_for_recovery(self, timeout=None):
+        self.log("waiting for recovery to complete")
+        start = time.time()
+        num_active_recovered = self.get_num_active_recovered()
+        while not self.is_recovered():
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'failed to recover before timeout expired'
+            cur_active_recovered = self.get_num_active_recovered()
+            if cur_active_recovered != num_active_recovered:
+                start = time.time()
+                num_active_recovered = cur_active_recovered
+            time.sleep(3)
+        self.log("recovered!")
+
     def osd_is_up(self, osd):
         osds = self.get_osd_dump()
         return osds[osd]['up'] > 0
index 4ea751112b45cb21d92bb681c93ef7992d23584c..973f241af93457ef3fc4d62c00661e0706f7565e 100644 (file)
@@ -45,7 +45,7 @@ def task(ctx, config):
     manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-    manager.wait_till_clean()
+    manager.wait_for_clean()
 
     # something that is always there
     dummyfile = '/etc/fstab'
@@ -60,7 +60,7 @@ def task(ctx, config):
 
     manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
-    manager.wait_till_clean()
+    manager.wait_for_recovery()
 
     # create old objects
     for f in range(1, 10):
@@ -135,7 +135,7 @@ def task(ctx, config):
     manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
     manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
-    manager.wait_till_clean()
+    manager.wait_for_recovery()
 
     # verify result
     for f in range(1, 10):
@@ -149,4 +149,4 @@ def task(ctx, config):
     # see if osd.1 can cope
     manager.revive_osd(1)
     manager.wait_till_osd_is_up(1)
-    manager.wait_till_clean()
+    manager.wait_for_clean()
index ab5fc9b357ee8ce66dbc936fd0af1c9eccf800d8..2678e60e56b5eed6aecb58133b1e0648cc1a2fd2 100644 (file)
@@ -82,4 +82,4 @@ def task(ctx, config):
     finally:
         log.info('joining thrashosds')
         thrash_proc.do_join()
-        manager.wait_till_clean(config.get('timeout', 360))
+        manager.wait_for_recovery(config.get('timeout', 360))