]> git-server-git.apps.pok.os.sepia.ceph.com Git - teuthology.git/commitdiff
peer: wait for peering to complete, or block
authorSage Weil <sage@newdream.net>
Sun, 26 Feb 2012 05:05:00 +0000 (21:05 -0800)
committerSage Weil <sage@newdream.net>
Sun, 26 Feb 2012 05:05:00 +0000 (21:05 -0800)
We need to wait for peering to either complete, or block because it is
waiting for another PG.  _Then_ look at all the PG states and compare the
mon values with what we get from qeurying the OSDs directly.

teuthology/task/ceph_manager.py
teuthology/task/peer.py

index cf4065d1c0ffa4355c75ef3256b91dc1e1ed377b..f4d85a681da1c297e51532ce2a7b06afd39d4560 100644 (file)
@@ -267,12 +267,24 @@ class CephManager:
                 num += 1
         return num
 
+    def get_num_active_down(self):
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if (pg['state'].count('active') and not pg['state'].count('stale')) or \
+                    (pg['state'].count('down') and not pg['state'].count('stale')):
+                num += 1
+        return num
+
     def is_clean(self):
         return self.get_num_active_clean() == self.get_num_pgs()
 
     def is_recovered(self):
         return self.get_num_active_recovered() == self.get_num_pgs()
 
+    def is_active_or_down(self):
+        return self.get_num_active_down() == self.get_num_pgs()
+
     def wait_for_clean(self, timeout=None):
         self.log("waiting for clean")
         start = time.time()
@@ -303,6 +315,21 @@ class CephManager:
             time.sleep(3)
         self.log("recovered!")
 
+    def wait_for_active_or_down(self, timeout=None):
+        self.log("waiting for peering to complete or become blocked")
+        start = time.time()
+        num_active_down = self.get_num_active_down()
+        while not self.is_active_or_down():
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'failed to recover before timeout expired'
+            cur_active_down = self.get_num_active_down()
+            if cur_active_down != num_active_down:
+                start = time.time()
+                num_active_down = cur_active_down
+            time.sleep(3)
+        self.log("active or down!")
+
     def osd_is_up(self, osd):
         osds = self.get_osd_dump()
         return osds[osd]['up'] > 0
index 9b820bd6688474020f77bf06b5c72c1e79e89354..096c285a01db5def0165f26bafc9d95df274840e 100644 (file)
@@ -68,6 +68,11 @@ def task(ctx, config):
     manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
     manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
 
+    manager.wait_for_active_or_down()
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
+    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
+
     # look for down pgs
     num_down_pgs = 0
     pgs = manager.get_pg_stats()