From: Sage Weil <sage@redhat.com>
Date: Mon, 24 Jul 2017 21:05:11 +0000 (-0400)
Subject: qa/tasks/ceph: wait for pg stats to flush in healthy check
X-Git-Tag: v12.1.2~63^2~1
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a40d94b1632d9cf71e6f6683d41f1fc244103f00;p=ceph.git

qa/tasks/ceph: wait for pg stats to flush in healthy check

Signed-off-by: Sage Weil <sage@redhat.com>
---

diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py
index 5fe19c1ed1f8..05c4c7d90abf 100644
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -1222,7 +1222,13 @@ def healthy(ctx, config):
     """
     config = config if isinstance(config, dict) else dict()
     cluster_name = config.get('cluster', 'ceph')
-    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
+    log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
+    manager = ctx.managers[cluster_name]
+    try:
+        manager.wait_for_mgr_available()
+    except run.CommandFailedError:
+        log.info('ignoring mgr wait error, probably testing upgrade')
+
     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
     teuthology.wait_until_osds_up(
@@ -1231,6 +1237,14 @@ def healthy(ctx, config):
         remote=mon0_remote,
         ceph_cluster=cluster_name,
     )
+
+    try:
+        manager.flush_all_pg_stats()
+    except run.CommandFailedError:
+        log.info('ignoring flush pg stats error, probably testing upgrade')
+    manager.wait_for_clean()
+
+    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
     teuthology.wait_until_healthy(
         ctx,
         remote=mon0_remote,
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py
index 757a7faf23af..b2f687e49bb2 100644
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -1989,6 +1989,10 @@ class CephManager:
         """
         return self.get_osd_dump_json()['osds']
 
+    def get_mgr_dump(self):
+        out = self.raw_cluster_cmd('mgr', 'dump', '--format=json')
+        return json.loads(out)
+
     def get_stuck_pgs(self, type_, threshold):
         """
         :returns: stuck pg information from the cluster
@@ -2182,6 +2186,20 @@ class CephManager:
         for pool in pools:
             self.wait_for_pool(pool)
 
+    def is_mgr_available(self):
+        x = self.get_mgr_dump()
+        return x.get('available', False)
+
+    def wait_for_mgr_available(self, timeout=None):
+        self.log("waiting for mgr available")
+        start = time.time()
+        while not self.is_mgr_available():
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'timeout expired in wait_for_mgr_available'
+            time.sleep(3)
+        self.log("mgr available!")
+
     def wait_for_recovery(self, timeout=None):
         """
         Check peering. When this exists, we have recovered.