qa/tasks/ceph: wait for pg stats to flush in healthy check

author Sage Weil <sage@redhat.com>

Mon, 24 Jul 2017 21:05:11 +0000 (17:05 -0400)

committer Sage Weil <sage@redhat.com>

Thu, 27 Jul 2017 16:10:27 +0000 (12:10 -0400)
author Sage Weil <sage@redhat.com>
Mon, 24 Jul 2017 21:05:11 +0000 (17:05 -0400)
committer Sage Weil <sage@redhat.com>
Thu, 27 Jul 2017 16:10:27 +0000 (12:10 -0400)
diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py

index 5fe19c1ed1f8a474da65a3769d9c054a70c11b11..05c4c7d90abf3eda87c8a7e81bedec39637e0715 100644 (file)
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -1222,7 +1222,13 @@ def healthy(ctx, config):
      """
      config = config if isinstance(config, dict) else dict()
      cluster_name = config.get('cluster', 'ceph')
-    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
+    log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
+    manager = ctx.managers[cluster_name]
+    try:
+        manager.wait_for_mgr_available()
+    except run.CommandFailedError:
+        log.info('ignoring mgr wait error, probably testing upgrade')
+
      firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
      (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
      teuthology.wait_until_osds_up(
@@ -1231,6 +1237,14 @@ def healthy(ctx, config):
          remote=mon0_remote,
          ceph_cluster=cluster_name,
      )
+
+    try:
+        manager.flush_all_pg_stats()
+    except run.CommandFailedError:
+        log.info('ignoring flush pg stats error, probably testing upgrade')
+    manager.wait_for_clean()
+
+    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
      teuthology.wait_until_healthy(
          ctx,
          remote=mon0_remote,
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py

index 757a7faf23af9fd257f281be6525a7e9b476aa88..b2f687e49bb2a89030d30a8286248f871402e3a3 100644 (file)
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -1989,6 +1989,10 @@ class CephManager:
          """
          return self.get_osd_dump_json()['osds']
  
+    def get_mgr_dump(self):
+        out = self.raw_cluster_cmd('mgr', 'dump', '--format=json')
+        return json.loads(out)
+
      def get_stuck_pgs(self, type_, threshold):
          """
          :returns: stuck pg information from the cluster
@@ -2182,6 +2186,20 @@ class CephManager:
          for pool in pools:
              self.wait_for_pool(pool)
  
+    def is_mgr_available(self):
+        x = self.get_mgr_dump()
+        return x.get('available', False)
+
+    def wait_for_mgr_available(self, timeout=None):
+        self.log("waiting for mgr available")
+        start = time.time()
+        while not self.is_mgr_available():
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'timeout expired in wait_for_mgr_available'
+            time.sleep(3)
+        self.log("mgr available!")
+
      def wait_for_recovery(self, timeout=None):
          """
          Check peering. When this exists, we have recovered.
author	Sage Weil <sage@redhat.com>
	Mon, 24 Jul 2017 21:05:11 +0000 (17:05 -0400)
committer	Sage Weil <sage@redhat.com>
	Thu, 27 Jul 2017 16:10:27 +0000 (12:10 -0400)
qa/tasks/ceph.py		patch \| blob \| history
qa/tasks/ceph_manager.py		patch \| blob \| history