From: Xiubo Li <xiubli@redhat.com>
Date: Wed, 23 Dec 2020 01:25:37 +0000 (+0800)
Subject: qa: move wait_until_scrub_complete helper to filesystem class
X-Git-Tag: v16.2.0~20^2~5
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=de84ed971f151acae299efb3e14afad559fbbb49;p=ceph.git

qa: move wait_until_scrub_complete helper to filesystem class

Fixes: https://tracker.ceph.com/issues/48559
Signed-off-by: Xiubo Li <xiubli@redhat.com>
(cherry picked from commit b1f20a37c729f29739d3782f688dcfa5b2992d74)
---

diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py
index 9d689456c9d..f72ca26c3da 100644
--- a/qa/tasks/cephfs/cephfs_test_case.py
+++ b/qa/tasks/cephfs/cephfs_test_case.py
@@ -13,9 +13,6 @@ from teuthology.misc import sudo_write_file
 from teuthology.orchestra import run
 from teuthology.orchestra.run import CommandFailedError
 
-from teuthology.contextutil import safe_while
-
-
 log = logging.getLogger(__name__)
 
 def for_teuthology(f):
@@ -381,13 +378,11 @@ class CephFSTestCase(CephTestCase):
         except contextutil.MaxWhileTries as e:
             raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
 
-    def _wait_until_scrub_complete(self, path="/", recursive=True):
+    def _wait_until_scrub_complete(self, path="/", recursive=True, timeout=100):
         out_json = self.fs.rank_tell(["scrub", "start", path] + ["recursive"] if recursive else [])
-        with safe_while(sleep=10, tries=10) as proceed:
-            while proceed():
-                out_json = self.fs.rank_tell(["scrub", "status"])
-                if out_json['status'] == "no active scrubs running":
-                    break;
+        if not self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"],
+                                                 sleep=10, timeout=timeout):
+            log.info("timed out waiting for scrub to complete")
 
     def _wait_distributed_subtrees(self, count, status=None, rank=None, path=None):
         try:
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
index 226eb02d192..0dd9d5e048e 100644
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -18,6 +18,7 @@ from teuthology.exceptions import CommandFailedError
 from teuthology import misc
 from teuthology.nuke import clear_firewall
 from teuthology.parallel import parallel
+from teuthology import contextutil
 from tasks.ceph_manager import write_conf
 from tasks import ceph_manager
 
@@ -1575,3 +1576,26 @@ class Filesystem(MDSCluster):
         assert(new_max_mds < oldmax)
         self.set_max_mds(new_max_mds)
         return self.wait_for_daemons()
+
+    def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30, timeout=300):
+        # time out after "timeout" seconds and assume as done
+        if result is None:
+            result = "no active scrubs running"
+        with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
+            while proceed():
+                out_json = self.rank_tell(["scrub", "status"], rank=rank)
+                assert out_json is not None
+                if result in out_json['status']:
+                    log.info("all active scrubs completed")
+                    return True
+
+                if tag is not None:
+                    status = out_json['scrubs'][tag]
+                    if status is not None:
+                        log.info(f"scrub status for tag:{tag} - {status}")
+                    else:
+                        log.info(f"scrub has completed for tag:{tag}")
+                        return True
+
+        # timed out waiting for scrub to complete
+        return False
diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py
index 381dcbaf85e..6463815a53e 100644
--- a/qa/tasks/cephfs/test_scrub_checks.py
+++ b/qa/tasks/cephfs/test_scrub_checks.py
@@ -77,7 +77,7 @@ done
 
         # abort and verify
         self._abort_scrub(0)
-        self.wait_until_true(lambda: "no active" in self._get_scrub_status()['status'], 30)
+        self.fs.wait_until_scrub_complete(sleep=5, timeout=30)
 
         # sleep enough to fetch updated task status
         checked = self._check_task_status_na()
@@ -298,7 +298,7 @@ class TestScrubChecks(CephFSTestCase):
         self.assertFalse(_check_and_clear_damage(ino, "backtrace"));
         self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name())
         self.tell_command(mds_rank, command, success_validator)
-        self.wait_until_true(lambda: "no active" in _get_scrub_status()['status'], 30)
+        self.fs.wait_until_scrub_complete(sleep=5, timeout=30)
         self.assertTrue(_check_and_clear_damage(ino, "backtrace"));
 
         command = "flush_path /"
diff --git a/qa/tasks/fwd_scrub.py b/qa/tasks/fwd_scrub.py
index 15393c20992..c8866c1901a 100644
--- a/qa/tasks/fwd_scrub.py
+++ b/qa/tasks/fwd_scrub.py
@@ -8,8 +8,6 @@ from gevent import sleep, GreenletExit
 from gevent.greenlet import Greenlet
 from gevent.event import Event
 from teuthology import misc as teuthology
-from teuthology import contextutil
-from teuthology.orchestra.run import CommandFailedError
 
 from tasks import ceph_manager
 from tasks.cephfs.filesystem import MDSCluster, Filesystem
@@ -73,29 +71,8 @@ class ForwardScrubber(Thrasher, Greenlet):
         assert out_json['return_code'] == 0
         assert out_json['mode'] == 'asynchronous'
 
-        return self._wait_until_scrub_complete(tag)
-
-    def _wait_until_scrub_complete(self, tag):
-        # time out after scrub_timeout seconds and assume as done
-        with contextutil.safe_while(sleep=30, tries=self.scrub_timeout//30) as proceed:
-            while proceed():
-                try:
-                    out_json = self.fs.rank_tell(["scrub", "status"])
-                    assert out_json is not None
-                    if out_json['status'] == "no active scrubs running":
-                        self.logger.info("all active scrubs completed")
-                        return
-
-                    status = out_json['scrubs'][tag]
-                    if status is not None:
-                        self.logger.info(f"scrub status for tag:{tag} - {status}")
-                    else:
-                        self.logger.info(f"scrub has completed for tag:{tag}")
-                        return
-                except CommandFailedError as e:
-                    self.logger.exception(f"exception while getting scrub status: {e}")
-                    self.logger.info("retrying scrub status command in a while")
-                    pass
+        return self.fs.wait_until_scrub_complete(tag=tag, sleep=30,
+                                                 timeout=self.scrub_timeout)
 
 def stop_all_fwd_scrubbers(thrashers):
     for thrasher in thrashers: