From: Xiubo Li Date: Wed, 23 Dec 2020 01:25:37 +0000 (+0800) Subject: qa: move wait_until_scrub_complete helper to filesystem class X-Git-Tag: v16.2.0~20^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=de84ed971f151acae299efb3e14afad559fbbb49;p=ceph.git qa: move wait_until_scrub_complete helper to filesystem class Fixes: https://tracker.ceph.com/issues/48559 Signed-off-by: Xiubo Li (cherry picked from commit b1f20a37c729f29739d3782f688dcfa5b2992d74) --- diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py index 9d689456c9d..f72ca26c3da 100644 --- a/qa/tasks/cephfs/cephfs_test_case.py +++ b/qa/tasks/cephfs/cephfs_test_case.py @@ -13,9 +13,6 @@ from teuthology.misc import sudo_write_file from teuthology.orchestra import run from teuthology.orchestra.run import CommandFailedError -from teuthology.contextutil import safe_while - - log = logging.getLogger(__name__) def for_teuthology(f): @@ -381,13 +378,11 @@ class CephFSTestCase(CephTestCase): except contextutil.MaxWhileTries as e: raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e - def _wait_until_scrub_complete(self, path="/", recursive=True): + def _wait_until_scrub_complete(self, path="/", recursive=True, timeout=100): out_json = self.fs.rank_tell(["scrub", "start", path] + ["recursive"] if recursive else []) - with safe_while(sleep=10, tries=10) as proceed: - while proceed(): - out_json = self.fs.rank_tell(["scrub", "status"]) - if out_json['status'] == "no active scrubs running": - break; + if not self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"], + sleep=10, timeout=timeout): + log.info("timed out waiting for scrub to complete") def _wait_distributed_subtrees(self, count, status=None, rank=None, path=None): try: diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index 226eb02d192..0dd9d5e048e 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -18,6 +18,7 @@ from teuthology.exceptions import CommandFailedError from teuthology import misc from teuthology.nuke import clear_firewall from teuthology.parallel import parallel +from teuthology import contextutil from tasks.ceph_manager import write_conf from tasks import ceph_manager @@ -1575,3 +1576,26 @@ class Filesystem(MDSCluster): assert(new_max_mds < oldmax) self.set_max_mds(new_max_mds) return self.wait_for_daemons() + + def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30, timeout=300): + # time out after "timeout" seconds and assume as done + if result is None: + result = "no active scrubs running" + with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed: + while proceed(): + out_json = self.rank_tell(["scrub", "status"], rank=rank) + assert out_json is not None + if result in out_json['status']: + log.info("all active scrubs completed") + return True + + if tag is not None: + status = out_json['scrubs'][tag] + if status is not None: + log.info(f"scrub status for tag:{tag} - {status}") + else: + log.info(f"scrub has completed for tag:{tag}") + return True + + # timed out waiting for scrub to complete + return False diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py index 381dcbaf85e..6463815a53e 100644 --- a/qa/tasks/cephfs/test_scrub_checks.py +++ b/qa/tasks/cephfs/test_scrub_checks.py @@ -77,7 +77,7 @@ done # abort and verify self._abort_scrub(0) - self.wait_until_true(lambda: "no active" in self._get_scrub_status()['status'], 30) + self.fs.wait_until_scrub_complete(sleep=5, timeout=30) # sleep enough to fetch updated task status checked = self._check_task_status_na() @@ -298,7 +298,7 @@ class TestScrubChecks(CephFSTestCase): self.assertFalse(_check_and_clear_damage(ino, "backtrace")); self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name()) self.tell_command(mds_rank, command, success_validator) - self.wait_until_true(lambda: "no active" in _get_scrub_status()['status'], 30) + self.fs.wait_until_scrub_complete(sleep=5, timeout=30) self.assertTrue(_check_and_clear_damage(ino, "backtrace")); command = "flush_path /" diff --git a/qa/tasks/fwd_scrub.py b/qa/tasks/fwd_scrub.py index 15393c20992..c8866c1901a 100644 --- a/qa/tasks/fwd_scrub.py +++ b/qa/tasks/fwd_scrub.py @@ -8,8 +8,6 @@ from gevent import sleep, GreenletExit from gevent.greenlet import Greenlet from gevent.event import Event from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.orchestra.run import CommandFailedError from tasks import ceph_manager from tasks.cephfs.filesystem import MDSCluster, Filesystem @@ -73,29 +71,8 @@ class ForwardScrubber(Thrasher, Greenlet): assert out_json['return_code'] == 0 assert out_json['mode'] == 'asynchronous' - return self._wait_until_scrub_complete(tag) - - def _wait_until_scrub_complete(self, tag): - # time out after scrub_timeout seconds and assume as done - with contextutil.safe_while(sleep=30, tries=self.scrub_timeout//30) as proceed: - while proceed(): - try: - out_json = self.fs.rank_tell(["scrub", "status"]) - assert out_json is not None - if out_json['status'] == "no active scrubs running": - self.logger.info("all active scrubs completed") - return - - status = out_json['scrubs'][tag] - if status is not None: - self.logger.info(f"scrub status for tag:{tag} - {status}") - else: - self.logger.info(f"scrub has completed for tag:{tag}") - return - except CommandFailedError as e: - self.logger.exception(f"exception while getting scrub status: {e}") - self.logger.info("retrying scrub status command in a while") - pass + return self.fs.wait_until_scrub_complete(tag=tag, sleep=30, + timeout=self.scrub_timeout) def stop_all_fwd_scrubbers(thrashers): for thrasher in thrashers: