From: Sridhar Seshasayee Date: Wed, 19 May 2021 15:22:15 +0000 (+0530) Subject: qa/tasks: Enhance wait_until_true() to check & retry recovery progress X-Git-Tag: v17.1.0~1765^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=328271d587d099e78dcd020c17e7465043c1bb6b;p=ceph.git qa/tasks: Enhance wait_until_true() to check & retry recovery progress With mclock scheduler enabled, the recovery throughput is throttled based on factors like the type of mclock profile enabled, the OSD capacity among others. Due to this the recovery times may vary and therefore the existing timeout of 120 secs may not be sufficient. To address the above, a new method called _is_inprogress_or_complete() is introduced in the TestProgress Class that checks if the event with the specified 'id' is in progress by checking the 'progress' key of the progress command response. This method also handles the corner case where the event completes just before it's called. The existing wait_until_true() method in the CephTestCase Class is modified to accept another function argument called "check_fn". This is set to the _is_inprogress_or_complete() function described earlier in the "test_turn_off_module" test that has been observed to fail due to the reasons already described above. A retry mechanism of a maximum of 5 attempts is introduced after the first timeout is hit. This means that the wait can extend up to a maximum of 600 secs (120 secs * 5) as long as there is recovery progress reported by the 'ceph progress' command result. Signed-off-by: Sridhar Seshasayee --- diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py index 7f1dccc986d9..2ca17b34ce37 100644 --- a/qa/tasks/ceph_test_case.py +++ b/qa/tasks/ceph_test_case.py @@ -192,16 +192,22 @@ class CephTestCase(unittest.TestCase): log.debug("wait_until_equal: success") @classmethod - def wait_until_true(cls, condition, timeout, period=5): + def wait_until_true(cls, condition, timeout, check_fn=None, period=5): elapsed = 0 + retry_count = 0 while True: if condition(): - log.debug("wait_until_true: success in {0}s".format(elapsed)) + log.debug("wait_until_true: success in {0}s and {1} retries".format(elapsed, retry_count)) return else: if elapsed >= timeout: - raise TestTimeoutError("Timed out after {0}s".format(elapsed)) + if check_fn and check_fn() and retry_count < 5: + elapsed = 0 + retry_count += 1 + log.debug("wait_until_true: making progress, waiting (timeout={0} retry_count={1})...".format(timeout, retry_count)) + else: + raise TestTimeoutError("Timed out after {0}s and {1} retries".format(elapsed, retry_count)) else: - log.debug("wait_until_true: waiting (timeout={0})...".format(timeout)) + log.debug("wait_until_true: waiting (timeout={0} retry_count={1})...".format(timeout, retry_count)) time.sleep(period) elapsed += period diff --git a/qa/tasks/mgr/test_progress.py b/qa/tasks/mgr/test_progress.py index 0e03c2b663ac..cf992e22d5cb 100644 --- a/qa/tasks/mgr/test_progress.py +++ b/qa/tasks/mgr/test_progress.py @@ -243,6 +243,13 @@ class TestProgress(MgrTestCase): assert ev_id in live_ids return False + def _is_inprogress_or_complete(self, ev_id): + for ev in self._events_in_progress(): + if ev['id'] == ev_id: + return ev['progress'] > 0 + # check if the event completed + return self._is_complete(ev_id) + def tearDown(self): if self.POOL in self.mgr_cluster.mon_manager.pools: self.mgr_cluster.mon_manager.remove_pool(self.POOL) @@ -396,5 +403,6 @@ class TestProgress(MgrTestCase): log.info(json.dumps(ev1, indent=1)) self.wait_until_true(lambda: self._is_complete(ev1['id']), + check_fn=lambda: self._is_inprogress_or_complete(ev1['id']), timeout=self.RECOVERY_PERIOD) self.assertTrue(self._is_quiet())