]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: bump up scrub status command timeout 54446/head
authorMilind Changire <mchangir@redhat.com>
Sat, 11 Nov 2023 03:39:55 +0000 (09:09 +0530)
committerMilind Changire <mchangir@redhat.com>
Sat, 11 Nov 2023 03:39:55 +0000 (09:09 +0530)
A journal flush sometimes takes more than 120 seconds and so the 'scrub
status' command after blocking for more than 120 seconds is declared
failed causing the job to be declared as failed.

This bumping up of the timeout gives more time for the 'scrub status'
command to wait and eventually let the journal flush to complete.

Fixes: https://tracker.ceph.com/issues/63411
Signed-off-by: Milind Changire <mchangir@redhat.com>
qa/tasks/ceph_manager.py
qa/tasks/cephfs/filesystem.py
qa/tasks/vstart_runner.py

index 86b57028ee2703a2a33da476c46af8fcea3aa1bd..fa909c1a77ca336a557bd9e4dc264844443b6b49 100644 (file)
@@ -1540,11 +1540,9 @@ class CephManager:
         self.cephadm = cephadm
         self.testdir = teuthology.get_testdir(self.ctx)
         # prefix args for ceph cmds to be executed
-        pre = ['adjust-ulimits', 'ceph-coverage',
-               f'{self.testdir}/archive/coverage']
-        self.CEPH_CMD = ['sudo'] + pre + ['timeout', '120', 'ceph',
-                                          '--cluster', self.cluster]
-        self.RADOS_CMD = pre + ['rados', '--cluster', self.cluster]
+        self.pre = ['adjust-ulimits', 'ceph-coverage',
+                    f'{self.testdir}/archive/coverage']
+        self.RADOS_CMD = self.pre + ['rados', '--cluster', self.cluster]
 
         pools = self.list_pools()
         self.pools = {}
@@ -1555,6 +1553,11 @@ class CephManager:
             except CommandFailedError:
                 self.log('Failed to get pg_num from pool %s, ignoring' % pool)
 
+    def get_ceph_cmd(self, **kwargs):
+        timeout = kwargs.pop('timeout', 120)
+        return ['sudo'] + self.pre + ['timeout', f'{timeout}', 'ceph',
+                                      '--cluster', self.cluster]
+
     def ceph(self, cmd, **kwargs):
         """
         Simple Ceph admin command wrapper around run_cluster_cmd.
@@ -1598,7 +1601,7 @@ class CephManager:
                            stdout=StringIO(),
                            check_status=kwargs.get('check_status', True))
         else:
-            kwargs['args'] = prefixcmd + self.CEPH_CMD + kwargs['args']
+            kwargs['args'] = prefixcmd + self.get_ceph_cmd(**kwargs) + kwargs['args']
             return self.controller.run(**kwargs)
 
     def raw_cluster_cmd(self, *args, **kwargs) -> str:
index 3516bf4b86cded25ba3217d596e2ce69499ef980..7de5653021bff7f31b30bb6fd0c953c9660f137e 100644 (file)
@@ -1295,9 +1295,9 @@ class Filesystem(MDSCluster):
         info = self.get_rank(rank=rank, status=status)
         return self.json_asok(command, 'mds', info['name'], timeout=timeout)
 
-    def rank_tell(self, command, rank=0, status=None):
+    def rank_tell(self, command, rank=0, status=None, timeout=120):
         try:
-            out = self.get_ceph_cmd_stdout("tell", f"mds.{self.id}:{rank}", *command)
+            out = self.get_ceph_cmd_stdout("tell", f"mds.{self.id}:{rank}", *command, timeout=timeout)
             return json.loads(out)
         except json.decoder.JSONDecodeError:
             log.error("could not decode: {}".format(out))
@@ -1712,11 +1712,11 @@ class Filesystem(MDSCluster):
         self.set_max_mds(new_max_mds)
         return self.wait_for_daemons()
 
-    def run_scrub(self, cmd, rank=0):
-        return self.rank_tell(["scrub"] + cmd, rank)
+    def run_scrub(self, cmd, rank=0, timeout=300):
+        return self.rank_tell(["scrub"] + cmd, rank=rank, timeout=timeout)
 
     def get_scrub_status(self, rank=0):
-        return self.run_scrub(["status"], rank)
+        return self.run_scrub(["status"], rank=rank, timeout=300)
 
     def flush(self, rank=0):
         return self.rank_tell(["flush", "journal"], rank=rank)
@@ -1728,7 +1728,7 @@ class Filesystem(MDSCluster):
             result = "no active scrubs running"
         with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
             while proceed():
-                out_json = self.rank_tell(["scrub", "status"], rank=rank)
+                out_json = self.rank_tell(["scrub", "status"], rank=rank, timeout=timeout)
                 assert out_json is not None
                 if not reverse:
                     if result in out_json['status']:
index 96dc9fffba2fd919b0c37c172c42cfb6bfea483c..8d9dc6f1845c54170d993d3d3c8537617906d0d0 100644 (file)
@@ -804,9 +804,11 @@ class LocalCephManager(CephManager):
         self.cephadm = False
         self.rook = False
         self.testdir = None
-        self.CEPH_CMD = [CEPH_CMD]
         self.RADOS_CMD = [RADOS_CMD]
 
+    def get_ceph_cmd(self, **kwargs):
+        return [CEPH_CMD]
+
     def find_remote(self, daemon_type, daemon_id):
         """
         daemon_type like 'mds', 'osd'