]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: bump up scrub status command timeout 55915/head
authorMilind Changire <mchangir@redhat.com>
Sat, 11 Nov 2023 03:39:55 +0000 (09:09 +0530)
committerMilind Changire <mchangir@redhat.com>
Mon, 4 Mar 2024 09:43:32 +0000 (15:13 +0530)
A journal flush sometimes takes more than 120 seconds and so the 'scrub
status' command after blocking for more than 120 seconds is declared
failed causing the job to be declared as failed.

This bumping up of the timeout gives more time for the 'scrub status'
command to wait and eventually let the journal flush to complete.

Fixes: https://tracker.ceph.com/issues/63411
Signed-off-by: Milind Changire <mchangir@redhat.com>
(cherry picked from commit 33899fdaac57db5266940f59f5ef956932aa1714)

Conflicts:
qa/tasks/ceph_manager.py
- fixed diff between main and reef
qa/tasks/cephfs/filesystem.py
- fixed diff between main and reef
qa/tasks/vstart_runner.py
- fixed diff between main and reef

qa/tasks/ceph_manager.py
qa/tasks/cephfs/filesystem.py
qa/tasks/vstart_runner.py

index ac26a96315e50ea2941a6dfae3a09426fc86ac12..e24965026c4b95d0f4af52f6b6b622b1b3b7f5d4 100644 (file)
@@ -1540,11 +1540,9 @@ class CephManager:
         self.cephadm = cephadm
         self.testdir = teuthology.get_testdir(self.ctx)
         # prefix args for ceph cmds to be executed
-        pre = ['adjust-ulimits', 'ceph-coverage',
-               f'{self.testdir}/archive/coverage']
-        self.CEPH_CMD = ['sudo'] + pre + ['timeout', '120', 'ceph',
-                                          '--cluster', self.cluster]
-        self.RADOS_CMD = pre + ['rados', '--cluster', self.cluster]
+        self.pre = ['adjust-ulimits', 'ceph-coverage',
+                    f'{self.testdir}/archive/coverage']
+        self.RADOS_CMD = self.pre + ['rados', '--cluster', self.cluster]
         self.run_ceph_w_prefix = ['sudo', 'daemon-helper', 'kill', 'ceph',
                                   '--cluster', self.cluster]
 
@@ -1557,6 +1555,11 @@ class CephManager:
             except CommandFailedError:
                 self.log('Failed to get pg_num from pool %s, ignoring' % pool)
 
+    def get_ceph_cmd(self, **kwargs):
+        timeout = kwargs.pop('timeout', 120)
+        return ['sudo'] + self.pre + ['timeout', f'{timeout}', 'ceph',
+                                      '--cluster', self.cluster]
+
     def ceph(self, cmd, **kwargs):
         """
         Simple Ceph admin command wrapper around run_cluster_cmd.
@@ -1600,7 +1603,7 @@ class CephManager:
                            stdout=StringIO(),
                            check_status=kwargs.get('check_status', True))
         else:
-            kwargs['args'] = prefixcmd + self.CEPH_CMD + kwargs['args']
+            kwargs['args'] = prefixcmd + self.get_ceph_cmd(**kwargs) + kwargs['args']
             return self.controller.run(**kwargs)
 
     def raw_cluster_cmd(self, *args, **kwargs) -> str:
index a3ef9f1ec104d59ab19065dbd1807d1fd551d75b..813a74d57c88270fcc83bcc3b6b7443e9c05bb0a 100644 (file)
@@ -1262,9 +1262,9 @@ class Filesystem(MDSCluster):
         info = self.get_rank(rank=rank, status=status)
         return self.json_asok(command, 'mds', info['name'], timeout=timeout)
 
-    def rank_tell(self, command, rank=0, status=None):
+    def rank_tell(self, command, rank=0, status=None, timeout=120):
         try:
-            out = self.mon_manager.raw_cluster_cmd("tell", f"mds.{self.id}:{rank}", *command)
+            out = self.mon_manager.raw_cluster_cmd("tell", f"mds.{self.id}:{rank}", *command, timeout=timeout)
             return json.loads(out)
         except json.decoder.JSONDecodeError:
             log.error("could not decode: {}".format(out))
@@ -1679,11 +1679,11 @@ class Filesystem(MDSCluster):
         self.set_max_mds(new_max_mds)
         return self.wait_for_daemons()
 
-    def run_scrub(self, cmd, rank=0):
-        return self.rank_tell(["scrub"] + cmd, rank)
+    def run_scrub(self, cmd, rank=0, timeout=300):
+        return self.rank_tell(["scrub"] + cmd, rank=rank, timeout=timeout)
 
     def get_scrub_status(self, rank=0):
-        return self.run_scrub(["status"], rank)
+        return self.run_scrub(["status"], rank=rank, timeout=300)
 
     def flush(self, rank=0):
         return self.rank_tell(["flush", "journal"], rank=rank)
@@ -1695,7 +1695,7 @@ class Filesystem(MDSCluster):
             result = "no active scrubs running"
         with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
             while proceed():
-                out_json = self.rank_tell(["scrub", "status"], rank=rank)
+                out_json = self.rank_tell(["scrub", "status"], rank=rank, timeout=timeout)
                 assert out_json is not None
                 if not reverse:
                     if result in out_json['status']:
index df4886fb669757bc62ea2b58cf3a5b219ffd9ca0..3d429c2653e382d708e4bc8242c29c98ee53b71f 100644 (file)
@@ -777,9 +777,11 @@ class LocalCephManager(CephManager):
         self.rook = False
         self.testdir = None
         self.run_ceph_w_prefix = self.run_cluster_cmd_prefix = [CEPH_CMD]
-        self.CEPH_CMD = [CEPH_CMD]
         self.RADOS_CMD = [RADOS_CMD]
 
+    def get_ceph_cmd(self, **kwargs):
+        return [CEPH_CMD]
+
     def find_remote(self, daemon_type, daemon_id):
         """
         daemon_type like 'mds', 'osd'