]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa/tasks/quiescer: dump ops in parallel 57302/head
authorPatrick Donnelly <pdonnell@redhat.com>
Mon, 6 May 2024 18:02:02 +0000 (14:02 -0400)
committerPatrick Donnelly <pdonnell@redhat.com>
Thu, 16 May 2024 16:11:49 +0000 (12:11 -0400)
Since this --flags=locks takes the mds_lock and dumps thousands of ops, this
may take a long time to complete for each individual MDS. The entire quiesce
set may timeout (and all q ops killed) before we finish dumping ops.

Fixes: https://tracker.ceph.com/issues/65823
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
qa/tasks/quiescer.py

index dfec4112e6622ea9a31845878da1f70455efabb3..e5c60538e0bacebf166c32af4a6ed0bfc43a79f9 100644 (file)
@@ -186,17 +186,19 @@ class Quiescer(ThrasherGreenlet):
 
             self.logger.debug(f"Dumping ops on rank {rank} ({name}) to a remote file {remote_path}")
             try:
-                _ = self.fs.rank_tell(['ops', '--flags=locks', f'--path={daemon_path}'], rank=rank)
-                remote_dumps.append((info, remote_path))
+                args = ['tell', f'mds.{self.fs.id}:{rank}', 'ops', '--flags=locks', f'--path={daemon_path}']
+                p = self.fs.run_ceph_cmd(args=args, wait=False, stdout=StringIO())
+                remote_dumps.append((info, remote_path, p))
             except Exception as e:
                 self.logger.error(f"Couldn't execute ops dump on rank {rank}, error: {e}")
 
         # now get the ops from the files
-        for info, remote_path in remote_dumps:
+        for info, remote_path, p in remote_dumps:
+            name = info['name']
+            rank = info['rank']
+            mds_remote = self.fs.mon_manager.find_remote('mds', name)
             try:
-                name = info['name']
-                rank = info['rank']
-                mds_remote = self.fs.mon_manager.find_remote('mds', name)
+                p.wait()
                 blob = misc.get_file(mds_remote, remote_path, sudo=True).decode('utf-8')
                 self.logger.debug(f"read {len(blob)}B of ops from '{remote_path}' on mds.{rank} ({name})")
                 ops_dump = json.loads(blob)
@@ -218,7 +220,8 @@ class Quiescer(ThrasherGreenlet):
                 self.logger.info(f"Pulled {len(ops_dump['ops'])} ops from rank {rank} ({name}) into {out_name}")
             except Exception as e:
                 self.logger.error(f"Couldn't pull ops dump at '{remote_path}' on rank {info['rank']} ({info['name']}), error: {e}")
-            misc.delete_file(mds_remote, remote_path, sudo=True, check=False)
+            finally:
+                misc.delete_file(mds_remote, remote_path, sudo=True, check=False)
 
     def get_set_state_name(self, response, set_id = None):
         if isinstance(response, (str, bytes, bytearray)):