From 7dfd3d36e9e3ea9c97e85c8c0dcd74d8ef1a7329 Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Tue, 28 Jan 2025 14:48:15 +0530 Subject: [PATCH] qa/tasks/nvmeof.py: Fix do_checks() method All checks currently run on initator node, now run all "ceph" commands on one of gateway hosts instead of initator nodes. And run "nvme list" and "nvme list-subsys" checks on initator node. Add retry (5 times) to do_checks if any command fails. Signed-off-by: Vallari Agrawal --- qa/tasks/nvmeof.py | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py index 33cb51b8c01..aeca32f1b65 100644 --- a/qa/tasks/nvmeof.py +++ b/qa/tasks/nvmeof.py @@ -346,24 +346,29 @@ class NvmeofThrasher(Thrasher, Greenlet): Run some checks to see if everything is running well during thrashing. """ self.log('display and verify stats:') - for d in self.daemons: - d.remote.sh(d.status_cmd, check_status=False) - check_cmd = [ - 'ceph', 'orch', 'ls', '--refresh', - run.Raw('&&'), 'ceph', 'orch', 'ps', '--daemon-type', 'nvmeof', '--refresh', - run.Raw('&&'), 'ceph', 'health', 'detail', - run.Raw('&&'), 'ceph', '-s', - run.Raw('&&'), 'ceph', 'nvme-gw', 'show', 'mypool', 'mygroup0', - run.Raw('&&'), 'sudo', 'nvme', 'list', - ] - self.checker_host.run(args=check_cmd).wait() - - for dev in self.devices: - device_check_cmd = [ - 'sudo', 'nvme', 'list-subsys', dev, - run.Raw('|'), 'grep', 'live optimized' - ] - self.checker_host.run(args=device_check_cmd) + for retry in range(5): + try: + random_gateway_host = None + initiator_host = self.checker_host + for d in self.daemons: + random_gateway_host = d.remote + d.remote.sh(d.status_cmd, check_status=False) + random_gateway_host.run(args=['ceph', 'orch', 'ls', '--refresh']) + random_gateway_host.run(args=['ceph', 'orch', 'ps', '--daemon-type', 'nvmeof', '--refresh']) + random_gateway_host.run(args=['ceph', 'health', 'detail']) + random_gateway_host.run(args=['ceph', '-s']) + random_gateway_host.run(args=['ceph', 'nvme-gw', 'show', 'mypool', 'mygroup0']) + + initiator_host.run(args=['sudo', 'nvme', 'list']) + for dev in self.devices: + device_check_cmd = [ + 'sudo', 'nvme', 'list-subsys', dev, + run.Raw('|'), 'grep', 'live optimized' + ] + initiator_host.run(args=device_check_cmd) + break + except run.CommandFailedError: + self.log(f"retry do_checks() for {retry} time") def switch_task(self): """ -- 2.39.5