From: Vallari Agrawal Date: Fri, 13 Mar 2026 08:24:31 +0000 (+0530) Subject: qa/tasks/nvmeof.py: Fix tharsher daemon_rm revival X-Git-Tag: v21.0.0~11^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5fcc8b0e46c1c54b3fec2d5ea39ee20f094f0916;p=ceph.git qa/tasks/nvmeof.py: Fix tharsher daemon_rm revival Instead of "ceph orch daemon restart", wait for daemon to come backup on it's own during revival. Also improve do_check retry logic. And some logging improvements in nvmeof.thrasher task. Fixes: https://tracker.ceph.com/issues/75383 Signed-off-by: Vallari Agrawal --- diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py index 54b9b83d5dc2..a417cfa988ff 100644 --- a/qa/tasks/nvmeof.py +++ b/qa/tasks/nvmeof.py @@ -7,7 +7,7 @@ from textwrap import dedent from gevent.event import Event from gevent.greenlet import Greenlet from teuthology.task import Task -from teuthology import misc +from teuthology import misc, contextutil from teuthology.exceptions import ConfigError from teuthology.orchestra import run from tasks.util import get_remote_for_role @@ -354,9 +354,9 @@ class NvmeofThrasher(Thrasher, Greenlet): self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 4)) self.daemon_max_thrash_period = int(self.config.get('daemon_max_thrash_period', 30 * 60)) # seconds - self.min_thrash_delay = int(self.config.get('min_thrash_delay', 60)) + self.min_thrash_delay = int(self.config.get('min_thrash_delay', 100)) self.max_thrash_delay = int(self.config.get('max_thrash_delay', self.min_thrash_delay + 30)) - self.min_revive_delay = int(self.config.get('min_revive_delay', 60)) + self.min_revive_delay = int(self.config.get('min_revive_delay', 100)) self.max_revive_delay = int(self.config.get('max_revive_delay', self.min_revive_delay + 30)) def _get_devices(self, remote): @@ -392,7 +392,8 @@ class NvmeofThrasher(Thrasher, Greenlet): Run some checks to see if everything is running well during thrashing. """ self.log('display and verify stats:') - for retry in range(5): + max_retry = 5 + for retry in range(1, max_retry+1): try: random_gateway_host = None initiator_host = self.checker_host @@ -404,7 +405,6 @@ class NvmeofThrasher(Thrasher, Greenlet): random_gateway_host.run(args=['ceph', 'health', 'detail']) random_gateway_host.run(args=['ceph', '-s']) random_gateway_host.run(args=['ceph', 'nvme-gw', 'show', 'mypool', 'mygroup0']) - initiator_host.run(args=['sudo', 'nvme', 'list']) for dev in self.devices: device_check_cmd = [ @@ -413,8 +413,11 @@ class NvmeofThrasher(Thrasher, Greenlet): ] initiator_host.run(args=device_check_cmd) break - except run.CommandFailedError: + except Exception: self.log(f"retry do_checks() for {retry} time") + if retry == max_retry: + raise + time.sleep(30) # blocking wait def switch_task(self): """ @@ -446,19 +449,20 @@ class NvmeofThrasher(Thrasher, Greenlet): ] chosen_method = self.rng.choice(kill_methods) d_name = '%s.%s' % (daemon.type_, daemon.id_) + self.log('kill {dname} by {method}'.format( + dname=d_name, method=chosen_method)) if chosen_method == "ceph_daemon_stop": daemon.remote.run(args=[ - "ceph", "orch", "daemon", "stop", - d_name - ], check_status=False) + "ceph", "orch", "daemon", "stop", d_name + ]) elif chosen_method == "systemctl_stop": # To bypass is_started logic of CephadmUnit daemon.remote.sh(daemon.stop_cmd, check_status=False) elif chosen_method == "daemon_remove": daemon.remote.run(args=[ - "ceph", "orch", "daemon", "rm", - d_name - ], check_status=False) + "ceph", "orch", "daemon", "rm", d_name + ]) + time.sleep(15) # blocking wait return chosen_method def revive_daemon(self, daemon, killed_method): @@ -475,11 +479,25 @@ class NvmeofThrasher(Thrasher, Greenlet): "ceph", "orch", "daemon", "start", name ]) + elif killed_method == "daemon_remove": + self._wait_for_daemon(daemon) else: daemon.remote.run(args=[ "ceph", "orch", "daemon", "start", name ]) + + def _wait_for_daemon(self, daemon, tries=15): + dname = '%s.%s' % (daemon.type_, daemon.id_) + self.log(f'waiting for {dname} to start...') + retry = 1 + with contextutil.safe_while(sleep=20, tries=tries) as proceed: + while proceed(): + out = daemon.remote.sh(daemon.status_cmd, check_status=False) + if "running" in out: + break + retry += 1 + self.log(f'waiting for {dname} to start (retry: {retry}/{tries})') def do_thrash(self): self.log('start thrashing') @@ -515,7 +533,6 @@ class NvmeofThrasher(Thrasher, Greenlet): f'in {self.daemon_max_thrash_period} seconds.') continue - self.log('kill {label}'.format(label=daemon.id_)) kill_method = self.kill_daemon(daemon) killed_daemons[kill_method].append(daemon) @@ -541,8 +558,8 @@ class NvmeofThrasher(Thrasher, Greenlet): # revive after thrashing for kill_method in killed_daemons: for daemon in killed_daemons[kill_method]: - self.log('reviving {label}'.format(label=daemon.id_)) - # daemon.restart() + self.log('reviving {label} from {kmethod}'.format( + label=daemon.id_, kmethod=kill_method)) self.revive_daemon(daemon, kill_method) # delay before thrashing