from gevent.event import Event
from gevent.greenlet import Greenlet
from teuthology.task import Task
-from teuthology import misc
+from teuthology import misc, contextutil
from teuthology.exceptions import ConfigError
from teuthology.orchestra import run
from tasks.util import get_remote_for_role
self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 4))
self.daemon_max_thrash_period = int(self.config.get('daemon_max_thrash_period', 30 * 60)) # seconds
- self.min_thrash_delay = int(self.config.get('min_thrash_delay', 60))
+ self.min_thrash_delay = int(self.config.get('min_thrash_delay', 100))
self.max_thrash_delay = int(self.config.get('max_thrash_delay', self.min_thrash_delay + 30))
- self.min_revive_delay = int(self.config.get('min_revive_delay', 60))
+ self.min_revive_delay = int(self.config.get('min_revive_delay', 100))
self.max_revive_delay = int(self.config.get('max_revive_delay', self.min_revive_delay + 30))
def _get_devices(self, remote):
Run some checks to see if everything is running well during thrashing.
"""
self.log('display and verify stats:')
- for retry in range(5):
+ max_retry = 5
+ for retry in range(1, max_retry+1):
try:
random_gateway_host = None
initiator_host = self.checker_host
random_gateway_host.run(args=['ceph', 'health', 'detail'])
random_gateway_host.run(args=['ceph', '-s'])
random_gateway_host.run(args=['ceph', 'nvme-gw', 'show', 'mypool', 'mygroup0'])
-
initiator_host.run(args=['sudo', 'nvme', 'list'])
for dev in self.devices:
device_check_cmd = [
]
initiator_host.run(args=device_check_cmd)
break
- except run.CommandFailedError:
+ except Exception:
self.log(f"retry do_checks() for {retry} time")
+ if retry == max_retry:
+ raise
+ time.sleep(30) # blocking wait
def switch_task(self):
"""
]
chosen_method = self.rng.choice(kill_methods)
d_name = '%s.%s' % (daemon.type_, daemon.id_)
+ self.log('kill {dname} by {method}'.format(
+ dname=d_name, method=chosen_method))
if chosen_method == "ceph_daemon_stop":
daemon.remote.run(args=[
- "ceph", "orch", "daemon", "stop",
- d_name
- ], check_status=False)
+ "ceph", "orch", "daemon", "stop", d_name
+ ])
elif chosen_method == "systemctl_stop":
# To bypass is_started logic of CephadmUnit
daemon.remote.sh(daemon.stop_cmd, check_status=False)
elif chosen_method == "daemon_remove":
daemon.remote.run(args=[
- "ceph", "orch", "daemon", "rm",
- d_name
- ], check_status=False)
+ "ceph", "orch", "daemon", "rm", d_name
+ ])
+ time.sleep(15) # blocking wait
return chosen_method
def revive_daemon(self, daemon, killed_method):
"ceph", "orch", "daemon", "start",
name
])
+ elif killed_method == "daemon_remove":
+ self._wait_for_daemon(daemon)
else:
daemon.remote.run(args=[
"ceph", "orch", "daemon", "start",
name
])
+
+ def _wait_for_daemon(self, daemon, tries=15):
+ dname = '%s.%s' % (daemon.type_, daemon.id_)
+ self.log(f'waiting for {dname} to start...')
+ retry = 1
+ with contextutil.safe_while(sleep=20, tries=tries) as proceed:
+ while proceed():
+ out = daemon.remote.sh(daemon.status_cmd, check_status=False)
+ if "running" in out:
+ break
+ retry += 1
+ self.log(f'waiting for {dname} to start (retry: {retry}/{tries})')
def do_thrash(self):
self.log('start thrashing')
f'in {self.daemon_max_thrash_period} seconds.')
continue
- self.log('kill {label}'.format(label=daemon.id_))
kill_method = self.kill_daemon(daemon)
killed_daemons[kill_method].append(daemon)
# revive after thrashing
for kill_method in killed_daemons:
for daemon in killed_daemons[kill_method]:
- self.log('reviving {label}'.format(label=daemon.id_))
- # daemon.restart()
+ self.log('reviving {label} from {kmethod}'.format(
+ label=daemon.id_, kmethod=kill_method))
self.revive_daemon(daemon, kill_method)
# delay before thrashing