From: Zack Cerza Date: Wed, 15 Sep 2021 17:46:33 +0000 (-0600) Subject: supervisor: To preserve logs, delay nuking X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=1e2d780cf481cec2b03467422166ab75741450d6;p=teuthology.git supervisor: To preserve logs, delay nuking The previous behavior was causing machines to get nuked before any attempt to fetch logs. If a machine took longer than 60s to become available, collecting logs would fail. Since we also nuke after this step, don't bother here. Fixes: https://tracker.ceph.com/issues/51944 Signed-off-by: Zack Cerza --- diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index 4ce9fa8531..8278fe1ed5 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -218,11 +218,11 @@ def run_with_watchdog(process, job_config): log.warning("Job ran longer than {max}s. Killing...".format( max=teuth_config.max_job_time)) try: - # kill processes but do not unlock yet so we can save + # kill processes but do not nuke yet so we can save # the logs, coredumps, etc. kill_job(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config['owner'], - save_logs=True) + skip_nuke=True) except Exception: log.exception('Failed to kill job') diff --git a/teuthology/kill.py b/teuthology/kill.py index b8ae348cf4..770bf8eafa 100755 --- a/teuthology/kill.py +++ b/teuthology/kill.py @@ -67,7 +67,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None, nuke_targets(targets, owner) -def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False): +def kill_job(run_name, job_id, archive_base=None, owner=None, skip_nuke=False): serializer = report.ResultsSerializer(archive_base) job_info = serializer.job_info(run_name, job_id) if not owner: @@ -81,7 +81,8 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False): # the necessary nodes ain't locked yet, we do not use job_info to get them, # but use find_targets(): targets = find_targets(run_name, owner, job_id) - nuke_targets(targets, owner, save_logs) + if not skip_nuke: + nuke_targets(targets, owner) def find_run_info(serializer, run_name): @@ -219,7 +220,7 @@ def find_targets(run_name, owner, job_id=None): return out_obj -def nuke_targets(targets_dict, owner, save_logs=False): +def nuke_targets(targets_dict, owner): targets = targets_dict.get('targets') if not targets: log.info("No locked machines. Not nuking anything") @@ -241,10 +242,7 @@ def nuke_targets(targets_dict, owner, save_logs=False): '--owner', owner ] - if save_logs: - nuke_args.extend(['--no-reboot', '--keep-logs']) - else: - nuke_args.extend(['--reboot-all', '--unlock']) + nuke_args.extend(['--reboot-all', '--unlock']) proc = subprocess.Popen( nuke_args,