if total_seconds > teuth_config.max_job_time:
log.warning("Job ran longer than {max}s. Killing...".format(
max=teuth_config.max_job_time))
+ try:
+ # kill processes but do not unlock yet so we can save
+ # the logs, coredumps, etc.
+ kill_job(job_info['name'], job_info['job_id'],
+ teuth_config.archive_base, job_config['owner'],
+ save_logs=True)
+ except Exception:
+ log.exception('Failed to kill job')
+
try:
transfer_archives(job_info['name'], job_info['job_id'],
teuth_config.archive_base, job_config)
log.exception('Could not save logs')
try:
+ # this time remove everything and unlock the machines
kill_job(job_info['name'], job_info['job_id'],
teuth_config.archive_base, job_config['owner'])
except Exception:
- log.exception('Failed to kill job')
+ log.exception('Failed to kill job and unlock machines')
# calling this without a status just updates the jobs updated time
report.try_push_job_info(job_info)
nuke_targets(targets, owner)
-def kill_job(run_name, job_id, archive_base=None, owner=None):
+def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False):
serializer = report.ResultsSerializer(archive_base)
job_info = serializer.job_info(run_name, job_id)
if not owner:
# the necessary nodes ain't locked yet, we do not use job_info to get them,
# but use find_targets():
targets = find_targets(run_name, owner, job_id)
- nuke_targets(targets, owner)
+ nuke_targets(targets, owner, save_logs)
def find_run_info(serializer, run_name):
return out_obj
-def nuke_targets(targets_dict, owner):
+def nuke_targets(targets_dict, owner, save_logs=False):
targets = targets_dict.get('targets')
if not targets:
log.info("No locked machines. Not nuking anything")
'teuthology-nuke',
'-t',
target_file.name,
- '--unlock',
- '-r',
'--owner',
owner
]
+ if save_logs:
+ nuke_args.extend(['--no-reboot', '--keep-logs'])
+ else:
+ nuke_args.extend(['--reboot-all', '--unlock'])
+
proc = subprocess.Popen(
nuke_args,
stdout=subprocess.PIPE,