From 603b864ba0207ec6e3e30806dec0a7960e3a634e Mon Sep 17 00:00:00 2001
From: Josh Durgin <jdurgin@redhat.com>
Date: Tue, 9 Feb 2021 21:16:46 +0000
Subject: [PATCH] supervisor: kill processes before gathering logs

When we hit the max job timeout, we need to stop the test programs
before collecting logs or else we run into errors like 'file size
changed while zipping' trying to compress them, and we can't save them
or stop the job.

Signed-off-by: Josh Durgin <jdurgin@redhat.com>
---
 teuthology/dispatcher/supervisor.py | 12 +++++++++++-
 teuthology/kill.py                  | 13 ++++++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py
index ff728745f6..975448ad31 100644
--- a/teuthology/dispatcher/supervisor.py
+++ b/teuthology/dispatcher/supervisor.py
@@ -215,6 +215,15 @@ def run_with_watchdog(process, job_config):
         if total_seconds > teuth_config.max_job_time:
             log.warning("Job ran longer than {max}s. Killing...".format(
                 max=teuth_config.max_job_time))
+            try:
+                # kill processes but do not unlock yet so we can save
+                # the logs, coredumps, etc.
+                kill_job(job_info['name'], job_info['job_id'],
+                         teuth_config.archive_base, job_config['owner'],
+                         save_logs=True)
+            except Exception:
+                log.exception('Failed to kill job')
+
             try:
                 transfer_archives(job_info['name'], job_info['job_id'],
                                   teuth_config.archive_base, job_config)
@@ -222,10 +231,11 @@ def run_with_watchdog(process, job_config):
                 log.exception('Could not save logs')
 
             try:
+                # this time remove everything and unlock the machines
                 kill_job(job_info['name'], job_info['job_id'],
                          teuth_config.archive_base, job_config['owner'])
             except Exception:
-                log.exception('Failed to kill job')
+                log.exception('Failed to kill job and unlock machines')
 
         # calling this without a status just updates the jobs updated time
         report.try_push_job_info(job_info)
diff --git a/teuthology/kill.py b/teuthology/kill.py
index 872883347d..c319638014 100755
--- a/teuthology/kill.py
+++ b/teuthology/kill.py
@@ -62,7 +62,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
         nuke_targets(targets, owner)
 
 
-def kill_job(run_name, job_id, archive_base=None, owner=None):
+def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False):
     serializer = report.ResultsSerializer(archive_base)
     job_info = serializer.job_info(run_name, job_id)
     if not owner:
@@ -76,7 +76,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None):
     # the necessary nodes ain't locked yet, we do not use job_info to get them,
     # but use find_targets():
     targets = find_targets(run_name, owner, job_id)
-    nuke_targets(targets, owner)
+    nuke_targets(targets, owner, save_logs)
 
 
 def find_run_info(serializer, run_name):
@@ -214,7 +214,7 @@ def find_targets(run_name, owner, job_id=None):
     return out_obj
 
 
-def nuke_targets(targets_dict, owner):
+def nuke_targets(targets_dict, owner, save_logs=False):
     targets = targets_dict.get('targets')
     if not targets:
         log.info("No locked machines. Not nuking anything")
@@ -233,11 +233,14 @@ def nuke_targets(targets_dict, owner):
         'teuthology-nuke',
         '-t',
         target_file.name,
-        '--unlock',
-        '-r',
         '--owner',
         owner
     ]
+    if save_logs:
+        nuke_args.extend(['--no-reboot', '--keep-logs'])
+    else:
+        nuke_args.extend(['--reboot-all', '--unlock'])
+
     proc = subprocess.Popen(
         nuke_args,
         stdout=subprocess.PIPE,
-- 
2.39.5