supervisor: kill processes before gathering logs
authorJosh Durgin <jdurgin@redhat.com>
Tue, 9 Feb 2021 21:16:46 +0000 (21:16 +0000)
committerJosh Durgin <jdurgin@redhat.com>
Tue, 9 Feb 2021 21:16:51 +0000 (21:16 +0000)
When we hit the max job timeout, we need to stop the test programs
before collecting logs or else we run into errors like 'file size
changed while zipping' trying to compress them, and we can't save them
or stop the job.

Signed-off-by: Josh Durgin <jdurgin@redhat.com>
teuthology/dispatcher/supervisor.py
teuthology/kill.py

index ff728745f694c107cc9e527a55cd3339fe2fa04b..975448ad31f48061ea37282485b4813e6b267058 100644 (file)
@@ -215,6 +215,15 @@ def run_with_watchdog(process, job_config):
         if total_seconds > teuth_config.max_job_time:
             log.warning("Job ran longer than {max}s. Killing...".format(
                 max=teuth_config.max_job_time))
+            try:
+                # kill processes but do not unlock yet so we can save
+                # the logs, coredumps, etc.
+                kill_job(job_info['name'], job_info['job_id'],
+                         teuth_config.archive_base, job_config['owner'],
+                         save_logs=True)
+            except Exception:
+                log.exception('Failed to kill job')
+
             try:
                 transfer_archives(job_info['name'], job_info['job_id'],
                                   teuth_config.archive_base, job_config)
@@ -222,10 +231,11 @@ def run_with_watchdog(process, job_config):
                 log.exception('Could not save logs')
 
             try:
+                # this time remove everything and unlock the machines
                 kill_job(job_info['name'], job_info['job_id'],
                          teuth_config.archive_base, job_config['owner'])
             except Exception:
-                log.exception('Failed to kill job')
+                log.exception('Failed to kill job and unlock machines')
 
         # calling this without a status just updates the jobs updated time
         report.try_push_job_info(job_info)
index 872883347ddc1139a6da6d8abef60e6e7f855f18..c319638014ea57322561f797bbec4b1a9cf94c65 100755 (executable)
@@ -62,7 +62,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
         nuke_targets(targets, owner)
 
 
-def kill_job(run_name, job_id, archive_base=None, owner=None):
+def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False):
     serializer = report.ResultsSerializer(archive_base)
     job_info = serializer.job_info(run_name, job_id)
     if not owner:
@@ -76,7 +76,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None):
     # the necessary nodes ain't locked yet, we do not use job_info to get them,
     # but use find_targets():
     targets = find_targets(run_name, owner, job_id)
-    nuke_targets(targets, owner)
+    nuke_targets(targets, owner, save_logs)
 
 
 def find_run_info(serializer, run_name):
@@ -214,7 +214,7 @@ def find_targets(run_name, owner, job_id=None):
     return out_obj
 
 
-def nuke_targets(targets_dict, owner):
+def nuke_targets(targets_dict, owner, save_logs=False):
     targets = targets_dict.get('targets')
     if not targets:
         log.info("No locked machines. Not nuking anything")
@@ -233,11 +233,14 @@ def nuke_targets(targets_dict, owner):
         'teuthology-nuke',
         '-t',
         target_file.name,
-        '--unlock',
-        '-r',
         '--owner',
         owner
     ]
+    if save_logs:
+        nuke_args.extend(['--no-reboot', '--keep-logs'])
+    else:
+        nuke_args.extend(['--reboot-all', '--unlock'])
+
     proc = subprocess.Popen(
         nuke_args,
         stdout=subprocess.PIPE,