supervisor: kill processes before gathering logs

author Josh Durgin <jdurgin@redhat.com>

Tue, 9 Feb 2021 21:16:46 +0000 (21:16 +0000)

committer Josh Durgin <jdurgin@redhat.com>

Tue, 9 Feb 2021 21:16:51 +0000 (21:16 +0000)
author Josh Durgin <jdurgin@redhat.com>
Tue, 9 Feb 2021 21:16:46 +0000 (21:16 +0000)
committer Josh Durgin <jdurgin@redhat.com>
Tue, 9 Feb 2021 21:16:51 +0000 (21:16 +0000)
diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py

index ff728745f694c107cc9e527a55cd3339fe2fa04b..975448ad31f48061ea37282485b4813e6b267058 100644 (file)
--- a/teuthology/dispatcher/supervisor.py
+++ b/teuthology/dispatcher/supervisor.py
@@ -215,6 +215,15 @@ def run_with_watchdog(process, job_config):
          if total_seconds > teuth_config.max_job_time:
              log.warning("Job ran longer than {max}s. Killing...".format(
                  max=teuth_config.max_job_time))
+            try:
+                # kill processes but do not unlock yet so we can save
+                # the logs, coredumps, etc.
+                kill_job(job_info['name'], job_info['job_id'],
+                         teuth_config.archive_base, job_config['owner'],
+                         save_logs=True)
+            except Exception:
+                log.exception('Failed to kill job')
+
              try:
                  transfer_archives(job_info['name'], job_info['job_id'],
                                    teuth_config.archive_base, job_config)
@@ -222,10 +231,11 @@ def run_with_watchdog(process, job_config):
                  log.exception('Could not save logs')
  
              try:
+                # this time remove everything and unlock the machines
                  kill_job(job_info['name'], job_info['job_id'],
                           teuth_config.archive_base, job_config['owner'])
              except Exception:
-                log.exception('Failed to kill job')
+                log.exception('Failed to kill job and unlock machines')
  
          # calling this without a status just updates the jobs updated time
          report.try_push_job_info(job_info)
diff --git a/teuthology/kill.py b/teuthology/kill.py

index 872883347ddc1139a6da6d8abef60e6e7f855f18..c319638014ea57322561f797bbec4b1a9cf94c65 100755 (executable)
--- a/teuthology/kill.py
+++ b/teuthology/kill.py
@@ -62,7 +62,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
          nuke_targets(targets, owner)
  
  
-def kill_job(run_name, job_id, archive_base=None, owner=None):
+def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False):
      serializer = report.ResultsSerializer(archive_base)
      job_info = serializer.job_info(run_name, job_id)
      if not owner:
@@ -76,7 +76,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None):
      # the necessary nodes ain't locked yet, we do not use job_info to get them,
      # but use find_targets():
      targets = find_targets(run_name, owner, job_id)
-    nuke_targets(targets, owner)
+    nuke_targets(targets, owner, save_logs)
  
  
  def find_run_info(serializer, run_name):
@@ -214,7 +214,7 @@ def find_targets(run_name, owner, job_id=None):
      return out_obj
  
  
-def nuke_targets(targets_dict, owner):
+def nuke_targets(targets_dict, owner, save_logs=False):
      targets = targets_dict.get('targets')
      if not targets:
          log.info("No locked machines. Not nuking anything")
@@ -233,11 +233,14 @@ def nuke_targets(targets_dict, owner):
          'teuthology-nuke',
          '-t',
          target_file.name,
-        '--unlock',
-        '-r',
          '--owner',
          owner
      ]
+    if save_logs:
+        nuke_args.extend(['--no-reboot', '--keep-logs'])
+    else:
+        nuke_args.extend(['--reboot-all', '--unlock'])
+
      proc = subprocess.Popen(
          nuke_args,
          stdout=subprocess.PIPE,
author	Josh Durgin <jdurgin@redhat.com>
	Tue, 9 Feb 2021 21:16:46 +0000 (21:16 +0000)
committer	Josh Durgin <jdurgin@redhat.com>
	Tue, 9 Feb 2021 21:16:51 +0000 (21:16 +0000)
teuthology/dispatcher/supervisor.py		patch \| blob \| history
teuthology/kill.py		patch \| blob \| history