]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
supervisor: To preserve logs, delay nuking 1674/head
authorZack Cerza <zack@redhat.com>
Wed, 15 Sep 2021 17:46:33 +0000 (11:46 -0600)
committerZack Cerza <zack@redhat.com>
Wed, 15 Sep 2021 18:11:09 +0000 (12:11 -0600)
The previous behavior was causing machines to get nuked before any
attempt to fetch logs. If a machine took longer than 60s to become
available, collecting logs would fail. Since we also nuke after this
step, don't bother here.

Fixes: https://tracker.ceph.com/issues/51944
Signed-off-by: Zack Cerza <zack@redhat.com>
teuthology/dispatcher/supervisor.py
teuthology/kill.py

index 4ce9fa8531f44f9af7c242086b2d6b5e0784b946..8278fe1ed58e4267d7224852c2d16c87b3961727 100644 (file)
@@ -218,11 +218,11 @@ def run_with_watchdog(process, job_config):
             log.warning("Job ran longer than {max}s. Killing...".format(
                 max=teuth_config.max_job_time))
             try:
-                # kill processes but do not unlock yet so we can save
+                # kill processes but do not nuke yet so we can save
                 # the logs, coredumps, etc.
                 kill_job(job_info['name'], job_info['job_id'],
                          teuth_config.archive_base, job_config['owner'],
-                         save_logs=True)
+                         skip_nuke=True)
             except Exception:
                 log.exception('Failed to kill job')
 
index b8ae348cf4d62059dd9be61dfa4fbc46928e47e1..770bf8eafa0c5821fd7599da32f599f775d1f598 100755 (executable)
@@ -67,7 +67,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
         nuke_targets(targets, owner)
 
 
-def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False):
+def kill_job(run_name, job_id, archive_base=None, owner=None, skip_nuke=False):
     serializer = report.ResultsSerializer(archive_base)
     job_info = serializer.job_info(run_name, job_id)
     if not owner:
@@ -81,7 +81,8 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False):
     # the necessary nodes ain't locked yet, we do not use job_info to get them,
     # but use find_targets():
     targets = find_targets(run_name, owner, job_id)
-    nuke_targets(targets, owner, save_logs)
+    if not skip_nuke:
+        nuke_targets(targets, owner)
 
 
 def find_run_info(serializer, run_name):
@@ -219,7 +220,7 @@ def find_targets(run_name, owner, job_id=None):
     return out_obj
 
 
-def nuke_targets(targets_dict, owner, save_logs=False):
+def nuke_targets(targets_dict, owner):
     targets = targets_dict.get('targets')
     if not targets:
         log.info("No locked machines. Not nuking anything")
@@ -241,10 +242,7 @@ def nuke_targets(targets_dict, owner, save_logs=False):
         '--owner',
         owner
     ]
-    if save_logs:
-        nuke_args.extend(['--no-reboot', '--keep-logs'])
-    else:
-        nuke_args.extend(['--reboot-all', '--unlock'])
+    nuke_args.extend(['--reboot-all', '--unlock'])
 
     proc = subprocess.Popen(
         nuke_args,