]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
supervisor: send paddles the reason a jobs is marked dead 1610/head
authorJosh Durgin <jdurgin@redhat.com>
Tue, 9 Feb 2021 21:33:34 +0000 (21:33 +0000)
committerJosh Durgin <jdurgin@redhat.com>
Tue, 9 Feb 2021 21:33:34 +0000 (21:33 +0000)
Signed-off-by: Josh Durgin <jdurgin@redhat.com>
teuthology/dispatcher/supervisor.py

index 975448ad31f48061ea37282485b4813e6b267058..4ce9fa8531f44f9af7c242086b2d6b5e0784b946 100644 (file)
@@ -161,10 +161,10 @@ def reimage(job_config):
     targets = job_config['targets']
     try:
         reimaged = reimage_machines(ctx, targets, job_config['machine_type'])
-    except Exception:
-        log.info('Reimaging error. Nuking machines...')
+    except Exception as e:
+        log.exception('Reimaging error. Nuking machines...')
         # Reimage failures should map to the 'dead' status instead of 'fail'
-        report.try_push_job_info(ctx.config, dict(status='dead'))
+        report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)))
         nuke(ctx, True)
         raise
     ctx.config['targets'] = reimaged
@@ -208,11 +208,13 @@ def run_with_watchdog(process, job_config):
 
     # Sleep once outside of the loop to avoid double-posting jobs
     time.sleep(teuth_config.watchdog_interval)
+    hit_max_timeout = False
     while process.poll() is None:
         # Kill jobs that have been running longer than the global max
         run_time = datetime.utcnow() - job_start_time
         total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds
         if total_seconds > teuth_config.max_job_time:
+            hit_max_timeout = True
             log.warning("Job ran longer than {max}s. Killing...".format(
                 max=teuth_config.max_job_time))
             try:
@@ -249,7 +251,10 @@ def run_with_watchdog(process, job_config):
     # the status, but if it was a pass or fail it will have already been
     # reported to paddles. In that case paddles ignores the 'dead' status.
     # If the job was killed, paddles will use the 'dead' status.
-    report.try_push_job_info(job_info, dict(status='dead'))
+    extra_info = dict(status='dead')
+    if hit_max_timeout:
+        extra_info['failure_reason'] = 'hit max job timeout'
+    report.try_push_job_info(job_info, extra_info)
 
 
 def create_fake_context(job_config, block=False):