From 19e2ada65accbd2d5cb18687eb196da6be0143a6 Mon Sep 17 00:00:00 2001
From: Josh Durgin <jdurgin@redhat.com>
Date: Tue, 9 Feb 2021 21:33:34 +0000
Subject: [PATCH] supervisor: send paddles the reason a jobs is marked dead

Signed-off-by: Josh Durgin <jdurgin@redhat.com>
---
 teuthology/dispatcher/supervisor.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py
index 975448ad3..4ce9fa853 100644
--- a/teuthology/dispatcher/supervisor.py
+++ b/teuthology/dispatcher/supervisor.py
@@ -161,10 +161,10 @@ def reimage(job_config):
     targets = job_config['targets']
     try:
         reimaged = reimage_machines(ctx, targets, job_config['machine_type'])
-    except Exception:
-        log.info('Reimaging error. Nuking machines...')
+    except Exception as e:
+        log.exception('Reimaging error. Nuking machines...')
         # Reimage failures should map to the 'dead' status instead of 'fail'
-        report.try_push_job_info(ctx.config, dict(status='dead'))
+        report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)))
         nuke(ctx, True)
         raise
     ctx.config['targets'] = reimaged
@@ -208,11 +208,13 @@ def run_with_watchdog(process, job_config):
 
     # Sleep once outside of the loop to avoid double-posting jobs
     time.sleep(teuth_config.watchdog_interval)
+    hit_max_timeout = False
     while process.poll() is None:
         # Kill jobs that have been running longer than the global max
         run_time = datetime.utcnow() - job_start_time
         total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds
         if total_seconds > teuth_config.max_job_time:
+            hit_max_timeout = True
             log.warning("Job ran longer than {max}s. Killing...".format(
                 max=teuth_config.max_job_time))
             try:
@@ -249,7 +251,10 @@ def run_with_watchdog(process, job_config):
     # the status, but if it was a pass or fail it will have already been
     # reported to paddles. In that case paddles ignores the 'dead' status.
     # If the job was killed, paddles will use the 'dead' status.
-    report.try_push_job_info(job_info, dict(status='dead'))
+    extra_info = dict(status='dead')
+    if hit_max_timeout:
+        extra_info['failure_reason'] = 'hit max job timeout'
+    report.try_push_job_info(job_info, extra_info)
 
 
 def create_fake_context(job_config, block=False):
-- 
2.47.3