supervisor: send paddles the reason a jobs is marked dead

author Josh Durgin <jdurgin@redhat.com>

Tue, 9 Feb 2021 21:33:34 +0000 (21:33 +0000)

committer Josh Durgin <jdurgin@redhat.com>

Tue, 9 Feb 2021 21:33:34 +0000 (21:33 +0000)
author Josh Durgin <jdurgin@redhat.com>
Tue, 9 Feb 2021 21:33:34 +0000 (21:33 +0000)
committer Josh Durgin <jdurgin@redhat.com>
Tue, 9 Feb 2021 21:33:34 +0000 (21:33 +0000)
diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py

index 975448ad31f48061ea37282485b4813e6b267058..4ce9fa8531f44f9af7c242086b2d6b5e0784b946 100644 (file)
--- a/teuthology/dispatcher/supervisor.py
+++ b/teuthology/dispatcher/supervisor.py
@@ -161,10 +161,10 @@ def reimage(job_config):
      targets = job_config['targets']
      try:
          reimaged = reimage_machines(ctx, targets, job_config['machine_type'])
-    except Exception:
-        log.info('Reimaging error. Nuking machines...')
+    except Exception as e:
+        log.exception('Reimaging error. Nuking machines...')
          # Reimage failures should map to the 'dead' status instead of 'fail'
-        report.try_push_job_info(ctx.config, dict(status='dead'))
+        report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)))
          nuke(ctx, True)
          raise
      ctx.config['targets'] = reimaged
@@ -208,11 +208,13 @@ def run_with_watchdog(process, job_config):
  
      # Sleep once outside of the loop to avoid double-posting jobs
      time.sleep(teuth_config.watchdog_interval)
+    hit_max_timeout = False
      while process.poll() is None:
          # Kill jobs that have been running longer than the global max
          run_time = datetime.utcnow() - job_start_time
          total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds
          if total_seconds > teuth_config.max_job_time:
+            hit_max_timeout = True
              log.warning("Job ran longer than {max}s. Killing...".format(
                  max=teuth_config.max_job_time))
              try:
@@ -249,7 +251,10 @@ def run_with_watchdog(process, job_config):
      # the status, but if it was a pass or fail it will have already been
      # reported to paddles. In that case paddles ignores the 'dead' status.
      # If the job was killed, paddles will use the 'dead' status.
-    report.try_push_job_info(job_info, dict(status='dead'))
+    extra_info = dict(status='dead')
+    if hit_max_timeout:
+        extra_info['failure_reason'] = 'hit max job timeout'
+    report.try_push_job_info(job_info, extra_info)
  
  
  def create_fake_context(job_config, block=False):
author	Josh Durgin <jdurgin@redhat.com>
	Tue, 9 Feb 2021 21:33:34 +0000 (21:33 +0000)
committer	Josh Durgin <jdurgin@redhat.com>
	Tue, 9 Feb 2021 21:33:34 +0000 (21:33 +0000)