targets = job_config['targets']
try:
reimaged = reimage_machines(ctx, targets, job_config['machine_type'])
- except Exception:
- log.info('Reimaging error. Nuking machines...')
+ except Exception as e:
+ log.exception('Reimaging error. Nuking machines...')
# Reimage failures should map to the 'dead' status instead of 'fail'
- report.try_push_job_info(ctx.config, dict(status='dead'))
+ report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)))
nuke(ctx, True)
raise
ctx.config['targets'] = reimaged
# Sleep once outside of the loop to avoid double-posting jobs
time.sleep(teuth_config.watchdog_interval)
+ hit_max_timeout = False
while process.poll() is None:
# Kill jobs that have been running longer than the global max
run_time = datetime.utcnow() - job_start_time
total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds
if total_seconds > teuth_config.max_job_time:
+ hit_max_timeout = True
log.warning("Job ran longer than {max}s. Killing...".format(
max=teuth_config.max_job_time))
try:
# the status, but if it was a pass or fail it will have already been
# reported to paddles. In that case paddles ignores the 'dead' status.
# If the job was killed, paddles will use the 'dead' status.
- report.try_push_job_info(job_info, dict(status='dead'))
+ extra_info = dict(status='dead')
+ if hit_max_timeout:
+ extra_info['failure_reason'] = 'hit max job timeout'
+ report.try_push_job_info(job_info, extra_info)
def create_fake_context(job_config, block=False):