From 856f83449cae8a45c82251e7281f63263772aa48 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Thu, 5 Dec 2013 17:37:25 -0600 Subject: [PATCH] Implement a watchdog for queued jobs This continually posts the run's status to the results server, if configured, at an interval defaulting to 600 seconds. Signed-off-by: Zack Cerza --- teuthology/config.py | 1 + teuthology/queue.py | 27 ++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/teuthology/config.py b/teuthology/config.py index 999feed600..635571d4fb 100644 --- a/teuthology/config.py +++ b/teuthology/config.py @@ -16,6 +16,7 @@ class Config(object): 'ceph_git_base_url': 'https://github.com/ceph/', 'lock_server': 'http://teuthology.front.sepia.ceph.com/locker/lock', 'verify_host_keys': True, + 'watchdog_interval': 600, } def __init__(self): diff --git a/teuthology/queue.py b/teuthology/queue.py index 26d9e830be..d2da9b4770 100644 --- a/teuthology/queue.py +++ b/teuthology/queue.py @@ -10,6 +10,7 @@ import yaml import beanstalkc +from . import report from . import safepath from .config import config as teuth_config from .misc import read_config @@ -179,6 +180,25 @@ def worker(ctx): job.delete() +def run_with_watchdog(process, job_config): + # Only push the information that's relevant to the watchdog, to save db + # load + job_info = dict( + name=job_config['name'], + job_id=job_config['job_id'], + ) + + while process.poll() is None: + report.try_push_job_info(job_info, dict(status='running')) + time.sleep(teuth_config.watchdog_interval) + + # The job finished. We don't know the status, but if it was a pass or fail + # it will have already been reported to paddles. In that case paddles + # ignores the 'dead' status. If the job was killed, paddles will use the + # 'dead' status. + report.try_push_job_info(job_info, dict(status='dead')) + + def run_job(job_config, teuth_bin_path): arg = [ os.path.join(teuth_bin_path, 'teuthology'), @@ -221,7 +241,12 @@ def run_job(job_config, teuth_bin_path): child = logging.getLogger(__name__ + '.child') for line in p.stderr: child.error(': %s', line.rstrip('\n')) - p.wait() + + if teuth_config.results_server: + run_with_watchdog(p, job_config) + else: + p.wait() + if p.returncode != 0: log.error('Child exited with code %d', p.returncode) else: -- 2.39.5