From: Zack Cerza Date: Thu, 16 Jan 2014 16:38:39 +0000 (-0600) Subject: Kill jobs that run for over 3 days (configurable) X-Git-Tag: 1.1.0~1695 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=769ef8a9600d43ebc203a52156ab7a61fa6b5835;p=teuthology.git Kill jobs that run for over 3 days (configurable) Signed-off-by: Zack Cerza --- diff --git a/teuthology/config.py b/teuthology/config.py index e5c3518d9..017bb6789 100644 --- a/teuthology/config.py +++ b/teuthology/config.py @@ -16,6 +16,7 @@ class Config(object): 'archive_base': '/var/lib/teuthworker/archive', 'ceph_git_base_url': 'https://github.com/ceph/', 'lock_server': 'http://teuthology.front.sepia.ceph.com/locker/lock', + 'max_job_time': 259200, # 3 days 'verify_host_keys': True, 'watchdog_interval': 600, } diff --git a/teuthology/queue.py b/teuthology/queue.py index f93264b80..9a6b1f833 100644 --- a/teuthology/queue.py +++ b/teuthology/queue.py @@ -14,6 +14,7 @@ from datetime import datetime from . import report from . import safepath from .config import config as teuth_config +from .kill import kill_job from .misc import read_config log = logging.getLogger(__name__) @@ -142,6 +143,8 @@ def worker(ctx): prog=os.path.basename(sys.argv[0]), path=ctx.archive_dir, )) + else: + teuth_config.archive_base = ctx.archive_dir read_config(ctx) @@ -207,6 +210,8 @@ def worker(ctx): def run_with_watchdog(process, job_config): + job_start_time = datetime.utcnow() + # Only push the information that's relevant to the watchdog, to save db # load job_info = dict( @@ -217,6 +222,13 @@ def run_with_watchdog(process, job_config): # Sleep once outside of the loop to avoid double-posting jobs time.sleep(teuth_config.watchdog_interval) while process.poll() is None: + # Kill jobs that have been running longer than the global max + job_run_time = datetime.utcnow() - job_start_time + if job_run_time.seconds > teuth_config.max_job_time: + kill_job(job_info['name'], job_info['job_id'], + teuth_config.archive_base) + break + report.try_push_job_info(job_info, dict(status='running')) time.sleep(teuth_config.watchdog_interval)