]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
Kill jobs that run for over 3 days (configurable)
authorZack Cerza <zack@cerza.org>
Thu, 16 Jan 2014 16:38:39 +0000 (10:38 -0600)
committerZack Cerza <zack@cerza.org>
Thu, 16 Jan 2014 16:38:39 +0000 (10:38 -0600)
Signed-off-by: Zack Cerza <zack.cerza@inktank.com>
teuthology/config.py
teuthology/queue.py

index e5c3518d9b8bff325d26a0837f63fd80fba9cea5..017bb678915fc3c6f88f11e889f8c5866c18c06f 100644 (file)
@@ -16,6 +16,7 @@ class Config(object):
         'archive_base': '/var/lib/teuthworker/archive',
         'ceph_git_base_url': 'https://github.com/ceph/',
         'lock_server': 'http://teuthology.front.sepia.ceph.com/locker/lock',
+        'max_job_time': 259200,  # 3 days
         'verify_host_keys': True,
         'watchdog_interval': 600,
     }
index f93264b8060aa2ce0fa1c94a1cd1f30b5eeb0b6c..9a6b1f83326f94e8f37f8f687cff7bfc8e17afa2 100644 (file)
@@ -14,6 +14,7 @@ from datetime import datetime
 from . import report
 from . import safepath
 from .config import config as teuth_config
+from .kill import kill_job
 from .misc import read_config
 
 log = logging.getLogger(__name__)
@@ -142,6 +143,8 @@ def worker(ctx):
             prog=os.path.basename(sys.argv[0]),
             path=ctx.archive_dir,
         ))
+    else:
+        teuth_config.archive_base = ctx.archive_dir
 
     read_config(ctx)
 
@@ -207,6 +210,8 @@ def worker(ctx):
 
 
 def run_with_watchdog(process, job_config):
+    job_start_time = datetime.utcnow()
+
     # Only push the information that's relevant to the watchdog, to save db
     # load
     job_info = dict(
@@ -217,6 +222,13 @@ def run_with_watchdog(process, job_config):
     # Sleep once outside of the loop to avoid double-posting jobs
     time.sleep(teuth_config.watchdog_interval)
     while process.poll() is None:
+        # Kill jobs that have been running longer than the global max
+        job_run_time = datetime.utcnow() - job_start_time
+        if job_run_time.seconds > teuth_config.max_job_time:
+            kill_job(job_info['name'], job_info['job_id'],
+                     teuth_config.archive_base)
+            break
+
         report.try_push_job_info(job_info, dict(status='running'))
         time.sleep(teuth_config.watchdog_interval)