]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
Implement a watchdog for queued jobs 157/head
authorZack Cerza <zack@cerza.org>
Thu, 5 Dec 2013 23:37:25 +0000 (17:37 -0600)
committerZack Cerza <zack@cerza.org>
Thu, 5 Dec 2013 23:48:10 +0000 (17:48 -0600)
This continually posts the run's status to the results server, if
configured, at an interval defaulting to 600 seconds.

Signed-off-by: Zack Cerza <zack.cerza@inktank.com>
teuthology/config.py
teuthology/queue.py

index 999feed60095edcd962fe09bd20b4f23c7a56de5..635571d4fb2b457e7704230d48337e27911ff553 100644 (file)
@@ -16,6 +16,7 @@ class Config(object):
         'ceph_git_base_url': 'https://github.com/ceph/',
         'lock_server': 'http://teuthology.front.sepia.ceph.com/locker/lock',
         'verify_host_keys': True,
+        'watchdog_interval': 600,
     }
 
     def __init__(self):
index 26d9e830becf661ef14a01021dc90900f925cc82..d2da9b477087dcb6ae29e9b70b4fc4a68ce917cc 100644 (file)
@@ -10,6 +10,7 @@ import yaml
 
 import beanstalkc
 
+from . import report
 from . import safepath
 from .config import config as teuth_config
 from .misc import read_config
@@ -179,6 +180,25 @@ def worker(ctx):
         job.delete()
 
 
+def run_with_watchdog(process, job_config):
+    # Only push the information that's relevant to the watchdog, to save db
+    # load
+    job_info = dict(
+        name=job_config['name'],
+        job_id=job_config['job_id'],
+    )
+
+    while process.poll() is None:
+        report.try_push_job_info(job_info, dict(status='running'))
+        time.sleep(teuth_config.watchdog_interval)
+
+    # The job finished. We don't know the status, but if it was a pass or fail
+    # it will have already been reported to paddles. In that case paddles
+    # ignores the 'dead' status. If the job was killed, paddles will use the
+    # 'dead' status.
+    report.try_push_job_info(job_info, dict(status='dead'))
+
+
 def run_job(job_config, teuth_bin_path):
     arg = [
         os.path.join(teuth_bin_path, 'teuthology'),
@@ -221,7 +241,12 @@ def run_job(job_config, teuth_bin_path):
         child = logging.getLogger(__name__ + '.child')
         for line in p.stderr:
             child.error(': %s', line.rstrip('\n'))
-        p.wait()
+
+        if teuth_config.results_server:
+            run_with_watchdog(p, job_config)
+        else:
+            p.wait()
+
         if p.returncode != 0:
             log.error('Child exited with code %d', p.returncode)
         else: