]> git-server-git.apps.pok.os.sepia.ceph.com Git - teuthology.git/commitdiff
teuthology-exporter: Report job status
authorZack Cerza <zack@redhat.com>
Tue, 7 Mar 2023 18:48:22 +0000 (11:48 -0700)
committerZack Cerza <zack@redhat.com>
Wed, 8 Mar 2023 20:30:29 +0000 (13:30 -0700)
Signed-off-by: Zack Cerza <zack@redhat.com>
teuthology/dispatcher/supervisor.py
teuthology/exporter.py
teuthology/kill.py
teuthology/report.py

index bf1c02528724fe14cd02b4874a67291b011e2e4d..e5ea4a3205b0595554e686a6cd6b4cac80fed33b 100644 (file)
@@ -316,7 +316,8 @@ def run_with_watchdog(process, job_config):
     extra_info = dict(status='dead')
     if hit_max_timeout:
         extra_info['failure_reason'] = 'hit max job timeout'
-    report.try_push_job_info(job_info, extra_info)
+    if not (job_config.get('first_in_suite') or job_config.get('last_in_suite')):
+        report.try_push_job_info(job_info, extra_info)
 
 
 def create_fake_context(job_config, block=False):
index 5a7dfea72693ae9ed6c225825a22a6dc7c96e115..d76b65c4fed1ec99f91425f23f4277f31a86540a 100644 (file)
@@ -1,21 +1,35 @@
 import itertools
 import logging
+import os
 import psutil
 import time
 
-from prometheus_client import (
-    start_http_server,
-    Gauge,
-)
+from pathlib import Path
 
 import teuthology.beanstalk as beanstalk
 import teuthology.dispatcher
 from teuthology.config import config
 from teuthology.lock.query import list_locks
 
-
 log = logging.getLogger(__name__)
 
+
+PROMETHEUS_MULTIPROC_DIR = Path("~/.cache/teuthology-exporter").expanduser()
+PROMETHEUS_MULTIPROC_DIR.mkdir(parents=True, exist_ok=True)
+os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(PROMETHEUS_MULTIPROC_DIR)
+
+# We can't import prometheus_client until after we set PROMETHEUS_MULTIPROC_DIR
+from prometheus_client import (  # noqa: E402
+    start_http_server,
+    Gauge,
+    Counter,
+    multiprocess,
+    CollectorRegistry,
+)
+
+registry = CollectorRegistry()
+multiprocess.MultiProcessCollector(registry)
+
 MACHINE_TYPES = list(config.active_machine_types)
 
 
@@ -23,6 +37,8 @@ class TeuthologyExporter:
     port = 61764  # int(''.join([str((ord(c) - 100) % 10) for c in "teuth"]))
 
     def __init__(self, interval=60):
+        for file in PROMETHEUS_MULTIPROC_DIR.iterdir():
+            file.unlink()
         self.interval = interval
         self.metrics = [
             Dispatchers(),
@@ -32,7 +48,7 @@ class TeuthologyExporter:
         ]
 
     def start(self):
-        start_http_server(self.port)
+        start_http_server(self.port, registry=registry)
         self.loop()
 
     def update(self):
@@ -154,6 +170,19 @@ class Nodes(TeuthologyMetric):
                 )
 
 
+class JobResults(TeuthologyMetric):
+    def __init__(self):
+        self.metric = Counter(
+            "teuthology_job_results",
+            "Teuthology Job Results",
+            ["machine_type", "status"],
+        )
+
+    # As this is to be used within job processes, we implement record() rather than update()
+    def record(self, machine_type, status):
+        self.metric.labels(machine_type=machine_type, status=status).inc()
+
+
 def main(args):
     exporter = TeuthologyExporter(interval=int(args["--interval"]))
     exporter.start()
index 5af11b628c95b174e76548ddcef42fae754109db..a51e0fda35c17b19dce40d5f4293b60fc89f3faa 100755 (executable)
@@ -9,6 +9,8 @@ import logging
 import getpass
 
 
+import teuthology.exporter
+
 from teuthology import beanstalk
 from teuthology import report
 from teuthology.config import config
@@ -84,6 +86,10 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_nuke=False):
                 "Please pass --owner <owner>.")
         owner = job_info['owner']
     kill_processes(run_name, [job_info.get('pid')])
+    if 'machine_type' in job_info:
+        teuthology.exporter.JobResults().record(job_info["machine_type"], job_info["status"])
+    else:
+        log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus")
     # Because targets can be missing for some cases, for example, when all
     # the necessary nodes ain't locked yet, we do not use job_info to get them,
     # but use find_targets():
index edf01cfdb082632c7b1901c6b847577472e13a6c..3b4247e517f452df0c0debc6779ae5bbcbb190d0 100644 (file)
@@ -9,6 +9,7 @@ import socket
 from datetime import datetime
 
 import teuthology
+import teuthology.exporter
 from teuthology.config import config
 from teuthology.contextutil import safe_while
 from teuthology.job_status import get_status, set_status
@@ -471,6 +472,9 @@ def push_job_info(run_name, job_id, job_info, base_uri=None):
     if not reporter.base_uri:
         return
     reporter.report_job(run_name, job_id, job_info)
+    status = get_status(job_info)
+    if status in ["pass", "fail", "dead"] and "machine_type" in job_info:
+        teuthology.exporter.JobResults().record(job_info["machine_type"], status)
 
 
 def try_push_job_info(job_config, extra_info=None):
@@ -579,6 +583,8 @@ def try_mark_run_dead(run_name):
             try:
                 log.info("Marking job {job_id} as dead".format(job_id=job_id))
                 reporter.report_job(run_name, job['job_id'], dead=True)
+                if "machine_type" in job:
+                    teuthology.exporter.JobResults().record(job["machine_type"], job["status"])
             except report_exceptions:
                 log.exception("Could not mark job as dead: {job_id}".format(
                     job_id=job_id))