teuthology-exporter: Report job status

author Zack Cerza <zack@redhat.com>

Tue, 7 Mar 2023 18:48:22 +0000 (11:48 -0700)

committer Zack Cerza <zack@redhat.com>

Wed, 8 Mar 2023 20:30:29 +0000 (13:30 -0700)
author Zack Cerza <zack@redhat.com>
Tue, 7 Mar 2023 18:48:22 +0000 (11:48 -0700)
committer Zack Cerza <zack@redhat.com>
Wed, 8 Mar 2023 20:30:29 +0000 (13:30 -0700)
diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py

index bf1c02528724fe14cd02b4874a67291b011e2e4d..e5ea4a3205b0595554e686a6cd6b4cac80fed33b 100644 (file)
--- a/teuthology/dispatcher/supervisor.py
+++ b/teuthology/dispatcher/supervisor.py
@@ -316,7 +316,8 @@ def run_with_watchdog(process, job_config):
      extra_info = dict(status='dead')
      if hit_max_timeout:
          extra_info['failure_reason'] = 'hit max job timeout'
-    report.try_push_job_info(job_info, extra_info)
+    if not (job_config.get('first_in_suite') or job_config.get('last_in_suite')):
+        report.try_push_job_info(job_info, extra_info)
  
  
  def create_fake_context(job_config, block=False):
diff --git a/teuthology/exporter.py b/teuthology/exporter.py

index 5a7dfea72693ae9ed6c225825a22a6dc7c96e115..d76b65c4fed1ec99f91425f23f4277f31a86540a 100644 (file)
--- a/teuthology/exporter.py
+++ b/teuthology/exporter.py
@@ -1,21 +1,35 @@
  import itertools
  import logging
+import os
  import psutil
  import time
  
-from prometheus_client import (
-    start_http_server,
-    Gauge,
-)
+from pathlib import Path
  
  import teuthology.beanstalk as beanstalk
  import teuthology.dispatcher
  from teuthology.config import config
  from teuthology.lock.query import list_locks
  
-
  log = logging.getLogger(__name__)
  
+
+PROMETHEUS_MULTIPROC_DIR = Path("~/.cache/teuthology-exporter").expanduser()
+PROMETHEUS_MULTIPROC_DIR.mkdir(parents=True, exist_ok=True)
+os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(PROMETHEUS_MULTIPROC_DIR)
+
+# We can't import prometheus_client until after we set PROMETHEUS_MULTIPROC_DIR
+from prometheus_client import (  # noqa: E402
+    start_http_server,
+    Gauge,
+    Counter,
+    multiprocess,
+    CollectorRegistry,
+)
+
+registry = CollectorRegistry()
+multiprocess.MultiProcessCollector(registry)
+
  MACHINE_TYPES = list(config.active_machine_types)
  
  
@@ -23,6 +37,8 @@ class TeuthologyExporter:
      port = 61764  # int(''.join([str((ord(c) - 100) % 10) for c in "teuth"]))
  
      def __init__(self, interval=60):
+        for file in PROMETHEUS_MULTIPROC_DIR.iterdir():
+            file.unlink()
          self.interval = interval
          self.metrics = [
              Dispatchers(),
@@ -32,7 +48,7 @@ class TeuthologyExporter:
          ]
  
      def start(self):
-        start_http_server(self.port)
+        start_http_server(self.port, registry=registry)
          self.loop()
  
      def update(self):
@@ -154,6 +170,19 @@ class Nodes(TeuthologyMetric):
                  )
  
  
+class JobResults(TeuthologyMetric):
+    def __init__(self):
+        self.metric = Counter(
+            "teuthology_job_results",
+            "Teuthology Job Results",
+            ["machine_type", "status"],
+        )
+
+    # As this is to be used within job processes, we implement record() rather than update()
+    def record(self, machine_type, status):
+        self.metric.labels(machine_type=machine_type, status=status).inc()
+
+
  def main(args):
      exporter = TeuthologyExporter(interval=int(args["--interval"]))
      exporter.start()
diff --git a/teuthology/kill.py b/teuthology/kill.py

index 5af11b628c95b174e76548ddcef42fae754109db..a51e0fda35c17b19dce40d5f4293b60fc89f3faa 100755 (executable)
--- a/teuthology/kill.py
+++ b/teuthology/kill.py
@@ -9,6 +9,8 @@ import logging
  import getpass
  
  
+import teuthology.exporter
+
  from teuthology import beanstalk
  from teuthology import report
  from teuthology.config import config
@@ -84,6 +86,10 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_nuke=False):
                  "Please pass --owner <owner>.")
          owner = job_info['owner']
      kill_processes(run_name, [job_info.get('pid')])
+    if 'machine_type' in job_info:
+        teuthology.exporter.JobResults().record(job_info["machine_type"], job_info["status"])
+    else:
+        log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus")
      # Because targets can be missing for some cases, for example, when all
      # the necessary nodes ain't locked yet, we do not use job_info to get them,
      # but use find_targets():
diff --git a/teuthology/report.py b/teuthology/report.py

index edf01cfdb082632c7b1901c6b847577472e13a6c..3b4247e517f452df0c0debc6779ae5bbcbb190d0 100644 (file)
--- a/teuthology/report.py
+++ b/teuthology/report.py
@@ -9,6 +9,7 @@ import socket
  from datetime import datetime
  
  import teuthology
+import teuthology.exporter
  from teuthology.config import config
  from teuthology.contextutil import safe_while
  from teuthology.job_status import get_status, set_status
@@ -471,6 +472,9 @@ def push_job_info(run_name, job_id, job_info, base_uri=None):
      if not reporter.base_uri:
          return
      reporter.report_job(run_name, job_id, job_info)
+    status = get_status(job_info)
+    if status in ["pass", "fail", "dead"] and "machine_type" in job_info:
+        teuthology.exporter.JobResults().record(job_info["machine_type"], status)
  
  
  def try_push_job_info(job_config, extra_info=None):
@@ -579,6 +583,8 @@ def try_mark_run_dead(run_name):
              try:
                  log.info("Marking job {job_id} as dead".format(job_id=job_id))
                  reporter.report_job(run_name, job['job_id'], dead=True)
+                if "machine_type" in job:
+                    teuthology.exporter.JobResults().record(job["machine_type"], job["status"])
              except report_exceptions:
                  log.exception("Could not mark job as dead: {job_id}".format(
                      job_id=job_id))
author	Zack Cerza <zack@redhat.com>
	Tue, 7 Mar 2023 18:48:22 +0000 (11:48 -0700)
committer	Zack Cerza <zack@redhat.com>
	Wed, 8 Mar 2023 20:30:29 +0000 (13:30 -0700)
teuthology/dispatcher/supervisor.py		patch \| blob \| history
teuthology/exporter.py		patch \| blob \| history
teuthology/kill.py		patch \| blob \| history
teuthology/report.py		patch \| blob \| history