extra_info = dict(status='dead')
if hit_max_timeout:
extra_info['failure_reason'] = 'hit max job timeout'
- report.try_push_job_info(job_info, extra_info)
+ if not (job_config.get('first_in_suite') or job_config.get('last_in_suite')):
+ report.try_push_job_info(job_info, extra_info)
def create_fake_context(job_config, block=False):
import itertools
import logging
+import os
import psutil
import time
-from prometheus_client import (
- start_http_server,
- Gauge,
-)
+from pathlib import Path
import teuthology.beanstalk as beanstalk
import teuthology.dispatcher
from teuthology.config import config
from teuthology.lock.query import list_locks
-
log = logging.getLogger(__name__)
+
+PROMETHEUS_MULTIPROC_DIR = Path("~/.cache/teuthology-exporter").expanduser()
+PROMETHEUS_MULTIPROC_DIR.mkdir(parents=True, exist_ok=True)
+os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(PROMETHEUS_MULTIPROC_DIR)
+
+# We can't import prometheus_client until after we set PROMETHEUS_MULTIPROC_DIR
+from prometheus_client import ( # noqa: E402
+ start_http_server,
+ Gauge,
+ Counter,
+ multiprocess,
+ CollectorRegistry,
+)
+
+registry = CollectorRegistry()
+multiprocess.MultiProcessCollector(registry)
+
MACHINE_TYPES = list(config.active_machine_types)
port = 61764 # int(''.join([str((ord(c) - 100) % 10) for c in "teuth"]))
def __init__(self, interval=60):
+ for file in PROMETHEUS_MULTIPROC_DIR.iterdir():
+ file.unlink()
self.interval = interval
self.metrics = [
Dispatchers(),
]
def start(self):
- start_http_server(self.port)
+ start_http_server(self.port, registry=registry)
self.loop()
def update(self):
)
+class JobResults(TeuthologyMetric):
+ def __init__(self):
+ self.metric = Counter(
+ "teuthology_job_results",
+ "Teuthology Job Results",
+ ["machine_type", "status"],
+ )
+
+ # As this is to be used within job processes, we implement record() rather than update()
+ def record(self, machine_type, status):
+ self.metric.labels(machine_type=machine_type, status=status).inc()
+
+
def main(args):
exporter = TeuthologyExporter(interval=int(args["--interval"]))
exporter.start()
import getpass
+import teuthology.exporter
+
from teuthology import beanstalk
from teuthology import report
from teuthology.config import config
"Please pass --owner <owner>.")
owner = job_info['owner']
kill_processes(run_name, [job_info.get('pid')])
+ if 'machine_type' in job_info:
+ teuthology.exporter.JobResults().record(job_info["machine_type"], job_info["status"])
+ else:
+ log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus")
# Because targets can be missing for some cases, for example, when all
# the necessary nodes ain't locked yet, we do not use job_info to get them,
# but use find_targets():
from datetime import datetime
import teuthology
+import teuthology.exporter
from teuthology.config import config
from teuthology.contextutil import safe_while
from teuthology.job_status import get_status, set_status
if not reporter.base_uri:
return
reporter.report_job(run_name, job_id, job_info)
+ status = get_status(job_info)
+ if status in ["pass", "fail", "dead"] and "machine_type" in job_info:
+ teuthology.exporter.JobResults().record(job_info["machine_type"], status)
def try_push_job_info(job_config, extra_info=None):
try:
log.info("Marking job {job_id} as dead".format(job_id=job_id))
reporter.report_job(run_name, job['job_id'], dead=True)
+ if "machine_type" in job:
+ teuthology.exporter.JobResults().record(job["machine_type"], job["status"])
except report_exceptions:
log.exception("Could not mark job as dead: {job_id}".format(
job_id=job_id))