From f56e1acda4b8496d83643cfc8952174926cfbff2 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 21 Feb 2023 13:34:06 -0700 Subject: [PATCH] Add teuthology-exporter, a Prometheus exporter Signed-off-by: Zack Cerza --- scripts/exporter.py | 18 ++++ setup.cfg | 2 + teuthology/dispatcher/__init__.py | 21 ++-- teuthology/exporter.py | 159 ++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+), 9 deletions(-) create mode 100644 scripts/exporter.py create mode 100644 teuthology/exporter.py diff --git a/scripts/exporter.py b/scripts/exporter.py new file mode 100644 index 000000000..438d5d3f3 --- /dev/null +++ b/scripts/exporter.py @@ -0,0 +1,18 @@ +import docopt + +import teuthology.exporter + +doc = """ +usage: teuthology-exporter --help + teuthology-exporter [--interval INTERVAL] + +optional arguments: + -h, --help show this help message and exit + --interval INTERVAL update metrics this often, in seconds + [default: 60] +""" + + +def main(): + args = docopt.docopt(doc) + teuthology.exporter.main(args) diff --git a/setup.cfg b/setup.cfg index 2a322e88b..c21eb2dca 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,6 +58,7 @@ install_requires = python-openstackclient requests>2.13.0 sentry-sdk + prometheus_client>=0.16.0 python_requires = >=3.6 [options.entry_points] @@ -81,6 +82,7 @@ console_scripts = teuthology-reimage = scripts.reimage:main teuthology-dispatcher = scripts.dispatcher:main teuthology-wait = scripts.wait:main + teuthology-exporter = scripts.exporter:main [options.extras_require] manhole = diff --git a/teuthology/dispatcher/__init__.py b/teuthology/dispatcher/__init__.py index 9aea132dd..8ce9b6557 100644 --- a/teuthology/dispatcher/__init__.py +++ b/teuthology/dispatcher/__init__.py @@ -1,4 +1,3 @@ -import getpass import logging import os import psutil @@ -7,6 +6,7 @@ import sys import yaml from datetime import datetime +from typing import Dict, List from teuthology import setup_log_file, install_except_hook from teuthology import beanstalk @@ -72,7 +72,7 @@ def main(args): archive_dir = teuth_config.archive_base # Refuse to start more than one dispatcher per machine type - procs = find_dispatcher_processes(tube) + procs = find_dispatcher_processes().get(tube) if procs: raise RuntimeError( "There is already a teuthology-dispatcher process running:" @@ -194,11 +194,8 @@ def main(args): return max(returncodes) -def find_dispatcher_processes(machine_type): - user = getpass.getuser() +def find_dispatcher_processes() -> Dict[str, List[psutil.Process]] : def match(proc): - if proc.username() != user: - return False cmdline = proc.cmdline() if len(cmdline) < 3: return False @@ -206,14 +203,20 @@ def find_dispatcher_processes(machine_type): return False if cmdline[2] == "--supervisor": return False - if machine_type not in cmdline: + if "--tube" not in cmdline: return False if proc.pid == os.getpid(): return False return True - attrs = ["pid", "username", "cmdline"] - procs = list(filter(match, psutil.process_iter(attrs=attrs))) + procs = {} + attrs = ["pid", "cmdline"] + for proc in psutil.process_iter(attrs=attrs): + if not match(proc): + continue + cmdline = proc.cmdline() + machine_type = cmdline[cmdline.index("--tube") + 1] + procs.setdefault(machine_type, []).append(proc) return procs diff --git a/teuthology/exporter.py b/teuthology/exporter.py new file mode 100644 index 000000000..5a7dfea72 --- /dev/null +++ b/teuthology/exporter.py @@ -0,0 +1,159 @@ +import itertools +import logging +import psutil +import time + +from prometheus_client import ( + start_http_server, + Gauge, +) + +import teuthology.beanstalk as beanstalk +import teuthology.dispatcher +from teuthology.config import config +from teuthology.lock.query import list_locks + + +log = logging.getLogger(__name__) + +MACHINE_TYPES = list(config.active_machine_types) + + +class TeuthologyExporter: + port = 61764 # int(''.join([str((ord(c) - 100) % 10) for c in "teuth"])) + + def __init__(self, interval=60): + self.interval = interval + self.metrics = [ + Dispatchers(), + BeanstalkQueue(), + JobProcesses(), + Nodes(), + ] + + def start(self): + start_http_server(self.port) + self.loop() + + def update(self): + log.info("Updating...") + for metric in self.metrics: + metric.update() + log.info("Update finished.") + + def loop(self): + log.info("Starting teuthology-exporter...") + while True: + try: + before = time.perf_counter() + try: + self.update() + except Exception: + log.exception("Failed to update metrics") + interval = self.interval + # try to deliver metrics _at_ $interval, as opposed to sleeping + # for $interval between updates + elapsed: float = time.perf_counter() - before + if elapsed < 0: + interval *= 2 + interval -= elapsed + time.sleep(interval) + except KeyboardInterrupt: + log.info("Stopping.") + raise SystemExit + + +class TeuthologyMetric: + def __init__(self): + pass + + def update(self): + raise NotImplementedError + + +class Dispatchers(TeuthologyMetric): + def __init__(self): + self.metric = Gauge( + "teuthology_dispatchers", "Teuthology Dispatchers", ["machine_type"] + ) + + def update(self): + dispatcher_procs = teuthology.dispatcher.find_dispatcher_processes() + for machine_type in MACHINE_TYPES: + self.metric.labels(machine_type).set( + len(dispatcher_procs.get(machine_type, [])) + ) + + +class BeanstalkQueue(TeuthologyMetric): + def __init__(self): + self.length = Gauge( + "beanstalk_queue_length", "Beanstalk Queue Length", ["machine_type"] + ) + self.paused = Gauge( + "beanstalk_queue_paused", "Beanstalk Queue is Paused", ["machine_type"] + ) + + def update(self): + for machine_type in MACHINE_TYPES: + queue_stats = beanstalk.stats_tube( + beanstalk.connect(), machine_type + ) + self.length.labels(machine_type).set(queue_stats["count"]) + self.paused.labels(machine_type).set(1 if queue_stats["paused"] else 0) + + +class JobProcesses(TeuthologyMetric): + def __init__(self): + self.metric = Gauge( + "teuthology_job_processes", + "Teuthology Job Processes", + ) + + def update(self): + + attrs = ["pid", "cmdline"] + total = 0 + for proc in psutil.process_iter(attrs=attrs): + if self._match(proc): + total += 1 + self.metric.set(total) + + @staticmethod + def _match(proc): + cmdline = proc.cmdline() + if not len(cmdline) > 1: + return False + if not cmdline[1].endswith("teuthology"): + return False + if "--archive" not in cmdline: + return False + if "--name" not in cmdline: + return False + try: + owner_index = cmdline.index("--owner") + 1 + if not cmdline[owner_index].startswith("scheduled_"): + return False + except ValueError: + return False + return True + + +class Nodes(TeuthologyMetric): + def __init__(self): + self.metric = Gauge( + "teuthology_nodes", "Teuthology Nodes", ["machine_type", "locked", "up"] + ) + + def update(self): + for machine_type in MACHINE_TYPES: + nodes = list_locks(machine_type=machine_type) + for up, locked in itertools.product([True, False], [True, False]): + self.metric.labels(machine_type=machine_type, up=up, locked=locked).set( + len([n for n in nodes if n["up"] is up and n["locked"] is locked]) + ) + + +def main(args): + exporter = TeuthologyExporter(interval=int(args["--interval"])) + exporter.start() -- 2.47.3