From 35fc1b03dee1f5d7a8d1c25b9a9beabc48ad1665 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 13 Jun 2023 17:49:48 -0600 Subject: [PATCH] exporter: Restart every 24h A design limitation of prometheus-client's multiprocessing mode is that each process creates files to store its own metrics; the exporter then has to read each file, even if the process which created it is dead. This results in request latency growing over time, to the point of multiple seconds when the file count gets into the thousands. This eventually results in prometheus failing to fetch, leaving gaps in our data. We can work around this by restarting at a regular interval; 24h seems like a fine place to start. Signed-off-by: Zack Cerza --- teuthology/dispatcher/__init__.py | 2 +- teuthology/exporter.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/teuthology/dispatcher/__init__.py b/teuthology/dispatcher/__init__.py index 4ec6fc90b9..3f46a74d4e 100644 --- a/teuthology/dispatcher/__init__.py +++ b/teuthology/dispatcher/__init__.py @@ -39,7 +39,7 @@ def sentinel(path): return file_mtime > start_time -def restart(): +def restart(log=log): log.info('Restarting...') args = sys.argv[:] args.insert(0, sys.executable) diff --git a/teuthology/exporter.py b/teuthology/exporter.py index b5986de14b..b688d4d755 100644 --- a/teuthology/exporter.py +++ b/teuthology/exporter.py @@ -47,6 +47,7 @@ class TeuthologyExporter: JobProcesses(), Nodes(), ] + self._created_time = time.perf_counter() def start(self): start_http_server(self.port, registry=registry) @@ -63,6 +64,8 @@ class TeuthologyExporter: while True: try: before = time.perf_counter() + if before - self._created_time > 24 * 60 * 60: + self.restart() try: self.update() except Exception: @@ -79,6 +82,11 @@ class TeuthologyExporter: log.info("Stopping.") raise SystemExit + def restart(self): + # Use the dispatcher's restart function - note that by using this here, + # it restarts the exporter, *not* the dispatcher. + return teuthology.dispatcher.restart(log=log) + class TeuthologyMetric: def __init__(self): -- 2.39.5