]> git-server-git.apps.pok.os.sepia.ceph.com Git - teuthology.git/commitdiff
exporter: Restart every 24h 1858/head
authorZack Cerza <zack@redhat.com>
Tue, 13 Jun 2023 23:49:48 +0000 (17:49 -0600)
committerZack Cerza <zack@redhat.com>
Wed, 21 Jun 2023 21:40:39 +0000 (15:40 -0600)
A design limitation of prometheus-client's multiprocessing mode is that
each process creates files to store its own metrics; the exporter then
has to read each file, even if the process which created it is dead.

This results in request latency growing over time, to the point of
multiple seconds when the file count gets into the thousands. This
eventually results in prometheus failing to fetch, leaving gaps in our
data.

We can work around this by restarting at a regular interval; 24h seems
like a fine place to start.

Signed-off-by: Zack Cerza <zack@redhat.com>
teuthology/dispatcher/__init__.py
teuthology/exporter.py

index 4ec6fc90b95c4c55c289e509b35f4336f6e3d3fc..3f46a74d4e12c1e27c57dedcc7ff6935d37e782d 100644 (file)
@@ -39,7 +39,7 @@ def sentinel(path):
     return file_mtime > start_time
 
 
-def restart():
+def restart(log=log):
     log.info('Restarting...')
     args = sys.argv[:]
     args.insert(0, sys.executable)
index b5986de14be2fc49818eb9ad59d7aabcff007efa..b688d4d75513bd80c1a7a59ce9df9faa4f4bd703 100644 (file)
@@ -47,6 +47,7 @@ class TeuthologyExporter:
             JobProcesses(),
             Nodes(),
         ]
+        self._created_time = time.perf_counter()
 
     def start(self):
         start_http_server(self.port, registry=registry)
@@ -63,6 +64,8 @@ class TeuthologyExporter:
         while True:
             try:
                 before = time.perf_counter()
+                if before - self._created_time > 24 * 60 * 60:
+                    self.restart()
                 try:
                     self.update()
                 except Exception:
@@ -79,6 +82,11 @@ class TeuthologyExporter:
                 log.info("Stopping.")
                 raise SystemExit
 
+    def restart(self):
+        # Use the dispatcher's restart function - note that by using this here,
+        # it restarts the exporter, *not* the dispatcher.
+        return teuthology.dispatcher.restart(log=log)
+
 
 class TeuthologyMetric:
     def __init__(self):