]> git-server-git.apps.pok.os.sepia.ceph.com Git - teuthology.git/commitdiff
exporter: Instrument node reimaging success/fail
authorZack Cerza <zack@redhat.com>
Tue, 23 May 2023 19:53:23 +0000 (13:53 -0600)
committerZack Cerza <zack@redhat.com>
Tue, 23 May 2023 22:43:39 +0000 (16:43 -0600)
Signed-off-by: Zack Cerza <zack@redhat.com>
teuthology/exporter.py
teuthology/provision/__init__.py

index f1d910da64419af1c2b2ff9c7c5c644825af334a..b5986de14be2fc49818eb9ad59d7aabcff007efa 100644 (file)
@@ -184,6 +184,22 @@ class _JobResults(TeuthologyMetric):
 
 JobResults = _JobResults()
 
+
+class _NodeReimagingResults(TeuthologyMetric):
+    def __init__(self):
+        self.metric = Counter(
+            "teuthology_reimaging_results",
+            "Teuthology Reimaging Results",
+            ["machine_type", "status"],
+        )
+
+    # As this is to be used within job processes, we implement record() rather than update()
+    def record(self, machine_type, status):
+        self.metric.labels(machine_type=machine_type, status=status).inc()
+
+
+NodeReimagingResults = _NodeReimagingResults()
+
 NodeLockingTime = Summary(
     "teuthology_node_locking_duration_seconds",
     "Time spent waiting to lock nodes",
index 325f2c34bf3eb7fc838d933a04b63fbddd780e39..2e9ba23ff55c4655fbcec97ce1f07494d55df702 100644 (file)
@@ -1,5 +1,6 @@
 import logging
 
+import teuthology.exporter
 import teuthology.lock.query
 from teuthology.misc import decanonicalize_hostname, get_distro, get_distro_version
 
@@ -18,9 +19,11 @@ def _logfile(ctx, shortname):
         return os.path.join(ctx.config['archive_path'],
                             shortname + '.downburst.log')
 
+
 def get_reimage_types():
     return pelagos.get_types() + fog.get_types()
 
+
 def reimage(ctx, machine_name, machine_type):
     os_type = get_distro(ctx)
     os_version = get_distro_version(ctx)
@@ -36,7 +39,21 @@ def reimage(ctx, machine_name, machine_type):
     else:
         raise Exception("The machine_type '%s' is not known to any "
                         "of configured provisioners" % machine_type)
-    return obj.create()
+    status = "fail"
+    try:
+        result = obj.create()
+        status = "success"
+    except Exception:
+        # We only need this clause so that we avoid triggering the finally
+        # clause below in cases where the exception raised is KeyboardInterrupt
+        # or SystemExit
+        raise
+    finally:
+        teuthology.exporter.NodeReimagingResults.record(
+            ctx.config.get("machine_type"),
+            status,
+        )
+    return result
 
 
 def create_if_vm(ctx, machine_name, _downburst=None):