From d1b85f4126c5f12f96fb22c97f9314a9e7885730 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 23 May 2023 13:53:23 -0600 Subject: [PATCH] exporter: Instrument node reimaging success/fail Signed-off-by: Zack Cerza --- teuthology/exporter.py | 16 ++++++++++++++++ teuthology/provision/__init__.py | 19 ++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/teuthology/exporter.py b/teuthology/exporter.py index f1d910da64..b5986de14b 100644 --- a/teuthology/exporter.py +++ b/teuthology/exporter.py @@ -184,6 +184,22 @@ class _JobResults(TeuthologyMetric): JobResults = _JobResults() + +class _NodeReimagingResults(TeuthologyMetric): + def __init__(self): + self.metric = Counter( + "teuthology_reimaging_results", + "Teuthology Reimaging Results", + ["machine_type", "status"], + ) + + # As this is to be used within job processes, we implement record() rather than update() + def record(self, machine_type, status): + self.metric.labels(machine_type=machine_type, status=status).inc() + + +NodeReimagingResults = _NodeReimagingResults() + NodeLockingTime = Summary( "teuthology_node_locking_duration_seconds", "Time spent waiting to lock nodes", diff --git a/teuthology/provision/__init__.py b/teuthology/provision/__init__.py index 325f2c34bf..2e9ba23ff5 100644 --- a/teuthology/provision/__init__.py +++ b/teuthology/provision/__init__.py @@ -1,5 +1,6 @@ import logging +import teuthology.exporter import teuthology.lock.query from teuthology.misc import decanonicalize_hostname, get_distro, get_distro_version @@ -18,9 +19,11 @@ def _logfile(ctx, shortname): return os.path.join(ctx.config['archive_path'], shortname + '.downburst.log') + def get_reimage_types(): return pelagos.get_types() + fog.get_types() + def reimage(ctx, machine_name, machine_type): os_type = get_distro(ctx) os_version = get_distro_version(ctx) @@ -36,7 +39,21 @@ def reimage(ctx, machine_name, machine_type): else: raise Exception("The machine_type '%s' is not known to any " "of configured provisioners" % machine_type) - return obj.create() + status = "fail" + try: + result = obj.create() + status = "success" + except Exception: + # We only need this clause so that we avoid triggering the finally + # clause below in cases where the exception raised is KeyboardInterrupt + # or SystemExit + raise + finally: + teuthology.exporter.NodeReimagingResults.record( + ctx.config.get("machine_type"), + status, + ) + return result def create_if_vm(ctx, machine_name, _downburst=None): -- 2.39.5