From: Zack Cerza Date: Thu, 9 Mar 2023 18:28:02 +0000 (-0700) Subject: dispatcher: Add instrumentation for locking time X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=94809225e4fb8df1bcd8b6ebb74ed1cd91bde69b;p=teuthology.git dispatcher: Add instrumentation for locking time Signed-off-by: Zack Cerza --- diff --git a/teuthology/dispatcher/__init__.py b/teuthology/dispatcher/__init__.py index 69b27072bf..f6f27a056d 100644 --- a/teuthology/dispatcher/__init__.py +++ b/teuthology/dispatcher/__init__.py @@ -9,6 +9,7 @@ from datetime import datetime from typing import Dict, List import teuthology.dispatcher.supervisor as supervisor +import teuthology.exporter as exporter import teuthology.lock.ops as lock_ops import teuthology.nuke as nuke import teuthology.worker as worker @@ -224,13 +225,16 @@ def find_dispatcher_processes() -> Dict[str, List[psutil.Process]]: def lock_machines(job_config): report.try_push_job_info(job_config, dict(status='running')) fake_ctx = supervisor.create_fake_context(job_config, block=True) - lock_ops.block_and_lock_machines( - fake_ctx, - len(job_config['roles']), - job_config['machine_type'], - tries=-1, - reimage=False, - ) + machine_type = job_config["machine_type"] + count = len(job_config['roles']) + with exporter.NodeLockingTime.labels(machine_type, count).time(): + lock_ops.block_and_lock_machines( + fake_ctx, + count, + machine_type, + tries=-1, + reimage=False, + ) job_config = fake_ctx.config return job_config diff --git a/teuthology/exporter.py b/teuthology/exporter.py index d76b65c4fe..d40e55b109 100644 --- a/teuthology/exporter.py +++ b/teuthology/exporter.py @@ -23,6 +23,7 @@ from prometheus_client import ( # noqa: E402 start_http_server, Gauge, Counter, + Summary, multiprocess, CollectorRegistry, ) @@ -112,9 +113,7 @@ class BeanstalkQueue(TeuthologyMetric): def update(self): for machine_type in MACHINE_TYPES: - queue_stats = beanstalk.stats_tube( - beanstalk.connect(), machine_type - ) + queue_stats = beanstalk.stats_tube(beanstalk.connect(), machine_type) self.length.labels(machine_type).set(queue_stats["count"]) self.paused.labels(machine_type).set(1 if queue_stats["paused"] else 0) @@ -183,6 +182,13 @@ class JobResults(TeuthologyMetric): self.metric.labels(machine_type=machine_type, status=status).inc() +NodeLockingTime = Summary( + "teuthology_node_locking_duration_seconds", + "Time spent waiting to lock a node", + ["machine_type", "count"], +) + + def main(args): exporter = TeuthologyExporter(interval=int(args["--interval"])) exporter.start()