From bd1aec7ea7f4dec8b8149402439fa465f4099981 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 4 Mar 2025 17:58:31 -0700 Subject: [PATCH] Remove run name, job id from unlock_* calls This was simply faulty logic. One effect was that the supervisor would refuse to unlock nodes that were used by its own job. Signed-off-by: Zack Cerza --- teuthology/dispatcher/__init__.py | 7 +------ teuthology/dispatcher/supervisor.py | 2 +- teuthology/kill.py | 4 ++-- teuthology/lock/ops.py | 11 ++++------- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/teuthology/dispatcher/__init__.py b/teuthology/dispatcher/__init__.py index 59f8ae3279..e1027ffc9c 100644 --- a/teuthology/dispatcher/__init__.py +++ b/teuthology/dispatcher/__init__.py @@ -182,12 +182,7 @@ def main(args): log.exception(error_message) if 'targets' in job_config: node_names = job_config["targets"].keys() - lock_ops.unlock_safe( - node_names, - job_config["owner"], - job_config["name"], - job_config["job_id"] - ) + lock_ops.unlock_safe(node_names, job_config["owner"]) report.try_push_job_info(job_config, dict( status='fail', failure_reason=error_message)) diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index b89c39ac5a..1936fa636f 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -269,7 +269,7 @@ def unlock_targets(job_config): return if job_config.get("unlock_on_failure", True): log.info('Unlocking machines...') - lock_ops.unlock_safe(locked, job_config["owner"], job_config["name"], job_config["job_id"]) + lock_ops.unlock_safe(locked, job_config["owner"]) def run_with_watchdog(process, job_config): diff --git a/teuthology/kill.py b/teuthology/kill.py index 137e49080e..0914a6f9a1 100755 --- a/teuthology/kill.py +++ b/teuthology/kill.py @@ -75,7 +75,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None, if owner is not None: targets = find_targets(run_name) names = list(targets.keys()) - lock_ops.unlock_safe(names, owner, run_name) + lock_ops.unlock_safe(names, owner) report.try_mark_run_dead(run_name) @@ -103,7 +103,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False) log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus") if not skip_unlock: targets = find_targets(run_name, job_id) - lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id) + lock_ops.unlock_safe(list(targets.keys()), owner) def find_run_info(serializer, run_name): diff --git a/teuthology/lock/ops.py b/teuthology/lock/ops.py index 52dce68575..3b945df120 100644 --- a/teuthology/lock/ops.py +++ b/teuthology/lock/ops.py @@ -174,24 +174,21 @@ def lock_one(name, user=None, description=None): return response -def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""): +def unlock_safe(names: List[str], owner: str): with teuthology.parallel.parallel() as p: for name in names: - p.spawn(unlock_one_safe, name, owner, run_name, job_id) + p.spawn(unlock_one_safe, name, owner) return all(p) -def unlock_one_safe(name: str, owner: str, run_name: str = "", job_id: str = "") -> bool: +def unlock_one_safe(name: str, owner: str) -> bool: node_status = query.get_status(name) if node_status.get("locked", False) is False: - log.warn(f"Refusing to unlock {name} since it is already unlocked") + log.debug(f"Refusing to unlock {name} since it is already unlocked") return False maybe_job = query.node_active_job(name, node_status) if not maybe_job: return unlock_one(name, owner, node_status["description"], node_status) - if run_name and job_id and maybe_job.endswith(f"{run_name}/{job_id}"): - log.error(f"Refusing to unlock {name} since it has an active job: {run_name}/{job_id}") - return False log.warning(f"Refusing to unlock {name} since it has an active job: {maybe_job}") return False -- 2.39.5