]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
Remove run name, job id from unlock_* calls
authorZack Cerza <zack@cerza.org>
Wed, 5 Mar 2025 00:58:31 +0000 (17:58 -0700)
committerZack Cerza <zack@cerza.org>
Wed, 5 Mar 2025 01:15:39 +0000 (18:15 -0700)
This was simply faulty logic. One effect was that the supervisor would refuse to
unlock nodes that were used by its own job.

Signed-off-by: Zack Cerza <zack@cerza.org>
teuthology/dispatcher/__init__.py
teuthology/dispatcher/supervisor.py
teuthology/kill.py
teuthology/lock/ops.py

index 59f8ae3279d8e27509e2322d6cc32c0ea8bf6815..e1027ffc9c64d53b8131a02d235ca9482ed27779 100644 (file)
@@ -182,12 +182,7 @@ def main(args):
             log.exception(error_message)
             if 'targets' in job_config:
                 node_names = job_config["targets"].keys()
-                lock_ops.unlock_safe(
-                    node_names,
-                    job_config["owner"],
-                    job_config["name"],
-                    job_config["job_id"]
-                )
+                lock_ops.unlock_safe(node_names, job_config["owner"])
             report.try_push_job_info(job_config, dict(
                 status='fail',
                 failure_reason=error_message))
index b89c39ac5aa02bb3d9326a37e0f80cabdfd94685..1936fa636fdb7f289c8f73f1ad303417687b16a7 100644 (file)
@@ -269,7 +269,7 @@ def unlock_targets(job_config):
         return
     if job_config.get("unlock_on_failure", True):
         log.info('Unlocking machines...')
-        lock_ops.unlock_safe(locked, job_config["owner"], job_config["name"], job_config["job_id"])
+        lock_ops.unlock_safe(locked, job_config["owner"])
 
 
 def run_with_watchdog(process, job_config):
index 137e49080e77a445b846f1f5f8f047f74c0dcca0..0914a6f9a15910c4724a715db8f97ad93a99f8f8 100755 (executable)
@@ -75,7 +75,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
     if owner is not None:
         targets = find_targets(run_name)
         names = list(targets.keys())
-        lock_ops.unlock_safe(names, owner, run_name)
+        lock_ops.unlock_safe(names, owner)
     report.try_mark_run_dead(run_name)
 
 
@@ -103,7 +103,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False)
         log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus")
     if not skip_unlock:
         targets = find_targets(run_name, job_id)
-        lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
+        lock_ops.unlock_safe(list(targets.keys()), owner)
 
 
 def find_run_info(serializer, run_name):
index 52dce685754c772c1aff15cb6b05527a56af48d2..3b945df1201a68b089f70ecc1c784763a2b0af91 100644 (file)
@@ -174,24 +174,21 @@ def lock_one(name, user=None, description=None):
     return response
 
 
-def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""):
+def unlock_safe(names: List[str], owner: str):
     with teuthology.parallel.parallel() as p:
         for name in names:
-            p.spawn(unlock_one_safe, name, owner, run_name, job_id)
+            p.spawn(unlock_one_safe, name, owner)
         return all(p)
 
 
-def unlock_one_safe(name: str, owner: str, run_name: str = "", job_id: str = "") -> bool:
+def unlock_one_safe(name: str, owner: str) -> bool:
     node_status = query.get_status(name)
     if node_status.get("locked", False) is False:
-        log.warn(f"Refusing to unlock {name} since it is already unlocked")
+        log.debug(f"Refusing to unlock {name} since it is already unlocked")
         return False
     maybe_job = query.node_active_job(name, node_status)
     if not maybe_job:
         return unlock_one(name, owner, node_status["description"], node_status)
-    if run_name and job_id and maybe_job.endswith(f"{run_name}/{job_id}"):
-            log.error(f"Refusing to unlock {name} since it has an active job: {run_name}/{job_id}")
-            return False
     log.warning(f"Refusing to unlock {name} since it has an active job: {maybe_job}")
     return False