This was simply faulty logic. One effect was that the supervisor would refuse to
unlock nodes that were used by its own job.
Signed-off-by: Zack Cerza <zack@cerza.org>
log.exception(error_message)
if 'targets' in job_config:
node_names = job_config["targets"].keys()
- lock_ops.unlock_safe(
- node_names,
- job_config["owner"],
- job_config["name"],
- job_config["job_id"]
- )
+ lock_ops.unlock_safe(node_names, job_config["owner"])
report.try_push_job_info(job_config, dict(
status='fail',
failure_reason=error_message))
return
if job_config.get("unlock_on_failure", True):
log.info('Unlocking machines...')
- lock_ops.unlock_safe(locked, job_config["owner"], job_config["name"], job_config["job_id"])
+ lock_ops.unlock_safe(locked, job_config["owner"])
def run_with_watchdog(process, job_config):
if owner is not None:
targets = find_targets(run_name)
names = list(targets.keys())
- lock_ops.unlock_safe(names, owner, run_name)
+ lock_ops.unlock_safe(names, owner)
report.try_mark_run_dead(run_name)
log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus")
if not skip_unlock:
targets = find_targets(run_name, job_id)
- lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
+ lock_ops.unlock_safe(list(targets.keys()), owner)
def find_run_info(serializer, run_name):
return response
-def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""):
+def unlock_safe(names: List[str], owner: str):
with teuthology.parallel.parallel() as p:
for name in names:
- p.spawn(unlock_one_safe, name, owner, run_name, job_id)
+ p.spawn(unlock_one_safe, name, owner)
return all(p)
-def unlock_one_safe(name: str, owner: str, run_name: str = "", job_id: str = "") -> bool:
+def unlock_one_safe(name: str, owner: str) -> bool:
node_status = query.get_status(name)
if node_status.get("locked", False) is False:
- log.warn(f"Refusing to unlock {name} since it is already unlocked")
+ log.debug(f"Refusing to unlock {name} since it is already unlocked")
return False
maybe_job = query.node_active_job(name, node_status)
if not maybe_job:
return unlock_one(name, owner, node_status["description"], node_status)
- if run_name and job_id and maybe_job.endswith(f"{run_name}/{job_id}"):
- log.error(f"Refusing to unlock {name} since it has an active job: {run_name}/{job_id}")
- return False
log.warning(f"Refusing to unlock {name} since it has an active job: {maybe_job}")
return False