From ab3475a8195a0e4fc8d68719bae07a8591b6281a Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Tue, 4 Mar 2025 16:37:53 -0700 Subject: [PATCH] node-cleanup: Grace period for inactive jobs Once a job is marked finished, the supervisor may still be waiting to unlock its nodes. Give jobs five minutes to clean up nodes before we consider them "stale". Signed-off-by: Zack Cerza --- teuthology/lock/query.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/teuthology/lock/query.py b/teuthology/lock/query.py index d52149a216..7d79ce9aee 100644 --- a/teuthology/lock/query.py +++ b/teuthology/lock/query.py @@ -1,3 +1,4 @@ +import datetime import logging import os import requests @@ -8,6 +9,7 @@ from teuthology import misc from teuthology.config import config from teuthology.contextutil import safe_while from teuthology.util.compat import urlencode +from teuthology.util.time import parse_timestamp log = logging.getLogger(__name__) @@ -153,16 +155,27 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N return "node description does not contained scheduled job info" url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/" job_status = "" + active = True with safe_while( sleep=1, increment=0.5, action='node_is_active') as proceed: while proceed(): resp = requests.get(url) if resp.ok: - job_status = resp.json()["status"] + job_obj = resp.json() + job_status = job_obj["status"] + active = job_status and job_status not in ('pass', 'fail', 'dead') + if active: + break + job_updated = job_obj["updated"] + try: + delta = datetime.datetime.now(datetime.timezone.utc) - parse_timestamp(job_updated) + active = active or delta < datetime.timedelta(minutes=5) + except Exception: + log.exception(f"{run_name}/{job_id} updated={job_updated}") break elif resp.status_code == 404: break else: log.debug(f"Error {resp.status_code} listing job {run_name}/{job_id} for {name}: {resp.text}") - if job_status and job_status not in ('pass', 'fail', 'dead'): + if active: return description -- 2.39.5