From: Zack Cerza Date: Tue, 4 Mar 2025 23:37:53 +0000 (-0700) Subject: node-cleanup: Grace period for inactive jobs X-Git-Tag: 1.2.2~32^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ab3475a8195a0e4fc8d68719bae07a8591b6281a;p=teuthology.git node-cleanup: Grace period for inactive jobs Once a job is marked finished, the supervisor may still be waiting to unlock its nodes. Give jobs five minutes to clean up nodes before we consider them "stale". Signed-off-by: Zack Cerza --- diff --git a/teuthology/lock/query.py b/teuthology/lock/query.py index d52149a21..7d79ce9ae 100644 --- a/teuthology/lock/query.py +++ b/teuthology/lock/query.py @@ -1,3 +1,4 @@ +import datetime import logging import os import requests @@ -8,6 +9,7 @@ from teuthology import misc from teuthology.config import config from teuthology.contextutil import safe_while from teuthology.util.compat import urlencode +from teuthology.util.time import parse_timestamp log = logging.getLogger(__name__) @@ -153,16 +155,27 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N return "node description does not contained scheduled job info" url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/" job_status = "" + active = True with safe_while( sleep=1, increment=0.5, action='node_is_active') as proceed: while proceed(): resp = requests.get(url) if resp.ok: - job_status = resp.json()["status"] + job_obj = resp.json() + job_status = job_obj["status"] + active = job_status and job_status not in ('pass', 'fail', 'dead') + if active: + break + job_updated = job_obj["updated"] + try: + delta = datetime.datetime.now(datetime.timezone.utc) - parse_timestamp(job_updated) + active = active or delta < datetime.timedelta(minutes=5) + except Exception: + log.exception(f"{run_name}/{job_id} updated={job_updated}") break elif resp.status_code == 404: break else: log.debug(f"Error {resp.status_code} listing job {run_name}/{job_id} for {name}: {resp.text}") - if job_status and job_status not in ('pass', 'fail', 'dead'): + if active: return description