]> git-server-git.apps.pok.os.sepia.ceph.com Git - teuthology.git/commitdiff
node-cleanup: Grace period for inactive jobs 2033/head
authorZack Cerza <zack@cerza.org>
Tue, 4 Mar 2025 23:37:53 +0000 (16:37 -0700)
committerZack Cerza <zack@cerza.org>
Wed, 5 Mar 2025 01:51:19 +0000 (18:51 -0700)
Once a job is marked finished, the supervisor may still be waiting to unlock its
nodes. Give jobs five minutes to clean up nodes before we consider them "stale".

Signed-off-by: Zack Cerza <zack@cerza.org>
teuthology/lock/query.py

index d52149a2160ef54c1d3cf96c77692da624aa3966..7d79ce9aeeeb7c801bb0b3d954f99a3951385a1f 100644 (file)
@@ -1,3 +1,4 @@
+import datetime
 import logging
 import os
 import requests
@@ -8,6 +9,7 @@ from teuthology import misc
 from teuthology.config import config
 from teuthology.contextutil import safe_while
 from teuthology.util.compat import urlencode
+from teuthology.util.time import parse_timestamp
 
 
 log = logging.getLogger(__name__)
@@ -153,16 +155,27 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N
         return "node description does not contained scheduled job info"
     url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/"
     job_status = ""
+    active = True
     with safe_while(
             sleep=1, increment=0.5, action='node_is_active') as proceed:
         while proceed():
             resp = requests.get(url)
             if resp.ok:
-                job_status = resp.json()["status"]
+                job_obj = resp.json()
+                job_status = job_obj["status"]
+                active = job_status and job_status not in ('pass', 'fail', 'dead')
+                if active:
+                    break
+                job_updated = job_obj["updated"]
+                try:
+                    delta = datetime.datetime.now(datetime.timezone.utc) - parse_timestamp(job_updated)
+                    active = active or delta < datetime.timedelta(minutes=5)
+                except Exception:
+                    log.exception(f"{run_name}/{job_id} updated={job_updated}")
                 break
             elif resp.status_code == 404:
                 break
             else:
                 log.debug(f"Error {resp.status_code} listing job {run_name}/{job_id} for {name}: {resp.text}")
-    if job_status and job_status not in ('pass', 'fail', 'dead'):
+    if active:
         return description