From ff615aae541032c647e78d3959d368f595c93e31 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Thu, 8 Jan 2026 17:36:30 -0700 Subject: [PATCH] kill: Look for, and kill, the supervisor process Useful if one wants to kill a job that is still waiting for its nodes to be provisioned. Signed-off-by: Zack Cerza --- teuthology/dispatcher/supervisor.py | 6 +++++ teuthology/kill.py | 41 +++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index 23c7d18c1..b235d4041 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -42,6 +42,12 @@ def main(args): except SkipJob: return 0 + report.try_push_job_info({ + 'name': job_config['name'], + 'job_id': job_config['job_id'], + 'pid': os.getpid(), + }) + # reimage target machines before running the job if 'targets' in job_config: node_count = len(job_config["targets"]) diff --git a/teuthology/kill.py b/teuthology/kill.py index 137e49080..d2b5ea911 100755 --- a/teuthology/kill.py +++ b/teuthology/kill.py @@ -80,20 +80,25 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None, def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False): - serializer = report.ResultsSerializer(archive_base) - job_info = serializer.job_info(run_name, job_id) - # If we can't read the filesystem, job_info will be nearly empty. Ask paddles: - if 'name' not in job_info: - job_info = report.ResultsReporter().get_jobs(run_name, job_id) + job_info = report.ResultsReporter().get_jobs(run_name, job_id) if not owner: if 'owner' not in job_info: raise RuntimeError( "I could not figure out the owner of the requested job. " "Please pass --owner .") owner = job_info['owner'] - if kill_processes(run_name, [job_info.get('pid')]): + if kill_processes(run_name, [int(job_info.get('pid'))], job_info.get('job_id')): return - report.try_push_job_info(job_info, dict(status="dead")) + report.try_push_job_info( + { + 'name': run_name, + 'job_id': job_id, + }, + { + 'status': 'dead', + 'failure_reason': 'killed', + } + ) if 'machine_type' in job_info: teuthology.exporter.JobResults().record( machine_type=job_info["machine_type"], @@ -177,7 +182,7 @@ def remove_beanstalk_jobs(run_name, tube_name): beanstalk_conn.close() -def kill_processes(run_name, pids=None): +def kill_processes(run_name, pids=None, job_id=None): if pids: to_kill = set(pids).intersection(psutil.pids()) else: @@ -185,9 +190,15 @@ def kill_processes(run_name, pids=None): pids_need_sudo = set() for pid in set(to_kill): - if not process_matches_run(pid, run_name): - to_kill.remove(pid) - elif psutil.Process(int(pid)).username() != getpass.getuser(): + if job_id: + if not process_matches_job(pid, run_name, job_id): + to_kill.remove(pid) + continue + else: + if not process_matches_run(pid, run_name): + to_kill.remove(pid) + continue + if psutil.Process(int(pid)).username() != getpass.getuser(): pids_need_sudo.add(pid) survivors = [] @@ -227,6 +238,14 @@ def process_matches_run(pid, run_name): pass return False +def process_matches_job(pid, run_name, job_id): + try: + return f"{run_name}/{job_id}" in ' '.join(psutil.Process(pid).cmdline()) + except psutil.NoSuchProcess: + pass + except psutil.AccessDenied: + pass + return False def find_pids(run_name): run_pids = [] -- 2.47.3