From: Zack Cerza <zack@cerza.org>
Date: Thu, 15 Jan 2026 19:20:16 +0000 (-0700)
Subject: kill: Handle supervisor procs when killing runs
X-Git-Tag: 1.2.3~5^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F2130%2Fhead;p=teuthology.git

kill: Handle supervisor procs when killing runs

This is a follow-up to ff615aae541032c647e78d3959d368f595c93e31, which only
handled killing individual jobs. Since we're using the results server for all
run and job metadata, we can drop all mentions of the archive. This change
is necessary since we've restricted access to the archive from the teuthology
machine for normal users, to avoid resource contention.

Signed-off-by: Zack Cerza <zack@cerza.org>
---

diff --git a/scripts/kill.py b/scripts/kill.py
index 31acc8b1a..2e6ea0bdf 100644
--- a/scripts/kill.py
+++ b/scripts/kill.py
@@ -5,10 +5,10 @@ import teuthology.kill
 
 doc = """
 usage: teuthology-kill -h
-       teuthology-kill [-a ARCHIVE] [-p] -r RUN
-       teuthology-kill [-a ARCHIVE] [-p] -m MACHINE_TYPE -r RUN
-       teuthology-kill [-a ARCHIVE] [-o OWNER] -r RUN -j JOB ...
-       teuthology-kill [-a ARCHIVE] [-o OWNER] -J JOBSPEC
+       teuthology-kill [-p] -r RUN
+       teuthology-kill [-p] -m MACHINE_TYPE -r RUN
+       teuthology-kill [-o OWNER] -r RUN -j JOB ...
+       teuthology-kill [-o OWNER] -J JOBSPEC
        teuthology-kill [-p] -o OWNER -m MACHINE_TYPE -r RUN
 
 Kill running teuthology jobs:
@@ -21,9 +21,6 @@ processes.
 
 optional arguments:
   -h, --help            show this help message and exit
-  -a ARCHIVE, --archive ARCHIVE
-                        The base archive directory
-                        [default: {archive_base}]
   -p, --preserve-queue  Preserve the queue - do not delete queued jobs
   -r, --run RUN         The name(s) of the run(s) to kill
   -j, --job JOB         The job_id of the job to kill
@@ -36,7 +33,7 @@ optional arguments:
                         The type of machine the job(s) are running on.
                         This is required if killing a job that is still
                         entirely in the queue.
-""".format(archive_base=teuthology.config.config.archive_base)
+"""
 
 
 def main():
diff --git a/teuthology/kill.py b/teuthology/kill.py
index d2b5ea911..08abe0db4 100755
--- a/teuthology/kill.py
+++ b/teuthology/kill.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-import os
 import sys
 import yaml
 import psutil
@@ -23,7 +22,6 @@ def main(args):
     run_name = args['--run']
     job = args['--job']
     jobspec = args['--jobspec']
-    archive_base = args['--archive']
     owner = args['--owner']
     machine_type = args['--machine-type']
     preserve_queue = args['--preserve-queue']
@@ -35,42 +33,35 @@ def main(args):
 
     if job:
         for job_id in job:
-            kill_job(run_name, job_id, archive_base, owner)
+            kill_job(
+                run_name,
+                job_id,
+                owner
+            )
     else:
-        kill_run(run_name, archive_base, owner, machine_type,
-                 preserve_queue=preserve_queue)
+        kill_run(
+            run_name,
+            owner,
+            machine_type,
+            preserve_queue=preserve_queue,
+        )
 
 
-def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
+def kill_run(run_name, owner=None, machine_type=None,
              preserve_queue=False):
-    run_info = {}
-    serializer = report.ResultsSerializer(archive_base)
-    if archive_base:
-        run_archive_dir = os.path.join(archive_base, run_name)
-        if os.path.isdir(run_archive_dir):
-            run_info = find_run_info(serializer, run_name)
-            if 'machine_type' in run_info:
-                machine_type = run_info['machine_type']
-                owner = run_info['owner']
-            else:
-                log.warning("The run info does not have machine type: %s" % run_info)
-                log.warning("Run archive used: %s" % run_archive_dir)
-                log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner))
-        elif machine_type is None:
-            # no jobs found in archive and no machine type specified,
-            # so we try paddles to see if there is anything scheduled
-            run_info = report.ResultsReporter().get_run(run_name)
-            machine_type = run_info.get('machine_type', None)
-            if machine_type:
-                log.info(f"Using machine type '{machine_type}' received from paddles.")
-            else:
-                raise RuntimeError(f"Cannot find machine type for the run {run_name}; " +
-                                    "you must also pass --machine-type")
+    run_info = report.ResultsReporter().get_run(run_name)
+    # run: machine_type, owner
+    # job: pid, id
+    machine_type = run_info.get('machine_type', None)
 
     if not preserve_queue:
         remove_beanstalk_jobs(run_name, machine_type)
         remove_paddles_jobs(run_name)
-    if kill_processes(run_name, run_info.get('pids')):
+    pids = []
+    for job in run_info['jobs']:
+        if pid := job.get('pid'):
+            pids.append(int(pid))
+    if kill_processes(run_name, pids):
         return
     if owner is not None:
         targets = find_targets(run_name)
@@ -79,7 +70,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
     report.try_mark_run_dead(run_name)
 
 
-def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
+def kill_job(run_name, job_id, owner=None, skip_unlock=False):
     job_info = report.ResultsReporter().get_jobs(run_name, job_id)
     if not owner:
         if 'owner' not in job_info:
@@ -111,34 +102,6 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False)
         lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
 
 
-def find_run_info(serializer, run_name):
-    log.info("Assembling run information...")
-    run_info_fields = [
-        'machine_type',
-        'owner',
-    ]
-
-    pids = []
-    run_info = {}
-    job_info = {}
-    job_num = 0
-    jobs = serializer.jobs_for_run(run_name)
-    job_total = len(jobs)
-    for (job_id, job_dir) in jobs.items():
-        if not os.path.isdir(job_dir):
-            continue
-        job_num += 1
-        beanstalk.print_progress(job_num, job_total, 'Reading Job: ')
-        job_info = serializer.job_info(run_name, job_id, simple=True)
-        for key in job_info.keys():
-            if key in run_info_fields and key not in run_info:
-                run_info[key] = job_info[key]
-        if 'pid' in job_info:
-            pids.append(job_info['pid'])
-    run_info['pids'] = pids
-    return run_info
-
-
 def remove_paddles_jobs(run_name):
     jobs = report.ResultsReporter().get_jobs(run_name, fields=['status'])
     job_ids = [job['job_id'] for job in jobs if job['status'] == 'queued']
@@ -229,7 +192,7 @@ def kill_processes(run_name, pids=None, job_id=None):
 def process_matches_run(pid, run_name):
     try:
         p = psutil.Process(pid)
-        cmd = p.cmdline()
+        cmd = ' '.join(p.cmdline())
         if run_name in cmd and sys.argv[0] not in cmd:
             return True
     except psutil.NoSuchProcess: