]> git-server-git.apps.pok.os.sepia.ceph.com Git - teuthology.git/commitdiff
kill: Handle supervisor procs when killing runs kill-multi-supervisor 2130/head
authorZack Cerza <zack@cerza.org>
Thu, 15 Jan 2026 19:20:16 +0000 (12:20 -0700)
committerZack Cerza <zack@cerza.org>
Thu, 15 Jan 2026 20:48:12 +0000 (13:48 -0700)
This is a follow-up to ff615aae541032c647e78d3959d368f595c93e31, which only
handled killing individual jobs. Since we're using the results server for all
run and job metadata, we can drop all mentions of the archive. This change
is necessary since we've restricted access to the archive from the teuthology
machine for normal users, to avoid resource contention.

Signed-off-by: Zack Cerza <zack@cerza.org>
scripts/kill.py
teuthology/kill.py

index 31acc8b1a4a262741518a80064ac33a8f72c8853..2e6ea0bdfc19fb3c3e96ce24efb37724ad5520ff 100644 (file)
@@ -5,10 +5,10 @@ import teuthology.kill
 
 doc = """
 usage: teuthology-kill -h
-       teuthology-kill [-a ARCHIVE] [-p] -r RUN
-       teuthology-kill [-a ARCHIVE] [-p] -m MACHINE_TYPE -r RUN
-       teuthology-kill [-a ARCHIVE] [-o OWNER] -r RUN -j JOB ...
-       teuthology-kill [-a ARCHIVE] [-o OWNER] -J JOBSPEC
+       teuthology-kill [-p] -r RUN
+       teuthology-kill [-p] -m MACHINE_TYPE -r RUN
+       teuthology-kill [-o OWNER] -r RUN -j JOB ...
+       teuthology-kill [-o OWNER] -J JOBSPEC
        teuthology-kill [-p] -o OWNER -m MACHINE_TYPE -r RUN
 
 Kill running teuthology jobs:
@@ -21,9 +21,6 @@ processes.
 
 optional arguments:
   -h, --help            show this help message and exit
-  -a ARCHIVE, --archive ARCHIVE
-                        The base archive directory
-                        [default: {archive_base}]
   -p, --preserve-queue  Preserve the queue - do not delete queued jobs
   -r, --run RUN         The name(s) of the run(s) to kill
   -j, --job JOB         The job_id of the job to kill
@@ -36,7 +33,7 @@ optional arguments:
                         The type of machine the job(s) are running on.
                         This is required if killing a job that is still
                         entirely in the queue.
-""".format(archive_base=teuthology.config.config.archive_base)
+"""
 
 
 def main():
index d2b5ea91147410dc18cfc026d3a2ba983f4e6533..08abe0db4cafee1a586b81c74620213d2dd7f92d 100755 (executable)
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-import os
 import sys
 import yaml
 import psutil
@@ -23,7 +22,6 @@ def main(args):
     run_name = args['--run']
     job = args['--job']
     jobspec = args['--jobspec']
-    archive_base = args['--archive']
     owner = args['--owner']
     machine_type = args['--machine-type']
     preserve_queue = args['--preserve-queue']
@@ -35,42 +33,35 @@ def main(args):
 
     if job:
         for job_id in job:
-            kill_job(run_name, job_id, archive_base, owner)
+            kill_job(
+                run_name,
+                job_id,
+                owner
+            )
     else:
-        kill_run(run_name, archive_base, owner, machine_type,
-                 preserve_queue=preserve_queue)
+        kill_run(
+            run_name,
+            owner,
+            machine_type,
+            preserve_queue=preserve_queue,
+        )
 
 
-def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
+def kill_run(run_name, owner=None, machine_type=None,
              preserve_queue=False):
-    run_info = {}
-    serializer = report.ResultsSerializer(archive_base)
-    if archive_base:
-        run_archive_dir = os.path.join(archive_base, run_name)
-        if os.path.isdir(run_archive_dir):
-            run_info = find_run_info(serializer, run_name)
-            if 'machine_type' in run_info:
-                machine_type = run_info['machine_type']
-                owner = run_info['owner']
-            else:
-                log.warning("The run info does not have machine type: %s" % run_info)
-                log.warning("Run archive used: %s" % run_archive_dir)
-                log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner))
-        elif machine_type is None:
-            # no jobs found in archive and no machine type specified,
-            # so we try paddles to see if there is anything scheduled
-            run_info = report.ResultsReporter().get_run(run_name)
-            machine_type = run_info.get('machine_type', None)
-            if machine_type:
-                log.info(f"Using machine type '{machine_type}' received from paddles.")
-            else:
-                raise RuntimeError(f"Cannot find machine type for the run {run_name}; " +
-                                    "you must also pass --machine-type")
+    run_info = report.ResultsReporter().get_run(run_name)
+    # run: machine_type, owner
+    # job: pid, id
+    machine_type = run_info.get('machine_type', None)
 
     if not preserve_queue:
         remove_beanstalk_jobs(run_name, machine_type)
         remove_paddles_jobs(run_name)
-    if kill_processes(run_name, run_info.get('pids')):
+    pids = []
+    for job in run_info['jobs']:
+        if pid := job.get('pid'):
+            pids.append(int(pid))
+    if kill_processes(run_name, pids):
         return
     if owner is not None:
         targets = find_targets(run_name)
@@ -79,7 +70,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
     report.try_mark_run_dead(run_name)
 
 
-def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
+def kill_job(run_name, job_id, owner=None, skip_unlock=False):
     job_info = report.ResultsReporter().get_jobs(run_name, job_id)
     if not owner:
         if 'owner' not in job_info:
@@ -111,34 +102,6 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False)
         lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
 
 
-def find_run_info(serializer, run_name):
-    log.info("Assembling run information...")
-    run_info_fields = [
-        'machine_type',
-        'owner',
-    ]
-
-    pids = []
-    run_info = {}
-    job_info = {}
-    job_num = 0
-    jobs = serializer.jobs_for_run(run_name)
-    job_total = len(jobs)
-    for (job_id, job_dir) in jobs.items():
-        if not os.path.isdir(job_dir):
-            continue
-        job_num += 1
-        beanstalk.print_progress(job_num, job_total, 'Reading Job: ')
-        job_info = serializer.job_info(run_name, job_id, simple=True)
-        for key in job_info.keys():
-            if key in run_info_fields and key not in run_info:
-                run_info[key] = job_info[key]
-        if 'pid' in job_info:
-            pids.append(job_info['pid'])
-    run_info['pids'] = pids
-    return run_info
-
-
 def remove_paddles_jobs(run_name):
     jobs = report.ResultsReporter().get_jobs(run_name, fields=['status'])
     job_ids = [job['job_id'] for job in jobs if job['status'] == 'queued']
@@ -229,7 +192,7 @@ def kill_processes(run_name, pids=None, job_id=None):
 def process_matches_run(pid, run_name):
     try:
         p = psutil.Process(pid)
-        cmd = p.cmdline()
+        cmd = ' '.join(p.cmdline())
         if run_name in cmd and sys.argv[0] not in cmd:
             return True
     except psutil.NoSuchProcess: