kill: Handle supervisor procs when killing runs

author Zack Cerza <zack@cerza.org>

Thu, 15 Jan 2026 19:20:16 +0000 (12:20 -0700)

committer Zack Cerza <zack@cerza.org>

Thu, 15 Jan 2026 20:48:12 +0000 (13:48 -0700)
author Zack Cerza <zack@cerza.org>
Thu, 15 Jan 2026 19:20:16 +0000 (12:20 -0700)
committer Zack Cerza <zack@cerza.org>
Thu, 15 Jan 2026 20:48:12 +0000 (13:48 -0700)
diff --git a/scripts/kill.py b/scripts/kill.py

index 31acc8b1a4a262741518a80064ac33a8f72c8853..2e6ea0bdfc19fb3c3e96ce24efb37724ad5520ff 100644 (file)
--- a/scripts/kill.py
+++ b/scripts/kill.py
@@ -5,10 +5,10 @@ import teuthology.kill
  
  doc = """
  usage: teuthology-kill -h
-       teuthology-kill [-a ARCHIVE] [-p] -r RUN
-       teuthology-kill [-a ARCHIVE] [-p] -m MACHINE_TYPE -r RUN
-       teuthology-kill [-a ARCHIVE] [-o OWNER] -r RUN -j JOB ...
-       teuthology-kill [-a ARCHIVE] [-o OWNER] -J JOBSPEC
+       teuthology-kill [-p] -r RUN
+       teuthology-kill [-p] -m MACHINE_TYPE -r RUN
+       teuthology-kill [-o OWNER] -r RUN -j JOB ...
+       teuthology-kill [-o OWNER] -J JOBSPEC
         teuthology-kill [-p] -o OWNER -m MACHINE_TYPE -r RUN
  
  Kill running teuthology jobs:
@@ -21,9 +21,6 @@ processes.
  
  optional arguments:
    -h, --help            show this help message and exit
-  -a ARCHIVE, --archive ARCHIVE
-                        The base archive directory
-                        [default: {archive_base}]
    -p, --preserve-queue  Preserve the queue - do not delete queued jobs
    -r, --run RUN         The name(s) of the run(s) to kill
    -j, --job JOB         The job_id of the job to kill
@@ -36,7 +33,7 @@ optional arguments:
                          The type of machine the job(s) are running on.
                          This is required if killing a job that is still
                          entirely in the queue.
-""".format(archive_base=teuthology.config.config.archive_base)
+"""
  
  
  def main():
diff --git a/teuthology/kill.py b/teuthology/kill.py

index d2b5ea91147410dc18cfc026d3a2ba983f4e6533..08abe0db4cafee1a586b81c74620213d2dd7f92d 100755 (executable)
--- a/teuthology/kill.py
+++ b/teuthology/kill.py
@@ -1,5 +1,4 @@
  #!/usr/bin/python
-import os
  import sys
  import yaml
  import psutil
@@ -23,7 +22,6 @@ def main(args):
      run_name = args['--run']
      job = args['--job']
      jobspec = args['--jobspec']
-    archive_base = args['--archive']
      owner = args['--owner']
      machine_type = args['--machine-type']
      preserve_queue = args['--preserve-queue']
@@ -35,42 +33,35 @@ def main(args):
  
      if job:
          for job_id in job:
-            kill_job(run_name, job_id, archive_base, owner)
+            kill_job(
+                run_name,
+                job_id,
+                owner
+            )
      else:
-        kill_run(run_name, archive_base, owner, machine_type,
-                 preserve_queue=preserve_queue)
+        kill_run(
+            run_name,
+            owner,
+            machine_type,
+            preserve_queue=preserve_queue,
+        )
  
  
-def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
+def kill_run(run_name, owner=None, machine_type=None,
               preserve_queue=False):
-    run_info = {}
-    serializer = report.ResultsSerializer(archive_base)
-    if archive_base:
-        run_archive_dir = os.path.join(archive_base, run_name)
-        if os.path.isdir(run_archive_dir):
-            run_info = find_run_info(serializer, run_name)
-            if 'machine_type' in run_info:
-                machine_type = run_info['machine_type']
-                owner = run_info['owner']
-            else:
-                log.warning("The run info does not have machine type: %s" % run_info)
-                log.warning("Run archive used: %s" % run_archive_dir)
-                log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner))
-        elif machine_type is None:
-            # no jobs found in archive and no machine type specified,
-            # so we try paddles to see if there is anything scheduled
-            run_info = report.ResultsReporter().get_run(run_name)
-            machine_type = run_info.get('machine_type', None)
-            if machine_type:
-                log.info(f"Using machine type '{machine_type}' received from paddles.")
-            else:
-                raise RuntimeError(f"Cannot find machine type for the run {run_name}; " +
-                                    "you must also pass --machine-type")
+    run_info = report.ResultsReporter().get_run(run_name)
+    # run: machine_type, owner
+    # job: pid, id
+    machine_type = run_info.get('machine_type', None)
  
      if not preserve_queue:
          remove_beanstalk_jobs(run_name, machine_type)
          remove_paddles_jobs(run_name)
-    if kill_processes(run_name, run_info.get('pids')):
+    pids = []
+    for job in run_info['jobs']:
+        if pid := job.get('pid'):
+            pids.append(int(pid))
+    if kill_processes(run_name, pids):
          return
      if owner is not None:
          targets = find_targets(run_name)
@@ -79,7 +70,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
      report.try_mark_run_dead(run_name)
  
  
-def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
+def kill_job(run_name, job_id, owner=None, skip_unlock=False):
      job_info = report.ResultsReporter().get_jobs(run_name, job_id)
      if not owner:
          if 'owner' not in job_info:
@@ -111,34 +102,6 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False)
          lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
  
  
-def find_run_info(serializer, run_name):
-    log.info("Assembling run information...")
-    run_info_fields = [
-        'machine_type',
-        'owner',
-    ]
-
-    pids = []
-    run_info = {}
-    job_info = {}
-    job_num = 0
-    jobs = serializer.jobs_for_run(run_name)
-    job_total = len(jobs)
-    for (job_id, job_dir) in jobs.items():
-        if not os.path.isdir(job_dir):
-            continue
-        job_num += 1
-        beanstalk.print_progress(job_num, job_total, 'Reading Job: ')
-        job_info = serializer.job_info(run_name, job_id, simple=True)
-        for key in job_info.keys():
-            if key in run_info_fields and key not in run_info:
-                run_info[key] = job_info[key]
-        if 'pid' in job_info:
-            pids.append(job_info['pid'])
-    run_info['pids'] = pids
-    return run_info
-
-
  def remove_paddles_jobs(run_name):
      jobs = report.ResultsReporter().get_jobs(run_name, fields=['status'])
      job_ids = [job['job_id'] for job in jobs if job['status'] == 'queued']
@@ -229,7 +192,7 @@ def kill_processes(run_name, pids=None, job_id=None):
  def process_matches_run(pid, run_name):
      try:
          p = psutil.Process(pid)
-        cmd = p.cmdline()
+        cmd = ' '.join(p.cmdline())
          if run_name in cmd and sys.argv[0] not in cmd:
              return True
      except psutil.NoSuchProcess:
author	Zack Cerza <zack@cerza.org>
	Thu, 15 Jan 2026 19:20:16 +0000 (12:20 -0700)
committer	Zack Cerza <zack@cerza.org>
	Thu, 15 Jan 2026 20:48:12 +0000 (13:48 -0700)
scripts/kill.py		patch \| blob \| history
teuthology/kill.py		patch \| blob \| history