From: Zack Cerza Date: Thu, 15 Jan 2026 19:20:16 +0000 (-0700) Subject: kill: Handle supervisor procs when killing runs X-Git-Tag: 1.2.3~5^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F2130%2Fhead;p=teuthology.git kill: Handle supervisor procs when killing runs This is a follow-up to ff615aae541032c647e78d3959d368f595c93e31, which only handled killing individual jobs. Since we're using the results server for all run and job metadata, we can drop all mentions of the archive. This change is necessary since we've restricted access to the archive from the teuthology machine for normal users, to avoid resource contention. Signed-off-by: Zack Cerza --- diff --git a/scripts/kill.py b/scripts/kill.py index 31acc8b1a..2e6ea0bdf 100644 --- a/scripts/kill.py +++ b/scripts/kill.py @@ -5,10 +5,10 @@ import teuthology.kill doc = """ usage: teuthology-kill -h - teuthology-kill [-a ARCHIVE] [-p] -r RUN - teuthology-kill [-a ARCHIVE] [-p] -m MACHINE_TYPE -r RUN - teuthology-kill [-a ARCHIVE] [-o OWNER] -r RUN -j JOB ... - teuthology-kill [-a ARCHIVE] [-o OWNER] -J JOBSPEC + teuthology-kill [-p] -r RUN + teuthology-kill [-p] -m MACHINE_TYPE -r RUN + teuthology-kill [-o OWNER] -r RUN -j JOB ... + teuthology-kill [-o OWNER] -J JOBSPEC teuthology-kill [-p] -o OWNER -m MACHINE_TYPE -r RUN Kill running teuthology jobs: @@ -21,9 +21,6 @@ processes. optional arguments: -h, --help show this help message and exit - -a ARCHIVE, --archive ARCHIVE - The base archive directory - [default: {archive_base}] -p, --preserve-queue Preserve the queue - do not delete queued jobs -r, --run RUN The name(s) of the run(s) to kill -j, --job JOB The job_id of the job to kill @@ -36,7 +33,7 @@ optional arguments: The type of machine the job(s) are running on. This is required if killing a job that is still entirely in the queue. -""".format(archive_base=teuthology.config.config.archive_base) +""" def main(): diff --git a/teuthology/kill.py b/teuthology/kill.py index d2b5ea911..08abe0db4 100755 --- a/teuthology/kill.py +++ b/teuthology/kill.py @@ -1,5 +1,4 @@ #!/usr/bin/python -import os import sys import yaml import psutil @@ -23,7 +22,6 @@ def main(args): run_name = args['--run'] job = args['--job'] jobspec = args['--jobspec'] - archive_base = args['--archive'] owner = args['--owner'] machine_type = args['--machine-type'] preserve_queue = args['--preserve-queue'] @@ -35,42 +33,35 @@ def main(args): if job: for job_id in job: - kill_job(run_name, job_id, archive_base, owner) + kill_job( + run_name, + job_id, + owner + ) else: - kill_run(run_name, archive_base, owner, machine_type, - preserve_queue=preserve_queue) + kill_run( + run_name, + owner, + machine_type, + preserve_queue=preserve_queue, + ) -def kill_run(run_name, archive_base=None, owner=None, machine_type=None, +def kill_run(run_name, owner=None, machine_type=None, preserve_queue=False): - run_info = {} - serializer = report.ResultsSerializer(archive_base) - if archive_base: - run_archive_dir = os.path.join(archive_base, run_name) - if os.path.isdir(run_archive_dir): - run_info = find_run_info(serializer, run_name) - if 'machine_type' in run_info: - machine_type = run_info['machine_type'] - owner = run_info['owner'] - else: - log.warning("The run info does not have machine type: %s" % run_info) - log.warning("Run archive used: %s" % run_archive_dir) - log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner)) - elif machine_type is None: - # no jobs found in archive and no machine type specified, - # so we try paddles to see if there is anything scheduled - run_info = report.ResultsReporter().get_run(run_name) - machine_type = run_info.get('machine_type', None) - if machine_type: - log.info(f"Using machine type '{machine_type}' received from paddles.") - else: - raise RuntimeError(f"Cannot find machine type for the run {run_name}; " + - "you must also pass --machine-type") + run_info = report.ResultsReporter().get_run(run_name) + # run: machine_type, owner + # job: pid, id + machine_type = run_info.get('machine_type', None) if not preserve_queue: remove_beanstalk_jobs(run_name, machine_type) remove_paddles_jobs(run_name) - if kill_processes(run_name, run_info.get('pids')): + pids = [] + for job in run_info['jobs']: + if pid := job.get('pid'): + pids.append(int(pid)) + if kill_processes(run_name, pids): return if owner is not None: targets = find_targets(run_name) @@ -79,7 +70,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None, report.try_mark_run_dead(run_name) -def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False): +def kill_job(run_name, job_id, owner=None, skip_unlock=False): job_info = report.ResultsReporter().get_jobs(run_name, job_id) if not owner: if 'owner' not in job_info: @@ -111,34 +102,6 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False) lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id) -def find_run_info(serializer, run_name): - log.info("Assembling run information...") - run_info_fields = [ - 'machine_type', - 'owner', - ] - - pids = [] - run_info = {} - job_info = {} - job_num = 0 - jobs = serializer.jobs_for_run(run_name) - job_total = len(jobs) - for (job_id, job_dir) in jobs.items(): - if not os.path.isdir(job_dir): - continue - job_num += 1 - beanstalk.print_progress(job_num, job_total, 'Reading Job: ') - job_info = serializer.job_info(run_name, job_id, simple=True) - for key in job_info.keys(): - if key in run_info_fields and key not in run_info: - run_info[key] = job_info[key] - if 'pid' in job_info: - pids.append(job_info['pid']) - run_info['pids'] = pids - return run_info - - def remove_paddles_jobs(run_name): jobs = report.ResultsReporter().get_jobs(run_name, fields=['status']) job_ids = [job['job_id'] for job in jobs if job['status'] == 'queued'] @@ -229,7 +192,7 @@ def kill_processes(run_name, pids=None, job_id=None): def process_matches_run(pid, run_name): try: p = psutil.Process(pid) - cmd = p.cmdline() + cmd = ' '.join(p.cmdline()) if run_name in cmd and sys.argv[0] not in cmd: return True except psutil.NoSuchProcess: