From: Zack Cerza Date: Wed, 20 Oct 2021 18:51:30 +0000 (-0600) Subject: supervisor: Don't unlock nodes w/ bad description X-Git-Tag: 1.2.0~228^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f4383d9bdf3a2cb78e72865ce87a9f77186ca772;p=teuthology.git supervisor: Don't unlock nodes w/ bad description Very rarely, we enter a situation where nodes get used by two jobs simultaneously. We can break this cycle if jobs refuse to unlock a node that is locked by a different job. This will not entirely prevent the problem, but it will keep it from perpetuating itself. Signed-off-by: Zack Cerza --- diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index 8278fe1ed..336578230 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -175,10 +175,20 @@ def reimage(job_config): def unlock_targets(job_config): serializer = report.ResultsSerializer(teuth_config.archive_base) job_info = serializer.job_info(job_config['name'], job_config['job_id']) - machine_status = query.get_statuses(job_info['targets'].keys()) - # only unlock/nuke targets if locked in the first place - locked = [shortname(_['name']) - for _ in machine_status if _['locked']] + machine_statuses = query.get_statuses(job_info['targets'].keys()) + # only unlock/nuke targets if locked and description matches + locked = [] + for status in machine_statuses: + name = shortname(status['name']) + description = status['description'] + if not status['locked']: + continue + if description != job_info['archive_path']: + log.warning( + "Was going to unlock %s but it was locked by another job: %s", + name, description + ) + locked.append(name) if not locked: return job_status = get_status(job_info)