From f4383d9bdf3a2cb78e72865ce87a9f77186ca772 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Wed, 20 Oct 2021 12:51:30 -0600 Subject: [PATCH] supervisor: Don't unlock nodes w/ bad description Very rarely, we enter a situation where nodes get used by two jobs simultaneously. We can break this cycle if jobs refuse to unlock a node that is locked by a different job. This will not entirely prevent the problem, but it will keep it from perpetuating itself. Signed-off-by: Zack Cerza --- teuthology/dispatcher/supervisor.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index 8278fe1ed5..336578230f 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -175,10 +175,20 @@ def reimage(job_config): def unlock_targets(job_config): serializer = report.ResultsSerializer(teuth_config.archive_base) job_info = serializer.job_info(job_config['name'], job_config['job_id']) - machine_status = query.get_statuses(job_info['targets'].keys()) - # only unlock/nuke targets if locked in the first place - locked = [shortname(_['name']) - for _ in machine_status if _['locked']] + machine_statuses = query.get_statuses(job_info['targets'].keys()) + # only unlock/nuke targets if locked and description matches + locked = [] + for status in machine_statuses: + name = shortname(status['name']) + description = status['description'] + if not status['locked']: + continue + if description != job_info['archive_path']: + log.warning( + "Was going to unlock %s but it was locked by another job: %s", + name, description + ) + locked.append(name) if not locked: return job_status = get_status(job_info) -- 2.39.5