]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
supervisor: Don't unlock nodes w/ bad description 1688/head
authorZack Cerza <zack@redhat.com>
Wed, 20 Oct 2021 18:51:30 +0000 (12:51 -0600)
committerZack Cerza <zack@redhat.com>
Wed, 20 Oct 2021 18:51:30 +0000 (12:51 -0600)
Very rarely, we enter a situation where nodes get used by two jobs
simultaneously. We can break this cycle if jobs refuse to unlock a
node that is locked by a different job.

This will not entirely prevent the problem, but it will keep it from
perpetuating itself.

Signed-off-by: Zack Cerza <zack@redhat.com>
teuthology/dispatcher/supervisor.py

index 8278fe1ed58e4267d7224852c2d16c87b3961727..336578230fc3a692e82c4c06a5589c4d23c7d4e4 100644 (file)
@@ -175,10 +175,20 @@ def reimage(job_config):
 def unlock_targets(job_config):
     serializer = report.ResultsSerializer(teuth_config.archive_base)
     job_info = serializer.job_info(job_config['name'], job_config['job_id'])
-    machine_status = query.get_statuses(job_info['targets'].keys())
-    # only unlock/nuke targets if locked in the first place
-    locked = [shortname(_['name'])
-              for _ in machine_status if _['locked']]
+    machine_statuses = query.get_statuses(job_info['targets'].keys())
+    # only unlock/nuke targets if locked and description matches
+    locked = []
+    for status in machine_statuses:
+        name = shortname(status['name'])
+        description = status['description']
+        if not status['locked']:
+            continue
+        if description != job_info['archive_path']:
+            log.warning(
+                "Was going to unlock %s but it was locked by another job: %s",
+                name, description
+            )
+        locked.append(name)
     if not locked:
         return
     job_status = get_status(job_info)