From b74f66b877144b920c368920ce3a9810e5c1e341 Mon Sep 17 00:00:00 2001 From: David Galloway Date: Sat, 13 Dec 2025 10:14:21 -0500 Subject: [PATCH] maas: Retry system ID lookups This is really the only API call I've seen consistently not return what we expected. Signed-off-by: David Galloway --- teuthology/provision/maas.py | 52 ++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/teuthology/provision/maas.py b/teuthology/provision/maas.py index d2b3ed2ef..db97b10e7 100644 --- a/teuthology/provision/maas.py +++ b/teuthology/provision/maas.py @@ -1,7 +1,9 @@ import io import json import logging +import time import operator +import random import re from oauthlib.oauth1 import SIGNATURE_PLAINTEXT @@ -208,17 +210,45 @@ class MAAS(object): :returns: The machine data as a dictionary """ - resp = self.do_request( - "/machines/", params={"hostname": self.shortname} - ).json() - if len(resp) == 0: - raise RuntimeError(f"Machine '{self.shortname}' not found!") - if len(resp) > 1: - hostnames = ", ".join([m.get("hostname", "") for m in resp]) - raise RuntimeError( - f"More than one machine found for hostname '{self.shortname}': {hostnames}" - ) - return resp[0] + tries = 3 + base_sleep = 0.5 + + for attempt in range(tries): + resp_obj = self.do_request("/machines/", params={"hostname": self.shortname}) + # Defensive: ensure we really got JSON list + try: + resp = resp_obj.json() + except Exception as e: + # definitely transient / bad gateway / overload etc. + if attempt == tries - 1: + raise + time.sleep(base_sleep * (2 ** attempt) + random.random() * 0.2) + continue + + if isinstance(resp, list) and len(resp) == 1: + return resp[0] + + if isinstance(resp, list) and len(resp) > 1: + hostnames = ", ".join([m.get("hostname", "") for m in resp]) + raise RuntimeError( + f"More than one machine found for hostname '{self.shortname}': {hostnames}" + ) + + # Empty list: could be real “not found” OR MAAS is overloaded. + if isinstance(resp, list) and len(resp) == 0: + if attempt < tries - 1: + time.sleep(base_sleep * (2 ** attempt) + random.random() * 0.2) + continue + raise RuntimeError( + f"Machine '{self.shortname}' not found after {tries} attempts " + f"(MAAS may be overloaded)." + ) + + # Unexpected JSON type (dict/str/etc). Treat as transient-ish. + if attempt < tries - 1: + time.sleep(base_sleep * (2 ** attempt) + random.random() * 0.2) + continue + raise RuntimeError(f"Unexpected MAAS response for '{self.shortname}': {type(resp)} {resp}") def get_image_data(self) -> Dict[str, Any]: """Locate the image we want to use -- 2.47.3