From: David Galloway Date: Thu, 12 Feb 2026 20:44:05 +0000 (-0500) Subject: fog: Try ipmitool off&on if stuck in a reimage reboot hang X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=61b34a666790b2ed518e061f746e4fb46b6619f9;p=teuthology.git fog: Try ipmitool off&on if stuck in a reimage reboot hang Fixes: https://tracker.ceph.com/issues/74717 Signed-off-by: David Galloway --- diff --git a/teuthology/provision/fog.py b/teuthology/provision/fog.py index 101da2464..bd7256b7e 100644 --- a/teuthology/provision/fog.py +++ b/teuthology/provision/fog.py @@ -281,9 +281,36 @@ class FOG(object): resp.raise_for_status() def _wait_for_ready(self): - """ Attempt to connect to the machine via SSH """ - with safe_while(sleep=6, timeout=config.fog_wait_for_ssh_timeout) as proceed: + """ + Attempt to connect to the machine via SSH (power cycle once at 50% of timeout). + """ + + total_timeout = config.fog_wait_for_ssh_timeout + ipmi_cycle_after_seconds = total_timeout * 0.5 + + start = datetime.datetime.now(datetime.timezone.utc) + ipmi_cycle_sent = False + + with safe_while(sleep=6, timeout=total_timeout) as proceed: while proceed(): + now = datetime.datetime.now(datetime.timezone.utc) + elapsed = (now - start).total_seconds() + + # ipmitool power cycle once at 50% of timeout + if not ipmi_cycle_sent and elapsed >= ipmi_cycle_after_seconds: + ipmi_cycle_sent = True + self.log.warning( + f"{self.shortname}: SSH not up after {int(elapsed)}s " + f"(~50% of timeout); power cycling and continuing to wait" + ) + try: + self.remote.console.power_off() + self.remote.console.power_on() + except Exception as e: + self.log.error( + f"{self.shortname}: power cycle failed but continuing: {e}" + ) + try: self.remote.connect() break