From 61b34a666790b2ed518e061f746e4fb46b6619f9 Mon Sep 17 00:00:00 2001 From: David Galloway Date: Thu, 12 Feb 2026 15:44:05 -0500 Subject: [PATCH] fog: Try ipmitool off&on if stuck in a reimage reboot hang Fixes: https://tracker.ceph.com/issues/74717 Signed-off-by: David Galloway --- teuthology/provision/fog.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/teuthology/provision/fog.py b/teuthology/provision/fog.py index 101da2464..bd7256b7e 100644 --- a/teuthology/provision/fog.py +++ b/teuthology/provision/fog.py @@ -281,9 +281,36 @@ class FOG(object): resp.raise_for_status() def _wait_for_ready(self): - """ Attempt to connect to the machine via SSH """ - with safe_while(sleep=6, timeout=config.fog_wait_for_ssh_timeout) as proceed: + """ + Attempt to connect to the machine via SSH (power cycle once at 50% of timeout). + """ + + total_timeout = config.fog_wait_for_ssh_timeout + ipmi_cycle_after_seconds = total_timeout * 0.5 + + start = datetime.datetime.now(datetime.timezone.utc) + ipmi_cycle_sent = False + + with safe_while(sleep=6, timeout=total_timeout) as proceed: while proceed(): + now = datetime.datetime.now(datetime.timezone.utc) + elapsed = (now - start).total_seconds() + + # ipmitool power cycle once at 50% of timeout + if not ipmi_cycle_sent and elapsed >= ipmi_cycle_after_seconds: + ipmi_cycle_sent = True + self.log.warning( + f"{self.shortname}: SSH not up after {int(elapsed)}s " + f"(~50% of timeout); power cycling and continuing to wait" + ) + try: + self.remote.console.power_off() + self.remote.console.power_on() + except Exception as e: + self.log.error( + f"{self.shortname}: power cycle failed but continuing: {e}" + ) + try: self.remote.connect() break -- 2.47.3