]> git-server-git.apps.pok.os.sepia.ceph.com Git - teuthology.git/commitdiff
fog: Try ipmitool off&on if stuck in a reimage reboot hang 2146/head
authorDavid Galloway <david.galloway@ibm.com>
Thu, 12 Feb 2026 20:44:05 +0000 (15:44 -0500)
committerDavid Galloway <david.galloway@ibm.com>
Thu, 12 Feb 2026 23:16:12 +0000 (18:16 -0500)
Fixes: https://tracker.ceph.com/issues/74717
Signed-off-by: David Galloway <david.galloway@ibm.com>
teuthology/provision/fog.py

index 101da2464d3b4581d25f93288e09a002b1966a6f..bd7256b7ed22a36721cddf483e5596004ac9dba5 100644 (file)
@@ -281,9 +281,36 @@ class FOG(object):
         resp.raise_for_status()
 
     def _wait_for_ready(self):
-        """ Attempt to connect to the machine via SSH """
-        with safe_while(sleep=6, timeout=config.fog_wait_for_ssh_timeout) as proceed:
+        """
+        Attempt to connect to the machine via SSH (power cycle once at 50% of timeout).
+        """
+
+        total_timeout = config.fog_wait_for_ssh_timeout
+        ipmi_cycle_after_seconds = total_timeout * 0.5
+
+        start = datetime.datetime.now(datetime.timezone.utc)
+        ipmi_cycle_sent = False
+
+        with safe_while(sleep=6, timeout=total_timeout) as proceed:
             while proceed():
+                now = datetime.datetime.now(datetime.timezone.utc)
+                elapsed = (now - start).total_seconds()
+
+                # ipmitool power cycle once at 50% of timeout
+                if not ipmi_cycle_sent and elapsed >= ipmi_cycle_after_seconds:
+                    ipmi_cycle_sent = True
+                    self.log.warning(
+                        f"{self.shortname}: SSH not up after {int(elapsed)}s "
+                        f"(~50% of timeout); power cycling and continuing to wait"
+                    )
+                    try:
+                        self.remote.console.power_off()
+                        self.remote.console.power_on()
+                    except Exception as e:
+                        self.log.error(
+                            f"{self.shortname}: power cycle failed but continuing: {e}"
+                        )
+
                 try:
                     self.remote.connect()
                     break