qa: Improve failure reporting for watchdog timeouts

author Bill Scales <bill_scales@uk.ibm.com>

Thu, 14 Aug 2025 12:02:49 +0000 (13:02 +0100)

committer Jon <jonathan.bailey1@ibm.com>

Fri, 3 Oct 2025 13:31:28 +0000 (14:31 +0100)
author Bill Scales <bill_scales@uk.ibm.com>
Thu, 14 Aug 2025 12:02:49 +0000 (13:02 +0100)
committer Jon <jonathan.bailey1@ibm.com>
Fri, 3 Oct 2025 13:31:28 +0000 (14:31 +0100)
diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py

index ea860a135f9911c6f40188cdf4a7afee0889dc21..c566cafe2d9b8f94a8c7449f011d7d83c25d2645 100644 (file)
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -778,6 +778,7 @@ def cluster(ctx, config):
      ctx.ceph[cluster_name] = argparse.Namespace()
      ctx.ceph[cluster_name].conf = conf
      ctx.ceph[cluster_name].mons = mons
+    ctx.ceph[cluster_name].assert_trackers = []
  
      default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
      keyring_path = config.get('keyring_path', default_keyring)
@@ -1403,6 +1404,69 @@ def run_daemon(ctx, config, type_):
      :param config: Configuration
      :param type_: Role type
      """
+    class AssertTracker(StringIO):
+        MAX_BT_LINES = 100
+        DISPLAY_BT_LINES = 6
+
+        def __init__(self, cluster_name, type_, id_, logger):
+            super(AssertTracker, self).__init__()
+            self.role = cluster_name + '.' + type_
+            self.id_ = id_
+            self.bt_lines = []
+            self.assertion = None
+            self.backtrace = None
+            self.in_bt = False
+            self.logger = logger
+            self.failure_time = None
+
+        def write(self, line):
+            if "FAILED assert" in line or "__ceph_assert_fail" in line:
+                self.assertion = line.strip()
+            if line.startswith(" ceph version"):
+                # The start of a backtrace!
+                self.bt_lines = [line]
+                self.in_bt = True
+                self.failure_time = time.time()
+            elif line.startswith(" NOTE: a copy of the executable") or \
+                 line.strip().endswith("clone()") or \
+                 "clone()+0x" in line:
+                # The backtrace terminated
+                if len(self.bt_lines):
+                    if self.assertion != None:
+                        self.bt_lines.insert(0, self.assertion)
+                        self.assertion = None
+                    # Limit number of lines of backtrace in the failure reason
+                    self.backtrace = ("".join(self.bt_lines[:self.DISPLAY_BT_LINES])).strip()
+                else:
+                    self.logger.warning("Saw end of backtrace but not start")
+                self.in_bt = False
+            elif self.in_bt:
+                # We're in a backtrace, push the line onto the list
+                if len(self.bt_lines) > self.MAX_BT_LINES:
+                    self.logger.warning("Failed to capture backtrace")
+                    self.bt_lines = ["Failed to capture backtrace", self.assertion]
+                    self.in_bt = False
+                else:
+                    # Backtrace filtering - skip lines for libraries, but include
+                    # libraries with ceph or rados in their path or name
+                    if "ceph" in line or "rados" in line or not re.match(r" [0-9]+: /.*",line):
+                        # Truncate long lines
+                        line = re.sub(r"([0-9]+: .{80}).*(\+0x[0-9a-fA-F]*\) \[0x[0-9a-fA-F]*\])",r"\1 ... \2",line)
+                        self.bt_lines.append(line)
+
+        def get_exception(self):
+            if self.backtrace is not None:
+                return Exception(self.backtrace)
+            if self.assertion is not None:
+                return Exception(self.assertion)
+            return None
+
+        def get_failure_time(self):
+            return self.failure_time
+
+        def match_id(self, role, id_):
+            return self.role == role and self.id_ == id_
+
      cluster_name = config['cluster']
      log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
      testdir = teuthology.get_testdir(ctx)
@@ -1503,13 +1567,17 @@ def run_daemon(ctx, config, type_):
                  maybe_redirect_stderr(config, type_, run_cmd, log_path)
              if create_log_cmd:
                  remote.sh(create_log_cmd)
+
              # always register mgr; don't necessarily start
+            asserttracker = AssertTracker(cluster_name, type_, id_, log.getChild(role))
+            ctx.ceph[cluster_name].assert_trackers.append(asserttracker)
              ctx.daemons.register_daemon(
                  remote, type_, id_,
                  cluster=cluster_name,
                  args=run_cmd,
                  logger=log.getChild(role),
                  stdin=run.PIPE,
+                stderr=asserttracker,
                  wait=False
              )
              if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
diff --git a/qa/tasks/daemonwatchdog.py b/qa/tasks/daemonwatchdog.py

index 7003a6b798598ea4950e979a160b002babb4437c..c81e2d3300add4c6d0e6a356c55f53687a4b1213 100644 (file)
--- a/qa/tasks/daemonwatchdog.py
+++ b/qa/tasks/daemonwatchdog.py
@@ -1,6 +1,7 @@
  import logging
  import signal
  import time
+import traceback
  
  from gevent import sleep
  from gevent.greenlet import Greenlet
@@ -15,7 +16,7 @@ class BarkError(Exception):
      """
  
      def __init__(self, bark_reason):
-        self.message = f"The WatchDog barked due to {bark_reason}"
+        self.message = f"Watchdog timeout: {bark_reason}"
          super().__init__(self.message)
  
  class DaemonWatchdog(Greenlet):
@@ -33,8 +34,8 @@ class DaemonWatchdog(Greenlet):
      If an extended failure is detected in a daemon process (i.e. not intentional), then
      the watchdog will bark. It will also bark is an assert is raised during the running
      of any monitored Thrashers or WatchedProcesses
-    
-    When the watchdog barks it will 
+
+    When the watchdog barks it will
       - unmount file systems and send SIGTERM to all daemons.
       - stop all thrasher processes by calling their stop_and_join() method
       - stop any watched processes by calling their stop() method
@@ -61,6 +62,7 @@ class DaemonWatchdog(Greenlet):
          self.stopping = Event()
          self.thrashers = ctx.ceph[config["cluster"]].thrashers
          self.watched_processes = ctx.ceph[config["cluster"]].watched_processes
+        self.assert_trackers = ctx.ceph[config["cluster"]].assert_trackers
  
      def _run(self):
          try:
@@ -113,7 +115,7 @@ class DaemonWatchdog(Greenlet):
          daemon_timeout = int(self.config.get('daemon_timeout', 300))
          daemon_restart = self.config.get('daemon_restart', False)
          daemon_failure_time = {}
-        bark_reason: str = ""
+        bark_reason = []
          while not self.stopping.is_set():
              bark = False
              now = time.time()
@@ -131,39 +133,106 @@ class DaemonWatchdog(Greenlet):
              daemon_failures.extend(filter(lambda daemon: daemon.finished(), rgws))
              daemon_failures.extend(filter(lambda daemon: daemon.finished(), mgrs))
  
-            for daemon in daemon_failures:
-                name = daemon.role + '.' + daemon.id_
-                dt = daemon_failure_time.setdefault(name, (daemon, now))
-                assert dt[0] is daemon
-                delta = now-dt[1]
-                self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
-                if delta > daemon_timeout:
-                    bark = True
-                    bark_reason = f"Daemon {name} has failed"
-                if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
-                    self.log(f"attempting to restart daemon {name}")
-                    daemon.restart()
-
-            # If a daemon is no longer failed, remove it from tracking:
-            for name in list(daemon_failure_time.keys()):
-                if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
-                    self.log("daemon {name} has been restored".format(name=name))
-                    del daemon_failure_time[name]
-
-            for thrasher in self.thrashers:
-                if thrasher.exception is not None:
-                    self.log("{name} failed".format(name=thrasher.name))
-                    bark_reason = f"Thrasher {name} threw exception {thrasher.exception}"
-                    bark = True
-
-            for proc in self.watched_processes:
-                if proc.exception is not None:
-                    self.log("Remote process %s failed" % proc.id)
-                    bark_reason = f"Remote process {proc.id} threw exception {proc.exception}"
-                    bark = True
+            try:
+                earliest = None
+                failures = []
+                for daemon in daemon_failures:
+                    name = daemon.role + '.' + daemon.id_
+                    dt = daemon_failure_time.setdefault(name, (daemon, now))
+                    assert dt[0] is daemon
+                    delta = now-dt[1]
+                    self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
+                    if delta > daemon_timeout:
+                        bark = True
+                        failures.append(f"{name}")
+                        for at in self.assert_trackers:
+                            if at.match_id(daemon.role, daemon.id_):
+                                exception = at.get_exception()
+                                if exception is not None:
+                                    if earliest == None or at.get_failure_time() < earliest.get_failure_time():
+                                        earliest = at
+
+                    if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
+                        self.log(f"attempting to restart daemon {name}")
+                        daemon.restart()
+
+                if len(failures) > 1:
+                    # Remove common prefixes from failure list
+                    last=""
+                    failures2 = []
+                    for f in failures:
+                        p = last.split(".")
+                        n = f.split(".")
+                        if p[0] != n[0] or len(p) == 1 or len(n) == 1:
+                            failures2.append(f)
+                        elif p[1] != n[1] or len(n) == 2:
+                            failures2.append(".".join(n[1:]))
+                        else:
+                            failures2.append(".".join(n[2:]))
+                        last = f
+                    failure_list = ",".join(failures2)
+                    bark_reason.append(f"Multiple daemons {failure_list} have timed out")
+                elif len(failures) == 1:
+                    bark_reason.append(f"Daemon {failures[0]} has timed out")
+
+                # If a daemon is no longer failed, remove it from tracking:
+                for name in list(daemon_failure_time.keys()):
+                    if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
+                        self.log("daemon {name} has been restored".format(name=name))
+                        del daemon_failure_time[name]
+
+                for thrasher in self.thrashers:
+                    if thrasher.exception is not None:
+                        self.log("{name} failed".format(name=thrasher.name))
+                        bark_reason.append(f"Thrasher {name} threw exception {thrasher.exception}")
+                        bark = True
+
+                for proc in self.watched_processes:
+                    if proc.exception is not None:
+                        self.log("Remote process %s failed" % proc.id)
+                        bark_reason.append(f"Remote process {proc.id} threw exception {proc.exception}")
+                        bark = True
+
+                # If a thrasher or watched process failed then check for the earliest daemon failure
+                failures = []
+                if bark and earliest == None:
+                    for at in self.assert_trackers:
+                        exception = at.get_exception()
+                        if exception is not None:
+                            failures.append(f"{at.role}.{at.id_}")
+                            if earliest == None or at.get_failure_time() < earliest.get_failure_time():
+                                earliest = at
+
+                if len(failures) > 1:
+                    # Remove common prefixes from failure list
+                    last=""
+                    failures2 = []
+                    for f in failures:
+                        p = last.split(".")
+                        n = f.split(".")
+                        if p[0] != n[0] or len(p) == 1 or len(n) == 1:
+                            failures2.append(f)
+                        elif p[1] != n[1] or len(n) == 2:
+                            failures2.append(".".join(n[1:]))
+                        else:
+                            failures2.append(".".join(n[2:]))
+                        last = f
+                    failure_list = ",".join(failures2)
+                    bark_reason.append(f"Multiple daemons {failure_list} have crashed")
+                elif len(failures) == 1:
+                    bark_reason.append(f"Daemon {failures[0]} has crashed")
+
+                # Only report details about one daemon failure
+                if earliest != None:
+                    bark_reason.append(f"First crash: Daemon {earliest.role}.{earliest.id_} at {time.ctime(earliest.get_failure_time())}")
+                    bark_reason.append(f"{earliest.get_exception()}")
+
+            except:
+                bark = True
+                bark_reason.append(f"Watchdog bug {traceback.format_exc()}")
  
              if bark:
-                self.bark(bark_reason)
+                self.bark("\n".join(bark_reason))
                  return
  
              sleep(5)
author	Bill Scales <bill_scales@uk.ibm.com>
	Thu, 14 Aug 2025 12:02:49 +0000 (13:02 +0100)
committer	Jon <jonathan.bailey1@ibm.com>
	Fri, 3 Oct 2025 13:31:28 +0000 (14:31 +0100)
qa/tasks/ceph.py		patch \| blob \| history
qa/tasks/daemonwatchdog.py		patch \| blob \| history