From 63238d6e36da8c0663a3694d852a94c090857960 Mon Sep 17 00:00:00 2001
From: Bill Scales <bill_scales@uk.ibm.com>
Date: Thu, 14 Aug 2025 13:02:49 +0100
Subject: [PATCH] qa: Improve failure reporting for watchdog timeouts

A daemon such as an osd or mon crashing can lead to teuthology tests
failing, these failures are detected by the daemon watchdog. Currently
the failure message just indicates that a watchdog timeout occurred.

Add an AssertTracker class that processes the stderr output from
daemon processes, taking insipration from teuthology scrape.py to
capture the assert string and backtrace if the daemon crashes

Enhance the watchdog to report the assertion and backtrace if
a daemon has crashed so this gets reported as the teuthoolgy
job failure reason

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
---
 qa/tasks/ceph.py           |  68 ++++++++++++++++++
 qa/tasks/daemonwatchdog.py | 139 +++++++++++++++++++++++++++----------
 2 files changed, 172 insertions(+), 35 deletions(-)

diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py
index ea860a135f9..c566cafe2d9 100644
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -778,6 +778,7 @@ def cluster(ctx, config):
     ctx.ceph[cluster_name] = argparse.Namespace()
     ctx.ceph[cluster_name].conf = conf
     ctx.ceph[cluster_name].mons = mons
+    ctx.ceph[cluster_name].assert_trackers = []
 
     default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
     keyring_path = config.get('keyring_path', default_keyring)
@@ -1403,6 +1404,69 @@ def run_daemon(ctx, config, type_):
     :param config: Configuration
     :param type_: Role type
     """
+    class AssertTracker(StringIO):
+        MAX_BT_LINES = 100
+        DISPLAY_BT_LINES = 6
+
+        def __init__(self, cluster_name, type_, id_, logger):
+            super(AssertTracker, self).__init__()
+            self.role = cluster_name + '.' + type_
+            self.id_ = id_
+            self.bt_lines = []
+            self.assertion = None
+            self.backtrace = None
+            self.in_bt = False
+            self.logger = logger
+            self.failure_time = None
+
+        def write(self, line):
+            if "FAILED assert" in line or "__ceph_assert_fail" in line:
+                self.assertion = line.strip()
+            if line.startswith(" ceph version"):
+                # The start of a backtrace!
+                self.bt_lines = [line]
+                self.in_bt = True
+                self.failure_time = time.time()
+            elif line.startswith(" NOTE: a copy of the executable") or \
+                 line.strip().endswith("clone()") or \
+                 "clone()+0x" in line:
+                # The backtrace terminated
+                if len(self.bt_lines):
+                    if self.assertion != None:
+                        self.bt_lines.insert(0, self.assertion)
+                        self.assertion = None
+                    # Limit number of lines of backtrace in the failure reason
+                    self.backtrace = ("".join(self.bt_lines[:self.DISPLAY_BT_LINES])).strip()
+                else:
+                    self.logger.warning("Saw end of backtrace but not start")
+                self.in_bt = False
+            elif self.in_bt:
+                # We're in a backtrace, push the line onto the list
+                if len(self.bt_lines) > self.MAX_BT_LINES:
+                    self.logger.warning("Failed to capture backtrace")
+                    self.bt_lines = ["Failed to capture backtrace", self.assertion]
+                    self.in_bt = False
+                else:
+                    # Backtrace filtering - skip lines for libraries, but include
+                    # libraries with ceph or rados in their path or name
+                    if "ceph" in line or "rados" in line or not re.match(r" [0-9]+: /.*",line):
+                        # Truncate long lines
+                        line = re.sub(r"([0-9]+: .{80}).*(\+0x[0-9a-fA-F]*\) \[0x[0-9a-fA-F]*\])",r"\1 ... \2",line)
+                        self.bt_lines.append(line)
+
+        def get_exception(self):
+            if self.backtrace is not None:
+                return Exception(self.backtrace)
+            if self.assertion is not None:
+                return Exception(self.assertion)
+            return None
+
+        def get_failure_time(self):
+            return self.failure_time
+
+        def match_id(self, role, id_):
+            return self.role == role and self.id_ == id_
+
     cluster_name = config['cluster']
     log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
     testdir = teuthology.get_testdir(ctx)
@@ -1503,13 +1567,17 @@ def run_daemon(ctx, config, type_):
                 maybe_redirect_stderr(config, type_, run_cmd, log_path)
             if create_log_cmd:
                 remote.sh(create_log_cmd)
+
             # always register mgr; don't necessarily start
+            asserttracker = AssertTracker(cluster_name, type_, id_, log.getChild(role))
+            ctx.ceph[cluster_name].assert_trackers.append(asserttracker)
             ctx.daemons.register_daemon(
                 remote, type_, id_,
                 cluster=cluster_name,
                 args=run_cmd,
                 logger=log.getChild(role),
                 stdin=run.PIPE,
+                stderr=asserttracker,
                 wait=False
             )
             if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
diff --git a/qa/tasks/daemonwatchdog.py b/qa/tasks/daemonwatchdog.py
index 7003a6b7985..c81e2d3300a 100644
--- a/qa/tasks/daemonwatchdog.py
+++ b/qa/tasks/daemonwatchdog.py
@@ -1,6 +1,7 @@
 import logging
 import signal
 import time
+import traceback
 
 from gevent import sleep
 from gevent.greenlet import Greenlet
@@ -15,7 +16,7 @@ class BarkError(Exception):
     """
 
     def __init__(self, bark_reason):
-        self.message = f"The WatchDog barked due to {bark_reason}"
+        self.message = f"Watchdog timeout: {bark_reason}"
         super().__init__(self.message)
 
 class DaemonWatchdog(Greenlet):
@@ -33,8 +34,8 @@ class DaemonWatchdog(Greenlet):
     If an extended failure is detected in a daemon process (i.e. not intentional), then
     the watchdog will bark. It will also bark is an assert is raised during the running
     of any monitored Thrashers or WatchedProcesses
-    
-    When the watchdog barks it will 
+
+    When the watchdog barks it will
      - unmount file systems and send SIGTERM to all daemons.
      - stop all thrasher processes by calling their stop_and_join() method
      - stop any watched processes by calling their stop() method
@@ -61,6 +62,7 @@ class DaemonWatchdog(Greenlet):
         self.stopping = Event()
         self.thrashers = ctx.ceph[config["cluster"]].thrashers
         self.watched_processes = ctx.ceph[config["cluster"]].watched_processes
+        self.assert_trackers = ctx.ceph[config["cluster"]].assert_trackers
 
     def _run(self):
         try:
@@ -113,7 +115,7 @@ class DaemonWatchdog(Greenlet):
         daemon_timeout = int(self.config.get('daemon_timeout', 300))
         daemon_restart = self.config.get('daemon_restart', False)
         daemon_failure_time = {}
-        bark_reason: str = ""
+        bark_reason = []
         while not self.stopping.is_set():
             bark = False
             now = time.time()
@@ -131,39 +133,106 @@ class DaemonWatchdog(Greenlet):
             daemon_failures.extend(filter(lambda daemon: daemon.finished(), rgws))
             daemon_failures.extend(filter(lambda daemon: daemon.finished(), mgrs))
 
-            for daemon in daemon_failures:
-                name = daemon.role + '.' + daemon.id_
-                dt = daemon_failure_time.setdefault(name, (daemon, now))
-                assert dt[0] is daemon
-                delta = now-dt[1]
-                self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
-                if delta > daemon_timeout:
-                    bark = True
-                    bark_reason = f"Daemon {name} has failed"
-                if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
-                    self.log(f"attempting to restart daemon {name}")
-                    daemon.restart()
-
-            # If a daemon is no longer failed, remove it from tracking:
-            for name in list(daemon_failure_time.keys()):
-                if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
-                    self.log("daemon {name} has been restored".format(name=name))
-                    del daemon_failure_time[name]
-
-            for thrasher in self.thrashers:
-                if thrasher.exception is not None:
-                    self.log("{name} failed".format(name=thrasher.name))
-                    bark_reason = f"Thrasher {name} threw exception {thrasher.exception}"
-                    bark = True
-
-            for proc in self.watched_processes:
-                if proc.exception is not None:
-                    self.log("Remote process %s failed" % proc.id)
-                    bark_reason = f"Remote process {proc.id} threw exception {proc.exception}"
-                    bark = True
+            try:
+                earliest = None
+                failures = []
+                for daemon in daemon_failures:
+                    name = daemon.role + '.' + daemon.id_
+                    dt = daemon_failure_time.setdefault(name, (daemon, now))
+                    assert dt[0] is daemon
+                    delta = now-dt[1]
+                    self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
+                    if delta > daemon_timeout:
+                        bark = True
+                        failures.append(f"{name}")
+                        for at in self.assert_trackers:
+                            if at.match_id(daemon.role, daemon.id_):
+                                exception = at.get_exception()
+                                if exception is not None:
+                                    if earliest == None or at.get_failure_time() < earliest.get_failure_time():
+                                        earliest = at
+
+                    if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
+                        self.log(f"attempting to restart daemon {name}")
+                        daemon.restart()
+
+                if len(failures) > 1:
+                    # Remove common prefixes from failure list
+                    last=""
+                    failures2 = []
+                    for f in failures:
+                        p = last.split(".")
+                        n = f.split(".")
+                        if p[0] != n[0] or len(p) == 1 or len(n) == 1:
+                            failures2.append(f)
+                        elif p[1] != n[1] or len(n) == 2:
+                            failures2.append(".".join(n[1:]))
+                        else:
+                            failures2.append(".".join(n[2:]))
+                        last = f
+                    failure_list = ",".join(failures2)
+                    bark_reason.append(f"Multiple daemons {failure_list} have timed out")
+                elif len(failures) == 1:
+                    bark_reason.append(f"Daemon {failures[0]} has timed out")
+
+                # If a daemon is no longer failed, remove it from tracking:
+                for name in list(daemon_failure_time.keys()):
+                    if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
+                        self.log("daemon {name} has been restored".format(name=name))
+                        del daemon_failure_time[name]
+
+                for thrasher in self.thrashers:
+                    if thrasher.exception is not None:
+                        self.log("{name} failed".format(name=thrasher.name))
+                        bark_reason.append(f"Thrasher {name} threw exception {thrasher.exception}")
+                        bark = True
+
+                for proc in self.watched_processes:
+                    if proc.exception is not None:
+                        self.log("Remote process %s failed" % proc.id)
+                        bark_reason.append(f"Remote process {proc.id} threw exception {proc.exception}")
+                        bark = True
+
+                # If a thrasher or watched process failed then check for the earliest daemon failure
+                failures = []
+                if bark and earliest == None:
+                    for at in self.assert_trackers:
+                        exception = at.get_exception()
+                        if exception is not None:
+                            failures.append(f"{at.role}.{at.id_}")
+                            if earliest == None or at.get_failure_time() < earliest.get_failure_time():
+                                earliest = at
+
+                if len(failures) > 1:
+                    # Remove common prefixes from failure list
+                    last=""
+                    failures2 = []
+                    for f in failures:
+                        p = last.split(".")
+                        n = f.split(".")
+                        if p[0] != n[0] or len(p) == 1 or len(n) == 1:
+                            failures2.append(f)
+                        elif p[1] != n[1] or len(n) == 2:
+                            failures2.append(".".join(n[1:]))
+                        else:
+                            failures2.append(".".join(n[2:]))
+                        last = f
+                    failure_list = ",".join(failures2)
+                    bark_reason.append(f"Multiple daemons {failure_list} have crashed")
+                elif len(failures) == 1:
+                    bark_reason.append(f"Daemon {failures[0]} has crashed")
+
+                # Only report details about one daemon failure
+                if earliest != None:
+                    bark_reason.append(f"First crash: Daemon {earliest.role}.{earliest.id_} at {time.ctime(earliest.get_failure_time())}")
+                    bark_reason.append(f"{earliest.get_exception()}")
+
+            except:
+                bark = True
+                bark_reason.append(f"Watchdog bug {traceback.format_exc()}")
 
             if bark:
-                self.bark(bark_reason)
+                self.bark("\n".join(bark_reason))
                 return
 
             sleep(5)
-- 
2.39.5